Skip to content

Commit 2168228

Browse files
author
Bradley Wood
committed
x86: Improve String.hashCode for medium/long strings
This commit improves String.hashCode() performance by calling vectorizedHashCodeLoopHelper(...) to process longer string in an unrolled vectorized loop with larger vector lengths. Minor tweaks were made, including to change how multiplication vectors are loaded. Signed-off-by: Bradley Wood <bradley.wood@ibm.com>
1 parent 3a88c81 commit 2168228

File tree

1 file changed

+60
-11
lines changed

1 file changed

+60
-11
lines changed

runtime/compiler/x/codegen/J9TreeEvaluator.cpp

Lines changed: 60 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9372,7 +9372,10 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
93729372
return result;
93739373
}
93749374

9375-
// Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9375+
// Convert serial String.hashCode computation into vectorization copy and implement with vector instructions.
9376+
// This algorithm processes 4-characters at a time in a vectorized loop and prepends zeros to the input characters
9377+
// to handle residual elements. To improve performance for large strings, this code calls vectorizedHashCodeLoopHelper
9378+
// and makes use of loop unrolling and larger vector lengths.
93769379
//
93779380
// Conversion process example:
93789381
//
@@ -9464,6 +9467,9 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94649467
{
94659468
TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
94669469

9470+
TR::VectorLength vl = cg->getMaxPreferredVectorLength();
9471+
static int32_t vectorSizes[3] = { 4, 8, 16 };
9472+
94679473
const int size = 4;
94689474
auto shift = isCompressed ? 0 : 1;
94699475

@@ -9472,29 +9478,49 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94729478
auto index = cg->allocateRegister();
94739479
auto hash = cg->allocateRegister();
94749480
auto tmp = cg->allocateRegister();
9481+
auto loopLimit = cg->allocateRegister();
94759482
auto hashXMM = cg->allocateRegister(TR_VRF);
94769483
auto tmpXMM = cg->allocateRegister(TR_VRF);
94779484
auto multiplierXMM = cg->allocateRegister(TR_VRF);
94789485

94799486
auto begLabel = generateLabelSymbol(cg);
94809487
auto endLabel = generateLabelSymbol(cg);
94819488
auto loopLabel = generateLabelSymbol(cg);
9489+
auto bigLoopLabel = generateLabelSymbol(cg);
9490+
auto doneLabel = generateLabelSymbol(cg);
94829491
begLabel->setStartInternalControlFlow();
94839492
endLabel->setEndInternalControlFlow();
94849493
auto deps = generateRegisterDependencyConditions((uint8_t)6, (uint8_t)6, cg);
94859494
deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
94869495
deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
9487-
deps->addPreCondition(length, TR::RealRegister::NoReg, cg);
9496+
deps->addPreCondition(loopLimit, TR::RealRegister::NoReg, cg);
94889497
deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94899498
deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94909499
deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
94919500
deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
94929501
deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
9493-
deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9502+
deps->addPostCondition(loopLimit, TR::RealRegister::NoReg, cg);
94949503
deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94959504
deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94969505
deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
94979506

9507+
// Generate Main Loop; 4x Unrolled seems to yield the best performance for large arrays
9508+
static char *unrollVar = feGetEnv("TR_setInlineStringHashCodeUnrollCount");
9509+
9510+
#ifdef TR_TARGET_64BIT
9511+
int32_t unrollCount = unrollVar ? atoi(unrollVar) : 4;
9512+
#else
9513+
int32_t unrollCount = 1;
9514+
#endif
9515+
9516+
int32_t charsPerMainLoopIteration = unrollCount * vectorSizes[vl];
9517+
9518+
generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, index, index, cg);
9519+
generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, hash, hash, cg);
9520+
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, loopLimit, length, cg);
9521+
generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, loopLimit, charsPerMainLoopIteration - 1, cg);
9522+
generateLabelInstruction(TR::InstOpCode::JE4, node, bigLoopLabel, cg);
9523+
94989524
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
94999525
generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
95009526
generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
@@ -9523,17 +9549,21 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95239549

95249550
// Reduction Loop
95259551
{
9526-
static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
95279552
generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9528-
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9553+
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit, cg);
95299554
generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9530-
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9555+
9556+
TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, 31*31*31*31, TR_RematerializableInt, cg);
9557+
generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierXMM, broadcastReg, cg);
9558+
TR::TreeEvaluator::broadcastHelper(node, multiplierXMM, vl, TR::Int32, cg);
9559+
cg->stopUsingRegister(broadcastReg);
9560+
95319561
generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
95329562
generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
95339563
generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
95349564
generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
95359565
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9536-
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9566+
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit, cg);
95379567
generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
95389568
generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
95399569
}
@@ -9550,6 +9580,17 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95509580

95519581
generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
95529582

9583+
// Skip secondary loop for small arrays
9584+
generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, length, charsPerMainLoopIteration, cg);
9585+
generateLabelInstruction(TR::InstOpCode::JL4, node, doneLabel, cg);
9586+
9587+
generateLabelInstruction(TR::InstOpCode::label, node, bigLoopLabel, cg);
9588+
9589+
// Secondary unrolled vectorized loop with larget vector lengths
9590+
TR::TreeEvaluator::vectorizedHashCodeLoopHelper(node, isCompressed ? TR::Int8 : TR::Int16, vl, false, hash, hash, index, length, address, unrollCount, cg);
9591+
9592+
generateLabelInstruction(TR::InstOpCode::label, node, doneLabel, cg);
9593+
95539594
cg->stopUsingRegister(index);
95549595
cg->stopUsingRegister(tmp);
95559596
cg->stopUsingRegister(hashXMM);
@@ -9608,7 +9649,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96089649
// then proceed to do horizontal reduction
96099650
for (int32_t i = 1; i < numVectors; i++)
96109651
{
9611-
OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl);
9652+
OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl, vl == TR::VectorLength512);
96129653
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, vectorRegVRF, vectorRegisters[i], cg, opcodeEncoding);
96139654
}
96149655

@@ -9625,7 +9666,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96259666
case TR::VectorLength256:
96269667
// extract 128 bits from ymm and store in xmm, then perform vertical operation
96279668
generateRegRegImmInstruction(TR::InstOpCode::VEXTRACTF128RegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0xFF, cg);
9628-
generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength128));
9669+
generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, OMR::X86::VEX_L128);
96299670
// Fallthrough to treat remaining result as 128-bit vector
96309671
case TR::VectorLength128:
96319672
generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0x0e, cg);
@@ -9732,7 +9773,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97329773
begLabel->setStartInternalControlFlow();
97339774
endLabel->setEndInternalControlFlow();
97349775

9735-
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9776+
if (result != initialHash)
9777+
{
9778+
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9779+
}
9780+
97369781
generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
97379782
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, tmp, length, cg);
97389783
generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, tmp, ~(numElements - 1), cg);
@@ -9750,7 +9795,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97509795
int32_t multiplier31PowNData[16];
97519796
// Fill multiplier array with 31^numElements
97529797
std::fill_n(multiplier31PowNData, 16, powersOf31[64 - numElements]);
9753-
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierVRF, generateX86MemoryReference(cg->findOrCreateConstantDataSnippet(node, multiplier31PowNData, vectorSizeElements * sizeof(int32_t)), cg), cg, vectorEncoding);
9798+
9799+
TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, powersOf31[64 - numElements], TR_RematerializableInt, cg);
9800+
generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierVRF, broadcastReg, cg);
9801+
TR::TreeEvaluator::broadcastHelper(node, multiplierVRF, vl, TR::Int32, cg);
9802+
cg->stopUsingRegister(broadcastReg);
97549803

97559804
for (int32_t i = 0; i < unrollCount; i++)
97569805
{

0 commit comments

Comments
 (0)