Skip to content

Commit 768f1b1

Browse files
author
Bradley Wood
committed
x86: Improve String.hashCode for medium/long strings
This commit improves String.hashCode() performance by calling vectorizedHashCodeLoopHelper(...) to process longer string in an unrolled vectorized loop with larger vector lengths. Minor tweaks were made, including to change how multiplication vectors are loaded. Signed-off-by: Bradley Wood <bradley.wood@ibm.com>
1 parent 3a88c81 commit 768f1b1

File tree

1 file changed

+59
-11
lines changed

1 file changed

+59
-11
lines changed

runtime/compiler/x/codegen/J9TreeEvaluator.cpp

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9372,7 +9372,10 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
93729372
return result;
93739373
}
93749374

9375-
// Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9375+
// Convert serial String.hashCode computation into vectorization copy and implement with vector instructions.
9376+
// This algorithm processes 4-characters at a time in a vectorized loop and prepends zeros to the input characters
9377+
// to handle residual elements. To improve performance for large strings, this code calls vectorizedHashCodeLoopHelper
9378+
// and makes use of loop unrolling and larger vector lengths.
93769379
//
93779380
// Conversion process example:
93789381
//
@@ -9464,6 +9467,9 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94649467
{
94659468
TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
94669469

9470+
TR::VectorLength vl = cg->getMaxPreferredVectorLength();
9471+
static int32_t vectorSizes[3] = { 4, 8, 16 };
9472+
94679473
const int size = 4;
94689474
auto shift = isCompressed ? 0 : 1;
94699475

@@ -9472,29 +9478,48 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94729478
auto index = cg->allocateRegister();
94739479
auto hash = cg->allocateRegister();
94749480
auto tmp = cg->allocateRegister();
9481+
auto loopLimit = cg->allocateRegister();
94759482
auto hashXMM = cg->allocateRegister(TR_VRF);
94769483
auto tmpXMM = cg->allocateRegister(TR_VRF);
94779484
auto multiplierXMM = cg->allocateRegister(TR_VRF);
94789485

94799486
auto begLabel = generateLabelSymbol(cg);
94809487
auto endLabel = generateLabelSymbol(cg);
94819488
auto loopLabel = generateLabelSymbol(cg);
9489+
auto bigLoopLabel = generateLabelSymbol(cg);
9490+
auto doneLabel = generateLabelSymbol(cg);
94829491
begLabel->setStartInternalControlFlow();
94839492
endLabel->setEndInternalControlFlow();
94849493
auto deps = generateRegisterDependencyConditions((uint8_t)6, (uint8_t)6, cg);
94859494
deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
94869495
deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
9487-
deps->addPreCondition(length, TR::RealRegister::NoReg, cg);
9496+
deps->addPreCondition(loopLimit, TR::RealRegister::NoReg, cg);
94889497
deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94899498
deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94909499
deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
94919500
deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
94929501
deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
9493-
deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9502+
deps->addPostCondition(loopLimit, TR::RealRegister::NoReg, cg);
94949503
deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94959504
deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94969505
deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
94979506

9507+
// Generate Main Loop; 4x Unrolled seems to yield the best performance for large arrays
9508+
#ifdef TR_TARGET_64BIT
9509+
static char *unrollVar = feGetEnv("TR_setInlineStringHashCodeUnrollCount");
9510+
int32_t unrollCount = unrollVar ? atoi(unrollVar) : 4;
9511+
#else
9512+
int32_t unrollCount = 1;
9513+
#endif
9514+
9515+
int32_t charsPerMainLoopIteration = unrollCount * vectorSizes[vl - TR::VectorLength128];
9516+
9517+
generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, index, index, cg);
9518+
generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, hash, hash, cg);
9519+
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, loopLimit, length, cg);
9520+
generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, loopLimit, charsPerMainLoopIteration - 1, cg);
9521+
generateLabelInstruction(TR::InstOpCode::JE4, node, bigLoopLabel, cg);
9522+
94989523
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
94999524
generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
95009525
generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
@@ -9523,17 +9548,21 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95239548

95249549
// Reduction Loop
95259550
{
9526-
static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
95279551
generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9528-
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9552+
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit, cg);
95299553
generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9530-
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9554+
9555+
TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, 31*31*31*31, TR_RematerializableInt, cg);
9556+
generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierXMM, broadcastReg, cg);
9557+
TR::TreeEvaluator::broadcastHelper(node, multiplierXMM, vl, TR::Int32, cg);
9558+
cg->stopUsingRegister(broadcastReg);
9559+
95319560
generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
95329561
generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
95339562
generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
95349563
generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
95359564
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9536-
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9565+
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit, cg);
95379566
generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
95389567
generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
95399568
}
@@ -9550,6 +9579,17 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95509579

95519580
generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
95529581

9582+
// Skip secondary loop for small arrays
9583+
generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, length, charsPerMainLoopIteration, cg);
9584+
generateLabelInstruction(TR::InstOpCode::JL4, node, doneLabel, cg);
9585+
9586+
generateLabelInstruction(TR::InstOpCode::label, node, bigLoopLabel, cg);
9587+
9588+
// Secondary unrolled vectorized loop with larger vector lengths
9589+
TR::TreeEvaluator::vectorizedHashCodeLoopHelper(node, isCompressed ? TR::Int8 : TR::Int16, vl, false, hash, hash, index, length, address, unrollCount, cg);
9590+
9591+
generateLabelInstruction(TR::InstOpCode::label, node, doneLabel, cg);
9592+
95539593
cg->stopUsingRegister(index);
95549594
cg->stopUsingRegister(tmp);
95559595
cg->stopUsingRegister(hashXMM);
@@ -9608,7 +9648,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96089648
// then proceed to do horizontal reduction
96099649
for (int32_t i = 1; i < numVectors; i++)
96109650
{
9611-
OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl);
9651+
OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl, vl == TR::VectorLength512);
96129652
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, vectorRegVRF, vectorRegisters[i], cg, opcodeEncoding);
96139653
}
96149654

@@ -9625,7 +9665,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96259665
case TR::VectorLength256:
96269666
// extract 128 bits from ymm and store in xmm, then perform vertical operation
96279667
generateRegRegImmInstruction(TR::InstOpCode::VEXTRACTF128RegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0xFF, cg);
9628-
generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength128));
9668+
generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, OMR::X86::VEX_L128);
96299669
// Fallthrough to treat remaining result as 128-bit vector
96309670
case TR::VectorLength128:
96319671
generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0x0e, cg);
@@ -9732,7 +9772,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97329772
begLabel->setStartInternalControlFlow();
97339773
endLabel->setEndInternalControlFlow();
97349774

9735-
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9775+
if (result != initialHash)
9776+
{
9777+
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9778+
}
9779+
97369780
generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
97379781
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, tmp, length, cg);
97389782
generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, tmp, ~(numElements - 1), cg);
@@ -9750,7 +9794,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97509794
int32_t multiplier31PowNData[16];
97519795
// Fill multiplier array with 31^numElements
97529796
std::fill_n(multiplier31PowNData, 16, powersOf31[64 - numElements]);
9753-
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierVRF, generateX86MemoryReference(cg->findOrCreateConstantDataSnippet(node, multiplier31PowNData, vectorSizeElements * sizeof(int32_t)), cg), cg, vectorEncoding);
9797+
9798+
TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, powersOf31[64 - numElements], TR_RematerializableInt, cg);
9799+
generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierVRF, broadcastReg, cg);
9800+
TR::TreeEvaluator::broadcastHelper(node, multiplierVRF, vl, TR::Int32, cg);
9801+
cg->stopUsingRegister(broadcastReg);
97549802

97559803
for (int32_t i = 0; i < unrollCount; i++)
97569804
{

0 commit comments

Comments
 (0)