Skip to content

Commit c4ce760

Browse files
author
Bradley Wood
committed
x86: Improve String.hashCode for medium/long strings
This commit improves String.hashCode() performance by calling vectorizedHashCodeLoopHelper(...) to process longer string in an unrolled vectorized loop with larger vector lengths. Minor tweaks were made, including to change how multiplication vectors are loaded. Signed-off-by: Bradley Wood <bradley.wood@ibm.com>
1 parent 3a88c81 commit c4ce760

File tree

1 file changed

+63
-11
lines changed

1 file changed

+63
-11
lines changed

runtime/compiler/x/codegen/J9TreeEvaluator.cpp

Lines changed: 63 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9372,7 +9372,10 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
93729372
return result;
93739373
}
93749374

9375-
// Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9375+
// Convert serial String.hashCode computation into vectorization copy and implement with vector instructions.
9376+
// This algorithm processes 4-characters at a time in a vectorized loop and prepends zeros to the input characters
9377+
// to handle residual elements. To improve performance for large strings, this code calls vectorizedHashCodeLoopHelper
9378+
// and makes use of loop unrolling and larger vector lengths.
93769379
//
93779380
// Conversion process example:
93789381
//
@@ -9464,6 +9467,9 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94649467
{
94659468
TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
94669469

9470+
TR::VectorLength vl = cg->getMaxPreferredVectorLength();
9471+
static int32_t vectorSizes[3] = { 4, 8, 16 };
9472+
94679473
const int size = 4;
94689474
auto shift = isCompressed ? 0 : 1;
94699475

@@ -9472,29 +9478,52 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94729478
auto index = cg->allocateRegister();
94739479
auto hash = cg->allocateRegister();
94749480
auto tmp = cg->allocateRegister();
9481+
auto loopLimit = cg->allocateRegister();
94759482
auto hashXMM = cg->allocateRegister(TR_VRF);
94769483
auto tmpXMM = cg->allocateRegister(TR_VRF);
94779484
auto multiplierXMM = cg->allocateRegister(TR_VRF);
94789485

94799486
auto begLabel = generateLabelSymbol(cg);
94809487
auto endLabel = generateLabelSymbol(cg);
94819488
auto loopLabel = generateLabelSymbol(cg);
9489+
auto bigLoopLabel = generateLabelSymbol(cg);
9490+
auto doneLabel = generateLabelSymbol(cg);
94829491
begLabel->setStartInternalControlFlow();
94839492
endLabel->setEndInternalControlFlow();
9484-
auto deps = generateRegisterDependencyConditions((uint8_t)6, (uint8_t)6, cg);
9493+
auto deps = generateRegisterDependencyConditions((uint8_t)8, (uint8_t)8, cg);
94859494
deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
94869495
deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
94879496
deps->addPreCondition(length, TR::RealRegister::NoReg, cg);
9497+
deps->addPreCondition(hash, TR::RealRegister::NoReg, cg);
9498+
deps->addPreCondition(loopLimit, TR::RealRegister::NoReg, cg);
94889499
deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94899500
deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94909501
deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
94919502
deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
94929503
deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
94939504
deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9505+
deps->addPostCondition(hash, TR::RealRegister::NoReg, cg);
9506+
deps->addPostCondition(loopLimit, TR::RealRegister::NoReg, cg);
94949507
deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94959508
deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94969509
deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
94979510

9511+
// Generate Main Loop; 4x Unrolled seems to yield the best performance for large arrays
9512+
#ifdef TR_TARGET_64BIT
9513+
static char *unrollVar = feGetEnv("TR_setInlineStringHashCodeUnrollCount");
9514+
int32_t unrollCount = unrollVar ? atoi(unrollVar) : 4;
9515+
#else
9516+
int32_t unrollCount = 1;
9517+
#endif
9518+
9519+
int32_t charsPerMainLoopIteration = unrollCount * vectorSizes[vl - TR::VectorLength128];
9520+
9521+
generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, index, index, cg);
9522+
generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, hash, hash, cg);
9523+
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, loopLimit, length, cg);
9524+
generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, loopLimit, charsPerMainLoopIteration - 1, cg);
9525+
generateLabelInstruction(TR::InstOpCode::JE4, node, bigLoopLabel, cg);
9526+
94989527
generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
94999528
generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
95009529
generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
@@ -9523,19 +9552,23 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95239552

95249553
// Reduction Loop
95259554
{
9526-
static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
95279555
generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9528-
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9556+
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit, cg);
95299557
generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9530-
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9558+
9559+
TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, 31*31*31*31, TR_RematerializableInt, cg);
9560+
generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierXMM, broadcastReg, cg);
9561+
TR::TreeEvaluator::broadcastHelper(node, multiplierXMM, vl, TR::Int32, cg);
9562+
cg->stopUsingRegister(broadcastReg);
9563+
95319564
generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
95329565
generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
95339566
generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
95349567
generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
95359568
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9536-
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length, cg);
9569+
generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit, cg);
95379570
generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
9538-
generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
9571+
generateLabelInstruction(TR::InstOpCode::label, node, endLabel, cg);
95399572
}
95409573

95419574
// Finalization
@@ -9550,6 +9583,17 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95509583

95519584
generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
95529585

9586+
// Skip secondary loop for small arrays
9587+
generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, length, charsPerMainLoopIteration, cg);
9588+
generateLabelInstruction(TR::InstOpCode::JL4, node, doneLabel, cg);
9589+
9590+
generateLabelInstruction(TR::InstOpCode::label, node, bigLoopLabel, cg);
9591+
9592+
// Secondary unrolled vectorized loop with larger vector lengths
9593+
TR::TreeEvaluator::vectorizedHashCodeLoopHelper(node, isCompressed ? TR::Int8 : TR::Int16, vl, false, hash, hash, index, length, address, unrollCount, cg);
9594+
9595+
generateLabelInstruction(TR::InstOpCode::label, node, doneLabel, deps, cg);
9596+
95539597
cg->stopUsingRegister(index);
95549598
cg->stopUsingRegister(tmp);
95559599
cg->stopUsingRegister(hashXMM);
@@ -9608,7 +9652,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96089652
// then proceed to do horizontal reduction
96099653
for (int32_t i = 1; i < numVectors; i++)
96109654
{
9611-
OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl);
9655+
OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl, vl == TR::VectorLength512);
96129656
generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, vectorRegVRF, vectorRegisters[i], cg, opcodeEncoding);
96139657
}
96149658

@@ -9625,7 +9669,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96259669
case TR::VectorLength256:
96269670
// extract 128 bits from ymm and store in xmm, then perform vertical operation
96279671
generateRegRegImmInstruction(TR::InstOpCode::VEXTRACTF128RegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0xFF, cg);
9628-
generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength128));
9672+
generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, OMR::X86::VEX_L128);
96299673
// Fallthrough to treat remaining result as 128-bit vector
96309674
case TR::VectorLength128:
96319675
generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0x0e, cg);
@@ -9732,7 +9776,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97329776
begLabel->setStartInternalControlFlow();
97339777
endLabel->setEndInternalControlFlow();
97349778

9735-
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9779+
if (result != initialHash)
9780+
{
9781+
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9782+
}
9783+
97369784
generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
97379785
generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, tmp, length, cg);
97389786
generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, tmp, ~(numElements - 1), cg);
@@ -9750,7 +9798,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97509798
int32_t multiplier31PowNData[16];
97519799
// Fill multiplier array with 31^numElements
97529800
std::fill_n(multiplier31PowNData, 16, powersOf31[64 - numElements]);
9753-
generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierVRF, generateX86MemoryReference(cg->findOrCreateConstantDataSnippet(node, multiplier31PowNData, vectorSizeElements * sizeof(int32_t)), cg), cg, vectorEncoding);
9801+
9802+
TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, powersOf31[64 - numElements], TR_RematerializableInt, cg);
9803+
generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierVRF, broadcastReg, cg);
9804+
TR::TreeEvaluator::broadcastHelper(node, multiplierVRF, vl, TR::Int32, cg);
9805+
cg->stopUsingRegister(broadcastReg);
97549806

97559807
for (int32_t i = 0; i < unrollCount; i++)
97569808
{

0 commit comments

Comments
 (0)