@@ -9372,7 +9372,10 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
93729372 return result;
93739373 }
93749374
9375- // Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9375+ // Convert serial String.hashCode computation into vectorization copy and implement with vector instructions.
9376+ // This algorithm processes 4-characters at a time in a vectorized loop and prepends zeros to the input characters
9377+ // to handle residual elements. To improve performance for large strings, this code calls vectorizedHashCodeLoopHelper
9378+ // and makes use of loop unrolling and larger vector lengths.
93769379//
93779380// Conversion process example:
93789381//
@@ -9464,6 +9467,9 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94649467 {
94659468 TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
94669469
9470+ TR::VectorLength vl = cg->getMaxPreferredVectorLength();
9471+ static int32_t vectorSizes[3] = { 4, 8, 16 };
9472+
94679473 const int size = 4;
94689474 auto shift = isCompressed ? 0 : 1;
94699475
@@ -9472,29 +9478,48 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94729478 auto index = cg->allocateRegister();
94739479 auto hash = cg->allocateRegister();
94749480 auto tmp = cg->allocateRegister();
9481+ auto loopLimit = cg->allocateRegister();
94759482 auto hashXMM = cg->allocateRegister(TR_VRF);
94769483 auto tmpXMM = cg->allocateRegister(TR_VRF);
94779484 auto multiplierXMM = cg->allocateRegister(TR_VRF);
94789485
94799486 auto begLabel = generateLabelSymbol(cg);
94809487 auto endLabel = generateLabelSymbol(cg);
94819488 auto loopLabel = generateLabelSymbol(cg);
9489+ auto bigLoopLabel = generateLabelSymbol(cg);
9490+ auto doneLabel = generateLabelSymbol(cg);
94829491 begLabel->setStartInternalControlFlow();
94839492 endLabel->setEndInternalControlFlow();
94849493 auto deps = generateRegisterDependencyConditions((uint8_t)6, (uint8_t)6, cg);
94859494 deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
94869495 deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
9487- deps->addPreCondition(length , TR::RealRegister::NoReg, cg);
9496+ deps->addPreCondition(loopLimit , TR::RealRegister::NoReg, cg);
94889497 deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94899498 deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94909499 deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
94919500 deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
94929501 deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
9493- deps->addPostCondition(length , TR::RealRegister::NoReg, cg);
9502+ deps->addPostCondition(loopLimit , TR::RealRegister::NoReg, cg);
94949503 deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94959504 deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94969505 deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
94979506
9507+ // Generate Main Loop; 4x Unrolled seems to yield the best performance for large arrays
9508+ #ifdef TR_TARGET_64BIT
9509+ static char *unrollVar = feGetEnv("TR_setInlineStringHashCodeUnrollCount");
9510+ int32_t unrollCount = unrollVar ? atoi(unrollVar) : 4;
9511+ #else
9512+ int32_t unrollCount = 1;
9513+ #endif
9514+
9515+ int32_t charsPerMainLoopIteration = unrollCount * vectorSizes[vl - TR::VectorLength128];
9516+
9517+ generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, index, index, cg);
9518+ generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, hash, hash, cg);
9519+ generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, loopLimit, length, cg);
9520+ generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, loopLimit, charsPerMainLoopIteration - 1, cg);
9521+ generateLabelInstruction(TR::InstOpCode::JE4, node, bigLoopLabel, cg);
9522+
94989523 generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
94999524 generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
95009525 generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
@@ -9523,17 +9548,21 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95239548
95249549 // Reduction Loop
95259550 {
9526- static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
95279551 generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9528- generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length , cg);
9552+ generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit , cg);
95299553 generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9530- generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9554+
9555+ TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, 31*31*31*31, TR_RematerializableInt, cg);
9556+ generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierXMM, broadcastReg, cg);
9557+ TR::TreeEvaluator::broadcastHelper(node, multiplierXMM, vl, TR::Int32, cg);
9558+ cg->stopUsingRegister(broadcastReg);
9559+
95319560 generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
95329561 generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
95339562 generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
95349563 generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
95359564 generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9536- generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length , cg);
9565+ generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit , cg);
95379566 generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
95389567 generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
95399568 }
@@ -9550,6 +9579,17 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95509579
95519580 generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
95529581
9582+ // Skip secondary loop for small arrays
9583+ generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, length, charsPerMainLoopIteration, cg);
9584+ generateLabelInstruction(TR::InstOpCode::JL4, node, doneLabel, cg);
9585+
9586+ generateLabelInstruction(TR::InstOpCode::label, node, bigLoopLabel, cg);
9587+
9588+ // Secondary unrolled vectorized loop with larger vector lengths
9589+ TR::TreeEvaluator::vectorizedHashCodeLoopHelper(node, isCompressed ? TR::Int8 : TR::Int16, vl, false, hash, hash, index, length, address, unrollCount, cg);
9590+
9591+ generateLabelInstruction(TR::InstOpCode::label, node, doneLabel, cg);
9592+
95539593 cg->stopUsingRegister(index);
95549594 cg->stopUsingRegister(tmp);
95559595 cg->stopUsingRegister(hashXMM);
@@ -9608,7 +9648,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96089648 // then proceed to do horizontal reduction
96099649 for (int32_t i = 1; i < numVectors; i++)
96109650 {
9611- OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl);
9651+ OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl, vl == TR::VectorLength512 );
96129652 generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, vectorRegVRF, vectorRegisters[i], cg, opcodeEncoding);
96139653 }
96149654
@@ -9625,7 +9665,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96259665 case TR::VectorLength256:
96269666 // extract 128 bits from ymm and store in xmm, then perform vertical operation
96279667 generateRegRegImmInstruction(TR::InstOpCode::VEXTRACTF128RegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0xFF, cg);
9628- generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength128) );
9668+ generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, OMR::X86::VEX_L128 );
96299669 // Fallthrough to treat remaining result as 128-bit vector
96309670 case TR::VectorLength128:
96319671 generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0x0e, cg);
@@ -9732,7 +9772,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97329772 begLabel->setStartInternalControlFlow();
97339773 endLabel->setEndInternalControlFlow();
97349774
9735- generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9775+ if (result != initialHash)
9776+ {
9777+ generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9778+ }
9779+
97369780 generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
97379781 generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, tmp, length, cg);
97389782 generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, tmp, ~(numElements - 1), cg);
@@ -9750,7 +9794,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97509794 int32_t multiplier31PowNData[16];
97519795 // Fill multiplier array with 31^numElements
97529796 std::fill_n(multiplier31PowNData, 16, powersOf31[64 - numElements]);
9753- generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierVRF, generateX86MemoryReference(cg->findOrCreateConstantDataSnippet(node, multiplier31PowNData, vectorSizeElements * sizeof(int32_t)), cg), cg, vectorEncoding);
9797+
9798+ TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, powersOf31[64 - numElements], TR_RematerializableInt, cg);
9799+ generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierVRF, broadcastReg, cg);
9800+ TR::TreeEvaluator::broadcastHelper(node, multiplierVRF, vl, TR::Int32, cg);
9801+ cg->stopUsingRegister(broadcastReg);
97549802
97559803 for (int32_t i = 0; i < unrollCount; i++)
97569804 {
0 commit comments