@@ -9372,7 +9372,10 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
93729372 return result;
93739373 }
93749374
9375- // Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9375+ // Convert serial String.hashCode computation into vectorization copy and implement with vector instructions.
9376+ // This algorithm processes 4-characters at a time in a vectorized loop and prepends zeros to the input characters
9377+ // to handle residual elements. To improve performance for large strings, this code calls vectorizedHashCodeLoopHelper
9378+ // and makes use of loop unrolling and larger vector lengths.
93769379//
93779380// Conversion process example:
93789381//
@@ -9464,6 +9467,9 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94649467 {
94659468 TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
94669469
9470+ TR::VectorLength vl = cg->getMaxPreferredVectorLength();
9471+ static int32_t vectorSizes[3] = { 4, 8, 16 };
9472+
94679473 const int size = 4;
94689474 auto shift = isCompressed ? 0 : 1;
94699475
@@ -9472,29 +9478,52 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94729478 auto index = cg->allocateRegister();
94739479 auto hash = cg->allocateRegister();
94749480 auto tmp = cg->allocateRegister();
9481+ auto loopLimit = cg->allocateRegister();
94759482 auto hashXMM = cg->allocateRegister(TR_VRF);
94769483 auto tmpXMM = cg->allocateRegister(TR_VRF);
94779484 auto multiplierXMM = cg->allocateRegister(TR_VRF);
94789485
94799486 auto begLabel = generateLabelSymbol(cg);
94809487 auto endLabel = generateLabelSymbol(cg);
94819488 auto loopLabel = generateLabelSymbol(cg);
9489+ auto bigLoopLabel = generateLabelSymbol(cg);
9490+ auto doneLabel = generateLabelSymbol(cg);
94829491 begLabel->setStartInternalControlFlow();
94839492 endLabel->setEndInternalControlFlow();
9484- auto deps = generateRegisterDependencyConditions((uint8_t)6 , (uint8_t)6 , cg);
9493+ auto deps = generateRegisterDependencyConditions((uint8_t)8 , (uint8_t)8 , cg);
94859494 deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
94869495 deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
94879496 deps->addPreCondition(length, TR::RealRegister::NoReg, cg);
9497+ deps->addPreCondition(hash, TR::RealRegister::NoReg, cg);
9498+ deps->addPreCondition(loopLimit, TR::RealRegister::NoReg, cg);
94889499 deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94899500 deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94909501 deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
94919502 deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
94929503 deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
94939504 deps->addPostCondition(length, TR::RealRegister::NoReg, cg);
9505+ deps->addPostCondition(hash, TR::RealRegister::NoReg, cg);
9506+ deps->addPostCondition(loopLimit, TR::RealRegister::NoReg, cg);
94949507 deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94959508 deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94969509 deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
94979510
9511+ // Generate Main Loop; 4x Unrolled seems to yield the best performance for large arrays
9512+ #ifdef TR_TARGET_64BIT
9513+ static char *unrollVar = feGetEnv("TR_setInlineStringHashCodeUnrollCount");
9514+ int32_t unrollCount = unrollVar ? atoi(unrollVar) : 4;
9515+ #else
9516+ int32_t unrollCount = 1;
9517+ #endif
9518+
9519+ int32_t charsPerMainLoopIteration = unrollCount * vectorSizes[vl - TR::VectorLength128];
9520+
9521+ generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, index, index, cg);
9522+ generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, hash, hash, cg);
9523+ generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, loopLimit, length, cg);
9524+ generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, loopLimit, charsPerMainLoopIteration - 1, cg);
9525+ generateLabelInstruction(TR::InstOpCode::JE4, node, bigLoopLabel, cg);
9526+
94989527 generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
94999528 generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
95009529 generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
@@ -9523,19 +9552,23 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95239552
95249553 // Reduction Loop
95259554 {
9526- static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
95279555 generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9528- generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length , cg);
9556+ generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit , cg);
95299557 generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9530- generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9558+
9559+ TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, 31*31*31*31, TR_RematerializableInt, cg);
9560+ generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierXMM, broadcastReg, cg);
9561+ TR::TreeEvaluator::broadcastHelper(node, multiplierXMM, vl, TR::Int32, cg);
9562+ cg->stopUsingRegister(broadcastReg);
9563+
95319564 generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
95329565 generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
95339566 generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
95349567 generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
95359568 generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9536- generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length , cg);
9569+ generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit , cg);
95379570 generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
9538- generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
9571+ generateLabelInstruction(TR::InstOpCode::label, node, endLabel, cg);
95399572 }
95409573
95419574 // Finalization
@@ -9550,6 +9583,17 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95509583
95519584 generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
95529585
9586+ // Skip secondary loop for small arrays
9587+ generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, length, charsPerMainLoopIteration, cg);
9588+ generateLabelInstruction(TR::InstOpCode::JL4, node, doneLabel, cg);
9589+
9590+ generateLabelInstruction(TR::InstOpCode::label, node, bigLoopLabel, cg);
9591+
9592+ // Secondary unrolled vectorized loop with larger vector lengths
9593+ TR::TreeEvaluator::vectorizedHashCodeLoopHelper(node, isCompressed ? TR::Int8 : TR::Int16, vl, false, hash, hash, index, length, address, unrollCount, cg);
9594+
9595+ generateLabelInstruction(TR::InstOpCode::label, node, doneLabel, deps, cg);
9596+
95539597 cg->stopUsingRegister(index);
95549598 cg->stopUsingRegister(tmp);
95559599 cg->stopUsingRegister(hashXMM);
@@ -9608,7 +9652,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96089652 // then proceed to do horizontal reduction
96099653 for (int32_t i = 1; i < numVectors; i++)
96109654 {
9611- OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl);
9655+ OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl, vl == TR::VectorLength512 );
96129656 generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, vectorRegVRF, vectorRegisters[i], cg, opcodeEncoding);
96139657 }
96149658
@@ -9625,7 +9669,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96259669 case TR::VectorLength256:
96269670 // extract 128 bits from ymm and store in xmm, then perform vertical operation
96279671 generateRegRegImmInstruction(TR::InstOpCode::VEXTRACTF128RegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0xFF, cg);
9628- generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength128) );
9672+ generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, OMR::X86::VEX_L128 );
96299673 // Fallthrough to treat remaining result as 128-bit vector
96309674 case TR::VectorLength128:
96319675 generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0x0e, cg);
@@ -9732,7 +9776,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97329776 begLabel->setStartInternalControlFlow();
97339777 endLabel->setEndInternalControlFlow();
97349778
9735- generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9779+ if (result != initialHash)
9780+ {
9781+ generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9782+ }
9783+
97369784 generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
97379785 generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, tmp, length, cg);
97389786 generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, tmp, ~(numElements - 1), cg);
@@ -9750,7 +9798,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97509798 int32_t multiplier31PowNData[16];
97519799 // Fill multiplier array with 31^numElements
97529800 std::fill_n(multiplier31PowNData, 16, powersOf31[64 - numElements]);
9753- generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierVRF, generateX86MemoryReference(cg->findOrCreateConstantDataSnippet(node, multiplier31PowNData, vectorSizeElements * sizeof(int32_t)), cg), cg, vectorEncoding);
9801+
9802+ TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, powersOf31[64 - numElements], TR_RematerializableInt, cg);
9803+ generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierVRF, broadcastReg, cg);
9804+ TR::TreeEvaluator::broadcastHelper(node, multiplierVRF, vl, TR::Int32, cg);
9805+ cg->stopUsingRegister(broadcastReg);
97549806
97559807 for (int32_t i = 0; i < unrollCount; i++)
97569808 {
0 commit comments