@@ -9372,7 +9372,10 @@ TR::Register* J9::X86::TreeEvaluator::inlineMathFma(TR::Node* node, TR::CodeGene
93729372 return result;
93739373 }
93749374
9375- // Convert serial String.hashCode computation into vectorization copy and implement with SSE instruction
9375+ // Convert serial String.hashCode computation into vectorization copy and implement with vector instructions.
9376+ // This algorithm processes 4-characters at a time in a vectorized loop and prepends zeros to the input characters
9377+ // to handle residual elements. To improve performance for large strings, this code calls vectorizedHashCodeLoopHelper
9378+ // and makes use of loop unrolling and larger vector lengths.
93769379//
93779380// Conversion process example:
93789381//
@@ -9464,6 +9467,9 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94649467 {
94659468 TR_ASSERT(node->getChild(1)->getOpCodeValue() == TR::iconst && node->getChild(1)->getInt() == 0, "String hashcode offset can only be const zero.");
94669469
9470+ TR::VectorLength vl = cg->getMaxPreferredVectorLength();
9471+ static int32_t vectorSizes[3] = { 4, 8, 16 };
9472+
94679473 const int size = 4;
94689474 auto shift = isCompressed ? 0 : 1;
94699475
@@ -9472,29 +9478,49 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
94729478 auto index = cg->allocateRegister();
94739479 auto hash = cg->allocateRegister();
94749480 auto tmp = cg->allocateRegister();
9481+ auto loopLimit = cg->allocateRegister();
94759482 auto hashXMM = cg->allocateRegister(TR_VRF);
94769483 auto tmpXMM = cg->allocateRegister(TR_VRF);
94779484 auto multiplierXMM = cg->allocateRegister(TR_VRF);
94789485
94799486 auto begLabel = generateLabelSymbol(cg);
94809487 auto endLabel = generateLabelSymbol(cg);
94819488 auto loopLabel = generateLabelSymbol(cg);
9489+ auto bigLoopLabel = generateLabelSymbol(cg);
9490+ auto doneLabel = generateLabelSymbol(cg);
94829491 begLabel->setStartInternalControlFlow();
94839492 endLabel->setEndInternalControlFlow();
94849493 auto deps = generateRegisterDependencyConditions((uint8_t)6, (uint8_t)6, cg);
94859494 deps->addPreCondition(address, TR::RealRegister::NoReg, cg);
94869495 deps->addPreCondition(index, TR::RealRegister::NoReg, cg);
9487- deps->addPreCondition(length , TR::RealRegister::NoReg, cg);
9496+ deps->addPreCondition(loopLimit , TR::RealRegister::NoReg, cg);
94889497 deps->addPreCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94899498 deps->addPreCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94909499 deps->addPreCondition(hashXMM, TR::RealRegister::NoReg, cg);
94919500 deps->addPostCondition(address, TR::RealRegister::NoReg, cg);
94929501 deps->addPostCondition(index, TR::RealRegister::NoReg, cg);
9493- deps->addPostCondition(length , TR::RealRegister::NoReg, cg);
9502+ deps->addPostCondition(loopLimit , TR::RealRegister::NoReg, cg);
94949503 deps->addPostCondition(multiplierXMM, TR::RealRegister::NoReg, cg);
94959504 deps->addPostCondition(tmpXMM, TR::RealRegister::NoReg, cg);
94969505 deps->addPostCondition(hashXMM, TR::RealRegister::NoReg, cg);
94979506
9507+ // Generate Main Loop; 4x Unrolled seems to yield the best performance for large arrays
9508+ static char *unrollVar = feGetEnv("TR_setInlineStringHashCodeUnrollCount");
9509+
9510+ #ifdef TR_TARGET_64BIT
9511+ int32_t unrollCount = unrollVar ? atoi(unrollVar) : 4;
9512+ #else
9513+ int32_t unrollCount = 1;
9514+ #endif
9515+
9516+ int32_t charsPerMainLoopIteration = unrollCount * vectorSizes[vl];
9517+
9518+ generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, index, index, cg);
9519+ generateRegRegInstruction(TR::InstOpCode::XORRegReg(), node, hash, hash, cg);
9520+ generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, loopLimit, length, cg);
9521+ generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, loopLimit, charsPerMainLoopIteration - 1, cg);
9522+ generateLabelInstruction(TR::InstOpCode::JE4, node, bigLoopLabel, cg);
9523+
94989524 generateRegRegInstruction(TR::InstOpCode::MOV4RegReg, node, index, length, cg);
94999525 generateRegImmInstruction(TR::InstOpCode::AND4RegImms, node, index, size-1, cg); // mod size
95009526 generateRegMemInstruction(TR::InstOpCode::CMOVE4RegMem, node, index, generateX86MemoryReference(cg->findOrCreate4ByteConstant(node, size), cg), cg);
@@ -9523,17 +9549,21 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95239549
95249550 // Reduction Loop
95259551 {
9526- static uint32_t multiplier[] = { 31*31*31*31, 31*31*31*31, 31*31*31*31, 31*31*31*31 };
95279552 generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
9528- generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length , cg);
9553+ generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit , cg);
95299554 generateLabelInstruction(TR::InstOpCode::JGE4, node, endLabel, cg);
9530- generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierXMM, generateX86MemoryReference(cg->findOrCreate16ByteConstant(node, multiplier), cg), cg);
9555+
9556+ TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, 31*31*31*31, TR_RematerializableInt, cg);
9557+ generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierXMM, broadcastReg, cg);
9558+ TR::TreeEvaluator::broadcastHelper(node, multiplierXMM, vl, TR::Int32, cg);
9559+ cg->stopUsingRegister(broadcastReg);
9560+
95319561 generateLabelInstruction(TR::InstOpCode::label, node, loopLabel, cg);
95329562 generateRegRegInstruction(TR::InstOpCode::PMULLDRegReg, node, hashXMM, multiplierXMM, cg);
95339563 generateRegMemInstruction(isCompressed ? TR::InstOpCode::PMOVZXBDRegMem : TR::InstOpCode::PMOVZXWDRegMem, node, tmpXMM, generateX86MemoryReference(address, index, shift, TR::Compiler->om.contiguousArrayHeaderSizeInBytes(), cg), cg);
95349564 generateRegImmInstruction(TR::InstOpCode::ADD4RegImms, node, index, 4, cg);
95359565 generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, hashXMM, tmpXMM, cg);
9536- generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, length , cg);
9566+ generateRegRegInstruction(TR::InstOpCode::CMP4RegReg, node, index, loopLimit , cg);
95379567 generateLabelInstruction(TR::InstOpCode::JL4, node, loopLabel, cg);
95389568 generateLabelInstruction(TR::InstOpCode::label, node, endLabel, deps, cg);
95399569 }
@@ -9550,6 +9580,17 @@ static TR::Register* inlineStringHashCode(TR::Node* node, bool isCompressed, TR:
95509580
95519581 generateRegRegInstruction(TR::InstOpCode::MOVDReg4Reg, node, hash, hashXMM, cg);
95529582
9583+ // Skip secondary loop for small arrays
9584+ generateRegImmInstruction(TR::InstOpCode::CMPRegImm4(), node, length, charsPerMainLoopIteration, cg);
9585+ generateLabelInstruction(TR::InstOpCode::JL4, node, doneLabel, cg);
9586+
9587+ generateLabelInstruction(TR::InstOpCode::label, node, bigLoopLabel, cg);
9588+
9589+ // Secondary unrolled vectorized loop with larget vector lengths
9590+ TR::TreeEvaluator::vectorizedHashCodeLoopHelper(node, isCompressed ? TR::Int8 : TR::Int16, vl, false, hash, hash, index, length, address, unrollCount, cg);
9591+
9592+ generateLabelInstruction(TR::InstOpCode::label, node, doneLabel, cg);
9593+
95539594 cg->stopUsingRegister(index);
95549595 cg->stopUsingRegister(tmp);
95559596 cg->stopUsingRegister(hashXMM);
@@ -9608,7 +9649,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96089649 // then proceed to do horizontal reduction
96099650 for (int32_t i = 1; i < numVectors; i++)
96109651 {
9611- OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl);
9652+ OMR::X86::Encoding opcodeEncoding = opcode.getSIMDEncoding(&cg->comp()->target().cpu, vl, vl == TR::VectorLength512 );
96129653 generateRegRegInstruction(TR::InstOpCode::PADDDRegReg, node, vectorRegVRF, vectorRegisters[i], cg, opcodeEncoding);
96139654 }
96149655
@@ -9625,7 +9666,7 @@ J9::X86::TreeEvaluator::vectorizedHashCodeReductionHelper(TR::Node* node, TR::Re
96259666 case TR::VectorLength256:
96269667 // extract 128 bits from ymm and store in xmm, then perform vertical operation
96279668 generateRegRegImmInstruction(TR::InstOpCode::VEXTRACTF128RegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0xFF, cg);
9628- generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, opcode.getSIMDEncoding(&cg->comp()->target().cpu, TR::VectorLength128) );
9669+ generateRegRegInstruction(opcode.getMnemonic(), node, vectorRegVRF, tmpVectorRegVRF, cg, OMR::X86::VEX_L128 );
96299670 // Fallthrough to treat remaining result as 128-bit vector
96309671 case TR::VectorLength128:
96319672 generateRegRegImmInstruction(TR::InstOpCode::PSHUFDRegRegImm1, node, tmpVectorRegVRF, vectorRegVRF, 0x0e, cg);
@@ -9732,7 +9773,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97329773 begLabel->setStartInternalControlFlow();
97339774 endLabel->setEndInternalControlFlow();
97349775
9735- generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9776+ if (result != initialHash)
9777+ {
9778+ generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, result, initialHash, cg);
9779+ }
9780+
97369781 generateLabelInstruction(TR::InstOpCode::label, node, begLabel, cg);
97379782 generateRegRegInstruction(TR::InstOpCode::MOVRegReg(), node, tmp, length, cg);
97389783 generateRegImmInstruction(TR::InstOpCode::AND4RegImm4, node, tmp, ~(numElements - 1), cg);
@@ -9750,7 +9795,11 @@ J9::X86::TreeEvaluator::vectorizedHashCodeLoopHelper(TR::Node *node,
97509795 int32_t multiplier31PowNData[16];
97519796 // Fill multiplier array with 31^numElements
97529797 std::fill_n(multiplier31PowNData, 16, powersOf31[64 - numElements]);
9753- generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, multiplierVRF, generateX86MemoryReference(cg->findOrCreateConstantDataSnippet(node, multiplier31PowNData, vectorSizeElements * sizeof(int32_t)), cg), cg, vectorEncoding);
9798+
9799+ TR::Register *broadcastReg = TR::TreeEvaluator::loadConstant(node, powersOf31[64 - numElements], TR_RematerializableInt, cg);
9800+ generateRegRegInstruction(TR::InstOpCode::MOVDRegReg4, node, multiplierVRF, broadcastReg, cg);
9801+ TR::TreeEvaluator::broadcastHelper(node, multiplierVRF, vl, TR::Int32, cg);
9802+ cg->stopUsingRegister(broadcastReg);
97549803
97559804 for (int32_t i = 0; i < unrollCount; i++)
97569805 {
0 commit comments