diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 837cc3c402dd9c..97b4b54ab4029e 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -7515,6 +7515,9 @@ class Compiler // To represent sets of VN's that have already been hoisted in outer loops. typedef JitHashTable, bool> VNSet; + // To track per-VN counts of constant-tree occurrences for hoist profitability (#92170). + typedef JitHashTable, unsigned> VNToCountMap; + struct LoopHoistContext { private: @@ -7526,6 +7529,12 @@ class Compiler // Previous decisions on loop-invariance of value numbers in the current loop. VNSet m_curLoopVnInvariantCache; + // Method-wide count of constant-tree occurrences by VN. Used to gate hoisting of plain + // integral constants: a single occurrence isn't worth hoisting because the hoist just + // injects a preheader copy without removing the original in-loop materialization. + // Built lazily once per method (only when there is at least one loop to hoist from). + VNToCountMap* m_constantUseCount; + int m_loopVarInOutCount; int m_loopVarCount; int m_hoistedExprCount; @@ -7558,7 +7567,9 @@ class Compiler } LoopHoistContext(Compiler* comp) - : m_pHoistedInCurLoop(nullptr), m_curLoopVnInvariantCache(comp->getAllocatorLoopHoist()) + : m_pHoistedInCurLoop(nullptr) + , m_curLoopVnInvariantCache(comp->getAllocatorLoopHoist()) + , m_constantUseCount(nullptr) { } }; diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h index 8a87207fe96c8d..01a4a65845c586 100644 --- a/src/coreclr/jit/jitconfigvalues.h +++ b/src/coreclr/jit/jitconfigvalues.h @@ -475,18 +475,22 @@ RELEASE_CONFIG_INTEGER(EnableApxZU, "EnableApxZU", RELEASE_CONFIG_INTEGER(JitDisableSimdVN, "JitDisableSimdVN", 0) #endif -// Default 0, enable the CSE of Constants, including nearby offsets. (only for ARM/ARM64/RISCV64) -// If 1, disable all the CSE of Constants -// If 2, enable the CSE of Constants but don't combine with nearby offsets. (only for ARM/ARM64/RISCV64) -// If 3, enable the CSE of Constants including nearby offsets. (all platforms) -// If 4, enable the CSE of Constants but don't combine with nearby offsets. (all platforms) +// Default 0, target-gated CSE of constants. +// On ARM/ARM64/RISCV64 this enables CSE of individual constants and bucketing of +// nearby-value constants (see optConstantCSEEnabled and optSharedConstantCSEEnabled). +// On x86/x64 only the nearby-value bucketing is enabled by default; individual +// constants are not CSE'd because they typically fit as instruction immediates. +// If 1, disable all the CSE of Constants. +// If 2, target-gated CSE of constants but without nearby-value bucketing. +// If 3, enable both CSE of constants and nearby-value bucketing on all platforms. +// If 4, enable CSE of constants on all platforms without nearby-value bucketing. // -#define CONST_CSE_ENABLE_ARM_RISCV64 0 -#define CONST_CSE_DISABLE_ALL 1 -#define CONST_CSE_ENABLE_ARM_RISCV64_NO_SHARING 2 -#define CONST_CSE_ENABLE_ALL 3 -#define CONST_CSE_ENABLE_ALL_NO_SHARING 4 -RELEASE_CONFIG_INTEGER(JitConstCSE, "JitConstCSE", CONST_CSE_ENABLE_ARM_RISCV64) +#define CONST_CSE_ENABLE_TARGETED 0 +#define CONST_CSE_DISABLE_ALL 1 +#define CONST_CSE_ENABLE_TARGETED_NO_SHARING 2 +#define CONST_CSE_ENABLE_ALL 3 +#define CONST_CSE_ENABLE_ALL_NO_SHARING 4 +RELEASE_CONFIG_INTEGER(JitConstCSE, "JitConstCSE", CONST_CSE_ENABLE_TARGETED) // If nonzero, use the greedy RL policy. // diff --git a/src/coreclr/jit/optcse.cpp b/src/coreclr/jit/optcse.cpp index d14d9ebd570bb9..fa1f84fdef3a5d 100644 --- a/src/coreclr/jit/optcse.cpp +++ b/src/coreclr/jit/optcse.cpp @@ -1790,7 +1790,12 @@ bool CSE_HeuristicCommon::CanConsiderTree(GenTree* tree, bool isReturn) if (!enableConstCSE && // Unconditionally allow these constant handles to be CSE'd !tree->IsIconHandle(GTF_ICON_STATIC_HDL) && !tree->IsIconHandle(GTF_ICON_CLASS_HDL) && - !tree->IsIconHandle(GTF_ICON_STR_HDL) && !tree->IsIconHandle(GTF_ICON_OBJ_HDL)) + !tree->IsIconHandle(GTF_ICON_STR_HDL) && !tree->IsIconHandle(GTF_ICON_OBJ_HDL) && + // Also unconditionally allow CSE/hoist for constants that don't fit as imm32 + // or that require relocation. On targets where these can't be encoded as an + // instruction immediate, every use costs a full-size constant load, so reuse + // via CSE or a hoisted temp is generally profitable. See issue #92170. + tree->IsIntCnsFitsInI32() && !tree->AsIntConCommon()->ImmedValNeedsReloc(m_compiler)) { return false; } @@ -4717,6 +4722,24 @@ bool CSE_Heuristic::PromotionCheck(CSE_Candidate* candidate) cse_def_cost += 1; cse_use_cost += 1; } + + // For constants, the alternative to CSE is to rematerialize the value at each + // use - essentially as cheap as a callee-trash register move. So a low-use-count + // constant CSE that spans a call may not save enough to justify forcing a + // callee-saved register into the prolog/epilog. Charge for that here when the + // use count is low enough that the trade-off is borderline. See issue #92170. + // + // Restricted to TARGET_XARCH: on arm64/loongarch64/riscv64 a 64-bit constant + // takes multiple instructions to rematerialize, so the alternative isn't as + // cheap and discouraging the CSE actively bloats code. + // +#if defined(TARGET_XARCH) + if (candidate->Expr()->OperIsConst() && (candidate->CseDsc()->csdUseCount <= 2)) + { + cse_def_cost += 1; + cse_use_cost += 1; + } +#endif } // If we don't have a lot of variables to enregister or we have a floating point type @@ -4734,6 +4757,61 @@ bool CSE_Heuristic::PromotionCheck(CSE_Candidate* candidate) } } +#if defined(TARGET_XARCH) + // For shared-constant CSE: not every use can fold the +delta into a useful encoding. + // Address-mode uses (IND, LEA, ADD) absorb the delta into a lea/disp; compare/call/return + // consume the value with no register-write follow-on. But ALU ops like XOR/AND/OR/MUL + // must MATERIALIZE the value into a fresh register before consuming it, which on AMD pays + // the 2-cycle "complex-lea" latency and can extend the critical path. Charge for those + // uses so the heuristic prefers keeping the original mov-imm at the use site when most + // consumers are computes. See issue #92170. + // + if (candidate->IsSharedConst()) + { + weight_t computeUseWtCnt = 0; + for (treeStmtLst* lst = &candidate->CseDsc()->csdTreeList; lst != nullptr; lst = lst->tslNext) + { + GenTree* const useTree = lst->tslTree; + if (!IS_CSE_USE(useTree->gtCSEnum)) + { + continue; + } + Compiler::FindLinkData link = m_compiler->gtFindLink(lst->tslStmt, useTree); + GenTree* const parent = link.parent; + if (parent == nullptr) + { + continue; + } + switch (parent->OperGet()) + { + case GT_XOR: + case GT_AND: + case GT_OR: + case GT_MUL: + case GT_MULHI: + case GT_NEG: + case GT_NOT: + case GT_BSWAP: + case GT_BSWAP16: + computeUseWtCnt += lst->tslBlock->getBBWeight(m_compiler); + break; + default: + // ADD/SUB fold into lea; IND/STOREIND/LEA fold into addressing mode; + // CMP/EQ/NE/LT/.. and CALL/RETURN don't extend a dep chain through a + // written register. No extra charge. + break; + } + } + if (computeUseWtCnt > 0) + { + // Charge ~2 cycles per compute use to model the complex-lea materialization + // cost on AMD (lea r, [base+index+disp] is 2 cycles on Zen, vs 1 cycle for a + // simple `add reg, imm` or a `mov reg, imm`). + extra_yes_cost += (unsigned)(computeUseWtCnt * 2); + } + } +#endif + // estimate the cost from lost codesize reduction if we do not perform the CSE if (candidate->Size() > cse_use_cost) { @@ -4897,13 +4975,15 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate) // Verify that all of the ValueNumbers in this list are correct as // Morph will change them when it performs a mutating operation. // - bool setRefCnt = true; - bool allSame = true; - bool isSharedConst = successfulCandidate->IsSharedConst(); - ValueNum bestVN = ValueNumStore::NoVN; - bool bestIsDef = false; - ssize_t bestConstValue = 0; - treeStmtLst* lst = &dsc->csdTreeList; + bool setRefCnt = true; + bool allSame = true; + bool isSharedConst = successfulCandidate->IsSharedConst(); + ValueNum bestVN = ValueNumStore::NoVN; + bool bestIsDef = false; + ssize_t bestConstValue = 0; + ssize_t maxConstValue = 0; + bool seenSharedConstValue = false; + treeStmtLst* lst = &dsc->csdTreeList; while (lst != nullptr) { @@ -4927,8 +5007,10 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate) if (isSharedConst) { // set bestConstValue and bestIsDef - bestConstValue = curConstValue; - bestIsDef = isDef; + bestConstValue = curConstValue; + maxConstValue = curConstValue; + seenSharedConstValue = true; + bestIsDef = isDef; } } else if (currVN != bestVN) @@ -4952,6 +5034,10 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate) bestConstValue = curConstValue; bestIsDef = isDef; } + if (curConstValue > maxConstValue) + { + maxConstValue = curConstValue; + } } BasicBlock* blk = lst->tslBlock; @@ -4979,8 +5065,41 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate) lst = lst->tslNext; } - dsc->csdConstDefValue = bestConstValue; - dsc->csdConstDefVN = bestVN; + // For shared-constant CSEs on x64, center the def value at the midpoint of the + // actual constant range so the use-side deltas distribute symmetrically around + // zero. This lets lea use the disp8 (signed 8-bit) displacement encoding for the + // full bucket range, halving the encoded size compared with a min-anchored base. + // + // Other targets use unsigned offsets in their primary addressing modes (or + // separate add/sub instructions for negative displacements), so centering + // forces extra subtracts and is a net loss there. + // +#if defined(TARGET_AMD64) + const bool centerSharedConst = true; +#else + const bool centerSharedConst = false; +#endif + if (centerSharedConst && isSharedConst && seenSharedConstValue && + ((cseLclVarTyp == TYP_INT) || (cseLclVarTyp == TYP_LONG))) + { + const ssize_t centerValue = bestConstValue + ((maxConstValue - bestConstValue) / 2); + dsc->csdConstDefValue = centerValue; + // Allocate a VN that matches the centered constant value, so the stored + // VN agrees with the actual value of the temp's def node. + if (cseLclVarTyp == TYP_LONG) + { + dsc->csdConstDefVN = m_compiler->vnStore->VNForLongCon(static_cast(centerValue)); + } + else + { + dsc->csdConstDefVN = m_compiler->vnStore->VNForIntCon(static_cast(centerValue)); + } + } + else + { + dsc->csdConstDefValue = bestConstValue; + dsc->csdConstDefVN = bestVN; + } #ifdef DEBUG if (m_compiler->verbose) @@ -5886,12 +6005,12 @@ bool Compiler::optSharedConstantCSEEnabled() { enableSharedConstCSE = true; } -#if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64) - else if (configValue == CONST_CSE_ENABLE_ARM_RISCV64) +#if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64) || defined(TARGET_XARCH) + else if (configValue == CONST_CSE_ENABLE_TARGETED) { enableSharedConstCSE = true; } -#endif // TARGET_ARMARCH || TARGET_RISCV64 +#endif // TARGET_ARMARCH || TARGET_RISCV64 || TARGET_XARCH return enableSharedConstCSE; } @@ -5912,7 +6031,7 @@ bool Compiler::optConstantCSEEnabled() enableConstCSE = true; } #if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64) - else if ((configValue == CONST_CSE_ENABLE_ARM_RISCV64) || (configValue == CONST_CSE_ENABLE_ARM_RISCV64_NO_SHARING)) + else if ((configValue == CONST_CSE_ENABLE_TARGETED) || (configValue == CONST_CSE_ENABLE_TARGETED_NO_SHARING)) { enableConstCSE = true; } diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp index 74e598e0ea579a..d8da40af451490 100644 --- a/src/coreclr/jit/optimizer.cpp +++ b/src/coreclr/jit/optimizer.cpp @@ -3741,10 +3741,47 @@ PhaseStatus Compiler::optHoistLoopCode() optComputeInterestingVarSets(); - // Consider all the loops, visiting child loops first. + // Build a method-wide tally of integral-constant occurrences keyed by VN. + // + // The hoister uses this to gate hoisting of plain integral constants: a single-occurrence + // constant isn't worth hoisting because the preheader copy doesn't replace the original + // in-loop materialization (CSE would need a second occurrence to introduce a temp). + // See issue #92170. + // + // Skip when both constant CSE flavors are disabled; in that case the hoister can't + // produce a profitable constant hoist anyway, so there's no point paying for the scan. // - bool modified = false; LoopHoistContext hoistCtxt(this); + if (optConstantCSEEnabled() || optSharedConstantCSEEnabled()) + { + hoistCtxt.m_constantUseCount = new (this, CMK_LoopHoist) VNToCountMap(getAllocator(CMK_LoopHoist)); + for (BasicBlock* const block : Blocks()) + { + for (Statement* const stmt : block->NonPhiStatements()) + { + for (GenTree* tree = stmt->GetTreeList(); tree != nullptr; tree = tree->gtNext) + { + if (!tree->OperIs(GT_CNS_INT, GT_CNS_LNG)) + { + continue; + } + + ValueNum vn = tree->gtVNPair.GetLiberal(); + if (vn == ValueNumStore::NoVN) + { + continue; + } + + unsigned* pCount = hoistCtxt.m_constantUseCount->LookupPointerOrAdd(vn, 0); + (*pCount)++; + } + } + } + } + + // Consider all the loops, visiting child loops first. + // + bool modified = false; for (FlowGraphNaturalLoop* loop : m_loops->InPostOrder()) { #if LOOP_HOIST_STATS @@ -4423,6 +4460,29 @@ void Compiler::optHoistLoopBlocks(FlowGraphNaturalLoop* loop, // that decision. This can happen in JitOptRepeat, where hoisting can follow assertion prop. return false; } + else if (node->IsIntegralConst() && !node->IsIconHandle(GTF_ICON_STATIC_HDL) && + !node->IsIconHandle(GTF_ICON_CLASS_HDL) && !node->IsIconHandle(GTF_ICON_STR_HDL) && + !node->IsIconHandle(GTF_ICON_OBJ_HDL)) + { + // Don't hoist plain integral constants unless they appear more than once in the method. + // Hoisting injects a preheader clone that relies on CSE to introduce a temp; if there is + // only one in-method occurrence, the preheader copy just bloats code without removing the + // original in-loop materialization. See issue #92170. + // + // When constant CSE is disabled, m_constantUseCount is null and we don't gate hoisting + // here (the in-loop materialization will be left alone anyway since CSE can't replace it). + // + if (m_hoistContext->m_constantUseCount != nullptr) + { + ValueNum vn = node->gtVNPair.GetLiberal(); + unsigned count = 0; + if ((vn == ValueNumStore::NoVN) || !m_hoistContext->m_constantUseCount->Lookup(vn, &count) || + (count < 2)) + { + return false; + } + } + } // Tree must be a suitable CSE candidate for us to be able to hoist it. return m_compiler->optIsCSEcandidate(node); diff --git a/src/coreclr/jit/target.h b/src/coreclr/jit/target.h index 4c96dbe40ee1ca..423056be8a3885 100644 --- a/src/coreclr/jit/target.h +++ b/src/coreclr/jit/target.h @@ -71,7 +71,7 @@ inline bool compUnixX86Abi() // with static const members of Target #if defined(TARGET_AMD64) #define REGMASK_BITS 64 -#define CSE_CONST_SHARED_LOW_BITS 16 +#define CSE_CONST_SHARED_LOW_BITS 8 #elif defined(TARGET_X86) #define REGMASK_BITS 32