Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -7515,6 +7515,9 @@ class Compiler
// To represent sets of VN's that have already been hoisted in outer loops.
typedef JitHashTable<ValueNum, JitSmallPrimitiveKeyFuncs<ValueNum>, bool> VNSet;

// To track per-VN counts of constant-tree occurrences for hoist profitability (#92170).
typedef JitHashTable<ValueNum, JitSmallPrimitiveKeyFuncs<ValueNum>, unsigned> VNToCountMap;

struct LoopHoistContext
{
private:
Expand All @@ -7526,6 +7529,12 @@ class Compiler
// Previous decisions on loop-invariance of value numbers in the current loop.
VNSet m_curLoopVnInvariantCache;

// Method-wide count of constant-tree occurrences by VN. Used to gate hoisting of plain
// integral constants: a single occurrence isn't worth hoisting because the hoist just
// injects a preheader copy without removing the original in-loop materialization.
// Built lazily once per method (only when there is at least one loop to hoist from).
VNToCountMap* m_constantUseCount;

int m_loopVarInOutCount;
int m_loopVarCount;
int m_hoistedExprCount;
Expand Down Expand Up @@ -7558,7 +7567,9 @@ class Compiler
}

LoopHoistContext(Compiler* comp)
: m_pHoistedInCurLoop(nullptr), m_curLoopVnInvariantCache(comp->getAllocatorLoopHoist())
: m_pHoistedInCurLoop(nullptr)
, m_curLoopVnInvariantCache(comp->getAllocatorLoopHoist())
, m_constantUseCount(nullptr)
{
}
};
Expand Down
26 changes: 15 additions & 11 deletions src/coreclr/jit/jitconfigvalues.h
Original file line number Diff line number Diff line change
Expand Up @@ -475,18 +475,22 @@ RELEASE_CONFIG_INTEGER(EnableApxZU, "EnableApxZU",
RELEASE_CONFIG_INTEGER(JitDisableSimdVN, "JitDisableSimdVN", 0)
#endif

// Default 0, enable the CSE of Constants, including nearby offsets. (only for ARM/ARM64/RISCV64)
// If 1, disable all the CSE of Constants
// If 2, enable the CSE of Constants but don't combine with nearby offsets. (only for ARM/ARM64/RISCV64)
// If 3, enable the CSE of Constants including nearby offsets. (all platforms)
// If 4, enable the CSE of Constants but don't combine with nearby offsets. (all platforms)
// Default 0, target-gated CSE of constants.
// On ARM/ARM64/RISCV64 this enables CSE of individual constants and bucketing of
// nearby-value constants (see optConstantCSEEnabled and optSharedConstantCSEEnabled).
// On x86/x64 only the nearby-value bucketing is enabled by default; individual
// constants are not CSE'd because they typically fit as instruction immediates.
// If 1, disable all the CSE of Constants.
// If 2, target-gated CSE of constants but without nearby-value bucketing.
// If 3, enable both CSE of constants and nearby-value bucketing on all platforms.
// If 4, enable CSE of constants on all platforms without nearby-value bucketing.
//
#define CONST_CSE_ENABLE_ARM_RISCV64 0
#define CONST_CSE_DISABLE_ALL 1
#define CONST_CSE_ENABLE_ARM_RISCV64_NO_SHARING 2
#define CONST_CSE_ENABLE_ALL 3
#define CONST_CSE_ENABLE_ALL_NO_SHARING 4
RELEASE_CONFIG_INTEGER(JitConstCSE, "JitConstCSE", CONST_CSE_ENABLE_ARM_RISCV64)
#define CONST_CSE_ENABLE_TARGETED 0
#define CONST_CSE_DISABLE_ALL 1
#define CONST_CSE_ENABLE_TARGETED_NO_SHARING 2
#define CONST_CSE_ENABLE_ALL 3
#define CONST_CSE_ENABLE_ALL_NO_SHARING 4
RELEASE_CONFIG_INTEGER(JitConstCSE, "JitConstCSE", CONST_CSE_ENABLE_TARGETED)

// If nonzero, use the greedy RL policy.
//
Expand Down
151 changes: 135 additions & 16 deletions src/coreclr/jit/optcse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1790,7 +1790,12 @@ bool CSE_HeuristicCommon::CanConsiderTree(GenTree* tree, bool isReturn)
if (!enableConstCSE &&
// Unconditionally allow these constant handles to be CSE'd
!tree->IsIconHandle(GTF_ICON_STATIC_HDL) && !tree->IsIconHandle(GTF_ICON_CLASS_HDL) &&
!tree->IsIconHandle(GTF_ICON_STR_HDL) && !tree->IsIconHandle(GTF_ICON_OBJ_HDL))
!tree->IsIconHandle(GTF_ICON_STR_HDL) && !tree->IsIconHandle(GTF_ICON_OBJ_HDL) &&
// Also unconditionally allow CSE/hoist for constants that don't fit as imm32
// or that require relocation. On targets where these can't be encoded as an
// instruction immediate, every use costs a full-size constant load, so reuse
// via CSE or a hoisted temp is generally profitable. See issue #92170.
tree->IsIntCnsFitsInI32() && !tree->AsIntConCommon()->ImmedValNeedsReloc(m_compiler))
{
return false;
}
Expand Down Expand Up @@ -4717,6 +4722,24 @@ bool CSE_Heuristic::PromotionCheck(CSE_Candidate* candidate)
cse_def_cost += 1;
cse_use_cost += 1;
}

// For constants, the alternative to CSE is to rematerialize the value at each
// use - essentially as cheap as a callee-trash register move. So a low-use-count
// constant CSE that spans a call may not save enough to justify forcing a
// callee-saved register into the prolog/epilog. Charge for that here when the
// use count is low enough that the trade-off is borderline. See issue #92170.
//
// Restricted to TARGET_XARCH: on arm64/loongarch64/riscv64 a 64-bit constant
// takes multiple instructions to rematerialize, so the alternative isn't as
// cheap and discouraging the CSE actively bloats code.
//
#if defined(TARGET_XARCH)
if (candidate->Expr()->OperIsConst() && (candidate->CseDsc()->csdUseCount <= 2))
{
cse_def_cost += 1;
cse_use_cost += 1;
}
#endif
}

// If we don't have a lot of variables to enregister or we have a floating point type
Expand All @@ -4734,6 +4757,61 @@ bool CSE_Heuristic::PromotionCheck(CSE_Candidate* candidate)
}
}

#if defined(TARGET_XARCH)
// For shared-constant CSE: not every use can fold the +delta into a useful encoding.
// Address-mode uses (IND, LEA, ADD) absorb the delta into a lea/disp; compare/call/return
// consume the value with no register-write follow-on. But ALU ops like XOR/AND/OR/MUL
// must MATERIALIZE the value into a fresh register before consuming it, which on AMD pays
// the 2-cycle "complex-lea" latency and can extend the critical path. Charge for those
// uses so the heuristic prefers keeping the original mov-imm at the use site when most
// consumers are computes. See issue #92170.
//
if (candidate->IsSharedConst())
{
weight_t computeUseWtCnt = 0;
for (treeStmtLst* lst = &candidate->CseDsc()->csdTreeList; lst != nullptr; lst = lst->tslNext)
{
GenTree* const useTree = lst->tslTree;
if (!IS_CSE_USE(useTree->gtCSEnum))
{
continue;
}
Compiler::FindLinkData link = m_compiler->gtFindLink(lst->tslStmt, useTree);
GenTree* const parent = link.parent;
if (parent == nullptr)
{
continue;
}
switch (parent->OperGet())
{
case GT_XOR:
case GT_AND:
case GT_OR:
case GT_MUL:
case GT_MULHI:
case GT_NEG:
case GT_NOT:
case GT_BSWAP:
case GT_BSWAP16:
computeUseWtCnt += lst->tslBlock->getBBWeight(m_compiler);
break;
default:
// ADD/SUB fold into lea; IND/STOREIND/LEA fold into addressing mode;
// CMP/EQ/NE/LT/.. and CALL/RETURN don't extend a dep chain through a
// written register. No extra charge.
break;
}
}
if (computeUseWtCnt > 0)
{
// Charge ~2 cycles per compute use to model the complex-lea materialization
// cost on AMD (lea r, [base+index+disp] is 2 cycles on Zen, vs 1 cycle for a
// simple `add reg, imm` or a `mov reg, imm`).
extra_yes_cost += (unsigned)(computeUseWtCnt * 2);
}
}
#endif

// estimate the cost from lost codesize reduction if we do not perform the CSE
if (candidate->Size() > cse_use_cost)
{
Expand Down Expand Up @@ -4897,13 +4975,15 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate)
// Verify that all of the ValueNumbers in this list are correct as
// Morph will change them when it performs a mutating operation.
//
bool setRefCnt = true;
bool allSame = true;
bool isSharedConst = successfulCandidate->IsSharedConst();
ValueNum bestVN = ValueNumStore::NoVN;
bool bestIsDef = false;
ssize_t bestConstValue = 0;
treeStmtLst* lst = &dsc->csdTreeList;
bool setRefCnt = true;
bool allSame = true;
bool isSharedConst = successfulCandidate->IsSharedConst();
ValueNum bestVN = ValueNumStore::NoVN;
bool bestIsDef = false;
ssize_t bestConstValue = 0;
ssize_t maxConstValue = 0;
bool seenSharedConstValue = false;
treeStmtLst* lst = &dsc->csdTreeList;

while (lst != nullptr)
{
Expand All @@ -4927,8 +5007,10 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate)
if (isSharedConst)
{
// set bestConstValue and bestIsDef
bestConstValue = curConstValue;
bestIsDef = isDef;
bestConstValue = curConstValue;
maxConstValue = curConstValue;
seenSharedConstValue = true;
bestIsDef = isDef;
}
}
else if (currVN != bestVN)
Expand All @@ -4952,6 +5034,10 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate)
bestConstValue = curConstValue;
bestIsDef = isDef;
}
if (curConstValue > maxConstValue)
{
maxConstValue = curConstValue;
}
}

BasicBlock* blk = lst->tslBlock;
Expand Down Expand Up @@ -4979,8 +5065,41 @@ void CSE_HeuristicCommon::PerformCSE(CSE_Candidate* successfulCandidate)
lst = lst->tslNext;
}

dsc->csdConstDefValue = bestConstValue;
dsc->csdConstDefVN = bestVN;
// For shared-constant CSEs on x64, center the def value at the midpoint of the
// actual constant range so the use-side deltas distribute symmetrically around
// zero. This lets lea use the disp8 (signed 8-bit) displacement encoding for the
// full bucket range, halving the encoded size compared with a min-anchored base.
//
// Other targets use unsigned offsets in their primary addressing modes (or
// separate add/sub instructions for negative displacements), so centering
// forces extra subtracts and is a net loss there.
//
#if defined(TARGET_AMD64)
const bool centerSharedConst = true;
#else
const bool centerSharedConst = false;
#endif
if (centerSharedConst && isSharedConst && seenSharedConstValue &&
((cseLclVarTyp == TYP_INT) || (cseLclVarTyp == TYP_LONG)))
{
const ssize_t centerValue = bestConstValue + ((maxConstValue - bestConstValue) / 2);
dsc->csdConstDefValue = centerValue;
Comment on lines +5082 to +5086
// Allocate a VN that matches the centered constant value, so the stored
// VN agrees with the actual value of the temp's def node.
if (cseLclVarTyp == TYP_LONG)
{
dsc->csdConstDefVN = m_compiler->vnStore->VNForLongCon(static_cast<INT64>(centerValue));
}
else
{
dsc->csdConstDefVN = m_compiler->vnStore->VNForIntCon(static_cast<INT32>(centerValue));
}
}
else
{
dsc->csdConstDefValue = bestConstValue;
dsc->csdConstDefVN = bestVN;
}

#ifdef DEBUG
if (m_compiler->verbose)
Expand Down Expand Up @@ -5886,12 +6005,12 @@ bool Compiler::optSharedConstantCSEEnabled()
{
enableSharedConstCSE = true;
}
#if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64)
else if (configValue == CONST_CSE_ENABLE_ARM_RISCV64)
#if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64) || defined(TARGET_XARCH)
else if (configValue == CONST_CSE_ENABLE_TARGETED)
{
enableSharedConstCSE = true;
}
#endif // TARGET_ARMARCH || TARGET_RISCV64
#endif // TARGET_ARMARCH || TARGET_RISCV64 || TARGET_XARCH

return enableSharedConstCSE;
}
Expand All @@ -5912,7 +6031,7 @@ bool Compiler::optConstantCSEEnabled()
enableConstCSE = true;
}
#if defined(TARGET_ARMARCH) || defined(TARGET_RISCV64)
else if ((configValue == CONST_CSE_ENABLE_ARM_RISCV64) || (configValue == CONST_CSE_ENABLE_ARM_RISCV64_NO_SHARING))
else if ((configValue == CONST_CSE_ENABLE_TARGETED) || (configValue == CONST_CSE_ENABLE_TARGETED_NO_SHARING))
{
enableConstCSE = true;
}
Expand Down
64 changes: 62 additions & 2 deletions src/coreclr/jit/optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3741,10 +3741,47 @@ PhaseStatus Compiler::optHoistLoopCode()

optComputeInterestingVarSets();

// Consider all the loops, visiting child loops first.
// Build a method-wide tally of integral-constant occurrences keyed by VN.
//
// The hoister uses this to gate hoisting of plain integral constants: a single-occurrence
// constant isn't worth hoisting because the preheader copy doesn't replace the original
// in-loop materialization (CSE would need a second occurrence to introduce a temp).
// See issue #92170.
//
// Skip when both constant CSE flavors are disabled; in that case the hoister can't
// produce a profitable constant hoist anyway, so there's no point paying for the scan.
//
bool modified = false;
LoopHoistContext hoistCtxt(this);
if (optConstantCSEEnabled() || optSharedConstantCSEEnabled())
{
hoistCtxt.m_constantUseCount = new (this, CMK_LoopHoist) VNToCountMap(getAllocator(CMK_LoopHoist));
for (BasicBlock* const block : Blocks())
{
for (Statement* const stmt : block->NonPhiStatements())
{
for (GenTree* tree = stmt->GetTreeList(); tree != nullptr; tree = tree->gtNext)
{
if (!tree->OperIs(GT_CNS_INT, GT_CNS_LNG))
{
continue;
}

ValueNum vn = tree->gtVNPair.GetLiberal();
if (vn == ValueNumStore::NoVN)
{
continue;
}

unsigned* pCount = hoistCtxt.m_constantUseCount->LookupPointerOrAdd(vn, 0);
(*pCount)++;
}
}
}
}

// Consider all the loops, visiting child loops first.
//
bool modified = false;
for (FlowGraphNaturalLoop* loop : m_loops->InPostOrder())
{
#if LOOP_HOIST_STATS
Expand Down Expand Up @@ -4423,6 +4460,29 @@ void Compiler::optHoistLoopBlocks(FlowGraphNaturalLoop* loop,
// that decision. This can happen in JitOptRepeat, where hoisting can follow assertion prop.
return false;
}
else if (node->IsIntegralConst() && !node->IsIconHandle(GTF_ICON_STATIC_HDL) &&
!node->IsIconHandle(GTF_ICON_CLASS_HDL) && !node->IsIconHandle(GTF_ICON_STR_HDL) &&
!node->IsIconHandle(GTF_ICON_OBJ_HDL))
{
// Don't hoist plain integral constants unless they appear more than once in the method.
// Hoisting injects a preheader clone that relies on CSE to introduce a temp; if there is
// only one in-method occurrence, the preheader copy just bloats code without removing the
// original in-loop materialization. See issue #92170.
//
// When constant CSE is disabled, m_constantUseCount is null and we don't gate hoisting
// here (the in-loop materialization will be left alone anyway since CSE can't replace it).
Comment on lines +4472 to +4473
//
if (m_hoistContext->m_constantUseCount != nullptr)
{
ValueNum vn = node->gtVNPair.GetLiberal();
unsigned count = 0;
if ((vn == ValueNumStore::NoVN) || !m_hoistContext->m_constantUseCount->Lookup(vn, &count) ||
(count < 2))
{
return false;
}
}
}

// Tree must be a suitable CSE candidate for us to be able to hoist it.
return m_compiler->optIsCSEcandidate(node);
Expand Down
2 changes: 1 addition & 1 deletion src/coreclr/jit/target.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ inline bool compUnixX86Abi()
// with static const members of Target
#if defined(TARGET_AMD64)
#define REGMASK_BITS 64
#define CSE_CONST_SHARED_LOW_BITS 16
#define CSE_CONST_SHARED_LOW_BITS 8

#elif defined(TARGET_X86)
#define REGMASK_BITS 32
Expand Down
Loading