From 1bfa1767423c4ad43276359564fb62256e704a81 Mon Sep 17 00:00:00 2001 From: cocolato Date: Wed, 1 Apr 2026 00:57:51 +0800 Subject: [PATCH 1/8] add fitness && exit quality mechanism --- Include/cpython/pystats.h | 2 + Include/internal/pycore_interp_structs.h | 9 ++ Include/internal/pycore_optimizer.h | 16 +- Python/optimizer.c | 179 ++++++++++++++++++++++- Python/pystate.c | 23 +++ Python/pystats.c | 2 + 6 files changed, 225 insertions(+), 6 deletions(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index e473110eca7415..33b812e7b81a39 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -144,6 +144,8 @@ typedef struct _optimization_stats { uint64_t unknown_callee; uint64_t trace_immediately_deopts; uint64_t executors_invalidated; + uint64_t fitness_terminated_traces; + uint64_t best_exit_fallback; UOpStats opcode[PYSTATS_MAX_UOP_ID + 1]; uint64_t unsupported_opcode[256]; uint64_t trace_length_hist[_Py_UOP_HIST_SIZE]; diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index f76d4f41c55119..4fbd36e1b2670f 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -449,6 +449,15 @@ typedef struct _PyOptimizationConfig { uint16_t side_exit_initial_value; uint16_t side_exit_initial_backoff; + // Trace fitness thresholds + uint16_t fitness_initial; + uint16_t fitness_initial_side; + uint16_t fitness_per_instruction; + uint16_t fitness_branch_biased; + uint16_t fitness_branch_unbiased; + uint16_t fitness_backward_edge; + uint16_t fitness_frame_entry; + // Optimization flags bool specialization_enabled; bool uops_optimize_enabled; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 2986afb142b5d1..ab7130a86c4d2b 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -15,6 +15,16 @@ extern "C" { #include "pycore_optimizer_types.h" #include +/* Default fitness configuration values for trace quality control. + * These can be overridden via PYTHON_JIT_FITNESS_* environment variables. */ +#define FITNESS_INITIAL 1000 +#define FITNESS_INITIAL_SIDE 800 +#define FITNESS_PER_INSTRUCTION 2 +#define FITNESS_BRANCH_BIASED 5 +#define FITNESS_BRANCH_UNBIASED 25 +#define FITNESS_BACKWARD_EDGE 80 +#define FITNESS_FRAME_ENTRY 10 + typedef struct _PyJitUopBuffer { _PyUOpInstruction *start; @@ -101,7 +111,11 @@ typedef struct _PyJitTracerPreviousState { } _PyJitTracerPreviousState; typedef struct _PyJitTracerTranslatorState { - int jump_backward_seen; + int32_t fitness; // Current trace fitness, starts high, decrements + int32_t best_exit_quality; // Best exit quality seen so far + int best_exit_buffer_pos; // Position in code_buffer of best exit (-1=none) + uint32_t best_exit_target; // Bytecode target of best exit point + int frame_depth; // Current inline depth (0 = root frame) } _PyJitTracerTranslatorState; typedef struct _PyJitTracerState { diff --git a/Python/optimizer.c b/Python/optimizer.c index f09bf778587b12..1efca0351e5589 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -549,8 +549,11 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = { }; -#define CONFIDENCE_RANGE 1000 -#define CONFIDENCE_CUTOFF 333 +/* Exit quality constants for fitness-based trace termination. + * Higher values mean better places to stop the trace. */ +#define EXIT_QUALITY_ENTER_EXECUTOR 500 // An executor already exists here +#define EXIT_QUALITY_DEFAULT 200 // Ordinary bytecode position +#define EXIT_QUALITY_SPECIALIZABLE 50 // Specializable instruction — avoid stopping here #ifdef Py_DEBUG #define DPRINTF(level, ...) \ @@ -598,6 +601,86 @@ add_to_trace( ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive))) +/* Compute branch bias from the 16-bit branch history register. + * Returns 0 (completely unpredictable, 50/50) to 8 (fully biased). */ +static inline int +compute_branch_bias(uint16_t history) +{ + int ones = _Py_popcount32((uint32_t)history); + return abs(ones - 8); +} + +/* Compute exit quality for the current trace position. + * Higher values mean it's a better place to stop the trace. */ +static inline int32_t +compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode) +{ + if (target_instr->op.code == ENTER_EXECUTOR) { + return EXIT_QUALITY_ENTER_EXECUTOR; + } + if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) { + return EXIT_QUALITY_SPECIALIZABLE; + } + return EXIT_QUALITY_DEFAULT; +} + +/* Try to truncate the trace to the best recorded exit point. + * Returns 1 if successful, 0 if no valid best exit exists. + * Enforces progress constraints: the fallback position must satisfy + * the minimum trace length requirements. */ +static inline int +try_best_exit_fallback( + _PyJitUopBuffer *trace, + _PyJitTracerTranslatorState *ts, + bool progress_needed) +{ + int best_pos = ts->best_exit_buffer_pos; + if (best_pos <= 0) { + return 0; + } else if (progress_needed && best_pos <= CODE_SIZE_NO_PROGRESS) { + return 0; + } else if (!progress_needed && best_pos <= CODE_SIZE_EMPTY) { + return 0; + } + trace->next = trace->start + best_pos; + /* Caller must add terminator (_EXIT_TRACE) after this */ + return 1; +} + +/* Update trace fitness after translating one bytecode instruction. */ +static inline void +update_trace_fitness( + _PyJitTracerTranslatorState *ts, + int opcode, + _Py_CODEUNIT *target_instr, + const _PyOptimizationConfig *cfg) +{ + ts->fitness -= cfg->fitness_per_instruction; + + switch (opcode) { + case POP_JUMP_IF_FALSE: + case POP_JUMP_IF_TRUE: + case POP_JUMP_IF_NONE: + case POP_JUMP_IF_NOT_NONE: { + int bias = compute_branch_bias(target_instr[1].cache); + /* Linear interpolation: bias 0 → unbiased penalty, bias 8 → biased penalty */ + int penalty = cfg->fitness_branch_unbiased + - (bias * (cfg->fitness_branch_unbiased - cfg->fitness_branch_biased)) / 8; + ts->fitness -= penalty; + break; + } + case JUMP_BACKWARD: + case JUMP_BACKWARD_JIT: + case JUMP_BACKWARD_NO_JIT: + ts->fitness -= cfg->fitness_backward_edge; + break; + /* JUMP_BACKWARD_NO_INTERRUPT: exempt from backward edge penalty (coroutines) */ + default: + break; + } +} + + static int is_terminator(const _PyUOpInstruction *uop) { @@ -730,17 +813,46 @@ _PyJit_translate_single_bytecode_to_trace( goto unsupported; } + // Track frame depth changes for fitness (only for supported frame transitions) + if (frame != tracer->prev_state.instr_frame) { + _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + if (frame->previous == tracer->prev_state.instr_frame) { + // Entered a deeper frame (function call inlined) + ts_depth->frame_depth++; + // Penalty scales with depth: shallow inlining is cheap, + // deep inlining gets progressively more expensive. + int32_t penalty = (int32_t)tstate->interp->opt_config.fitness_frame_entry + * ts_depth->frame_depth; + ts_depth->fitness -= penalty; + } else if (ts_depth->frame_depth > 0) { + // Returned to a shallower frame + ts_depth->frame_depth--; + } + } + if (oparg > 0xFFFF) { DPRINTF(2, "Unsupported: oparg too large\n"); unsupported: { - // Rewind to previous instruction and replace with _EXIT_TRACE. + // If we have a high-quality best_exit (enter_executor, etc.), + // prefer it over rewinding to last _SET_IP — this covers the + // main unsupported path, not just the edge case. + _PyJitTracerTranslatorState *ts_unsup = &tracer->translator_state; + if (ts_unsup->best_exit_quality > EXIT_QUALITY_DEFAULT && + try_best_exit_fallback(trace, ts_unsup, progress_needed)) { + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts_unsup->best_exit_target); + uop_buffer_last(trace)->operand1 = true; // is_control_flow + OPT_STAT_INC(best_exit_fallback); + DPRINTF(2, "Best-exit fallback at unsupported (pos=%d, quality=%d)\n", + ts_unsup->best_exit_buffer_pos, ts_unsup->best_exit_quality); + goto done; + } + // Fall back: rewind to last _SET_IP and replace with _DEOPT. _PyUOpInstruction *curr = uop_buffer_last(trace); while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) { trace->next--; curr = uop_buffer_last(trace); } - assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2); if (curr->opcode == _SET_IP) { int32_t old_target = (int32_t)uop_get_target(curr); curr->opcode = _DEOPT; @@ -763,6 +875,40 @@ _PyJit_translate_single_bytecode_to_trace( return 1; } + // Fitness-based trace quality check (before reserving space for this instruction) + { + _PyJitTracerTranslatorState *ts = &tracer->translator_state; + int32_t eq = compute_exit_quality(target_instr, opcode); + + // Record best exit candidate. + // Only record after minimum progress to avoid truncating to near-empty traces. + if (eq > ts->best_exit_quality && + uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) { + ts->best_exit_quality = eq; + ts->best_exit_buffer_pos = uop_buffer_length(trace); + ts->best_exit_target = target; + } + + // Check if fitness is depleted — should we stop the trace? + if (ts->fitness < eq && + !(progress_needed && uop_buffer_length(trace) < CODE_SIZE_NO_PROGRESS)) { + // Prefer stopping at the best recorded exit point + if (try_best_exit_fallback(trace, ts, progress_needed)) { + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts->best_exit_target); + uop_buffer_last(trace)->operand1 = true; // is_control_flow + } + else { + // No valid best exit — stop at current position + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); + uop_buffer_last(trace)->operand1 = true; // is_control_flow + } + OPT_STAT_INC(fitness_terminated_traces); + DPRINTF(2, "Fitness terminated: fitness=%d < exit_quality=%d\n", + ts->fitness, eq); + goto done; + } + } + // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT trace->end -= 2; @@ -793,6 +939,12 @@ _PyJit_translate_single_bytecode_to_trace( DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n", space_needed, uop_buffer_remaining_space(trace)); OPT_STAT_INC(trace_too_long); + // Try best-exit fallback before giving up + if (try_best_exit_fallback(trace, &tracer->translator_state, progress_needed)) { + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, tracer->translator_state.best_exit_target); + uop_buffer_last(trace)->operand1 = true; // is_control_flow + OPT_STAT_INC(best_exit_fallback); + } goto done; } @@ -986,7 +1138,12 @@ _PyJit_translate_single_bytecode_to_trace( ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0); goto done; } - DPRINTF(2, "Trace continuing\n"); + // Update fitness AFTER translation, BEFORE returning to continue tracing. + // This ensures the next iteration's fitness check reflects the cost of + // all instructions translated so far. + update_trace_fitness(&tracer->translator_state, opcode, target_instr, + &tstate->interp->opt_config); + DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness); return 1; done: DPRINTF(2, "Trace done\n"); @@ -1069,6 +1226,18 @@ _PyJit_TryInitializeTracing( assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL)); tracer->initial_state.jump_backward_instr = curr_instr; + // Initialize fitness tracking state + const _PyOptimizationConfig *cfg = &tstate->interp->opt_config; + _PyJitTracerTranslatorState *ts = &tracer->translator_state; + bool is_side_trace = (exit != NULL); + ts->fitness = is_side_trace + ? (int32_t)cfg->fitness_initial_side + : (int32_t)cfg->fitness_initial; + ts->best_exit_quality = 0; + ts->best_exit_buffer_pos = -1; + ts->best_exit_target = 0; + ts->frame_depth = 0; + tracer->is_tracing = true; return 1; } diff --git a/Python/pystate.c b/Python/pystate.c index 143175da0f45c7..5b5ac071f38b1f 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -635,6 +635,29 @@ init_interpreter(PyInterpreterState *interp, "PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF", SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF); + // Trace fitness configuration + init_policy(&interp->opt_config.fitness_initial, + "PYTHON_JIT_FITNESS_INITIAL", + FITNESS_INITIAL, 100, 10000); + init_policy(&interp->opt_config.fitness_initial_side, + "PYTHON_JIT_FITNESS_INITIAL_SIDE", + FITNESS_INITIAL_SIDE, 50, 5000); + init_policy(&interp->opt_config.fitness_per_instruction, + "PYTHON_JIT_FITNESS_PER_INSTRUCTION", + FITNESS_PER_INSTRUCTION, 0, 100); + init_policy(&interp->opt_config.fitness_branch_biased, + "PYTHON_JIT_FITNESS_BRANCH_BIASED", + FITNESS_BRANCH_BIASED, 0, 500); + init_policy(&interp->opt_config.fitness_branch_unbiased, + "PYTHON_JIT_FITNESS_BRANCH_UNBIASED", + FITNESS_BRANCH_UNBIASED, 0, 500); + init_policy(&interp->opt_config.fitness_backward_edge, + "PYTHON_JIT_FITNESS_BACKWARD_EDGE", + FITNESS_BACKWARD_EDGE, 0, 1000); + init_policy(&interp->opt_config.fitness_frame_entry, + "PYTHON_JIT_FITNESS_FRAME_ENTRY", + FITNESS_FRAME_ENTRY, 0, 1000); + interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF"); interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE"); if (interp != &runtime->_main_interpreter) { diff --git a/Python/pystats.c b/Python/pystats.c index a057ad884566d8..b563da14858861 100644 --- a/Python/pystats.c +++ b/Python/pystats.c @@ -274,6 +274,8 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence); fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee); fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated); + fprintf(out, "Optimization fitness terminated: %" PRIu64 "\n", stats->fitness_terminated_traces); + fprintf(out, "Optimization best exit fallback: %" PRIu64 "\n", stats->best_exit_fallback); print_histogram(out, "Trace length", stats->trace_length_hist); print_histogram(out, "Trace run length", stats->trace_run_length_hist); From 2f9438a25f17f52cf32f0f9d8f1ae3c2bee78395 Mon Sep 17 00:00:00 2001 From: LloydZ <35182391+cocolato@users.noreply.github.com> Date: Wed, 1 Apr 2026 08:54:47 +0000 Subject: [PATCH 2/8] Rewrite the code structure --- Include/internal/pycore_interp_structs.h | 5 ++ Include/internal/pycore_optimizer.h | 7 +++ Python/optimizer.c | 76 +++++++++++------------- Python/pystate.c | 11 ++++ 4 files changed, 57 insertions(+), 42 deletions(-) diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 4fbd36e1b2670f..77fff55b1c815e 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -458,6 +458,11 @@ typedef struct _PyOptimizationConfig { uint16_t fitness_backward_edge; uint16_t fitness_frame_entry; + // Exit quality thresholds for fitness-based trace termination + uint16_t exit_quality_enter_executor; + uint16_t exit_quality_default; + uint16_t exit_quality_specializable; + // Optimization flags bool specialization_enabled; bool uops_optimize_enabled; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index ab7130a86c4d2b..63462cf070c544 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -25,6 +25,13 @@ extern "C" { #define FITNESS_BACKWARD_EDGE 80 #define FITNESS_FRAME_ENTRY 10 +/* Default exit quality constants for fitness-based trace termination. + * Higher values mean better places to stop the trace. + * These can be overridden via PYTHON_JIT_EXIT_QUALITY_* environment variables. */ +#define EXIT_QUALITY_ENTER_EXECUTOR 500 +#define EXIT_QUALITY_DEFAULT 200 +#define EXIT_QUALITY_SPECIALIZABLE 50 + typedef struct _PyJitUopBuffer { _PyUOpInstruction *start; diff --git a/Python/optimizer.c b/Python/optimizer.c index 1efca0351e5589..ffe1317aad09c7 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -549,11 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = { }; -/* Exit quality constants for fitness-based trace termination. - * Higher values mean better places to stop the trace. */ -#define EXIT_QUALITY_ENTER_EXECUTOR 500 // An executor already exists here -#define EXIT_QUALITY_DEFAULT 200 // Ordinary bytecode position -#define EXIT_QUALITY_SPECIALIZABLE 50 // Specializable instruction — avoid stopping here #ifdef Py_DEBUG #define DPRINTF(level, ...) \ @@ -613,15 +608,16 @@ compute_branch_bias(uint16_t history) /* Compute exit quality for the current trace position. * Higher values mean it's a better place to stop the trace. */ static inline int32_t -compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode) +compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode, + const _PyOptimizationConfig *cfg) { if (target_instr->op.code == ENTER_EXECUTOR) { - return EXIT_QUALITY_ENTER_EXECUTOR; + return (int32_t)cfg->exit_quality_enter_executor; } if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) { - return EXIT_QUALITY_SPECIALIZABLE; + return (int32_t)cfg->exit_quality_specializable; } - return EXIT_QUALITY_DEFAULT; + return (int32_t)cfg->exit_quality_default; } /* Try to truncate the trace to the best recorded exit point. @@ -674,7 +670,6 @@ update_trace_fitness( case JUMP_BACKWARD_NO_JIT: ts->fitness -= cfg->fitness_backward_edge; break; - /* JUMP_BACKWARD_NO_INTERRUPT: exempt from backward edge penalty (coroutines) */ default: break; } @@ -817,7 +812,6 @@ _PyJit_translate_single_bytecode_to_trace( if (frame != tracer->prev_state.instr_frame) { _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; if (frame->previous == tracer->prev_state.instr_frame) { - // Entered a deeper frame (function call inlined) ts_depth->frame_depth++; // Penalty scales with depth: shallow inlining is cheap, // deep inlining gets progressively more expensive. @@ -825,7 +819,6 @@ _PyJit_translate_single_bytecode_to_trace( * ts_depth->frame_depth; ts_depth->fitness -= penalty; } else if (ts_depth->frame_depth > 0) { - // Returned to a shallower frame ts_depth->frame_depth--; } } @@ -838,7 +831,7 @@ _PyJit_translate_single_bytecode_to_trace( // prefer it over rewinding to last _SET_IP — this covers the // main unsupported path, not just the edge case. _PyJitTracerTranslatorState *ts_unsup = &tracer->translator_state; - if (ts_unsup->best_exit_quality > EXIT_QUALITY_DEFAULT && + if (ts_unsup->best_exit_quality > (int32_t)tstate->interp->opt_config.exit_quality_default && try_best_exit_fallback(trace, ts_unsup, progress_needed)) { ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts_unsup->best_exit_target); uop_buffer_last(trace)->operand1 = true; // is_control_flow @@ -876,37 +869,36 @@ _PyJit_translate_single_bytecode_to_trace( } // Fitness-based trace quality check (before reserving space for this instruction) - { - _PyJitTracerTranslatorState *ts = &tracer->translator_state; - int32_t eq = compute_exit_quality(target_instr, opcode); - - // Record best exit candidate. - // Only record after minimum progress to avoid truncating to near-empty traces. - if (eq > ts->best_exit_quality && - uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) { - ts->best_exit_quality = eq; - ts->best_exit_buffer_pos = uop_buffer_length(trace); - ts->best_exit_target = target; + _PyJitTracerTranslatorState *ts = &tracer->translator_state; + int32_t eq = compute_exit_quality(target_instr, opcode, + &tstate->interp->opt_config); + + // Record best exit candidate. + // Only record after minimum progress to avoid truncating to near-empty traces. + if (eq > ts->best_exit_quality && + uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) { + ts->best_exit_quality = eq; + ts->best_exit_buffer_pos = uop_buffer_length(trace); + ts->best_exit_target = target; + } + + // Check if fitness is depleted — should we stop the trace? + if (ts->fitness < eq && + !(progress_needed && uop_buffer_length(trace) < CODE_SIZE_NO_PROGRESS)) { + // Prefer stopping at the best recorded exit point + if (try_best_exit_fallback(trace, ts, progress_needed)) { + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts->best_exit_target); + uop_buffer_last(trace)->operand1 = true; // is_control_flow } - - // Check if fitness is depleted — should we stop the trace? - if (ts->fitness < eq && - !(progress_needed && uop_buffer_length(trace) < CODE_SIZE_NO_PROGRESS)) { - // Prefer stopping at the best recorded exit point - if (try_best_exit_fallback(trace, ts, progress_needed)) { - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts->best_exit_target); - uop_buffer_last(trace)->operand1 = true; // is_control_flow - } - else { - // No valid best exit — stop at current position - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); - uop_buffer_last(trace)->operand1 = true; // is_control_flow - } - OPT_STAT_INC(fitness_terminated_traces); - DPRINTF(2, "Fitness terminated: fitness=%d < exit_quality=%d\n", - ts->fitness, eq); - goto done; + else { + // No valid best exit — stop at current position + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); + uop_buffer_last(trace)->operand1 = true; // is_control_flow } + OPT_STAT_INC(fitness_terminated_traces); + DPRINTF(2, "Fitness terminated: fitness=%d < exit_quality=%d\n", + ts->fitness, eq); + goto done; } // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT diff --git a/Python/pystate.c b/Python/pystate.c index 5b5ac071f38b1f..557e6fc309e373 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -658,6 +658,17 @@ init_interpreter(PyInterpreterState *interp, "PYTHON_JIT_FITNESS_FRAME_ENTRY", FITNESS_FRAME_ENTRY, 0, 1000); + // Exit quality thresholds + init_policy(&interp->opt_config.exit_quality_enter_executor, + "PYTHON_JIT_EXIT_QUALITY_ENTER_EXECUTOR", + EXIT_QUALITY_ENTER_EXECUTOR, 0, 10000); + init_policy(&interp->opt_config.exit_quality_default, + "PYTHON_JIT_EXIT_QUALITY_DEFAULT", + EXIT_QUALITY_DEFAULT, 0, 10000); + init_policy(&interp->opt_config.exit_quality_specializable, + "PYTHON_JIT_EXIT_QUALITY_SPECIALIZABLE", + EXIT_QUALITY_SPECIALIZABLE, 0, 10000); + interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF"); interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE"); if (interp != &runtime->_main_interpreter) { From 709c0a1bff91931979889b0e39a80a8c56e6a8ed Mon Sep 17 00:00:00 2001 From: cocolato Date: Wed, 1 Apr 2026 22:49:33 +0800 Subject: [PATCH 3/8] address review --- Include/internal/pycore_optimizer.h | 4 ++-- Python/optimizer.c | 37 +++++++++++++++++------------ 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 63462cf070c544..c79d26ade80f4e 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -17,9 +17,9 @@ extern "C" { /* Default fitness configuration values for trace quality control. * These can be overridden via PYTHON_JIT_FITNESS_* environment variables. */ -#define FITNESS_INITIAL 1000 -#define FITNESS_INITIAL_SIDE 800 #define FITNESS_PER_INSTRUCTION 2 +#define FITNESS_INITIAL (UOP_MAX_TRACE_LENGTH * FITNESS_PER_INSTRUCTION) +#define FITNESS_INITIAL_SIDE 800 #define FITNESS_BRANCH_BIASED 5 #define FITNESS_BRANCH_UNBIASED 25 #define FITNESS_BACKWARD_EDGE 80 diff --git a/Python/optimizer.c b/Python/optimizer.c index ffe1317aad09c7..435fb1f06749fa 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -808,21 +808,6 @@ _PyJit_translate_single_bytecode_to_trace( goto unsupported; } - // Track frame depth changes for fitness (only for supported frame transitions) - if (frame != tracer->prev_state.instr_frame) { - _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; - if (frame->previous == tracer->prev_state.instr_frame) { - ts_depth->frame_depth++; - // Penalty scales with depth: shallow inlining is cheap, - // deep inlining gets progressively more expensive. - int32_t penalty = (int32_t)tstate->interp->opt_config.fitness_frame_entry - * ts_depth->frame_depth; - ts_depth->fitness -= penalty; - } else if (ts_depth->frame_depth > 0) { - ts_depth->frame_depth--; - } - } - if (oparg > 0xFFFF) { DPRINTF(2, "Unsupported: oparg too large\n"); unsupported: @@ -1089,6 +1074,28 @@ _PyJit_translate_single_bytecode_to_trace( assert(next->op.code == STORE_FAST); operand = next->op.arg; } + else if (uop == _PUSH_FRAME) { + _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + ts_depth->frame_depth++; + if (ts_depth->frame_depth >= MAX_ABSTRACT_FRAME_DEPTH) { + // The optimizer can't handle frames this deep, + // so there's no point continuing the trace. + DPRINTF(2, "Unsupported: frame depth %d >= MAX_ABSTRACT_FRAME_DEPTH\n", + ts_depth->frame_depth); + goto unsupported; + } + int32_t penalty = (int32_t)tstate->interp->opt_config.fitness_frame_entry + * ts_depth->frame_depth; + ts_depth->fitness -= penalty; + } + else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) { + _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + if (ts_depth->frame_depth <= 0) { + // Underflow + ts_depth->fitness -= (int32_t)tstate->interp->opt_config.fitness_frame_entry * 2; + } + ts_depth->frame_depth = ts_depth->frame_depth <= 0 ? 0 : ts_depth->frame_depth - 1; + } else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) { PyObject *recorded_value = tracer->prev_state.recorded_value; tracer->prev_state.recorded_value = NULL; From ef6ac24576882884004684dc1d4cf366e31a664a Mon Sep 17 00:00:00 2001 From: LloydZ <35182391+cocolato@users.noreply.github.com> Date: Thu, 2 Apr 2026 07:38:10 +0000 Subject: [PATCH 4/8] address many reviews --- Include/cpython/pystats.h | 1 - Include/internal/pycore_interp_structs.h | 10 -- Include/internal/pycore_optimizer.h | 18 +-- Python/optimizer.c | 156 ++++++----------------- Python/pystate.c | 34 ++--- Python/pystats.c | 1 - 6 files changed, 55 insertions(+), 165 deletions(-) diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index 33b812e7b81a39..5d1f44988a6df1 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -145,7 +145,6 @@ typedef struct _optimization_stats { uint64_t trace_immediately_deopts; uint64_t executors_invalidated; uint64_t fitness_terminated_traces; - uint64_t best_exit_fallback; UOpStats opcode[PYSTATS_MAX_UOP_ID + 1]; uint64_t unsupported_opcode[256]; uint64_t trace_length_hist[_Py_UOP_HIST_SIZE]; diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index 77fff55b1c815e..0cebe1b4b9e995 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -452,16 +452,6 @@ typedef struct _PyOptimizationConfig { // Trace fitness thresholds uint16_t fitness_initial; uint16_t fitness_initial_side; - uint16_t fitness_per_instruction; - uint16_t fitness_branch_biased; - uint16_t fitness_branch_unbiased; - uint16_t fitness_backward_edge; - uint16_t fitness_frame_entry; - - // Exit quality thresholds for fitness-based trace termination - uint16_t exit_quality_enter_executor; - uint16_t exit_quality_default; - uint16_t exit_quality_specializable; // Optimization flags bool specialization_enabled; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index c79d26ade80f4e..a0cdde0b94490b 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -16,18 +16,17 @@ extern "C" { #include /* Default fitness configuration values for trace quality control. - * These can be overridden via PYTHON_JIT_FITNESS_* environment variables. */ + * FITNESS_INITIAL and FITNESS_INITIAL_SIDE can be overridden via + * PYTHON_JIT_FITNESS_INITIAL and PYTHON_JIT_FITNESS_INITIAL_SIDE */ #define FITNESS_PER_INSTRUCTION 2 -#define FITNESS_INITIAL (UOP_MAX_TRACE_LENGTH * FITNESS_PER_INSTRUCTION) +#define FITNESS_INITIAL 2000 #define FITNESS_INITIAL_SIDE 800 -#define FITNESS_BRANCH_BIASED 5 -#define FITNESS_BRANCH_UNBIASED 25 +#define FITNESS_BRANCH_BASE 5 #define FITNESS_BACKWARD_EDGE 80 -#define FITNESS_FRAME_ENTRY 10 -/* Default exit quality constants for fitness-based trace termination. - * Higher values mean better places to stop the trace. - * These can be overridden via PYTHON_JIT_EXIT_QUALITY_* environment variables. */ +/* Exit quality constants for fitness-based trace termination. + * Higher values mean better places to stop the trace. */ +#define EXIT_QUALITY_CLOSE_LOOP 800 #define EXIT_QUALITY_ENTER_EXECUTOR 500 #define EXIT_QUALITY_DEFAULT 200 #define EXIT_QUALITY_SPECIALIZABLE 50 @@ -119,9 +118,6 @@ typedef struct _PyJitTracerPreviousState { typedef struct _PyJitTracerTranslatorState { int32_t fitness; // Current trace fitness, starts high, decrements - int32_t best_exit_quality; // Best exit quality seen so far - int best_exit_buffer_pos; // Position in code_buffer of best exit (-1=none) - uint32_t best_exit_target; // Bytecode target of best exit point int frame_depth; // Current inline depth (0 = root frame) } _PyJitTracerTranslatorState; diff --git a/Python/optimizer.c b/Python/optimizer.c index 435fb1f06749fa..59cff49df477a9 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -596,86 +596,45 @@ add_to_trace( ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive))) -/* Compute branch bias from the 16-bit branch history register. - * Returns 0 (completely unpredictable, 50/50) to 8 (fully biased). */ +/* Compute branch fitness penalty based on how likely the traced path is. + * The penalty is small when the traced path is common, large when rare. + * A branch that historically goes the other way gets a heavy penalty. */ static inline int -compute_branch_bias(uint16_t history) +compute_branch_penalty(uint16_t history, bool branch_taken) { - int ones = _Py_popcount32((uint32_t)history); - return abs(ones - 8); + int taken_count = _Py_popcount32((uint32_t)history); + int on_trace_count = branch_taken ? taken_count : 16 - taken_count; + int off_trace = 16 - on_trace_count; + /* Quadratic scaling: off_trace^2 ranges from 0 (fully biased our way) + * to 256 (fully biased against us, e.g. 15/16 left but traced right). */ + return FITNESS_BRANCH_BASE + off_trace * off_trace; } /* Compute exit quality for the current trace position. - * Higher values mean it's a better place to stop the trace. */ + * Higher values mean better places to stop the trace. */ static inline int32_t compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode, - const _PyOptimizationConfig *cfg) + const _PyJitTracerState *tracer) { + if (target_instr == tracer->initial_state.start_instr || + target_instr == tracer->initial_state.close_loop_instr) { + return EXIT_QUALITY_CLOSE_LOOP; + } if (target_instr->op.code == ENTER_EXECUTOR) { - return (int32_t)cfg->exit_quality_enter_executor; + return EXIT_QUALITY_ENTER_EXECUTOR; } if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) { - return (int32_t)cfg->exit_quality_specializable; - } - return (int32_t)cfg->exit_quality_default; -} - -/* Try to truncate the trace to the best recorded exit point. - * Returns 1 if successful, 0 if no valid best exit exists. - * Enforces progress constraints: the fallback position must satisfy - * the minimum trace length requirements. */ -static inline int -try_best_exit_fallback( - _PyJitUopBuffer *trace, - _PyJitTracerTranslatorState *ts, - bool progress_needed) -{ - int best_pos = ts->best_exit_buffer_pos; - if (best_pos <= 0) { - return 0; - } else if (progress_needed && best_pos <= CODE_SIZE_NO_PROGRESS) { - return 0; - } else if (!progress_needed && best_pos <= CODE_SIZE_EMPTY) { - return 0; + return EXIT_QUALITY_SPECIALIZABLE; } - trace->next = trace->start + best_pos; - /* Caller must add terminator (_EXIT_TRACE) after this */ - return 1; + return EXIT_QUALITY_DEFAULT; } -/* Update trace fitness after translating one bytecode instruction. */ -static inline void -update_trace_fitness( - _PyJitTracerTranslatorState *ts, - int opcode, - _Py_CODEUNIT *target_instr, - const _PyOptimizationConfig *cfg) +static inline int32_t +compute_frame_penalty(const _PyOptimizationConfig *cfg) { - ts->fitness -= cfg->fitness_per_instruction; - - switch (opcode) { - case POP_JUMP_IF_FALSE: - case POP_JUMP_IF_TRUE: - case POP_JUMP_IF_NONE: - case POP_JUMP_IF_NOT_NONE: { - int bias = compute_branch_bias(target_instr[1].cache); - /* Linear interpolation: bias 0 → unbiased penalty, bias 8 → biased penalty */ - int penalty = cfg->fitness_branch_unbiased - - (bias * (cfg->fitness_branch_unbiased - cfg->fitness_branch_biased)) / 8; - ts->fitness -= penalty; - break; - } - case JUMP_BACKWARD: - case JUMP_BACKWARD_JIT: - case JUMP_BACKWARD_NO_JIT: - ts->fitness -= cfg->fitness_backward_edge; - break; - default: - break; - } + return (int32_t)cfg->fitness_initial / 5 + 1; } - static int is_terminator(const _PyUOpInstruction *uop) { @@ -812,20 +771,6 @@ _PyJit_translate_single_bytecode_to_trace( DPRINTF(2, "Unsupported: oparg too large\n"); unsupported: { - // If we have a high-quality best_exit (enter_executor, etc.), - // prefer it over rewinding to last _SET_IP — this covers the - // main unsupported path, not just the edge case. - _PyJitTracerTranslatorState *ts_unsup = &tracer->translator_state; - if (ts_unsup->best_exit_quality > (int32_t)tstate->interp->opt_config.exit_quality_default && - try_best_exit_fallback(trace, ts_unsup, progress_needed)) { - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts_unsup->best_exit_target); - uop_buffer_last(trace)->operand1 = true; // is_control_flow - OPT_STAT_INC(best_exit_fallback); - DPRINTF(2, "Best-exit fallback at unsupported (pos=%d, quality=%d)\n", - ts_unsup->best_exit_buffer_pos, ts_unsup->best_exit_quality); - goto done; - } - // Fall back: rewind to last _SET_IP and replace with _DEOPT. _PyUOpInstruction *curr = uop_buffer_last(trace); while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) { trace->next--; @@ -855,31 +800,13 @@ _PyJit_translate_single_bytecode_to_trace( // Fitness-based trace quality check (before reserving space for this instruction) _PyJitTracerTranslatorState *ts = &tracer->translator_state; - int32_t eq = compute_exit_quality(target_instr, opcode, - &tstate->interp->opt_config); - - // Record best exit candidate. - // Only record after minimum progress to avoid truncating to near-empty traces. - if (eq > ts->best_exit_quality && - uop_buffer_length(trace) > CODE_SIZE_NO_PROGRESS) { - ts->best_exit_quality = eq; - ts->best_exit_buffer_pos = uop_buffer_length(trace); - ts->best_exit_target = target; - } + int32_t eq = compute_exit_quality(target_instr, opcode, tracer); // Check if fitness is depleted — should we stop the trace? - if (ts->fitness < eq && - !(progress_needed && uop_buffer_length(trace) < CODE_SIZE_NO_PROGRESS)) { - // Prefer stopping at the best recorded exit point - if (try_best_exit_fallback(trace, ts, progress_needed)) { - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, ts->best_exit_target); - uop_buffer_last(trace)->operand1 = true; // is_control_flow - } - else { - // No valid best exit — stop at current position - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); - uop_buffer_last(trace)->operand1 = true; // is_control_flow - } + if (ts->fitness < eq) { + // This is a tracer heuristic rather than normal program control flow, + // so leave operand1 clear and let the resulting side exit increase chain_depth. + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); OPT_STAT_INC(fitness_terminated_traces); DPRINTF(2, "Fitness terminated: fitness=%d < exit_quality=%d\n", ts->fitness, eq); @@ -916,12 +843,6 @@ _PyJit_translate_single_bytecode_to_trace( DPRINTF(2, "No room for expansions and guards (need %d, got %d)\n", space_needed, uop_buffer_remaining_space(trace)); OPT_STAT_INC(trace_too_long); - // Try best-exit fallback before giving up - if (try_best_exit_fallback(trace, &tracer->translator_state, progress_needed)) { - ADD_TO_TRACE(_EXIT_TRACE, 0, 0, tracer->translator_state.best_exit_target); - uop_buffer_last(trace)->operand1 = true; // is_control_flow - OPT_STAT_INC(best_exit_fallback); - } goto done; } @@ -945,6 +866,8 @@ _PyJit_translate_single_bytecode_to_trace( assert(jump_happened ? (next_instr == computed_jump_instr) : (next_instr == computed_next_instr)); uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened]; ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code)); + tracer->translator_state.fitness -= compute_branch_penalty( + target_instr[1].cache, jump_happened); break; } case JUMP_BACKWARD_JIT: @@ -952,6 +875,7 @@ _PyJit_translate_single_bytecode_to_trace( case JUMP_BACKWARD_NO_JIT: case JUMP_BACKWARD: ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target); + tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE; _Py_FALLTHROUGH; case JUMP_BACKWARD_NO_INTERRUPT: { @@ -1084,15 +1008,19 @@ _PyJit_translate_single_bytecode_to_trace( ts_depth->frame_depth); goto unsupported; } - int32_t penalty = (int32_t)tstate->interp->opt_config.fitness_frame_entry - * ts_depth->frame_depth; - ts_depth->fitness -= penalty; + int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config); + ts_depth->fitness -= frame_penalty * ts_depth->frame_depth; } else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) { _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config); if (ts_depth->frame_depth <= 0) { - // Underflow - ts_depth->fitness -= (int32_t)tstate->interp->opt_config.fitness_frame_entry * 2; + // Underflow: returning from a frame we didn't enter + ts_depth->fitness -= frame_penalty * 2; + } + else { + // Reward returning: small inlined calls should be encouraged + ts_depth->fitness += frame_penalty; } ts_depth->frame_depth = ts_depth->frame_depth <= 0 ? 0 : ts_depth->frame_depth - 1; } @@ -1140,8 +1068,7 @@ _PyJit_translate_single_bytecode_to_trace( // Update fitness AFTER translation, BEFORE returning to continue tracing. // This ensures the next iteration's fitness check reflects the cost of // all instructions translated so far. - update_trace_fitness(&tracer->translator_state, opcode, target_instr, - &tstate->interp->opt_config); + tracer->translator_state.fitness -= FITNESS_PER_INSTRUCTION; DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness); return 1; done: @@ -1232,9 +1159,6 @@ _PyJit_TryInitializeTracing( ts->fitness = is_side_trace ? (int32_t)cfg->fitness_initial_side : (int32_t)cfg->fitness_initial; - ts->best_exit_quality = 0; - ts->best_exit_buffer_pos = -1; - ts->best_exit_target = 0; ts->frame_depth = 0; tracer->is_tracing = true; diff --git a/Python/pystate.c b/Python/pystate.c index 557e6fc309e373..78eab7cc7d2459 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -642,32 +642,14 @@ init_interpreter(PyInterpreterState *interp, init_policy(&interp->opt_config.fitness_initial_side, "PYTHON_JIT_FITNESS_INITIAL_SIDE", FITNESS_INITIAL_SIDE, 50, 5000); - init_policy(&interp->opt_config.fitness_per_instruction, - "PYTHON_JIT_FITNESS_PER_INSTRUCTION", - FITNESS_PER_INSTRUCTION, 0, 100); - init_policy(&interp->opt_config.fitness_branch_biased, - "PYTHON_JIT_FITNESS_BRANCH_BIASED", - FITNESS_BRANCH_BIASED, 0, 500); - init_policy(&interp->opt_config.fitness_branch_unbiased, - "PYTHON_JIT_FITNESS_BRANCH_UNBIASED", - FITNESS_BRANCH_UNBIASED, 0, 500); - init_policy(&interp->opt_config.fitness_backward_edge, - "PYTHON_JIT_FITNESS_BACKWARD_EDGE", - FITNESS_BACKWARD_EDGE, 0, 1000); - init_policy(&interp->opt_config.fitness_frame_entry, - "PYTHON_JIT_FITNESS_FRAME_ENTRY", - FITNESS_FRAME_ENTRY, 0, 1000); - - // Exit quality thresholds - init_policy(&interp->opt_config.exit_quality_enter_executor, - "PYTHON_JIT_EXIT_QUALITY_ENTER_EXECUTOR", - EXIT_QUALITY_ENTER_EXECUTOR, 0, 10000); - init_policy(&interp->opt_config.exit_quality_default, - "PYTHON_JIT_EXIT_QUALITY_DEFAULT", - EXIT_QUALITY_DEFAULT, 0, 10000); - init_policy(&interp->opt_config.exit_quality_specializable, - "PYTHON_JIT_EXIT_QUALITY_SPECIALIZABLE", - EXIT_QUALITY_SPECIALIZABLE, 0, 10000); + /* The tracer starts at start_instr, so initial fitness must not be below + * the close-loop exit quality or tracing will terminate immediately. */ + if (interp->opt_config.fitness_initial < EXIT_QUALITY_CLOSE_LOOP) { + interp->opt_config.fitness_initial = EXIT_QUALITY_CLOSE_LOOP; + } + if (interp->opt_config.fitness_initial_side < EXIT_QUALITY_CLOSE_LOOP) { + interp->opt_config.fitness_initial_side = EXIT_QUALITY_CLOSE_LOOP; + } interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF"); interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE"); diff --git a/Python/pystats.c b/Python/pystats.c index b563da14858861..2fac2db1b738c7 100644 --- a/Python/pystats.c +++ b/Python/pystats.c @@ -275,7 +275,6 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee); fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated); fprintf(out, "Optimization fitness terminated: %" PRIu64 "\n", stats->fitness_terminated_traces); - fprintf(out, "Optimization best exit fallback: %" PRIu64 "\n", stats->best_exit_fallback); print_histogram(out, "Trace length", stats->trace_length_hist); print_histogram(out, "Trace run length", stats->trace_run_length_hist); From b99fe610ca11552119f6763759f591cd28ba5de5 Mon Sep 17 00:00:00 2001 From: cocolato Date: Thu, 2 Apr 2026 23:15:36 +0800 Subject: [PATCH 5/8] optimize some constants --- Include/internal/pycore_optimizer.h | 17 +++++++++-------- Python/optimizer.c | 7 ++++--- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index a0cdde0b94490b..ebf2fcb0099bb4 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -18,18 +18,19 @@ extern "C" { /* Default fitness configuration values for trace quality control. * FITNESS_INITIAL and FITNESS_INITIAL_SIDE can be overridden via * PYTHON_JIT_FITNESS_INITIAL and PYTHON_JIT_FITNESS_INITIAL_SIDE */ -#define FITNESS_PER_INSTRUCTION 2 -#define FITNESS_INITIAL 2000 -#define FITNESS_INITIAL_SIDE 800 -#define FITNESS_BRANCH_BASE 5 -#define FITNESS_BACKWARD_EDGE 80 +#define FITNESS_PER_INSTRUCTION 2 +#define FITNESS_BRANCH_BASE 5 +#define FITNESS_INITIAL (FITNESS_PER_INSTRUCTION * 1000) +#define FITNESS_INITIAL_SIDE (FITNESS_INITIAL / 2) +#define FITNESS_BACKWARD_EDGE (FITNESS_PER_INSTRUCTION / 10) /* Exit quality constants for fitness-based trace termination. * Higher values mean better places to stop the trace. */ -#define EXIT_QUALITY_CLOSE_LOOP 800 -#define EXIT_QUALITY_ENTER_EXECUTOR 500 + #define EXIT_QUALITY_DEFAULT 200 -#define EXIT_QUALITY_SPECIALIZABLE 50 +#define EXIT_QUALITY_CLOSE_LOOP (4 * EXIT_QUALITY_DEFAULT) +#define EXIT_QUALITY_ENTER_EXECUTOR (2 * EXIT_QUALITY_DEFAULT + 100) +#define EXIT_QUALITY_SPECIALIZABLE (EXIT_QUALITY_DEFAULT / 4) typedef struct _PyJitUopBuffer { diff --git a/Python/optimizer.c b/Python/optimizer.c index 59cff49df477a9..177092170bd849 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -605,9 +605,10 @@ compute_branch_penalty(uint16_t history, bool branch_taken) int taken_count = _Py_popcount32((uint32_t)history); int on_trace_count = branch_taken ? taken_count : 16 - taken_count; int off_trace = 16 - on_trace_count; - /* Quadratic scaling: off_trace^2 ranges from 0 (fully biased our way) - * to 256 (fully biased against us, e.g. 15/16 left but traced right). */ - return FITNESS_BRANCH_BASE + off_trace * off_trace; + /* Linear scaling: off_trace ranges from 0 (fully biased our way) + * to 16 (fully biased against us), so the penalty ranges from + * FITNESS_BRANCH_BASE to FITNESS_BRANCH_BASE + 48. */ + return FITNESS_BRANCH_BASE + off_trace * 2; } /* Compute exit quality for the current trace position. From d09afb5ee40817ea7e6341cef784a8eb7ca3e033 Mon Sep 17 00:00:00 2001 From: cocolato Date: Thu, 2 Apr 2026 23:22:12 +0800 Subject: [PATCH 6/8] fix comment --- Python/optimizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 177092170bd849..45c44016f3b639 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -607,7 +607,7 @@ compute_branch_penalty(uint16_t history, bool branch_taken) int off_trace = 16 - on_trace_count; /* Linear scaling: off_trace ranges from 0 (fully biased our way) * to 16 (fully biased against us), so the penalty ranges from - * FITNESS_BRANCH_BASE to FITNESS_BRANCH_BASE + 48. */ + * FITNESS_BRANCH_BASE to FITNESS_BRANCH_BASE + 32. */ return FITNESS_BRANCH_BASE + off_trace * 2; } From c9957c31ec2dd4995ba873effa6dbec80f6a3fa0 Mon Sep 17 00:00:00 2001 From: cocolato Date: Thu, 2 Apr 2026 23:31:43 +0800 Subject: [PATCH 7/8] fix constent --- Include/internal/pycore_optimizer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index ebf2fcb0099bb4..820ee32201c1f8 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -22,7 +22,7 @@ extern "C" { #define FITNESS_BRANCH_BASE 5 #define FITNESS_INITIAL (FITNESS_PER_INSTRUCTION * 1000) #define FITNESS_INITIAL_SIDE (FITNESS_INITIAL / 2) -#define FITNESS_BACKWARD_EDGE (FITNESS_PER_INSTRUCTION / 10) +#define FITNESS_BACKWARD_EDGE (FITNESS_INITIAL / 10) /* Exit quality constants for fitness-based trace termination. * Higher values mean better places to stop the trace. */ From 9447546ea1dbc6e9750f32ad7f272fdbe4c1c1ad Mon Sep 17 00:00:00 2001 From: cocolato Date: Fri, 3 Apr 2026 20:07:06 +0800 Subject: [PATCH 8/8] reduce frame penalty --- Python/optimizer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/optimizer.c b/Python/optimizer.c index 45c44016f3b639..d1b709ffdc68f2 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -633,7 +633,7 @@ compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode, static inline int32_t compute_frame_penalty(const _PyOptimizationConfig *cfg) { - return (int32_t)cfg->fitness_initial / 5 + 1; + return (int32_t)cfg->fitness_initial / 10 + 1; } static int