diff --git a/Include/cpython/pystats.h b/Include/cpython/pystats.h index e473110eca7415..5d1f44988a6df1 100644 --- a/Include/cpython/pystats.h +++ b/Include/cpython/pystats.h @@ -144,6 +144,7 @@ typedef struct _optimization_stats { uint64_t unknown_callee; uint64_t trace_immediately_deopts; uint64_t executors_invalidated; + uint64_t fitness_terminated_traces; UOpStats opcode[PYSTATS_MAX_UOP_ID + 1]; uint64_t unsupported_opcode[256]; uint64_t trace_length_hist[_Py_UOP_HIST_SIZE]; diff --git a/Include/internal/pycore_interp_structs.h b/Include/internal/pycore_interp_structs.h index f76d4f41c55119..0cebe1b4b9e995 100644 --- a/Include/internal/pycore_interp_structs.h +++ b/Include/internal/pycore_interp_structs.h @@ -449,6 +449,10 @@ typedef struct _PyOptimizationConfig { uint16_t side_exit_initial_value; uint16_t side_exit_initial_backoff; + // Trace fitness thresholds + uint16_t fitness_initial; + uint16_t fitness_initial_side; + // Optimization flags bool specialization_enabled; bool uops_optimize_enabled; diff --git a/Include/internal/pycore_optimizer.h b/Include/internal/pycore_optimizer.h index 2986afb142b5d1..820ee32201c1f8 100644 --- a/Include/internal/pycore_optimizer.h +++ b/Include/internal/pycore_optimizer.h @@ -15,6 +15,23 @@ extern "C" { #include "pycore_optimizer_types.h" #include +/* Default fitness configuration values for trace quality control. + * FITNESS_INITIAL and FITNESS_INITIAL_SIDE can be overridden via + * PYTHON_JIT_FITNESS_INITIAL and PYTHON_JIT_FITNESS_INITIAL_SIDE */ +#define FITNESS_PER_INSTRUCTION 2 +#define FITNESS_BRANCH_BASE 5 +#define FITNESS_INITIAL (FITNESS_PER_INSTRUCTION * 1000) +#define FITNESS_INITIAL_SIDE (FITNESS_INITIAL / 2) +#define FITNESS_BACKWARD_EDGE (FITNESS_INITIAL / 10) + +/* Exit quality constants for fitness-based trace termination. + * Higher values mean better places to stop the trace. */ + +#define EXIT_QUALITY_DEFAULT 200 +#define EXIT_QUALITY_CLOSE_LOOP (4 * EXIT_QUALITY_DEFAULT) +#define EXIT_QUALITY_ENTER_EXECUTOR (2 * EXIT_QUALITY_DEFAULT + 100) +#define EXIT_QUALITY_SPECIALIZABLE (EXIT_QUALITY_DEFAULT / 4) + typedef struct _PyJitUopBuffer { _PyUOpInstruction *start; @@ -101,7 +118,8 @@ typedef struct _PyJitTracerPreviousState { } _PyJitTracerPreviousState; typedef struct _PyJitTracerTranslatorState { - int jump_backward_seen; + int32_t fitness; // Current trace fitness, starts high, decrements + int frame_depth; // Current inline depth (0 = root frame) } _PyJitTracerTranslatorState; typedef struct _PyJitTracerState { diff --git a/Python/optimizer.c b/Python/optimizer.c index f09bf778587b12..c7a6b7e746545c 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -549,8 +549,6 @@ dynamic_exit_uop[MAX_UOP_ID + 1] = { }; -#define CONFIDENCE_RANGE 1000 -#define CONFIDENCE_CUTOFF 333 #ifdef Py_DEBUG #define DPRINTF(level, ...) \ @@ -598,6 +596,46 @@ add_to_trace( ((uint32_t)((INSTR) - ((_Py_CODEUNIT *)(CODE)->co_code_adaptive))) +/* Compute branch fitness penalty based on how likely the traced path is. + * The penalty is small when the traced path is common, large when rare. + * A branch that historically goes the other way gets a heavy penalty. */ +static inline int +compute_branch_penalty(uint16_t history, bool branch_taken) +{ + int taken_count = _Py_popcount32((uint32_t)history); + int on_trace_count = branch_taken ? taken_count : 16 - taken_count; + int off_trace = 16 - on_trace_count; + /* Linear scaling: off_trace ranges from 0 (fully biased our way) + * to 16 (fully biased against us), so the penalty ranges from + * FITNESS_BRANCH_BASE to FITNESS_BRANCH_BASE + 32. */ + return FITNESS_BRANCH_BASE + off_trace * 2; +} + +/* Compute exit quality for the current trace position. + * Higher values mean better places to stop the trace. */ +static inline int32_t +compute_exit_quality(_Py_CODEUNIT *target_instr, int opcode, + const _PyJitTracerState *tracer) +{ + if (target_instr == tracer->initial_state.start_instr || + target_instr == tracer->initial_state.close_loop_instr) { + return EXIT_QUALITY_CLOSE_LOOP; + } + if (target_instr->op.code == ENTER_EXECUTOR) { + return EXIT_QUALITY_ENTER_EXECUTOR; + } + if (_PyOpcode_Caches[_PyOpcode_Deopt[opcode]] > 0) { + return EXIT_QUALITY_SPECIALIZABLE; + } + return EXIT_QUALITY_DEFAULT; +} + +static inline int32_t +compute_frame_penalty(const _PyOptimizationConfig *cfg) +{ + return (int32_t)cfg->fitness_initial / 10 + 1; +} + static int is_terminator(const _PyUOpInstruction *uop) { @@ -637,6 +675,7 @@ _PyJit_translate_single_bytecode_to_trace( _Py_CODEUNIT *this_instr = tracer->prev_state.instr; _Py_CODEUNIT *target_instr = this_instr; uint32_t target = 0; + int end_trace_opcode = _DEOPT; target = Py_IsNone((PyObject *)old_code) ? (uint32_t)(target_instr - _Py_INTERPRETER_TRAMPOLINE_INSTRUCTIONS_PTR) @@ -734,16 +773,14 @@ _PyJit_translate_single_bytecode_to_trace( DPRINTF(2, "Unsupported: oparg too large\n"); unsupported: { - // Rewind to previous instruction and replace with _EXIT_TRACE. _PyUOpInstruction *curr = uop_buffer_last(trace); while (curr->opcode != _SET_IP && uop_buffer_length(trace) > 2) { trace->next--; curr = uop_buffer_last(trace); } - assert(curr->opcode == _SET_IP || uop_buffer_length(trace) == 2); if (curr->opcode == _SET_IP) { int32_t old_target = (int32_t)uop_get_target(curr); - curr->opcode = _DEOPT; + curr->opcode = end_trace_opcode; curr->format = UOP_FORMAT_TARGET; curr->target = old_target; } @@ -763,6 +800,23 @@ _PyJit_translate_single_bytecode_to_trace( return 1; } + // Fitness-based trace quality check (before reserving space for this instruction) + _PyJitTracerTranslatorState *ts = &tracer->translator_state; + int32_t eq = compute_exit_quality(target_instr, opcode, tracer); + DPRINTF(3, "Fitness check: %s(%d) fitness=%d, exit_quality=%d, depth=%d\n", + _PyOpcode_OpName[opcode], oparg, ts->fitness, eq, ts->frame_depth); + + // Check if fitness is depleted — should we stop the trace? + if (ts->fitness < eq) { + // This is a tracer heuristic rather than normal program control flow, + // so leave operand1 clear and let the resulting side exit increase chain_depth. + ADD_TO_TRACE(_EXIT_TRACE, 0, 0, target); + OPT_STAT_INC(fitness_terminated_traces); + DPRINTF(2, "Fitness terminated: %s(%d) fitness=%d < exit_quality=%d\n", + _PyOpcode_OpName[opcode], oparg, ts->fitness, eq); + goto done; + } + // One for possible _DEOPT, one because _CHECK_VALIDITY itself might _DEOPT trace->end -= 2; @@ -816,6 +870,12 @@ _PyJit_translate_single_bytecode_to_trace( assert(jump_happened ? (next_instr == computed_jump_instr) : (next_instr == computed_next_instr)); uint32_t uopcode = BRANCH_TO_GUARD[opcode - POP_JUMP_IF_FALSE][jump_happened]; ADD_TO_TRACE(uopcode, 0, 0, INSTR_IP(jump_happened ? computed_next_instr : computed_jump_instr, old_code)); + int bp = compute_branch_penalty(target_instr[1].cache, jump_happened); + tracer->translator_state.fitness -= bp; + DPRINTF(3, " branch penalty: -%d (history=0x%04x, taken=%d) -> fitness=%d\n", + bp, target_instr[1].cache, jump_happened, + tracer->translator_state.fitness); + break; } case JUMP_BACKWARD_JIT: @@ -823,6 +883,9 @@ _PyJit_translate_single_bytecode_to_trace( case JUMP_BACKWARD_NO_JIT: case JUMP_BACKWARD: ADD_TO_TRACE(_CHECK_PERIODIC, 0, 0, target); + tracer->translator_state.fitness -= FITNESS_BACKWARD_EDGE; + DPRINTF(3, " backward edge penalty: -%d -> fitness=%d\n", + FITNESS_BACKWARD_EDGE, tracer->translator_state.fitness); _Py_FALLTHROUGH; case JUMP_BACKWARD_NO_INTERRUPT: { @@ -945,6 +1008,44 @@ _PyJit_translate_single_bytecode_to_trace( assert(next->op.code == STORE_FAST); operand = next->op.arg; } + else if (uop == _PUSH_FRAME) { + _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + ts_depth->frame_depth++; + if (ts_depth->frame_depth >= MAX_ABSTRACT_FRAME_DEPTH) { + // The optimizer can't handle frames this deep, + // so there's no point continuing the trace. + DPRINTF(2, "Unsupported: frame depth %d >= MAX_ABSTRACT_FRAME_DEPTH\n", + ts_depth->frame_depth); + end_trace_opcode = _EXIT_TRACE; + goto unsupported; + } + int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config); + int32_t cost = frame_penalty * ts_depth->frame_depth; + ts_depth->fitness -= cost; + DPRINTF(3, " _PUSH_FRAME: depth=%d, penalty=-%d (per_frame=%d) -> fitness=%d\n", + ts_depth->frame_depth, cost, frame_penalty, + ts_depth->fitness); + } + else if (uop == _RETURN_VALUE || uop == _RETURN_GENERATOR || uop == _YIELD_VALUE) { + _PyJitTracerTranslatorState *ts_depth = &tracer->translator_state; + int32_t frame_penalty = compute_frame_penalty(&tstate->interp->opt_config); + if (ts_depth->frame_depth <= 0) { + // Underflow: returning from a frame we didn't enter + ts_depth->fitness -= frame_penalty * 2; + DPRINTF(3, " %s: underflow penalty=-%d -> fitness=%d\n", + _PyOpcode_uop_name[uop], frame_penalty * 2, + ts_depth->fitness); + } + else { + // Reward returning: small inlined calls should be encouraged + ts_depth->fitness += frame_penalty; + DPRINTF(3, " %s: return reward=+%d, depth=%d -> fitness=%d\n", + _PyOpcode_uop_name[uop], frame_penalty, + ts_depth->frame_depth - 1, + ts_depth->fitness); + } + ts_depth->frame_depth = ts_depth->frame_depth <= 0 ? 0 : ts_depth->frame_depth - 1; + } else if (_PyUop_Flags[uop] & HAS_RECORDS_VALUE_FLAG) { PyObject *recorded_value = tracer->prev_state.recorded_value; tracer->prev_state.recorded_value = NULL; @@ -986,7 +1087,13 @@ _PyJit_translate_single_bytecode_to_trace( ADD_TO_TRACE(_JUMP_TO_TOP, 0, 0, 0); goto done; } - DPRINTF(2, "Trace continuing\n"); + // Update fitness AFTER translation, BEFORE returning to continue tracing. + // This ensures the next iteration's fitness check reflects the cost of + // all instructions translated so far. + tracer->translator_state.fitness -= FITNESS_PER_INSTRUCTION; + DPRINTF(3, " per-insn cost: -%d -> fitness=%d\n", + FITNESS_PER_INSTRUCTION, tracer->translator_state.fitness); + DPRINTF(2, "Trace continuing (fitness=%d)\n", tracer->translator_state.fitness); return 1; done: DPRINTF(2, "Trace done\n"); @@ -1069,6 +1176,17 @@ _PyJit_TryInitializeTracing( assert(curr_instr->op.code == JUMP_BACKWARD_JIT || curr_instr->op.code == RESUME_CHECK_JIT || (exit != NULL)); tracer->initial_state.jump_backward_instr = curr_instr; + // Initialize fitness tracking state + const _PyOptimizationConfig *cfg = &tstate->interp->opt_config; + _PyJitTracerTranslatorState *ts = &tracer->translator_state; + bool is_side_trace = (exit != NULL); + ts->fitness = is_side_trace + ? (int32_t)cfg->fitness_initial_side + : (int32_t)cfg->fitness_initial; + ts->frame_depth = 0; + DPRINTF(3, "Fitness init: %s trace, fitness=%d\n", + is_side_trace ? "side" : "root", ts->fitness); + tracer->is_tracing = true; return 1; } diff --git a/Python/pystate.c b/Python/pystate.c index 143175da0f45c7..78eab7cc7d2459 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -635,6 +635,22 @@ init_interpreter(PyInterpreterState *interp, "PYTHON_JIT_SIDE_EXIT_INITIAL_BACKOFF", SIDE_EXIT_INITIAL_BACKOFF, 0, MAX_BACKOFF); + // Trace fitness configuration + init_policy(&interp->opt_config.fitness_initial, + "PYTHON_JIT_FITNESS_INITIAL", + FITNESS_INITIAL, 100, 10000); + init_policy(&interp->opt_config.fitness_initial_side, + "PYTHON_JIT_FITNESS_INITIAL_SIDE", + FITNESS_INITIAL_SIDE, 50, 5000); + /* The tracer starts at start_instr, so initial fitness must not be below + * the close-loop exit quality or tracing will terminate immediately. */ + if (interp->opt_config.fitness_initial < EXIT_QUALITY_CLOSE_LOOP) { + interp->opt_config.fitness_initial = EXIT_QUALITY_CLOSE_LOOP; + } + if (interp->opt_config.fitness_initial_side < EXIT_QUALITY_CLOSE_LOOP) { + interp->opt_config.fitness_initial_side = EXIT_QUALITY_CLOSE_LOOP; + } + interp->opt_config.specialization_enabled = !is_env_enabled("PYTHON_SPECIALIZATION_OFF"); interp->opt_config.uops_optimize_enabled = !is_env_disabled("PYTHON_UOPS_OPTIMIZE"); if (interp != &runtime->_main_interpreter) { diff --git a/Python/pystats.c b/Python/pystats.c index a057ad884566d8..2fac2db1b738c7 100644 --- a/Python/pystats.c +++ b/Python/pystats.c @@ -274,6 +274,7 @@ print_optimization_stats(FILE *out, OptimizationStats *stats) fprintf(out, "Optimization low confidence: %" PRIu64 "\n", stats->low_confidence); fprintf(out, "Optimization unknown callee: %" PRIu64 "\n", stats->unknown_callee); fprintf(out, "Executors invalidated: %" PRIu64 "\n", stats->executors_invalidated); + fprintf(out, "Optimization fitness terminated: %" PRIu64 "\n", stats->fitness_terminated_traces); print_histogram(out, "Trace length", stats->trace_length_hist); print_histogram(out, "Trace run length", stats->trace_run_length_hist);