diff --git a/.gitignore b/.gitignore
index 67fda7c..c019973 100644
--- a/.gitignore
+++ b/.gitignore
@@ -51,4 +51,7 @@ examples/**/.CondaPkg/*
 *.err
 *.tsv
 *.pdf
-plan.md
+plan/
+*_cuts.json
+settings.json
+*.sh
diff --git a/Project.toml b/Project.toml
index bf7226f..5e9b71c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -16,6 +16,7 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
 ParametricOptInterface = "0ce4ce61-57bf-432b-a095-efac525d185e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
@@ -35,6 +36,7 @@ MadNLP = "0.8, 0.9, 0.10"
 MadNLPGPU = "0.7, 0.8, 0.9, 0.10"
 MathOptInterface = "1.48.0"
 ParametricOptInterface = "0.14.1, 0.15, 0.16"
+Statistics = "1.10, 1.11"
 Zygote = "0.6.77, 0.7"
 julia = "1.10, 1.11, 1.12"
 
diff --git a/README.md b/README.md
index 8fb6b16..710b710 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ DecisionRules.jl implements this workflow in three flavors:
 
 ```julia
 using Pkg
-Pkg.add(url="https://github.com/LearningToOptimize/DecisionRules.jl.git")
+Pkg.add("DecisionRules")
 ```
 
 ## What you need to provide
@@ -202,6 +202,18 @@ Each evaluation reports (a) the rollout objective **excluding** the target-slack
 
 Per-sample debugging hooks can be attached with `SampleLog(on_sample=(s, models, log) -> ...)`; the training loop calls the hook after each sample's solve with the live JuMP model(s). The previous `record_loss=(iter, model, loss, tag) -> ...` keyword keeps working as a deprecated adapter.
 
+## GPU acceleration with DecisionRulesExa.jl
+
+For large-scale problems where the inner NLP solve is the bottleneck (e.g., AC-OPF with hundreds of buses), [DecisionRulesExa.jl](https://github.com/LearningToOptimize/DecisionRulesExa.jl) provides a GPU-accelerated backend that replaces JuMP with [ExaModels.jl](https://github.com/exanauts/ExaModels.jl) and solves with [MadNLP.jl](https://github.com/MadNLP/MadNLP.jl) + CUDSS on GPU.
+
+DecisionRulesExa.jl implements the same TS-DDR algorithm (deterministic-equivalent mode) with the same envelope-theorem gradient computation but formulates the NLP in ExaModels' SIMD-compatible modeling layer. This enables:
+
+- **GPU-native interior-point solves** via MadNLP + CUDSS
+- **Parallel GPU solves** for multiple training samples per gradient step
+- **Runtime parameter updates** via `ExaModels.set_parameter!` (no model reconstruction)
+
+See the [GPU Acceleration](https://LearningToOptimize.github.io/DecisionRules.jl/dev/gpu_acceleration/) page in the documentation for a tutorial on getting started with DecisionRulesExa.jl.
+
 ## Examples and tests
 
 Examples live in `examples/`. Run tests with:
diff --git a/docs/Project.toml b/docs/Project.toml
index 6829622..cadd9b3 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -3,9 +3,12 @@ DecisionRules = "47937410-f832-486f-8300-12c95b225dfc"
 DiffOpt = "930fe3bc-9c6b-11ea-2d94-6184641e85e7"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
+HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b"
 Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9"
 JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
 Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
diff --git a/docs/make.jl b/docs/make.jl
index 9f169c8..490de9b 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -24,6 +24,9 @@ makedocs(;
     pages=[
         "Home" => "index.md",
         "Algorithm" => "algorithm.md",
+        "Gradient Fallback" => "gradient_fallback.md",
+        "Uncertainty Sampling" => "sampling.md",
+        "GPU Acceleration" => "gpu_acceleration.md",
         "Examples" => [
             "Hydropower Scheduling" => "examples/hydro.md",
             "Rocket Control" => "examples/rocket.md",
diff --git a/docs/src/algorithm.md b/docs/src/algorithm.md
index 22cbe1e..36c3d24 100644
--- a/docs/src/algorithm.md
+++ b/docs/src/algorithm.md
@@ -108,6 +108,86 @@ for k = 1, ..., ⌈T/W⌉:
 **Pros**: balances coupling (within windows) with tractability; parallelizable windows.
 **Cons**: continuity gaps between windows require penalty tuning.
 
+## Mixed gradient: score-function (REINFORCE) correction
+
+For problems with integer variables or non-smooth subproblems, the dual
+gradient can be biased — it is local to a fixed integer assignment and cannot
+see the effect of discrete switches (e.g., opening a setup variable).
+
+DecisionRules provides a **score-function (REINFORCE)** correction that mixes
+the dual gradient with a model-free policy gradient estimated from stage-wise
+rollouts under perturbed targets.
+
+### How the score-function estimator works
+
+1. **Perturb**: add Gaussian noise to the policy targets:
+   ``\tilde{x}_t = \hat{x}_t(\theta) + \delta_t``, where
+   ``\delta_t \sim \mathcal{N}(0, \sigma^2 I)``.
+
+2. **Rollout**: solve the stage-wise subproblems with the perturbed targets to
+   obtain realized costs ``R_m`` for ``m = 1, \ldots, M`` rollouts. These
+   rollouts solve the models exactly as built (MIPs stay MIPs), so the costs
+   reflect true integer-feasible decisions.
+
+3. **Advantage**: center the costs ``A_m = R_m - \bar{R}`` (mean baseline
+   reduces variance without changing the expected gradient).
+
+4. **Surrogate loss**: the differentiable scalar whose gradient recovers the
+   REINFORCE estimate:
+
+```math
+L_{\text{sf}}(\theta)
+\;=\;
+\frac{1}{M} \sum_{m=1}^{M}
+  A_m
+  \sum_{t=1}^{T}
+  \left\langle
+    \frac{\delta_{m,t}}{\sigma^2},\;
+    \hat{x}_{t+1}(\theta)
+  \right\rangle.
+```
+
+This is the standard score-function estimator for Gaussian perturbations.
+The key identity is
+``\nabla_\theta \log p(\delta_t \mid \theta) = \delta_t / \sigma^2``
+for a Gaussian centered at ``\hat{x}_t(\theta)``.
+
+### Mixed gradient
+
+The final training gradient combines both signals:
+
+```math
+\nabla L
+\;=\;
+\alpha\, \nabla L_{\text{dual}}
++ (1 - \alpha)\, \nabla L_{\text{sf}},
+```
+
+where ``\alpha \in [0, 1]`` is the `dual_weight`.
+
+There are two separate solve paths in the mixed-gradient training loop:
+
+- **Dual path**: controlled by `integer_strategy`, which determines how local
+  dual information is read from the deterministic equivalent
+  (e.g., [`FixedDiscreteIntegerStrategy`](@ref) solves the MIP, fixes integers,
+  re-solves the LP, and reads LP duals).
+- **Score-function path**: controlled by [`ScoreFunctionConfig`](@ref), which
+  owns separate rollout subproblems. These are solved exactly as built, and
+  their realized costs define the Monte Carlo score-function term.
+
+### Scheduled ramp-in
+
+A [`ScoreFunctionSchedule`](@ref) can ramp ``\alpha`` from 1 (pure dual) to
+its final value over a warmup period.  Let ``k`` be the current iteration and
+``\rho_k = \operatorname{clip}((k - k_0) / r,\, 0,\, 1)``.  The effective
+score-function weight is ``\rho_k (1 - \alpha)``.
+
+This lets the DE dual gradient establish a good initial policy before
+introducing the higher-variance REINFORCE signal.
+
+See the [Stochastic Lot-Sizing with Fixed Ordering Costs](@ref) example for a
+complete worked example with integer variables and mixed gradients.
+
 ## Penalty annealing
 
 The target penalty ``\lambda`` is critical: too small and the optimizer ignores
diff --git a/docs/src/api.md b/docs/src/api.md
index 02d432c..c231205 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -16,4 +16,5 @@ Private = false
 ```@autodocs
 Modules = [DecisionRules]
 Public = false
+Filter = t -> t != DecisionRules
 ```
diff --git a/docs/src/assets/hydro_generation_comparison.png b/docs/src/assets/hydro_generation_comparison.png
new file mode 100644
index 0000000..b601e59
Binary files /dev/null and b/docs/src/assets/hydro_generation_comparison.png differ
diff --git a/docs/src/assets/hydro_volume_comparison.png b/docs/src/assets/hydro_volume_comparison.png
new file mode 100644
index 0000000..4e0f863
Binary files /dev/null and b/docs/src/assets/hydro_volume_comparison.png differ
diff --git a/docs/src/assets/inventory_integer_results.png b/docs/src/assets/inventory_integer_results.png
index e905f7a..0f26dd2 100644
Binary files a/docs/src/assets/inventory_integer_results.png and b/docs/src/assets/inventory_integer_results.png differ
diff --git a/docs/src/assets/inventory_relaxed_results.png b/docs/src/assets/inventory_relaxed_results.png
index 73e5dab..8a16f85 100644
Binary files a/docs/src/assets/inventory_relaxed_results.png and b/docs/src/assets/inventory_relaxed_results.png differ
diff --git a/docs/src/examples/hydro.jl b/docs/src/examples/hydro.jl
index c5e476f..e7bf3fd 100644
--- a/docs/src/examples/hydro.jl
+++ b/docs/src/examples/hydro.jl
@@ -1,16 +1,86 @@
 # # Hydropower Scheduling
 #
-# This example trains TS-DDR policies for the Bolivia long-term hydrothermal
-# dispatch (LTHD) problem using all three formulations: deterministic equivalent,
-# stage-wise subproblem decomposition, and multiple shooting.
+# This example trains target-setting decision rules for the Bolivia
+# long-term hydrothermal dispatch (LTHD) problem — both **TS-DDR** (deep,
+# LSTM-based) and **TS-LDR** (linear) — and compares them against an SDDP
+# baseline with inconsistent formulations.
 #
-# The Bolivia system has 10 hydro plants, 96 monthly stages, and AC power flow
-# constraints. Inflow uncertainty is sampled from historical scenarios.
+# The Bolivia system has **10 hydro plants**, **96 monthly stages**, and
+# **AC power flow** constraints.  Inflow uncertainty is sampled from 47
+# historical scenarios.
 #
+# ## Overview of the TS-DDR approach
+#
+# Classical stochastic programming (e.g., SDDP) constructs piecewise-linear
+# value-function approximations.  TS-DDR takes a different route: a neural
+# network policy ``\pi_\theta`` maps observations to **target states**, and a
+# projection subproblem at each stage enforces physical feasibility while
+# tracking those targets as closely as possible.
+#
+# The key insight is that the gradient of the projection subproblem with
+# respect to the target parameters is available through Lagrange duality
+# (or equivalently, implicit differentiation of the KKT conditions).
+# This avoids differentiating through the full optimization solver.
+#
+# ## Problem formulation
+#
+# At each stage ``t``, the operator observes inflows ``w_t`` and the current
+# reservoir state ``x_{t-1}``.  The policy predicts target volumes:
+#
+# ```math
+# \hat{x}_t = \pi_\theta(w_{1:t},\, x_{t-1}).
+# ```
+#
+# A stage subproblem projects onto the feasible set:
+#
+# ```math
+# \begin{aligned}
+# q_t(x_{t-1},\, w_t;\; \hat{x}_t)
+#   \;=\;
+#   \min_{x_t, u_t, \delta_t}
+#   \quad &
+#   c_t(x_t, u_t) + C_\delta\, \|\delta_t\| \\
+# \text{s.t.}\quad
+#   & x_t = x_{t-1} + w_t - \text{turbined}_t - \text{spilled}_t,
+#         && \text{(reservoir balance)} \\
+#   & x_t + \delta_t = \hat{x}_t,
+#         && : \lambda_t \quad \text{(target constraint)} \\
+#   & \text{AC-OPF}(u_t),
+#         && \text{(power flow)}  \\
+#   & x_t \in [0, \bar{x}],\; u_t \ge 0.
+# \end{aligned}
+# ```
+#
+# The slack variable ``\delta_t`` absorbs infeasible targets; ``\lambda_t`` is
+# the dual multiplier that provides the gradient signal.
+#
+# ## Gradient computation: the envelope theorem
+#
+# By the envelope theorem, the sensitivity of the optimal value with respect
+# to the target parameter is simply the dual:
+#
+# ```math
+# \frac{\partial q_t}{\partial \hat{x}_t}
+# \;=\; -\lambda_t.
+# ```
+#
+# Combined with backpropagation through the policy network, the full gradient
+# of the expected cost is:
+#
+# ```math
+# \nabla_\theta \mathbb{E}[Q]
+# \;\approx\;
+# \frac{1}{S} \sum_{s=1}^{S} \sum_{t=1}^{T}
+#   \lambda_t^s \odot \nabla_\theta \hat{x}_t^s(\theta),
+# ```
+#
+# where ``S`` is the number of sampled trajectories per batch and ``\odot``
+# denotes elementwise multiplication.
+
 # ## Problem setup
 #
 # The JuMP subproblems are built from a MOF file (exported from PowerModels.jl)
-# plus hydro data (reservoir limits, inflow scenarios). Each subproblem contains:
+# plus hydro data (reservoir limits, inflow scenarios).  Each subproblem contains:
 # - AC optimal power flow constraints
 # - Reservoir balance: `vol_out = vol_in + inflow - turbined - spilled`
 # - Target-slack deficit variables penalizing deviation from the policy's targets
@@ -25,7 +95,7 @@ using Flux
 using Statistics, Random
 
 # Load the problem builder (reads MOF + hydro JSON + inflow CSV).
-
+#
 # ```julia
 # include("load_hydropowermodels.jl")
 # ```
@@ -51,8 +121,28 @@ using Statistics, Random
 
 # ## Policy architecture
 #
-# The policy is a `StateConditionedPolicy` with an LSTM encoder. At each stage it
-# receives `[inflow_t; reservoir_state_{t-1}]` and outputs target reservoir volumes:
+# The policy is a [`StateConditionedPolicy`](@ref) with two components:
+#
+# 1. **Encoder** — a stack of LSTM cells that processes only the uncertainty
+#    (inflow) sequence, capturing temporal dependencies across stages.
+# 2. **Combiner** — a Dense layer that merges the encoded uncertainty with the
+#    previous state to produce the next target.
+#
+# At each stage the policy receives ``[w_t;\; x_{t-1}]`` and outputs
+# target reservoir volumes ``\hat{x}_t``:
+#
+# ```
+#  ┌─────────┐      ┌────────────────┐      ┌──────────────┐
+#  │   w_t   │─────▶│  LSTM encoder  │─────▶│              │
+#  └─────────┘      └────────────────┘      │    Dense     │──▶ x̂_t
+#  ┌─────────┐                              │   combiner   │
+#  │ x_{t-1} │─────────────────────────────▶│              │
+#  └─────────┘                              └──────────────┘
+# ```
+#
+# The LSTM carries hidden state across stages, giving the policy memory of
+# past inflows.  The activation is `sigmoid` (bounding outputs to ``[0,1]``,
+# which is then scaled by the feasibility mapping).
 
 # ```julia
 # models = state_conditioned_policy(
@@ -61,11 +151,86 @@ using Statistics, Random
 # )
 # ```
 
-# ## Training: Deterministic Equivalent
+# ## TS-LDR: Linear Decision Rules
+#
+# As a baseline, we also train a **linear** policy (TS-LDR).  This uses
+# `dense_multilayer_nn` with identity activation — a composition of linear
+# layers equivalent to a single affine map:
 #
-# The deterministic equivalent couples all 96 stages into a single NLP. The policy
-# generates targets in one forward pass; the coupled solve determines realized states.
-# This gives the strongest gradient signal but requires solving the largest subproblem.
+# ```math
+# \hat{x}_t = W [w_{1:t};\; x_{t-1}] + b.
+# ```
+#
+# TS-LDR uses the same target-setting framework and training pipeline as
+# TS-DDR.  The only difference is the policy class: linear maps have fewer
+# parameters and cannot capture nonlinear inflow patterns, but they are a
+# natural baseline from the classical LDR literature.
+
+# ```julia
+# num_inputs = DecisionRules.policy_input_dim(num_uncertainties, num_hydro)
+# models = dense_multilayer_nn(num_inputs, num_hydro, [64, 64]; activation=identity)
+# ```
+
+# ## Training pipeline 1: Deterministic Equivalent
+#
+# The deterministic equivalent (DE) couples all 96 stages into a **single NLP**
+# for each sampled trajectory.  This is the most direct formulation: the policy
+# generates the full target trajectory ``\hat{x}_{1:T}`` in one forward pass,
+# and a single coupled solve determines all realized states simultaneously.
+#
+# ### How it works
+#
+# ```
+#  ┌──────────────────────────────────────────────────────────┐
+#  │  For each sampled trajectory w_{1:T}:                    │
+#  │                                                          │
+#  │  1. Forward pass: x̂_{1:T} = π_θ(w_{1:T}, x_0)          │
+#  │                                                          │
+#  │  2. Solve coupled NLP:                                   │
+#  │     min  Σ_t c_t(x_t, u_t) + C_δ Σ_t ‖δ_t‖             │
+#  │     s.t. dynamics + AC-OPF for ALL stages simultaneously │
+#  │          x_t + δ_t = x̂_t(θ)   ∀t  (target constraint)  │
+#  │                                                          │
+#  │  3. Read duals λ_t of target constraints                 │
+#  │     Gradient: Σ_t λ_t ⊙ ∇_θ x̂_t(θ)                     │
+#  └──────────────────────────────────────────────────────────┘
+# ```
+#
+# ### Mathematical formulation
+#
+# ```math
+# \begin{aligned}
+# Q(w;\, \theta)
+#   \;=\;
+#   \min_{\{x_t, u_t, \delta_t\}_{t=1}^{T}}
+#   \quad &
+#   \sum_{t=1}^{T} c_t(x_t, u_t)
+#   + C_\delta \sum_{t=1}^{T} \|\delta_t\| \\
+# \text{s.t.}\quad
+#   & x_t = T_t(w_t,\, u_t,\, x_{t-1}),
+#         && t=1,\ldots,T \\
+#   & x_t + \delta_t = \hat{x}_t(\theta),
+#         && : \lambda_t,\quad t=1,\ldots,T \\
+#   & h_t(x_t, u_t) \ge 0,
+#         && t=1,\ldots,T
+# \end{aligned}
+# ```
+#
+# The gradient is exact by the envelope theorem:
+#
+# ```math
+# \nabla_\theta Q
+# \;=\;
+# \sum_{t=1}^{T}
+# \lambda_t \odot \nabla_\theta \hat{x}_t(\theta).
+# ```
+#
+# **Advantages**: strongest gradient signal — full cross-stage coupling
+# captures how a target at stage 3 affects costs at stage 50.
+#
+# **Disadvantage**: the NLP has ``96 \times (\text{AC-OPF variables})``
+# decision variables; the policy generates targets without seeing realized
+# states (open-loop target generation).
 
 # ```julia
 # det_equivalent, uncertainty_samples_det = DecisionRules.deterministic_equivalent!(
@@ -76,31 +241,129 @@ using Statistics, Random
 # train_multistage(
 #     models, initial_state, det_equivalent,
 #     state_params_in, state_params_out, uncertainty_samples;
-#     num_batches=2000, optimizer=Flux.Adam(),
-#     penalty_schedule=:default_annealed,
+#     num_batches=4000, optimizer=Flux.Adam(),
+#     penalty_schedule=[(1,100,0.1), (101,210,1.0), (211,300,10.0), (301,4000,30.0)],
 # )
 # ```
 
-# ## Training: Stage-wise Subproblems
+# ## Training pipeline 2: Stage-wise Decomposition (Single Shooting)
+#
+# Stage-wise decomposition solves one subproblem per stage sequentially.
+# Unlike the DE, the policy operates in **closed loop**: after each stage
+# solve, the realized state ``x_t`` (not the predicted target) is fed back
+# as input to the next stage.
+#
+# ### How it works
+#
+# ```
+#  ┌─────────────────────────────────────────────────────────────┐
+#  │  For each sampled trajectory w_{1:T}:                       │
+#  │                                                             │
+#  │  x_0 = initial state                                        │
+#  │  for t = 1, ..., T:                                         │
+#  │      x̂_t = π_θ(w_t, x_{t-1})          ← predict target     │
+#  │      solve stage-t subproblem          ← project to feasible│
+#  │      x_t = realized state from solver  ← closed-loop        │
+#  │      accumulate c_t + C_δ ‖δ_t‖                             │
+#  │                                                             │
+#  │  Gradient: chain rule through all stage solves               │
+#  └─────────────────────────────────────────────────────────────┘
+# ```
+#
+# ### Gradient chain
+#
+# The gradient must account for how the realized state at stage ``t``
+# depends on the targets at all earlier stages.  By the chain rule:
 #
-# Stage-wise decomposition solves one subproblem per stage sequentially. The policy
-# receives the realized state from the previous stage (closed-loop). Gradients
-# combine dual information with DiffOpt sensitivities along the rollout.
+# ```math
+# \frac{\partial Q}{\partial \hat{x}_t}
+# \;=\;
+# \lambda_t
+# + \sum_{k>t}
+#   \frac{\partial q_k}{\partial x_{k-1}}
+#   \cdot \prod_{j=t+1}^{k-1}
+#   \frac{\partial x_j}{\partial x_{j-1}}
+#   \cdot \frac{\partial x_t}{\partial \hat{x}_t}.
+# ```
+#
+# In practice, automatic differentiation (Zygote + ChainRules `rrule`s
+# defined on each stage solve) handles this chain automatically.
+# The `rrule` for each stage solve reads the dual ``\lambda_t`` for the
+# target constraint and uses DiffOpt's implicit differentiation for the
+# state-transition sensitivities.
+#
+# **Advantages**: closed-loop — the policy sees realized states, matching
+# deployment semantics.  Each solve is small (single-stage AC-OPF).
+#
+# **Disadvantage**: gradients weaken over long horizons because the
+# chain rule multiplies many Jacobians; sequential solve prevents
+# parallelism.
 
 # ```julia
 # train_multistage(
 #     models, initial_state, subproblems,
 #     state_params_in, state_params_out, uncertainty_samples;
-#     num_batches=2000, optimizer=Flux.Adam(),
+#     num_batches=3000, optimizer=Flux.Adam(),
 #     penalty_schedule=:default_annealed,
 # )
 # ```
 
-# ## Training: Multiple Shooting
+# ## Training pipeline 3: Multiple Shooting
+#
+# Multiple shooting partitions the ``T``-stage horizon into ``K`` windows of
+# ``W`` stages each.  Within each window, a local deterministic equivalent
+# couples the stages (strong gradient signal).  Between windows, the realized
+# end-state is passed to the next window (closed-loop continuity).
+#
+# ### How it works
+#
+# ```
+#  ┌────────────────────────────────────────────────────────────────┐
+#  │  Partition T=96 stages into K=⌈96/12⌉=8 windows of W=12      │
+#  │                                                                │
+#  │  x_0 = initial state                                           │
+#  │  for k = 1, ..., K:                                            │
+#  │      stages = [(k-1)W+1, ..., kW]                              │
+#  │      x̂_{stages} = π_θ(w_{stages}, x_{start_k})                │
+#  │      solve window-k DE (12-stage coupled NLP)                  │
+#  │      x_{end_k} = realized end-state from window solve          │
+#  │      x_{start_{k+1}} = x_{end_k}                               │
+#  │                                                                │
+#  │  Gradient:                                                     │
+#  │    Within window: duals from the coupled solve (like full DE)  │
+#  │    Across windows: DiffOpt chain rule through end-states       │
+#  └────────────────────────────────────────────────────────────────┘
+# ```
+#
+# ### Gradient structure
+#
+# Let ``Q_k`` be the cost of window ``k``.  The total cost is
+# ``Q = \sum_k Q_k``.  Within a window, the gradient is identical to the
+# DE case (duals of the target constraints in the coupled model).  Across
+# windows, the chain rule threads through the realized end-state:
 #
-# Multiple shooting partitions the 96-stage horizon into windows (e.g., 12 stages
-# each). Each window solves a local deterministic equivalent, then passes the
-# realized end-state to the next window.
+# ```math
+# \frac{dQ}{d\theta}
+# \;=\;
+# \sum_{k=1}^{K}
+# \left(
+#   \frac{\partial Q_k}{\partial \hat{x}_k}
+#   \cdot \frac{\partial \hat{x}_k}{\partial \theta}
+#   \;+\;
+#   \frac{\partial Q_k}{\partial x_{\text{start}_k}}
+#   \cdot \frac{d x_{\text{start}_k}}{d\theta}
+# \right),
+# ```
+#
+# where ``\frac{d x_{\text{start}_k}}{d\theta}`` involves the chain
+# through all prior windows via ``x_{\text{end}_{k-1}}``.
+#
+# **Advantages**: balances gradient quality (12-stage coupling) with
+# tractability (8 small DEs instead of one large one); inter-window
+# chain provides some closed-loop signal.
+#
+# **Disadvantage**: window boundaries introduce gradient discontinuities;
+# the full-horizon coupling is weaker than the single DE.
 
 # ```julia
 # windows = DecisionRules.setup_shooting_windows(
@@ -112,20 +375,40 @@ using Statistics, Random
 #
 # train_multiple_shooting(
 #     models, initial_state, windows, () -> uncertainty_samples;
-#     num_batches=2000, optimizer=Flux.Adam(),
+#     num_batches=3000, optimizer=Flux.Adam(),
 #     penalty_schedule=:default_annealed,
 # )
 # ```
 
+# ## Penalty annealing
+#
+# The target penalty ``C_\delta`` controls the trade-off between following
+# the policy's targets and minimizing operational cost.  DecisionRules
+# supports a **penalty annealing schedule** that ramps the penalty multiplier
+# during training:
+#
+# | Phase | Multiplier | Purpose |
+# |:------|:----------:|:--------|
+# | Warmup | ``0.1 \times C_\delta`` | Let the policy explore freely |
+# | Nominal | ``1.0 \times C_\delta`` | Standard training |
+# | Tighten | ``10.0 \times C_\delta`` | Sharpen target tracking |
+# | Lock | ``30.0 \times C_\delta`` | Final precision |
+#
+# This is activated with `penalty_schedule=:default_annealed` or by passing
+# an explicit list of `(start_iter, end_iter, multiplier)` tuples.
+
 # ## Evaluation
 #
 # After training, we evaluate the policy using stage-wise rollout on held-out
-# scenarios. Two modes:
-# - **Target feedback** (`policy_state=:target`): matches DE training semantics
-# - **Realized feedback** (`policy_state=:realized`): deployment/closed-loop semantics
+# scenarios.  Two modes:
+# - **Target feedback** (`policy_state=:target`): the policy receives its own
+#   predicted target as input, matching DE training semantics.
+# - **Realized feedback** (`policy_state=:realized`): the policy receives the
+#   realized state from the solver, matching deployment semantics.
 #
-# The target-violation share measures how much cost comes from the slack penalty
-# rather than actual operations — it should be small (≤ 5%) for a well-trained policy.
+# The **target-violation share** measures how much cost comes from the slack
+# penalty rather than actual operations — it should be small (``\le 5\%``) for
+# a well-trained policy.
 
 # ```julia
 # rollout_eval = RolloutEvaluation(
@@ -137,20 +420,62 @@ using Statistics, Random
 # println("Violation share:  ", rollout_eval.last_violation_share)
 # ```
 
+# ## SDDP baseline
+#
+# For comparison, we also train an SDDP policy using
+# [SDDP.jl](https://github.com/odow/SDDP.jl) with **inconsistent
+# formulations**: a convex SOC-WR relaxation for the backward pass
+# (cut generation) and the nonconvex ACP formulation for the forward
+# pass (simulation).  This is a pragmatic approach when the true problem
+# (AC-OPF) is nonconvex — SDDP requires convexity for valid cuts, so a
+# convex relaxation approximates the value function while the forward pass
+# evaluates under the true physics.
+#
+# The SDDP policy is trained for up to 2000 iterations and the learned
+# cuts are saved to a JSON file, which can be loaded to simulate the
+# policy under the ACP formulation.
+
 # ## Results
 #
-# The plots below compare all three training formulations on the Bolivia case.
-# Training curves, out-of-sample cost distributions, and reservoir trajectories
-# are generated from full training runs (20 epochs × 100 batches each).
+# The plots below compare the TS-DDR and TS-LDR training formulations and
+# the SDDP baseline on the Bolivia case.  Training curves, out-of-sample
+# cost distributions, reservoir volume trajectories, and thermal generation
+# profiles are shown.
+#
+# ### Training convergence (TS-DDR methods)
 #
 # ![Training convergence](../assets/hydro_training_convergence.png)
 #
+# ### Out-of-sample cost (TS-DDR methods)
+#
 # ![Out-of-sample cost comparison](../assets/hydro_cost_comparison.png)
 #
-# ![Reservoir trajectories](../assets/hydro_trajectories.png)
+# ### Target-violation share (TS-DDR methods)
+#
+# ![Violation share](../assets/hydro_violation_share.png)
+#
+# ### Reservoir volume comparison (all methods)
+#
+# ![Volume comparison](../assets/hydro_volume_comparison.png)
+#
+# ### Thermal generation comparison (all methods)
+#
+# ![Generation comparison](../assets/hydro_generation_comparison.png)
+#
+# ### Summary
+#
+# | Method | Policy | Mean Cost | Std | N |
+# |:-------|:------:|----------:|----:|--:|
+# | TS-DDR (DE) | LSTM | 325 540 | 6 266 | 100 |
+# | TS-DDR (DE, anneal) | LSTM | 324 445 | 6 134 | 100 |
+# | TS-DDR (shooting w=12) | LSTM | 323 289 | 5 593 | 100 |
+# | TS-DDR (shooting w=12, anneal) | LSTM | 322 812 | 6 081 | 100 |
+# | TS-DDR (stage-wise, anneal) | LSTM | 321 543 | 6 214 | 100 |
+# | SDDP (SOC-WR / ACP) | cuts | 303 684 | — | 100 |
+#
+# All three TS-DDR methods with penalty annealing converge to similar
+# costs (321K–325K).  SDDP trains on 126 stages (96 + 30 margin).
 #
-# | Method | Mean Cost | Std | Violation % | Train Time |
-# |:---|---:|---:|---:|---:|
-# | Deterministic Equivalent | 321189.0 | — | 48.66% | 158 steps |
-# | Stage-wise Subproblems | 364110.0 | — | 0.59% | 159 steps |
-# | Multiple Shooting | 319462.0 | — | 36.18% | 236 steps |
+# !!! note "Preliminary results"
+#     These numbers reflect the current default training scripts.
+#     They will be updated as the package evolves.
diff --git a/docs/src/examples/hydro.md b/docs/src/examples/hydro.md
index 5f43985..6c23019 100644
--- a/docs/src/examples/hydro.md
+++ b/docs/src/examples/hydro.md
@@ -4,17 +4,87 @@ EditURL = "hydro.jl"
 
 # Hydropower Scheduling
 
-This example trains TS-DDR policies for the Bolivia long-term hydrothermal
-dispatch (LTHD) problem using all three formulations: deterministic equivalent,
-stage-wise subproblem decomposition, and multiple shooting.
+This example trains target-setting decision rules for the Bolivia
+long-term hydrothermal dispatch (LTHD) problem — both **TS-DDR** (deep,
+LSTM-based) and **TS-LDR** (linear) — and compares them against an SDDP
+baseline with inconsistent formulations.
 
-The Bolivia system has 10 hydro plants, 96 monthly stages, and AC power flow
-constraints. Inflow uncertainty is sampled from historical scenarios.
+The Bolivia system has **10 hydro plants**, **96 monthly stages**, and
+**AC power flow** constraints.  Inflow uncertainty is sampled from 47
+historical scenarios.
+
+## Overview of the TS-DDR approach
+
+Classical stochastic programming (e.g., SDDP) constructs piecewise-linear
+value-function approximations.  TS-DDR takes a different route: a neural
+network policy ``\pi_\theta`` maps observations to **target states**, and a
+projection subproblem at each stage enforces physical feasibility while
+tracking those targets as closely as possible.
+
+The key insight is that the gradient of the projection subproblem with
+respect to the target parameters is available through Lagrange duality
+(or equivalently, implicit differentiation of the KKT conditions).
+This avoids differentiating through the full optimization solver.
+
+## Problem formulation
+
+At each stage ``t``, the operator observes inflows ``w_t`` and the current
+reservoir state ``x_{t-1}``.  The policy predicts target volumes:
+
+```math
+\hat{x}_t = \pi_\theta(w_{1:t},\, x_{t-1}).
+```
+
+A stage subproblem projects onto the feasible set:
+
+```math
+\begin{aligned}
+q_t(x_{t-1},\, w_t;\; \hat{x}_t)
+  \;=\;
+  \min_{x_t, u_t, \delta_t}
+  \quad &
+  c_t(x_t, u_t) + C_\delta\, \|\delta_t\| \\
+\text{s.t.}\quad
+  & x_t = x_{t-1} + w_t - \text{turbined}_t - \text{spilled}_t,
+        && \text{(reservoir balance)} \\
+  & x_t + \delta_t = \hat{x}_t,
+        && : \lambda_t \quad \text{(target constraint)} \\
+  & \text{AC-OPF}(u_t),
+        && \text{(power flow)}  \\
+  & x_t \in [0, \bar{x}],\; u_t \ge 0.
+\end{aligned}
+```
+
+The slack variable ``\delta_t`` absorbs infeasible targets; ``\lambda_t`` is
+the dual multiplier that provides the gradient signal.
+
+## Gradient computation: the envelope theorem
+
+By the envelope theorem, the sensitivity of the optimal value with respect
+to the target parameter is simply the dual:
+
+```math
+\frac{\partial q_t}{\partial \hat{x}_t}
+\;=\; -\lambda_t.
+```
+
+Combined with backpropagation through the policy network, the full gradient
+of the expected cost is:
+
+```math
+\nabla_\theta \mathbb{E}[Q]
+\;\approx\;
+\frac{1}{S} \sum_{s=1}^{S} \sum_{t=1}^{T}
+  \lambda_t^s \odot \nabla_\theta \hat{x}_t^s(\theta),
+```
+
+where ``S`` is the number of sampled trajectories per batch and ``\odot``
+denotes elementwise multiplication.
 
 ## Problem setup
 
 The JuMP subproblems are built from a MOF file (exported from PowerModels.jl)
-plus hydro data (reservoir limits, inflow scenarios). Each subproblem contains:
+plus hydro data (reservoir limits, inflow scenarios).  Each subproblem contains:
 - AC optimal power flow constraints
 - Reservoir balance: `vol_out = vol_in + inflow - turbined - spilled`
 - Target-slack deficit variables penalizing deviation from the policy's targets
@@ -57,8 +127,28 @@ subproblems, state_params_in, state_params_out, uncertainty_samples, initial_sta
 
 ## Policy architecture
 
-The policy is a `StateConditionedPolicy` with an LSTM encoder. At each stage it
-receives `[inflow_t; reservoir_state_{t-1}]` and outputs target reservoir volumes:
+The policy is a [`StateConditionedPolicy`](@ref) with two components:
+
+1. **Encoder** — a stack of LSTM cells that processes only the uncertainty
+   (inflow) sequence, capturing temporal dependencies across stages.
+2. **Combiner** — a Dense layer that merges the encoded uncertainty with the
+   previous state to produce the next target.
+
+At each stage the policy receives ``[w_t;\; x_{t-1}]`` and outputs
+target reservoir volumes ``\hat{x}_t``:
+
+```
+ ┌─────────┐      ┌────────────────┐      ┌──────────────┐
+ │   w_t   │─────▶│  LSTM encoder  │─────▶│              │
+ └─────────┘      └────────────────┘      │    Dense     │──▶ x̂_t
+ ┌─────────┐                              │   combiner   │
+ │ x_{t-1} │─────────────────────────────▶│              │
+ └─────────┘                              └──────────────┘
+```
+
+The LSTM carries hidden state across stages, giving the policy memory of
+past inflows.  The activation is `sigmoid` (bounding outputs to ``[0,1]``,
+which is then scaled by the feasibility mapping).
 
 ```julia
 models = state_conditioned_policy(
@@ -67,11 +157,86 @@ models = state_conditioned_policy(
 )
 ```
 
-## Training: Deterministic Equivalent
+## TS-LDR: Linear Decision Rules
+
+As a baseline, we also train a **linear** policy (TS-LDR).  This uses
+`dense_multilayer_nn` with identity activation — a composition of linear
+layers equivalent to a single affine map:
+
+```math
+\hat{x}_t = W [w_{1:t};\; x_{t-1}] + b.
+```
+
+TS-LDR uses the same target-setting framework and training pipeline as
+TS-DDR.  The only difference is the policy class: linear maps have fewer
+parameters and cannot capture nonlinear inflow patterns, but they are a
+natural baseline from the classical LDR literature.
+
+```julia
+num_inputs = DecisionRules.policy_input_dim(num_uncertainties, num_hydro)
+models = dense_multilayer_nn(num_inputs, num_hydro, [64, 64]; activation=identity)
+```
+
+## Training pipeline 1: Deterministic Equivalent
+
+The deterministic equivalent (DE) couples all 96 stages into a **single NLP**
+for each sampled trajectory.  This is the most direct formulation: the policy
+generates the full target trajectory ``\hat{x}_{1:T}`` in one forward pass,
+and a single coupled solve determines all realized states simultaneously.
+
+### How it works
+
+```
+ ┌──────────────────────────────────────────────────────────┐
+ │  For each sampled trajectory w_{1:T}:                    │
+ │                                                          │
+ │  1. Forward pass: x̂_{1:T} = π_θ(w_{1:T}, x_0)          │
+ │                                                          │
+ │  2. Solve coupled NLP:                                   │
+ │     min  Σ_t c_t(x_t, u_t) + C_δ Σ_t ‖δ_t‖             │
+ │     s.t. dynamics + AC-OPF for ALL stages simultaneously │
+ │          x_t + δ_t = x̂_t(θ)   ∀t  (target constraint)  │
+ │                                                          │
+ │  3. Read duals λ_t of target constraints                 │
+ │     Gradient: Σ_t λ_t ⊙ ∇_θ x̂_t(θ)                     │
+ └──────────────────────────────────────────────────────────┘
+```
+
+### Mathematical formulation
+
+```math
+\begin{aligned}
+Q(w;\, \theta)
+  \;=\;
+  \min_{\{x_t, u_t, \delta_t\}_{t=1}^{T}}
+  \quad &
+  \sum_{t=1}^{T} c_t(x_t, u_t)
+  + C_\delta \sum_{t=1}^{T} \|\delta_t\| \\
+\text{s.t.}\quad
+  & x_t = T_t(w_t,\, u_t,\, x_{t-1}),
+        && t=1,\ldots,T \\
+  & x_t + \delta_t = \hat{x}_t(\theta),
+        && : \lambda_t,\quad t=1,\ldots,T \\
+  & h_t(x_t, u_t) \ge 0,
+        && t=1,\ldots,T
+\end{aligned}
+```
+
+The gradient is exact by the envelope theorem:
+
+```math
+\nabla_\theta Q
+\;=\;
+\sum_{t=1}^{T}
+\lambda_t \odot \nabla_\theta \hat{x}_t(\theta).
+```
+
+**Advantages**: strongest gradient signal — full cross-stage coupling
+captures how a target at stage 3 affects costs at stage 50.
 
-The deterministic equivalent couples all 96 stages into a single NLP. The policy
-generates targets in one forward pass; the coupled solve determines realized states.
-This gives the strongest gradient signal but requires solving the largest subproblem.
+**Disadvantage**: the NLP has ``96 \times (\text{AC-OPF variables})``
+decision variables; the policy generates targets without seeing realized
+states (open-loop target generation).
 
 ```julia
 det_equivalent, uncertainty_samples_det = DecisionRules.deterministic_equivalent!(
@@ -82,31 +247,129 @@ det_equivalent, uncertainty_samples_det = DecisionRules.deterministic_equivalent
 train_multistage(
     models, initial_state, det_equivalent,
     state_params_in, state_params_out, uncertainty_samples;
-    num_batches=2000, optimizer=Flux.Adam(),
-    penalty_schedule=:default_annealed,
+    num_batches=4000, optimizer=Flux.Adam(),
+    penalty_schedule=[(1,100,0.1), (101,210,1.0), (211,300,10.0), (301,4000,30.0)],
 )
 ```
 
-## Training: Stage-wise Subproblems
+## Training pipeline 2: Stage-wise Decomposition (Single Shooting)
 
-Stage-wise decomposition solves one subproblem per stage sequentially. The policy
-receives the realized state from the previous stage (closed-loop). Gradients
-combine dual information with DiffOpt sensitivities along the rollout.
+Stage-wise decomposition solves one subproblem per stage sequentially.
+Unlike the DE, the policy operates in **closed loop**: after each stage
+solve, the realized state ``x_t`` (not the predicted target) is fed back
+as input to the next stage.
+
+### How it works
+
+```
+ ┌─────────────────────────────────────────────────────────────┐
+ │  For each sampled trajectory w_{1:T}:                       │
+ │                                                             │
+ │  x_0 = initial state                                        │
+ │  for t = 1, ..., T:                                         │
+ │      x̂_t = π_θ(w_t, x_{t-1})          ← predict target     │
+ │      solve stage-t subproblem          ← project to feasible│
+ │      x_t = realized state from solver  ← closed-loop        │
+ │      accumulate c_t + C_δ ‖δ_t‖                             │
+ │                                                             │
+ │  Gradient: chain rule through all stage solves               │
+ └─────────────────────────────────────────────────────────────┘
+```
+
+### Gradient chain
+
+The gradient must account for how the realized state at stage ``t``
+depends on the targets at all earlier stages.  By the chain rule:
+
+```math
+\frac{\partial Q}{\partial \hat{x}_t}
+\;=\;
+\lambda_t
++ \sum_{k>t}
+  \frac{\partial q_k}{\partial x_{k-1}}
+  \cdot \prod_{j=t+1}^{k-1}
+  \frac{\partial x_j}{\partial x_{j-1}}
+  \cdot \frac{\partial x_t}{\partial \hat{x}_t}.
+```
+
+In practice, automatic differentiation (Zygote + ChainRules `rrule`s
+defined on each stage solve) handles this chain automatically.
+The `rrule` for each stage solve reads the dual ``\lambda_t`` for the
+target constraint and uses DiffOpt's implicit differentiation for the
+state-transition sensitivities.
+
+**Advantages**: closed-loop — the policy sees realized states, matching
+deployment semantics.  Each solve is small (single-stage AC-OPF).
+
+**Disadvantage**: gradients weaken over long horizons because the
+chain rule multiplies many Jacobians; sequential solve prevents
+parallelism.
 
 ```julia
 train_multistage(
     models, initial_state, subproblems,
     state_params_in, state_params_out, uncertainty_samples;
-    num_batches=2000, optimizer=Flux.Adam(),
+    num_batches=3000, optimizer=Flux.Adam(),
     penalty_schedule=:default_annealed,
 )
 ```
 
-## Training: Multiple Shooting
+## Training pipeline 3: Multiple Shooting
+
+Multiple shooting partitions the ``T``-stage horizon into ``K`` windows of
+``W`` stages each.  Within each window, a local deterministic equivalent
+couples the stages (strong gradient signal).  Between windows, the realized
+end-state is passed to the next window (closed-loop continuity).
+
+### How it works
+
+```
+ ┌────────────────────────────────────────────────────────────────┐
+ │  Partition T=96 stages into K=⌈96/12⌉=8 windows of W=12      │
+ │                                                                │
+ │  x_0 = initial state                                           │
+ │  for k = 1, ..., K:                                            │
+ │      stages = [(k-1)W+1, ..., kW]                              │
+ │      x̂_{stages} = π_θ(w_{stages}, x_{start_k})                │
+ │      solve window-k DE (12-stage coupled NLP)                  │
+ │      x_{end_k} = realized end-state from window solve          │
+ │      x_{start_{k+1}} = x_{end_k}                               │
+ │                                                                │
+ │  Gradient:                                                     │
+ │    Within window: duals from the coupled solve (like full DE)  │
+ │    Across windows: DiffOpt chain rule through end-states       │
+ └────────────────────────────────────────────────────────────────┘
+```
+
+### Gradient structure
+
+Let ``Q_k`` be the cost of window ``k``.  The total cost is
+``Q = \sum_k Q_k``.  Within a window, the gradient is identical to the
+DE case (duals of the target constraints in the coupled model).  Across
+windows, the chain rule threads through the realized end-state:
+
+```math
+\frac{dQ}{d\theta}
+\;=\;
+\sum_{k=1}^{K}
+\left(
+  \frac{\partial Q_k}{\partial \hat{x}_k}
+  \cdot \frac{\partial \hat{x}_k}{\partial \theta}
+  \;+\;
+  \frac{\partial Q_k}{\partial x_{\text{start}_k}}
+  \cdot \frac{d x_{\text{start}_k}}{d\theta}
+\right),
+```
 
-Multiple shooting partitions the 96-stage horizon into windows (e.g., 12 stages
-each). Each window solves a local deterministic equivalent, then passes the
-realized end-state to the next window.
+where ``\frac{d x_{\text{start}_k}}{d\theta}`` involves the chain
+through all prior windows via ``x_{\text{end}_{k-1}}``.
+
+**Advantages**: balances gradient quality (12-stage coupling) with
+tractability (8 small DEs instead of one large one); inter-window
+chain provides some closed-loop signal.
+
+**Disadvantage**: window boundaries introduce gradient discontinuities;
+the full-horizon coupling is weaker than the single DE.
 
 ```julia
 windows = DecisionRules.setup_shooting_windows(
@@ -118,20 +381,40 @@ windows = DecisionRules.setup_shooting_windows(
 
 train_multiple_shooting(
     models, initial_state, windows, () -> uncertainty_samples;
-    num_batches=2000, optimizer=Flux.Adam(),
+    num_batches=3000, optimizer=Flux.Adam(),
     penalty_schedule=:default_annealed,
 )
 ```
 
+## Penalty annealing
+
+The target penalty ``C_\delta`` controls the trade-off between following
+the policy's targets and minimizing operational cost.  DecisionRules
+supports a **penalty annealing schedule** that ramps the penalty multiplier
+during training:
+
+| Phase | Multiplier | Purpose |
+|:------|:----------:|:--------|
+| Warmup | ``0.1 \times C_\delta`` | Let the policy explore freely |
+| Nominal | ``1.0 \times C_\delta`` | Standard training |
+| Tighten | ``10.0 \times C_\delta`` | Sharpen target tracking |
+| Lock | ``30.0 \times C_\delta`` | Final precision |
+
+This is activated with `penalty_schedule=:default_annealed` or by passing
+an explicit list of `(start_iter, end_iter, multiplier)` tuples.
+
 ## Evaluation
 
 After training, we evaluate the policy using stage-wise rollout on held-out
-scenarios. Two modes:
-- **Target feedback** (`policy_state=:target`): matches DE training semantics
-- **Realized feedback** (`policy_state=:realized`): deployment/closed-loop semantics
+scenarios.  Two modes:
+- **Target feedback** (`policy_state=:target`): the policy receives its own
+  predicted target as input, matching DE training semantics.
+- **Realized feedback** (`policy_state=:realized`): the policy receives the
+  realized state from the solver, matching deployment semantics.
 
-The target-violation share measures how much cost comes from the slack penalty
-rather than actual operations — it should be small (≤ 5%) for a well-trained policy.
+The **target-violation share** measures how much cost comes from the slack
+penalty rather than actual operations — it should be small (``\le 5\%``) for
+a well-trained policy.
 
 ```julia
 rollout_eval = RolloutEvaluation(
@@ -143,20 +426,63 @@ println("Operational cost: ", rollout_eval.last_objective_no_deficit)
 println("Violation share:  ", rollout_eval.last_violation_share)
 ```
 
+## SDDP baseline
+
+For comparison, we also train an SDDP policy using
+[SDDP.jl](https://github.com/odow/SDDP.jl) with **inconsistent
+formulations**: a convex SOC-WR relaxation for the backward pass
+(cut generation) and the nonconvex ACP formulation for the forward
+pass (simulation).  This is a pragmatic approach when the true problem
+(AC-OPF) is nonconvex — SDDP requires convexity for valid cuts, so a
+convex relaxation approximates the value function while the forward pass
+evaluates under the true physics.
+
+The SDDP policy is trained for up to 2000 iterations and the learned
+cuts are saved to a JSON file, which can be loaded to simulate the
+policy under the ACP formulation.
+
 ## Results
 
-The plots below compare all three training formulations on the Bolivia case.
-Training curves, out-of-sample cost distributions, and reservoir trajectories
-are generated from full training runs (20 epochs × 100 batches each).
+The plots below compare the TS-DDR and TS-LDR training formulations and
+the SDDP baseline on the Bolivia case.  Training curves, out-of-sample
+cost distributions, reservoir volume trajectories, and thermal generation
+profiles are shown.
+
+### Training convergence (TS-DDR methods)
 
 ![Training convergence](../assets/hydro_training_convergence.png)
 
+### Out-of-sample cost (TS-DDR methods)
+
 ![Out-of-sample cost comparison](../assets/hydro_cost_comparison.png)
 
-![Reservoir trajectories](../assets/hydro_trajectories.png)
+### Target-violation share (TS-DDR methods)
+
+![Violation share](../assets/hydro_violation_share.png)
+
+### Reservoir volume comparison (all methods)
+
+![Volume comparison](../assets/hydro_volume_comparison.png)
+
+### Thermal generation comparison (all methods)
+
+![Generation comparison](../assets/hydro_generation_comparison.png)
+
+### Summary
+
+| Method | Policy | Mean Cost | Std | N |
+|:-------|:------:|----------:|----:|--:|
+| TS-DDR (DE) | LSTM | 325 540 | 6 266 | 100 |
+| TS-DDR (DE, anneal) | LSTM | 324 445 | 6 134 | 100 |
+| TS-DDR (shooting w=12) | LSTM | 323 289 | 5 593 | 100 |
+| TS-DDR (shooting w=12, anneal) | LSTM | 322 812 | 6 081 | 100 |
+| TS-DDR (stage-wise, anneal) | LSTM | 321 543 | 6 214 | 100 |
+| SDDP (SOC-WR / ACP) | cuts | 303 684 | — | 100 |
+
+All three TS-DDR methods with penalty annealing converge to similar
+costs (321K–325K).  SDDP trains on 126 stages (96 + 30 margin).
+
+!!! note "Preliminary results"
+    These numbers reflect the current default training scripts.
+    They will be updated as the package evolves.
 
-| Method | Mean Cost | Std | Violation % | Train Time |
-|:---|---:|---:|---:|---:|
-| Deterministic Equivalent | 321189.0 | — | 48.66% | 158 steps |
-| Stage-wise Subproblems | 364110.0 | — | 0.59% | 159 steps |
-| Multiple Shooting | 319462.0 | — | 36.18% | 236 steps |
diff --git a/docs/src/examples/inventory.jl b/docs/src/examples/inventory.jl
index efb3606..a3d0058 100644
--- a/docs/src/examples/inventory.jl
+++ b/docs/src/examples/inventory.jl
@@ -1,187 +1,410 @@
-# # Inventory Control with Ordering Costs
+# # Stochastic Lot-Sizing with Fixed Ordering Costs
 #
-# This example studies a 12-period stochastic lot-sizing problem with two
-# formulations — a **relaxed** (continuous) case and an **integer** (MIP) case
-# with fixed ordering costs.  The comparison shows:
+# This example shows how to train target-state decision rules for a stochastic
+# inventory problem with ex-ante ordering decisions.
 #
-# 1. **Relaxed problem**: SDDP with a PAR(1) demand approximation is
-#    near-optimal and outperforms TS-DDR.
-# 2. **Integer problem**: TS-DDR with `FixedDiscreteIntegerStrategy` outperforms
-#    both SDDP and TS-DDR with `ContinuousRelaxationIntegerStrategy`, because
-#    SDDP and continuous relaxation both underestimate the fixed ordering cost.
+# The example has two purposes:
+#
+# 1. show the complete optimization model before discussing implementation
+#    details; and
+# 2. show the code in the same order a reader would run it.
 
 using DecisionRules
-using JuMP, HiGHS
 using Flux
-using Statistics, Random
+using HiGHS
+using JuMP
+using Random
+using Statistics
+
+# The runnable experiment lives outside the documentation tree. The file defines
+# the demand process, JuMP builders, and policy architecture used below.
+include(joinpath(@__DIR__, "..", "..", "..", "examples", "inventory_control",
+    "build_inventory_problem.jl"))
 
 # ## Information Pattern
 #
-# At the beginning of a period, the controller observes current inventory and
-# recent realized demand.  It does **not** observe current demand before ordering.
-# The order is therefore ex-ante.  After ordering, demand is realized and becomes
-# part of the state for the next period.
+# At the beginning of period `t`, the controller knows
 #
-# The state carried between periods is:
-#
-# ```julia
-# [net_inventory, last_demand, previous_demand]
+# ```math
+# x_t = (s_{t-1}, d_{t-1}, d_{t-2}),
 # ```
 #
-# This lets a time-invariant policy infer the latent demand regime from recent
-# observations without receiving a period counter or synthetic seasonal features.
-
-# ## Inventory Model
+# where `s` is net inventory and `d` is realized demand. The controller chooses
+# the order quantity before seeing current demand `d_t`. This is an ex-ante
+# decision.
 #
-# ### Relaxed formulation
+# The neural policy receives `[d_t, x_t...]` during training because
+# DecisionRules policies output target states after the stage uncertainty is
+# sampled. The implementation below uses that target only to guide the
+# optimization model; the actual order still respects the model's information
+# pattern.
+
+# ## Complete Stage Model
 #
-# The order quantity is continuous with no setup cost:
+# For each period `t = 1, ..., T`, the stage model is
 #
 # ```math
-# 0 \le q_t \le Q_{\max}, \qquad
-# \text{cost}_t = c\,q_t + h\max(s_t,0) + p\max(-s_t,0).
+# \begin{aligned}
+# \min_{q_t,z_t,s_t^{mid},s_t,h_t,b_t}
+#     \quad & K z_t + c q_t + h h_t + p b_t
+#             + \lambda |s_t^{mid} - \hat{s}_t| \\
+# \text{s.t.}\quad
+#     & 0 \le q_t \le Q_{\max} z_t, && \text{(1) order capacity} \\
+#     & z_t \in \{0,1\},             && \text{(2) setup decision} \\
+#     & s_t^{mid} = s_{t-1} + q_t,   && \text{(3) order arrives} \\
+#     & s_t = s_t^{mid} - d_t,       && \text{(4) demand realizes} \\
+#     & h_t - b_t = s_t,             && \text{(5) inventory split} \\
+#     & h_t \ge 0,\; b_t \ge 0.      && \text{(6) split bounds}
+# \end{aligned}
 # ```
 #
-# ### Integer formulation
+# The relaxed model removes (2) and replaces (1) by
+# ``0 \le q_t \le Q_{\max}``; it also removes the fixed cost `K z_t` from the
+# objective.
+#
+# The target `\hat{s}_t` is not an operational requirement. It is the state
+# target produced by the neural decision rule, and the penalty term gives the
+# policy a gradient signal.
+
+# ## Parameters
+
+inventory_parameters = (
+    T = INVENTORY_T,
+    setup_cost = INVENTORY_K,
+    unit_order_cost = INVENTORY_C,
+    holding_cost = INVENTORY_H,
+    backlog_cost = INVENTORY_P,
+    order_capacity = INVENTORY_Q_MAX,
+    initial_inventory = INVENTORY_I0,
+    target_penalty = INVENTORY_PENALTY,
+)
+
+# ## Demand Process
 #
-# A binary variable ``z_t \in \{0,1\}`` controls whether an order is placed.
-# If ``z_t = 0``, then ``q_t`` must be zero; if ``z_t = 1``, the model pays a
-# fixed setup cost ``K``:
+# Demand has a hidden seasonal phase, a persistent hidden regime, and an AR(1)
+# shock:
 #
 # ```math
-# 0 \le q_t \le Q_{\max}\,z_t, \qquad
-# \text{cost}_t = K\,z_t + c\,q_t + h\max(s_t,0) + p\max(-s_t,0).
+# \epsilon_t = 0.92 \epsilon_{t-1} + 0.35 \eta_t,
 # ```
 #
-# In both cases, ordered units arrive before demand:
-#
 # ```math
-# s^{mid}_t = s_{t-1} + q_t, \qquad s_t = s^{mid}_t - d_t.
+# d_t =
+# \operatorname{clip}\!\left(
+#     m_{\kappa_t}
+#     + w_{\kappa_t}(0.85 r_t + 0.42 \epsilon_t + 0.12 \eta'_t)
+# \right),
 # ```
 #
-# | Parameter | Value | Meaning |
-# |:--|--:|:--|
-# | ``T`` | 12 | periods |
-# | ``K`` | 500 | fixed order/setup cost (integer case) |
-# | ``c`` | 2 | unit ordering cost |
-# | ``h`` | 1 | holding cost |
-# | ``p`` | 25 | backlog penalty |
-# | ``Q_{\max}`` | 350 | order capacity |
-# | ``s_0`` | 30 | initial inventory |
+# where `r_t` is the hidden regime and
+# ``\kappa_t = 1 + ((t + \phi - 1) \bmod T)`` is the hidden seasonal index.
 
-# ## Demand Process
+Random.seed!(11)
+demand_paths = [sample_inventory_demand_path() for _ in 1:3]
+
+# ## Build the Continuous and Integer Models
+#
+# The builders return the JuMP model(s), input-state parameters, output-target
+# parameters, an uncertainty sampler, and the initial state.
+
+relaxed_subproblems,
+relaxed_state_in,
+relaxed_state_out,
+relaxed_sampler,
+initial_state = build_inventory_subproblems(;
+    num_scenarios = 100,
+    integer = false,
+)
+
+integer_subproblems,
+integer_state_in,
+integer_state_out,
+integer_sampler,
+_ = build_inventory_subproblems(;
+    num_scenarios = 100,
+    integer = true,
+)
+
+# The deterministic equivalent is the full-horizon model used by direct
+# transcription training.
+
+integer_det_equivalent,
+integer_det_state_in,
+integer_det_state_out,
+integer_det_sampler,
+_ = build_inventory_det_equivalent(;
+    num_scenarios = 50,
+    integer = true,
+)
+
+# ## Integer Sensitivity Strategies
+#
+# Mixed-integer models do not have ordinary LP duals. DecisionRules therefore
+# makes the chosen postprocessing strategy explicit.
+
+fixed_discrete = FixedDiscreteIntegerStrategy()
+continuous_relaxation = ContinuousRelaxationIntegerStrategy()
+
+# `FixedDiscreteIntegerStrategy` solves the MIP, fixes the incumbent integer
+# variables, re-solves the fixed LP, and reads local dual information.
 #
-# Each trajectory has a path-level phase shift ``\phi \sim \mathrm{Unif}\{0,\ldots,T-1\}``,
-# a persistent latent regime ``r_t \in \{-1,0,1\}`` (switch probability 0.04),
-# and an autoregressive shock ``\epsilon_t``:
+# `ContinuousRelaxationIntegerStrategy` relaxes integer variables first and reads
+# duals from the relaxed LP. This is smoother and faster, but the gradient is for
+# the relaxation, not for an integer-feasible decision.
+
+# ## Score-Function Correction
+#
+# Local LP duals do not see a discrete switch such as "open the setup variable".
+# A score-function correction estimates the effect of target changes by solving
+# perturbed integer rollouts:
 #
 # ```math
-# \epsilon_t = 0.92\,\epsilon_{t-1} + 0.35\,\eta_t, \qquad
-# d_t = \operatorname{clip}\!\bigl(
-#   m_{\kappa_t} + w_{\kappa_t}(0.85\,r_t + 0.42\,\epsilon_t + 0.12\,\eta'_t)
-# \bigr),
+# \nabla L
+# =
+# \alpha \nabla L_{\mathrm{dual}}
+# + (1-\alpha)
+#   \frac{1}{M}
+#   \sum_{m=1}^{M}
+#   (R_m - b)
+#   \nabla_\theta
+#   \sum_{t=1}^{T}
+#   \left\langle
+#       \delta_{m,t}/\sigma^2,
+#       \hat{x}_{t+1}(\theta)
+#   \right\rangle .
 # ```
 #
-# where ``\kappa_t = 1 + ((t + \phi - 1) \bmod T)`` is the shifted seasonal
-# index, and ``m_{\kappa_t}`` and ``w_{\kappa_t}`` are the midpoint and
-# half-width of the seasonal demand band.  None of the latent variables are
-# observed; the policy sees only inventory and realized demand history.
+# There are two different solves in the mixed-gradient training loop:
 #
-# The plot below shows 24 sampled demand paths.  Because each trajectory has a
-# different phase and persistent regime, the same calendar period can correspond
-# to high, medium, or low demand across scenarios.
+# - `train_multistage(...; integer_strategy = fixed_discrete)` controls the
+#   deterministic-equivalent solve used for the dual-gradient term
+#   ``\nabla L_{\mathrm{dual}}``. This solve needs a postprocessing strategy
+#   because duals are not directly defined for a MIP.
+# - `ScoreFunctionConfig(integer_subproblems, ...)` controls the Monte Carlo
+#   rollout term. These rollout models are solved exactly as they are built.
+#   Because `integer_subproblems` contain binary setup variables, the rollout
+#   costs `R_m` are true MIP rollout costs.
 #
-# ![Demand process](../assets/inventory_demand_process.png)
+# In short: `integer_strategy` is for reading local duals; score-function
+# rollouts are for measuring realized costs.
 
-# ## Integer Postprocessing Strategies
-#
-# DecisionRules.jl provides two strategies for extracting gradient information
-# from subproblems with discrete variables:
+score_function = ScoreFunctionConfig(
+    integer_subproblems,
+    integer_state_in,
+    integer_state_out;
+    dual_weight = 0.5,
+    perturbation_std = 1.0,
+    num_rollouts = 8,
+)
+
+score_schedule = ScoreFunctionSchedule(
+    score_function;
+    sf_start = 200,
+    ramp_batches = 300,
+    perturbation_std_initial = 0.1,
+    num_rollouts_initial = 2,
+)
+
+# ## Policy
 #
-# **`FixedDiscreteIntegerStrategy`**: (1) solve the MIP for incumbent binary
-# values ``z^*_t``; (2) fix ``z_t = z^*_t`` and relax integrality; (3) re-solve
-# the resulting LP; (4) read LP duals as gradient signal.  This is the same
-# principle as SDDP.jl's `FixedDiscreteDuality`.
+# A DecisionRules policy is any callable `π(x) -> target` where `x` is the
+# concatenation `[uncertainty..., state...]` and `target` is the desired
+# next state. The only requirement is that it is differentiable via
+# `Zygote.gradient` and registered with `Functors.@functor` so that
+# `Flux.loadmodel!` can checkpoint its parameters.
 #
-# **`ContinuousRelaxationIntegerStrategy`**: relax all binary/integer
-# constraints to continuous bounds (binary → [0,1]), solve the resulting LP,
-# and read duals directly.  This is faster (one LP instead of MIP + LP) and
-# gives smoother gradients, but the solution may have fractional integer
-# variables — the gradient does not correspond to any feasible integer
-# assignment.
+# ### Feedforward policy
 #
-# For the relaxed formulation (no integer variables), `NoIntegerStrategy` is
-# used and subproblems are solved as-is.
+# The simplest architecture is a feedforward MLP. This policy is ex-ante:
+# it ignores the current demand `d_t` (index 1) and uses only the state
+# entries `[inventory, d_{t-1}, d_{t-2}]`.
+
+using Functors: @functor
+
+struct ExAntePolicy{N}
+    net::N
+end
 
-# ## Relaxed (Continuous) Problem
+@functor ExAntePolicy (net,)
+
+# The callable normalizes features to ≈[0,1] and maps through the network.
+# The sigmoid output bounds the target to `[0, 500]`.
+function (p::ExAntePolicy)(x)
+    inventory = Float32(x[2])
+    d_prev    = Float32(x[3])
+    d_prev2   = Float32(x[4])
+    features  = Float32[inventory / 100, d_prev / 100, d_prev2 / 100]
+    target    = 500f0 .* Flux.sigmoid.(p.net(features))
+    return Float32[target[1], x[1], d_prev]
+end
+
+Random.seed!(2024)
+policy = ExAntePolicy(Chain(Dense(3, 32, relu), Dense(32, 24, relu), Dense(24, 1)))
+
+# ### Recurrent (LSTM) policy
 #
-# When there are no integer variables, SDDP can model the demand process
-# exactly via a PAR(1) approximation that carries ``d_{t-1}`` as a state
-# variable.  This makes SDDP near-optimal for the relaxed problem.
+# When the uncertainty process has temporal structure (regimes, trends,
+# seasonality), a recurrent encoder can learn patterns that a feedforward
+# MLP cannot detect from a fixed-length window.
 #
-# SDDP uses a PAR(1) fit: ``d_t \approx \mu_t + \alpha(d_{t-1} - \mu_{t-1}) + \omega_t``
-# with per-stage means ``\mu_t``, autocorrelation ``\alpha \approx 0.86``, and
-# 9 equiprobable innovation points fitted from 10,000 simulated demand paths.
+# The design below uses `Flux.LSTMCell` to process one *lagged* demand
+# value per stage. The LSTM hidden state accumulates across stages within
+# a scenario, then resets between scenarios via `Flux.reset!`.
 #
-# ![Relaxed results](../assets/inventory_relaxed_results.png)
+# The affine output `raw × 200 + 150` avoids sigmoid saturation and
+# centers the target on typical inventory levels.
+
+mutable struct RecurrentExAntePolicy{E,C,S}
+    encoder::E
+    combiner::C
+    state::S
+end
+
+@functor RecurrentExAntePolicy (encoder, combiner)
+
+function (p::RecurrentExAntePolicy)(x)
+    d_prev    = Float32(x[3])
+    inventory = Float32(x[2])
+    d_prev2   = Float32(x[4])
+    T = eltype(first(p.state))
+    encoded, new_state = p.encoder(T[d_prev / 100], p.state)
+    p.state = new_state
+    raw = p.combiner(vcat(encoded, T[inventory / 100, d_prev2 / 100]))
+    target = raw[1] * 200f0 + 150f0
+    return Float32[target, x[1], d_prev]
+end
+
+function Flux.reset!(p::RecurrentExAntePolicy)
+    p.state = Flux.initialstates(p.encoder)
+    return nothing
+end
+
+Random.seed!(2024)
+lstm_encoder = Flux.LSTMCell(1 => 16)
+lstm_policy = RecurrentExAntePolicy(
+    lstm_encoder,
+    Dense(16 + 2, 1),
+    Flux.initialstates(lstm_encoder),
+)
+
+# ## Training Calls
 #
-# | Method                   |   N | Mean cost |   Std | 95% CI | vs TS-DDR | Fit (s) | Eval (s) |
-# |:-------------------------|----:|----------:|------:|-------:|----------:|--------:|---------:|
-# | TS-DDR (trained)         | 300 |    2667.3 | 594.5 |   67.3 |     +0.0% |    54.6 |   0.0018 |
-# | SDDP (PAR)              | 300 |    2434.2 | 774.8 |   87.7 |     -8.7% |     0.0 |  20.6455 |
-# | Base-stock (S\*=160)    | 300 |    3035.6 | 506.8 |   57.3 |    +13.8% |     0.0 |   0.0002 |
-# | Random (untrained)      | 300 |    3751.7 | 221.7 |   25.1 |    +40.7% |     0.0 |   0.0018 |
+# The continuous problem uses ordinary dual information.
+
+# ```julia
+# train_multistage(
+#     policy,
+#     initial_state,
+#     relaxed_subproblems,
+#     relaxed_state_in,
+#     relaxed_state_out,
+#     relaxed_sampler;
+#     num_batches = 400,
+#     num_train_per_batch = 5,
+#     optimizer = Flux.Adam(0.0015),
+#     integer_strategy = NoIntegerStrategy(),
+#     penalty_schedule = [(1, 80, 0.4), (81, 400, 1.0)],
+# )
+# ```
+
+# The integer deterministic-equivalent run uses the fixed-discrete local dual
+# path plus the scheduled score-function correction.
+
+# ```julia
+# train_multistage(
+#     policy,
+#     initial_state,
+#     integer_det_equivalent,
+#     integer_det_state_in,
+#     integer_det_state_out,
+#     integer_det_sampler;
+#     num_batches = 800,
+#     num_train_per_batch = 10,
+#     optimizer = Flux.Adam(0.0008),
+#     integer_strategy = fixed_discrete,
+#     penalty_schedule = [(1, 120, 0.4), (121, 800, 1.0)],
+#     score_function = score_schedule,
+# )
+# ```
+
+# ## Evaluation
 #
-# SDDP clearly dominates: 8.7% lower cost than TS-DDR, 40.7% lower than
-# Random.  The SDDP and Random cost distributions are non-overlapping,
-# confirming that informed methods have a clear edge on this demand process.
+# A trained policy should be evaluated by stage-wise rollout, because that is
+# the deployment semantics: solve one period, observe the realized next state,
+# then solve the next period.
 
-# ## Integer (MIP) Problem
+uncertainty_sample = sample(integer_sampler)
+rollout_cost = simulate_multistage(
+    integer_subproblems,
+    integer_state_in,
+    integer_state_out,
+    initial_state,
+    uncertainty_sample,
+    policy;
+    integer_strategy = fixed_discrete,
+)
+
+# ## Experiment Scripts
 #
-# Introducing the binary ``z_t`` and fixed cost ``K=500`` changes the
-# landscape.  SDDP can only use LP relaxation for training (``z \in [0,1]``),
-# which systematically underestimates ``K``: when the LP says ``z=0.3``,
-# ``q=20``, the relaxed cost is ``0.3 \times 500 + 2 \times 20 = 190``, but
-# the true integer cost with ``z=1`` is ``500 + 40 = 540``.
+# Each variant can be trained independently via SLURM or directly:
 #
-# TS-DDR with `FixedDiscreteIntegerStrategy` handles this correctly: it
-# solves the full MIP, fixes the binary incumbent, and reads LP duals in
-# that integer-consistent state.
+# ```bash
+# # Single variant
+# julia --project=. train_dr_inventory.jl integer_lstm
 #
-# ![Integer results](../assets/inventory_integer_results.png)
+# # All variants in parallel via SLURM
+# cd examples/inventory_control && bash launch_all.sh
+# ```
 #
-# | Method                   |   N | Mean cost |   Std | 95% CI | vs TS-DDR (FD) | Fit (s) | Eval (s) |
-# |:-------------------------|----:|----------:|------:|-------:|---------------:|--------:|---------:|
-# | TS-DDR (FixedDiscrete)   | 300 |    8015.8 | 719.5 |   81.4 |          +0.0% |   339.2 |   0.0112 |
-# | TS-DDR (ContRelax)       | 300 |    8318.1 | 720.0 |   81.5 |          +3.8% |   109.4 |   0.0117 |
-# | SDDP integer rollout     | 300 |    8274.2 | 912.5 |  103.3 |          +3.2% |     0.0 |   7.9088 |
-# | Base-stock (S\*=160)    | 300 |    9035.6 | 506.8 |   57.3 |         +12.7% |     0.0 |   0.0000 |
-# | Random (untrained)      | 300 |    9594.6 | 361.1 |   40.9 |         +19.7% |     0.0 |   0.0120 |
+# Available variant tags: `relaxed`, `relaxed_lstm`, `relaxed_hp`,
+# `relaxed_lstm_hp`, `integer`, `integer_cr`, `integer_sf`, `integer_hp`,
+# `integer_lstm`, `integer_lstm_sf`.
 #
-# `FixedDiscreteIntegerStrategy` achieves the lowest cost (8016), beating both
-# SDDP (8274, +3.2%) and `ContinuousRelaxationIntegerStrategy` (8318, +3.8%).
-# The continuous relaxation strategy performs similarly to SDDP — both use LP
-# relaxation and both underestimate the fixed ordering cost.
+# After training, run the comparison script to regenerate tables and figures:
 #
-# `ContinuousRelaxationIntegerStrategy` trains 3× faster (109s vs 339s)
-# because it only solves LPs, but the resulting policy is less accurate on
-# integer-constrained problems.
-
-# ## Runnable Scripts
+# ```bash
+# julia --project=. evaluate_inventory.jl
+# julia --project=. solve_sddp.jl
+# julia --project=. compare_results.jl
+# ```
 #
-# The complete experiment lives in `examples/inventory_control/`:
+# The figures used by this page are generated by `compare_results.jl`.
+
+# ![Demand process](../assets/inventory_demand_process.png)
 #
-# | Script | Purpose |
-# |:-------|:--------|
-# | `build_inventory_problem.jl` | JuMP subproblem and det-equivalent builders, demand process, policy architecture |
-# | `train_dr_inventory.jl` | TS-DDR training (relaxed, FixedDiscrete, ContRelax) and trajectory evaluation |
-# | `evaluate_inventory.jl` | Base-stock grid-search and random baseline evaluation |
-# | `solve_sddp.jl` | SDDP (2T-stage PAR(1)) training and rollout |
-# | `compare_results.jl` | Load all CSVs, print summary tables, save plots |
+# ![Relaxed results](../assets/inventory_relaxed_results.png)
 #
-# ```bash
-# julia --project=examples/inventory_control examples/inventory_control/train_dr_inventory.jl
-# julia --project=examples/inventory_control examples/inventory_control/evaluate_inventory.jl
-# julia --project=examples/inventory_control examples/inventory_control/solve_sddp.jl
-# julia --project=examples/inventory_control examples/inventory_control/compare_results.jl
-# ```
+# ![Integer results](../assets/inventory_integer_results.png)
+
+# ### Relaxed (continuous) results
+#
+# SDDP uses a PAR(1) approximation of the true latent demand process, which
+# is not exact for this problem. Despite this advantage for TS-DDR, the gap
+# between the best TS-DDR variant and SDDP is ~7%.
+#
+# The LSTM encoder closes ~25% of the gap versus the feedforward baseline by
+# learning temporal demand patterns from lagged observations.
+#
+# | Method                          |   N | Mean cost |   Std | vs SDDP |
+# |:--------------------------------|----:|----------:|------:|--------:|
+# | SDDP (PAR)                      | 300 |    2434.0 |     — |   0.0%  |
+# | TS-DDR (LSTM)                   | 300 |    2610.6 | 540.3 |  +7.3%  |
+# | TS-DDR (feedforward)            | 300 |    2667.3 | 593.5 |  +9.6%  |
+# | TS-DDR (HighPenalty)            | 300 |    2677.5 | 547.0 | +10.0%  |
+# | TS-DDR (LSTM+HP)                | 300 |    2712.0 | 554.6 | +11.4%  |
+#
+# ### Integer (MIP) results
+#
+# SDDP uses an `AlternativeForwardPass`: MIP in the forward pass, LP
+# relaxation in the backward pass for valid cuts. The TS-DDR gap is ~36%.
+#
+# | Method                          |   N | Mean cost |   Std | vs SDDP |
+# |:--------------------------------|----:|----------:|------:|--------:|
+# | SDDP (MIP fwd)                  | 300 |    5871.6 |1087.4 |   0.0%  |
+# | TS-DDR (FixedDiscrete)          | 300 |    8015.8 | 718.3 | +36.5%  |
+# | TS-DDR (MixedGrad)              | 300 |    8268.0 | 715.3 | +40.8%  |
+# | TS-DDR (ContRelax)              | 300 |    8318.1 | 718.8 | +41.7%  |
+# | TS-DDR (HighPenalty)            | 300 |    8388.4 | 615.9 | +42.8%  |
+# | SDDP (LP relax)                 | 300 |    8274.2 | 912.5 | +40.9%  |
+# | Base-stock (S\*=160)            | 300 |    9035.6 | 506.8 | +53.9%  |
+# | Random (untrained)              | 300 |    9594.6 | 361.1 | +63.4%  |
\ No newline at end of file
diff --git a/docs/src/examples/inventory.md b/docs/src/examples/inventory.md
index ba10084..f962219 100644
--- a/docs/src/examples/inventory.md
+++ b/docs/src/examples/inventory.md
@@ -2,194 +2,440 @@
 EditURL = "inventory.jl"
 ```
 
-# Inventory Control with Ordering Costs
+# Stochastic Lot-Sizing with Fixed Ordering Costs
 
-This example studies a 12-period stochastic lot-sizing problem with two
-formulations — a **relaxed** (continuous) case and an **integer** (MIP) case
-with fixed ordering costs.  The comparison shows:
+This example shows how to train target-state decision rules for a stochastic
+inventory problem with ex-ante ordering decisions.
 
-1. **Relaxed problem**: SDDP with a PAR(1) demand approximation is
-   near-optimal and outperforms TS-DDR.
-2. **Integer problem**: TS-DDR with `FixedDiscreteIntegerStrategy` outperforms
-   both SDDP and TS-DDR with `ContinuousRelaxationIntegerStrategy`, because
-   SDDP and continuous relaxation both underestimate the fixed ordering cost.
+The example has two purposes:
 
-## Information Pattern
-
-At the beginning of a period, the controller observes current inventory and
-recent realized demand.  It does **not** observe current demand before ordering.
-The order is therefore ex-ante.  After ordering, demand is realized and becomes
-part of the state for the next period.
+1. show the complete optimization model before discussing implementation
+   details; and
+2. show the code in the same order a reader would run it.
 
-The state carried between periods is:
+````@example inventory
+using DecisionRules
+using Flux
+using HiGHS
+using JuMP
+using Random
+using Statistics
+````
 
-```julia
-[net_inventory, last_demand, previous_demand]
-```
+The runnable experiment lives outside the documentation tree. The file defines
+the demand process, JuMP builders, and policy architecture used below.
 
-This lets a time-invariant policy infer the latent demand regime from recent
-observations without receiving a period counter or synthetic seasonal features.
+````@example inventory
+include(joinpath(@__DIR__, "..", "..", "..", "examples", "inventory_control",
+    "build_inventory_problem.jl"))
+````
 
-## Inventory Model
-
-### Relaxed formulation
+## Information Pattern
 
-The order quantity is continuous with no setup cost:
+At the beginning of period `t`, the controller knows
 
 ```math
-0 \le q_t \le Q_{\max}, \qquad
-\text{cost}_t = c\,q_t + h\max(s_t,0) + p\max(-s_t,0).
+x_t = (s_{t-1}, d_{t-1}, d_{t-2}),
 ```
 
-### Integer formulation
+where `s` is net inventory and `d` is realized demand. The controller chooses
+the order quantity before seeing current demand `d_t`. This is an ex-ante
+decision.
 
-A binary variable ``z_t \in \{0,1\}`` controls whether an order is placed.
-If ``z_t = 0``, then ``q_t`` must be zero; if ``z_t = 1``, the model pays a
-fixed setup cost ``K``:
+The neural policy receives `[d_t, x_t...]` during training because
+DecisionRules policies output target states after the stage uncertainty is
+sampled. The implementation below uses that target only to guide the
+optimization model; the actual order still respects the model's information
+pattern.
 
-```math
-0 \le q_t \le Q_{\max}\,z_t, \qquad
-\text{cost}_t = K\,z_t + c\,q_t + h\max(s_t,0) + p\max(-s_t,0).
-```
+## Complete Stage Model
 
-In both cases, ordered units arrive before demand:
+For each period `t = 1, ..., T`, the stage model is
 
 ```math
-s^{mid}_t = s_{t-1} + q_t, \qquad s_t = s^{mid}_t - d_t.
+\begin{aligned}
+\min_{q_t,z_t,s_t^{mid},s_t,h_t,b_t}
+    \quad & K z_t + c q_t + h h_t + p b_t
+            + \lambda |s_t^{mid} - \hat{s}_t| \\
+\text{s.t.}\quad
+    & 0 \le q_t \le Q_{\max} z_t, && \text{(1) order capacity} \\
+    & z_t \in \{0,1\},             && \text{(2) setup decision} \\
+    & s_t^{mid} = s_{t-1} + q_t,   && \text{(3) order arrives} \\
+    & s_t = s_t^{mid} - d_t,       && \text{(4) demand realizes} \\
+    & h_t - b_t = s_t,             && \text{(5) inventory split} \\
+    & h_t \ge 0,\; b_t \ge 0.      && \text{(6) split bounds}
+\end{aligned}
 ```
 
-| Parameter | Value | Meaning |
-|:--|--:|:--|
-| ``T`` | 12 | periods |
-| ``K`` | 500 | fixed order/setup cost (integer case) |
-| ``c`` | 2 | unit ordering cost |
-| ``h`` | 1 | holding cost |
-| ``p`` | 25 | backlog penalty |
-| ``Q_{\max}`` | 350 | order capacity |
-| ``s_0`` | 30 | initial inventory |
+The relaxed model removes (2) and replaces (1) by
+``0 \le q_t \le Q_{\max}``; it also removes the fixed cost `K z_t` from the
+objective.
+
+The target `\hat{s}_t` is not an operational requirement. It is the state
+target produced by the neural decision rule, and the penalty term gives the
+policy a gradient signal.
+
+## Parameters
+
+````@example inventory
+inventory_parameters = (
+    T = INVENTORY_T,
+    setup_cost = INVENTORY_K,
+    unit_order_cost = INVENTORY_C,
+    holding_cost = INVENTORY_H,
+    backlog_cost = INVENTORY_P,
+    order_capacity = INVENTORY_Q_MAX,
+    initial_inventory = INVENTORY_I0,
+    target_penalty = INVENTORY_PENALTY,
+)
+````
 
 ## Demand Process
 
-Each trajectory has a path-level phase shift ``\phi \sim \mathrm{Unif}\{0,\ldots,T-1\}``,
-a persistent latent regime ``r_t \in \{-1,0,1\}`` (switch probability 0.04),
-and an autoregressive shock ``\epsilon_t``:
+Demand has a hidden seasonal phase, a persistent hidden regime, and an AR(1)
+shock:
 
 ```math
-\epsilon_t = 0.92\,\epsilon_{t-1} + 0.35\,\eta_t, \qquad
-d_t = \operatorname{clip}\!\bigl(
-  m_{\kappa_t} + w_{\kappa_t}(0.85\,r_t + 0.42\,\epsilon_t + 0.12\,\eta'_t)
-\bigr),
+\epsilon_t = 0.92 \epsilon_{t-1} + 0.35 \eta_t,
 ```
 
-where ``\kappa_t = 1 + ((t + \phi - 1) \bmod T)`` is the shifted seasonal
-index, and ``m_{\kappa_t}`` and ``w_{\kappa_t}`` are the midpoint and
-half-width of the seasonal demand band.  None of the latent variables are
-observed; the policy sees only inventory and realized demand history.
+```math
+d_t =
+\operatorname{clip}\!\left(
+    m_{\kappa_t}
+    + w_{\kappa_t}(0.85 r_t + 0.42 \epsilon_t + 0.12 \eta'_t)
+\right),
+```
 
-The plot below shows 24 sampled demand paths.  Because each trajectory has a
-different phase and persistent regime, the same calendar period can correspond
-to high, medium, or low demand across scenarios.
+where `r_t` is the hidden regime and
+``\kappa_t = 1 + ((t + \phi - 1) \bmod T)`` is the hidden seasonal index.
+
+````@example inventory
+Random.seed!(11)
+demand_paths = [sample_inventory_demand_path() for _ in 1:3]
+````
+
+## Build the Continuous and Integer Models
+
+The builders return the JuMP model(s), input-state parameters, output-target
+parameters, an uncertainty sampler, and the initial state.
+
+````@example inventory
+relaxed_subproblems,
+relaxed_state_in,
+relaxed_state_out,
+relaxed_sampler,
+initial_state = build_inventory_subproblems(;
+    num_scenarios = 100,
+    integer = false,
+)
+
+integer_subproblems,
+integer_state_in,
+integer_state_out,
+integer_sampler,
+_ = build_inventory_subproblems(;
+    num_scenarios = 100,
+    integer = true,
+)
+````
+
+The deterministic equivalent is the full-horizon model used by direct
+transcription training.
+
+````@example inventory
+integer_det_equivalent,
+integer_det_state_in,
+integer_det_state_out,
+integer_det_sampler,
+_ = build_inventory_det_equivalent(;
+    num_scenarios = 50,
+    integer = true,
+)
+````
+
+## Integer Sensitivity Strategies
+
+Mixed-integer models do not have ordinary LP duals. DecisionRules therefore
+makes the chosen postprocessing strategy explicit.
+
+````@example inventory
+fixed_discrete = FixedDiscreteIntegerStrategy()
+continuous_relaxation = ContinuousRelaxationIntegerStrategy()
+````
+
+`FixedDiscreteIntegerStrategy` solves the MIP, fixes the incumbent integer
+variables, re-solves the fixed LP, and reads local dual information.
+
+`ContinuousRelaxationIntegerStrategy` relaxes integer variables first and reads
+duals from the relaxed LP. This is smoother and faster, but the gradient is for
+the relaxation, not for an integer-feasible decision.
+
+## Score-Function Correction
+
+Local LP duals do not see a discrete switch such as "open the setup variable".
+A score-function correction estimates the effect of target changes by solving
+perturbed integer rollouts:
 
-![Demand process](../assets/inventory_demand_process.png)
+```math
+\nabla L
+=
+\alpha \nabla L_{\mathrm{dual}}
++ (1-\alpha)
+  \frac{1}{M}
+  \sum_{m=1}^{M}
+  (R_m - b)
+  \nabla_\theta
+  \sum_{t=1}^{T}
+  \left\langle
+      \delta_{m,t}/\sigma^2,
+      \hat{x}_{t+1}(\theta)
+  \right\rangle .
+```
 
-## Integer Postprocessing Strategies
+There are two different solves in the mixed-gradient training loop:
+
+- `train_multistage(...; integer_strategy = fixed_discrete)` controls the
+  deterministic-equivalent solve used for the dual-gradient term
+  ``\nabla L_{\mathrm{dual}}``. This solve needs a postprocessing strategy
+  because duals are not directly defined for a MIP.
+- `ScoreFunctionConfig(integer_subproblems, ...)` controls the Monte Carlo
+  rollout term. These rollout models are solved exactly as they are built.
+  Because `integer_subproblems` contain binary setup variables, the rollout
+  costs `R_m` are true MIP rollout costs.
+
+In short: `integer_strategy` is for reading local duals; score-function
+rollouts are for measuring realized costs.
+
+````@example inventory
+score_function = ScoreFunctionConfig(
+    integer_subproblems,
+    integer_state_in,
+    integer_state_out;
+    dual_weight = 0.5,
+    perturbation_std = 1.0,
+    num_rollouts = 8,
+)
+
+score_schedule = ScoreFunctionSchedule(
+    score_function;
+    sf_start = 200,
+    ramp_batches = 300,
+    perturbation_std_initial = 0.1,
+    num_rollouts_initial = 2,
+)
+````
+
+## Policy
+
+A DecisionRules policy is any callable `π(x) -> target` where `x` is the
+concatenation `[uncertainty..., state...]` and `target` is the desired
+next state. The only requirement is that it is differentiable via
+`Zygote.gradient` and registered with `Functors.@functor` so that
+`Flux.loadmodel!` can checkpoint its parameters.
+
+### Feedforward policy
+
+The simplest architecture is a feedforward MLP. This policy is ex-ante:
+it ignores the current demand `d_t` (index 1) and uses only the state
+entries `[inventory, d_{t-1}, d_{t-2}]`.
+
+````@example inventory
+using Functors: @functor
+
+struct ExAntePolicy{N}
+    net::N
+end
+
+@functor ExAntePolicy (net,)
+````
+
+The callable normalizes features to ≈[0,1] and maps through the network.
+The sigmoid output bounds the target to `[0, 500]`.
+
+````@example inventory
+function (p::ExAntePolicy)(x)
+    inventory = Float32(x[2])
+    d_prev    = Float32(x[3])
+    d_prev2   = Float32(x[4])
+    features  = Float32[inventory / 100, d_prev / 100, d_prev2 / 100]
+    target    = 500f0 .* Flux.sigmoid.(p.net(features))
+    return Float32[target[1], x[1], d_prev]
+end
+
+Random.seed!(2024)
+policy = ExAntePolicy(Chain(Dense(3, 32, relu), Dense(32, 24, relu), Dense(24, 1)))
+````
+
+### Recurrent (LSTM) policy
+
+When the uncertainty process has temporal structure (regimes, trends,
+seasonality), a recurrent encoder can learn patterns that a feedforward
+MLP cannot detect from a fixed-length window.
+
+The design below uses `Flux.LSTMCell` to process one *lagged* demand
+value per stage. The LSTM hidden state accumulates across stages within
+a scenario, then resets between scenarios via `Flux.reset!`.
+
+The affine output `raw × 200 + 150` avoids sigmoid saturation and
+centers the target on typical inventory levels.
+
+````@example inventory
+mutable struct RecurrentExAntePolicy{E,C,S}
+    encoder::E
+    combiner::C
+    state::S
+end
+
+@functor RecurrentExAntePolicy (encoder, combiner)
+
+function (p::RecurrentExAntePolicy)(x)
+    d_prev    = Float32(x[3])
+    inventory = Float32(x[2])
+    d_prev2   = Float32(x[4])
+    T = eltype(first(p.state))
+    encoded, new_state = p.encoder(T[d_prev / 100], p.state)
+    p.state = new_state
+    raw = p.combiner(vcat(encoded, T[inventory / 100, d_prev2 / 100]))
+    target = raw[1] * 200f0 + 150f0
+    return Float32[target, x[1], d_prev]
+end
+
+function Flux.reset!(p::RecurrentExAntePolicy)
+    p.state = Flux.initialstates(p.encoder)
+    return nothing
+end
+
+Random.seed!(2024)
+lstm_encoder = Flux.LSTMCell(1 => 16)
+lstm_policy = RecurrentExAntePolicy(
+    lstm_encoder,
+    Dense(16 + 2, 1),
+    Flux.initialstates(lstm_encoder),
+)
+````
+
+## Training Calls
+
+The continuous problem uses ordinary dual information.
 
-DecisionRules.jl provides two strategies for extracting gradient information
-from subproblems with discrete variables:
+```julia
+train_multistage(
+    policy,
+    initial_state,
+    relaxed_subproblems,
+    relaxed_state_in,
+    relaxed_state_out,
+    relaxed_sampler;
+    num_batches = 400,
+    num_train_per_batch = 5,
+    optimizer = Flux.Adam(0.0015),
+    integer_strategy = NoIntegerStrategy(),
+    penalty_schedule = [(1, 80, 0.4), (81, 400, 1.0)],
+)
+```
 
-**`FixedDiscreteIntegerStrategy`**: (1) solve the MIP for incumbent binary
-values ``z^*_t``; (2) fix ``z_t = z^*_t`` and relax integrality; (3) re-solve
-the resulting LP; (4) read LP duals as gradient signal.  This is the same
-principle as SDDP.jl's `FixedDiscreteDuality`.
+The integer deterministic-equivalent run uses the fixed-discrete local dual
+path plus the scheduled score-function correction.
 
-**`ContinuousRelaxationIntegerStrategy`**: relax all binary/integer
-constraints to continuous bounds (binary → [0,1]), solve the resulting LP,
-and read duals directly.  This is faster (one LP instead of MIP + LP) and
-gives smoother gradients, but the solution may have fractional integer
-variables — the gradient does not correspond to any feasible integer
-assignment.
+```julia
+train_multistage(
+    policy,
+    initial_state,
+    integer_det_equivalent,
+    integer_det_state_in,
+    integer_det_state_out,
+    integer_det_sampler;
+    num_batches = 800,
+    num_train_per_batch = 10,
+    optimizer = Flux.Adam(0.0008),
+    integer_strategy = fixed_discrete,
+    penalty_schedule = [(1, 120, 0.4), (121, 800, 1.0)],
+    score_function = score_schedule,
+)
+```
 
-For the relaxed formulation (no integer variables), `NoIntegerStrategy` is
-used and subproblems are solved as-is.
+## Evaluation
 
-## Relaxed (Continuous) Problem
+A trained policy should be evaluated by stage-wise rollout, because that is
+the deployment semantics: solve one period, observe the realized next state,
+then solve the next period.
 
-When there are no integer variables, SDDP can model the demand process
-exactly via a PAR(1) approximation that carries ``d_{t-1}`` as a state
-variable.  This makes SDDP near-optimal for the relaxed problem.
+````@example inventory
+uncertainty_sample = sample(integer_sampler)
+rollout_cost = simulate_multistage(
+    integer_subproblems,
+    integer_state_in,
+    integer_state_out,
+    initial_state,
+    uncertainty_sample,
+    policy;
+    integer_strategy = fixed_discrete,
+)
+````
 
-SDDP uses a PAR(1) fit: ``d_t \approx \mu_t + \alpha(d_{t-1} - \mu_{t-1}) + \omega_t``
-with per-stage means ``\mu_t``, autocorrelation ``\alpha \approx 0.86``, and
-9 equiprobable innovation points fitted from 10,000 simulated demand paths.
+## Experiment Scripts
 
-All costs below are out-of-sample operational costs evaluated on the same 300
-demand scenarios (seed 555).  **Fit** is the one-time offline cost (training
-or tuning).  **Eval** is the online deployment cost per decision point.
+Each variant can be trained independently via SLURM or directly:
 
-![Relaxed results](../assets/inventory_relaxed_results.png)
+```bash
+# Single variant
+julia --project=. train_dr_inventory.jl integer_lstm
 
-SDDP LP bound: **2162.0**
+# All variants in parallel via SLURM
+cd examples/inventory_control && bash launch_all.sh
+```
 
-| Method                   |   N | Mean cost |   Std | 95% CI | vs TS-DDR | Fit (s) | Eval (s) |
-|:-------------------------|----:|----------:|------:|-------:|----------:|--------:|---------:|
-| TS-DDR (trained)         | 300 |    2667.3 | 594.5 |   67.3 |     +0.0% |    54.6 |   0.0018 |
-| SDDP (PAR)              | 300 |    2434.2 | 774.8 |   87.7 |     -8.7% |     0.0 |  20.6455 |
-| Base-stock (S\*=160)    | 300 |    3035.6 | 506.8 |   57.3 |    +13.8% |     0.0 |   0.0002 |
-| Random (untrained)      | 300 |    3751.7 | 221.7 |   25.1 |    +40.7% |     0.0 |   0.0018 |
+Available variant tags: `relaxed`, `relaxed_lstm`, `relaxed_hp`,
+`relaxed_lstm_hp`, `integer`, `integer_cr`, `integer_sf`, `integer_hp`,
+`integer_lstm`, `integer_lstm_sf`.
 
-SDDP clearly dominates: 8.7% lower cost than TS-DDR, and the SDDP and Random
-cost distributions are non-overlapping.  This is expected for a convex problem
-where SDDP can represent the demand dynamics exactly through the PAR(1) state
-variable.
+After training, run the comparison script to regenerate tables and figures:
 
-## Integer (MIP) Problem
+```bash
+julia --project=. evaluate_inventory.jl
+julia --project=. solve_sddp.jl
+julia --project=. compare_results.jl
+```
+
+The figures used by this page are generated by `compare_results.jl`.
 
-Introducing the binary ``z_t`` and fixed cost ``K=500`` changes the
-landscape.  SDDP can only use LP relaxation for training (``z \in [0,1]``),
-which systematically underestimates ``K``: when the LP says ``z=0.3``,
-``q=20``, the relaxed cost is ``0.3 \times 500 + 2 \times 20 = 190``, but
-the true integer cost with ``z=1`` is ``500 + 40 = 540``.
+![Demand process](../assets/inventory_demand_process.png)
 
-TS-DDR with `FixedDiscreteIntegerStrategy` handles this correctly: it
-solves the full MIP, fixes the binary incumbent, and reads LP duals in
-that integer-consistent state.
+![Relaxed results](../assets/inventory_relaxed_results.png)
 
 ![Integer results](../assets/inventory_integer_results.png)
 
-SDDP LP bound: **3346.6**
+### Relaxed (continuous) results
 
-| Method                   |   N | Mean cost |   Std | 95% CI | vs TS-DDR (FD) | Fit (s) | Eval (s) |
-|:-------------------------|----:|----------:|------:|-------:|---------------:|--------:|---------:|
-| TS-DDR (FixedDiscrete)   | 300 |    8015.8 | 719.5 |   81.4 |          +0.0% |   339.2 |   0.0112 |
-| TS-DDR (ContRelax)       | 300 |    8318.1 | 720.0 |   81.5 |          +3.8% |   109.4 |   0.0117 |
-| SDDP integer rollout     | 300 |    8274.2 | 912.5 |  103.3 |          +3.2% |     0.0 |   7.9088 |
-| Base-stock (S\*=160)    | 300 |    9035.6 | 506.8 |   57.3 |         +12.7% |     0.0 |   0.0000 |
-| Random (untrained)      | 300 |    9594.6 | 361.1 |   40.9 |         +19.7% |     0.0 |   0.0120 |
+SDDP uses a PAR(1) approximation of the true latent demand process, which
+is not exact for this problem. Despite this advantage for TS-DDR, the gap
+between the best TS-DDR variant and SDDP is ~7%.
 
-`FixedDiscreteIntegerStrategy` achieves the lowest cost (8016), beating both
-SDDP (8274, +3.2%) and `ContinuousRelaxationIntegerStrategy` (8318, +3.8%).
-The continuous relaxation strategy performs similarly to SDDP — both use LP
-relaxation and both underestimate the fixed ordering cost.
+The LSTM encoder closes ~25% of the gap versus the feedforward baseline by
+learning temporal demand patterns from lagged observations.
 
-`ContinuousRelaxationIntegerStrategy` trains 3× faster (109s vs 339s)
-because it only solves LPs, but the resulting policy is less accurate on
-integer-constrained problems.
+| Method                          |   N | Mean cost |   Std | vs SDDP |
+|:--------------------------------|----:|----------:|------:|--------:|
+| SDDP (PAR)                      | 300 |    2434.0 |     — |   0.0%  |
+| TS-DDR (LSTM)                   | 300 |    2610.6 | 540.3 |  +7.3%  |
+| TS-DDR (feedforward)            | 300 |    2667.3 | 593.5 |  +9.6%  |
+| TS-DDR (HighPenalty)            | 300 |    2677.5 | 547.0 | +10.0%  |
+| TS-DDR (LSTM+HP)                | 300 |    2712.0 | 554.6 | +11.4%  |
 
-## Runnable Scripts
+### Integer (MIP) results
 
-The complete experiment lives in `examples/inventory_control/`:
+SDDP uses an `AlternativeForwardPass`: MIP in the forward pass, LP
+relaxation in the backward pass for valid cuts. The TS-DDR gap is ~36%.
 
-| Script | Purpose |
-|:-------|:--------|
-| `build_inventory_problem.jl` | JuMP subproblem and det-equivalent builders, demand process, policy architecture |
-| `train_dr_inventory.jl` | TS-DDR training (relaxed, FixedDiscrete, ContRelax) and trajectory evaluation |
-| `evaluate_inventory.jl` | Base-stock grid-search and random baseline evaluation |
-| `solve_sddp.jl` | SDDP (2T-stage PAR(1)) training and rollout |
-| `compare_results.jl` | Load all CSVs, print summary tables, save plots |
+| Method                          |   N | Mean cost |   Std | vs SDDP |
+|:--------------------------------|----:|----------:|------:|--------:|
+| SDDP (MIP fwd)                  | 300 |    5871.6 |1087.4 |   0.0%  |
+| TS-DDR (FixedDiscrete)          | 300 |    8015.8 | 718.3 | +36.5%  |
+| TS-DDR (MixedGrad)              | 300 |    8268.0 | 715.3 | +40.8%  |
+| TS-DDR (ContRelax)              | 300 |    8318.1 | 718.8 | +41.7%  |
+| TS-DDR (HighPenalty)            | 300 |    8388.4 | 615.9 | +42.8%  |
+| SDDP (LP relax)                 | 300 |    8274.2 | 912.5 | +40.9%  |
+| Base-stock (S\*=160)            | 300 |    9035.6 | 506.8 | +53.9%  |
+| Random (untrained)              | 300 |    9594.6 | 361.1 | +63.4%  |
 
-```bash
-julia --project=examples/inventory_control examples/inventory_control/train_dr_inventory.jl
-julia --project=examples/inventory_control examples/inventory_control/evaluate_inventory.jl
-julia --project=examples/inventory_control examples/inventory_control/solve_sddp.jl
-julia --project=examples/inventory_control examples/inventory_control/compare_results.jl
-```
diff --git a/docs/src/gpu_acceleration.md b/docs/src/gpu_acceleration.md
new file mode 100644
index 0000000..5cbf498
--- /dev/null
+++ b/docs/src/gpu_acceleration.md
@@ -0,0 +1,253 @@
+# GPU Acceleration with DecisionRulesExa.jl
+
+```@meta
+CurrentModule = DecisionRules
+```
+
+[DecisionRulesExa.jl](https://github.com/LearningToOptimize/DecisionRulesExa.jl) is a
+companion package that implements the same TS-DDR algorithm using
+[ExaModels.jl](https://github.com/exanauts/ExaModels.jl) instead of JuMP for the
+optimization backend. It targets GPU-accelerated training via
+[MadNLP.jl](https://github.com/MadNLP/MadNLP.jl) with CUDSS-backed interior-point
+solves.
+
+## When to use DecisionRulesExa.jl
+
+| | DecisionRules.jl (JuMP) | DecisionRulesExa.jl (ExaModels) |
+|:---|:---|:---|
+| **Backend** | JuMP + DiffOpt | ExaModels + MadNLP |
+| **Hardware** | CPU | CPU or GPU (CUDA) |
+| **Training modes** | DE, stage-wise, multiple shooting | Deterministic equivalent |
+| **Gradient source** | DiffOpt implicit diff + duals | Envelope theorem (duals only) |
+| **Best for** | Moderate NLPs, integer variables, stage-wise decomposition | Large NLPs (AC-OPF), GPU speedup, many samples per batch |
+
+**Choose DecisionRulesExa.jl when** the inner NLP is large enough that GPU
+acceleration matters (e.g., AC-OPF with hundreds of buses and thousands of
+variables per stage) and you want to run many training samples per gradient
+step on a single GPU.
+
+**Choose DecisionRules.jl when** you need stage-wise or multiple-shooting
+decomposition, integer variable support, or DiffOpt-based solution sensitivities.
+
+## Installation
+
+```julia
+using Pkg
+Pkg.add(url="https://github.com/LearningToOptimize/DecisionRulesExa.jl.git")
+```
+
+For GPU support, also install CUDA.jl and MadNLPGPU:
+
+```julia
+Pkg.add(["CUDA", "MadNLPGPU"])
+```
+
+## Quick start: CPU
+
+The simplest way to get started is with the built-in linear tracking problem:
+
+```julia
+using DecisionRulesExa
+using ExaModels, Flux, MadNLP, Random
+
+Random.seed!(1)
+
+T  = 8   # horizon
+nx = 1   # state dimension
+
+# Build a parametric deterministic-equivalent NLP on CPU
+prob = build_linear_tracking_problem(
+    horizon       = T,
+    nx            = nx,
+    backend       = nothing,       # CPU
+    slack_penalty = 10.0,
+    u_bounds      = (-2.0, 2.0),
+)
+
+# LSTM policy: maps [w_t ; x_{t-1}] → target x̂_t at each stage
+policy = StateConditionedPolicy(nx, nx, nx, [64, 64])
+
+# Uncertainty sampler: returns a flat vector of length T * nw
+sampler() = Float32.(0.1 .* randn(T * nx))
+
+# Train with TS-DDR policy gradient (envelope theorem)
+train_tsddr(
+    policy,
+    Float32.([1.0]),               # initial state
+    prob,
+    prob.p_x0,
+    prob.p_target,
+    prob.p_w,
+    sampler;
+    num_batches         = 100,
+    num_train_per_batch = 4,
+    optimizer           = Flux.Adam(1f-3),
+    madnlp_kwargs       = (print_level = MadNLP.ERROR, tol = 1e-6),
+)
+```
+
+## Moving to GPU
+
+To run the same problem on GPU, change the backend and add a GPU-native
+linear solver:
+
+```julia
+using CUDA, MadNLPGPU
+
+prob_gpu = build_linear_tracking_problem(
+    horizon       = T,
+    nx            = nx,
+    backend       = CUDABackend(),
+    slack_penalty = 10.0,
+    u_bounds      = (-2.0, 2.0),
+)
+
+train_tsddr(
+    policy,
+    Float32.([1.0]),
+    prob_gpu,
+    prob_gpu.p_x0,
+    prob_gpu.p_target,
+    prob_gpu.p_w,
+    sampler;
+    num_batches         = 100,
+    num_train_per_batch = 4,
+    optimizer           = Flux.Adam(1f-3),
+    madnlp_kwargs       = (
+        print_level   = MadNLP.ERROR,
+        tol           = 1e-6,
+        linear_solver = CUDSSSolver,
+    ),
+)
+```
+
+The policy (Flux model) stays on CPU; only the NLP solve runs on GPU.
+Parameter updates (`ExaModels.set_parameter!`) and multiplier extraction
+handle CPU↔GPU transfers automatically.
+
+## Custom problems
+
+For domain-specific models (power systems, robotics, etc.), build the
+ExaModels NLP directly instead of using `build_linear_tracking_problem`.
+The key requirements are:
+
+1. **Add target constraints last** so their multipliers form a contiguous
+   slice of `result.multipliers`.
+2. **Parameterize** the initial state (`p_x0`), uncertainty trajectory
+   (`p_w`), and target trajectory (`p_target`) as ExaModels parameters.
+3. **Return** a struct with fields `.core`, `.model`, `.horizon`, and
+   `.target_con_range`.
+
+The `HydroPowerModels` example in DecisionRulesExa.jl demonstrates this
+pattern for a full AC-OPF problem with reservoir dynamics:
+
+```julia
+# In examples/HydroPowerModels/hydro_power_exa.jl
+prob = build_hydro_de(
+    data;
+    num_stages     = 96,
+    backend        = CUDABackend(),
+    formulation    = :ac_polar,
+    deficit_cost   = 1e5,
+    target_penalty = :auto,
+)
+```
+
+## Parallel GPU solves
+
+When training samples are independent, multiple NLP instances can be
+solved concurrently on the same GPU. Build a pool of independent problem
+copies and pass it to `train_tsddr`:
+
+```julia
+pool = [(prob, prob.p_x0, prob.p_target, prob.p_w)]
+for _ in 2:num_workers
+    p = build_my_problem(backend = CUDABackend())
+    push!(pool, (p, p.p_x0, p.p_target, p.p_w))
+end
+
+train_tsddr(policy, x0, prob, prob.p_x0, prob.p_target, prob.p_w, sampler;
+    problem_pool        = pool,
+    num_train_per_batch = num_workers,
+)
+```
+
+Each pool entry gets its own MadNLP solver instance. Samples are
+distributed round-robin across the pool and solved via `Threads.@spawn`.
+
+## Penalty annealing
+
+DecisionRulesExa.jl supports penalty annealing through the
+`adjust_hyperparameters` callback. The target penalty coefficient
+``\rho`` is stored as an ExaModels parameter and can be updated at
+runtime:
+
+```julia
+adjust_hyperparameters = function(iter, opt_state, num_train)
+    phase = iter < 100 ? 0.1 :
+            iter < 200 ? 1.0 :
+            iter < 300 ? 10.0 : 30.0
+    ρ = base_penalty * phase
+    penalty_vals = fill(ρ / 2, T * nx)
+    ExaModels.set_parameter!(prob.core, prob.p_penalty_half, penalty_vals)
+    return num_train
+end
+```
+
+This mirrors the `penalty_schedule` keyword in DecisionRules.jl's
+[`train_multistage`](@ref).
+
+## Rollout evaluation
+
+[`RolloutEvaluation`](@ref) in DecisionRules.jl evaluates policies
+stage-by-stage under deployment semantics. DecisionRulesExa.jl provides
+an analogous `RolloutEvaluation` that solves stage subproblems
+sequentially:
+
+```julia
+eval = RolloutEvaluation(
+    stage_problem, x0, eval_scenarios;
+    horizon              = T,
+    n_uncertainty        = nw,
+    set_stage_parameters! = my_stage_setter!,
+    realized_state       = my_realized_state,
+    stride               = 25,
+    policy_state         = :realized,
+)
+```
+
+Both packages report the same metrics: operational cost excluding
+target-deficit penalty, and target-violation share.
+
+## Mapping between packages
+
+| DecisionRules.jl | DecisionRulesExa.jl | Notes |
+|:---|:---|:---|
+| `train_multistage` | `train_tsddr` | Main training loop |
+| `state_conditioned_policy` | `StateConditionedPolicy` | LSTM policy |
+| `dense_multilayer_nn` | `MLPPolicy` | MLP policy |
+| `state_params_in` | `p_x0` | Initial state parameter |
+| `state_params_out` | `p_target` | Target parameter |
+| `uncertainty_samples` | `p_w` + sampler | Uncertainty parameter |
+| `SampleLog` / `record` | `record_loss` | Per-iteration callback |
+| `RolloutEvaluation` | `RolloutEvaluation` | Stage-wise eval |
+| `penalty_schedule` | `adjust_hyperparameters` | Penalty annealing |
+| `ScoreFunctionConfig` | — | Not yet ported to ExaModels |
+| Stage-wise decomposition | — | JuMP only |
+| Multiple shooting | — | JuMP only |
+
+## Full example: HydroPowerModels
+
+The `examples/HydroPowerModels/` directory in DecisionRulesExa.jl contains
+a complete AC-OPF hydrothermal scheduling example for the Bolivia test case
+— the same problem solved by DecisionRules.jl in the
+[Hydropower Scheduling](@ref) tutorial. It demonstrates:
+
+- Parsing PowerModels.jl network data and hydro reservoir parameters
+- Building a multi-stage deterministic-equivalent NLP in ExaModels
+  (DC or AC polar OPF formulations)
+- L1 + L2 penalty on target slack (δ⁺/δ⁻ splitting for smooth NLP)
+- GPU training with parallel MadNLP solves
+- Warm-start caching to prevent cascade solver failures
+- Penalty and sample-count annealing schedules
+- W&B metric logging
diff --git a/docs/src/gradient_fallback.md b/docs/src/gradient_fallback.md
new file mode 100644
index 0000000..3baf895
--- /dev/null
+++ b/docs/src/gradient_fallback.md
@@ -0,0 +1,145 @@
+# Gradient Fallback
+
+```@meta
+CurrentModule = DecisionRules
+```
+
+## Motivation
+
+TS-DDR training relies on solving an optimization subproblem at every stage and
+differentiating through it (via Lagrange duals or DiffOpt). In practice, some
+solves may fail — the solver hits numerical trouble, DiffOpt encounters
+degenerate duals, or the subproblem is infeasible for a particular sample. A
+single uncaught error kills the entire training run.
+
+The **gradient fallback** system provides a principled, extensible way to handle
+these errors at three levels:
+
+| Level | Where it fires | What it controls |
+|-------|----------------|------------------|
+| **rrule pullback** | Inside the ChainRules `rrule` for `get_next_state` | Whether a bad-solver-status pullback returns zeros or throws |
+| **Training loop** | Around `Flux.gradient(...)` in `train_multistage` / `train_multiple_shooting` | Whether a DiffOpt error skips the iteration or crashes |
+| **Rollout evaluation** | Inside [`RolloutEvaluation`](@ref) per scenario | Whether a failed scenario is excluded from the metric or crashes |
+
+## Built-in fallback types
+
+- [`AbstractGradientFallback`](@ref) — abstract supertype for all fallback strategies
+- [`ZeroGradientFallback`](@ref) — log a warning, return zero gradients, continue training (default)
+- [`ErrorGradientFallback`](@ref) — re-throw the error (useful in tests)
+
+See the [API Reference](api.md) for full docstrings.
+
+## Usage
+
+### Default behavior (zero gradients)
+
+By default, all training functions use [`ZeroGradientFallback`](@ref). Failed
+iterations log a warning and skip the parameter update:
+
+```julia
+train_multistage(
+    model, x0, subproblems, spi, spo, uncertainty;
+    num_batches=500,
+    # gradient_fallback=ZeroGradientFallback()  # this is the default
+)
+```
+
+At training start you will see:
+
+```
+[ Info: Training with ZeroGradientFallback: solver/differentiation errors
+will be caught and the iteration skipped (zero gradient). Pass
+`gradient_fallback=ErrorGradientFallback()` to throw instead, or implement
+a custom `AbstractGradientFallback` subtype.
+```
+
+### Strict mode (for tests)
+
+Use [`ErrorGradientFallback`](@ref) when you want errors to surface
+immediately — typically in unit tests where every solve should succeed:
+
+```julia
+train_multistage(
+    model, x0, subproblems, spi, spo, uncertainty;
+    num_batches=10,
+    gradient_fallback=ErrorGradientFallback(),
+)
+```
+
+The same keyword works for [`train_multiple_shooting`](@ref) and
+[`RolloutEvaluation`](@ref):
+
+```julia
+rollout = RolloutEvaluation(
+    subproblems, spi, spo, x0, scenarios;
+    gradient_fallback=ErrorGradientFallback(),
+)
+```
+
+## Custom fallbacks (extending the type system)
+
+Subtype [`AbstractGradientFallback`](@ref) and implement three methods:
+
+```julia
+struct LoggingFallback <: DecisionRules.AbstractGradientFallback
+    log::Vector{Any}
+end
+
+# Called when the rrule pullback (DiffOpt / dual extraction) fails.
+# Return a tuple of cotangents matching the rrule signature, or rethrow.
+function DecisionRules.handle_gradient_error(fb::LoggingFallback, e, n_in, n_out)
+    push!(fb.log, (:gradient, e))
+    return DecisionRules._zero_cotangents(n_in, n_out)
+end
+
+# Called when Flux.gradient(...) throws in the training loop.
+# Return `true` to skip this iteration, or rethrow.
+function DecisionRules.handle_training_error(fb::LoggingFallback, e, iter)
+    push!(fb.log, (:training, iter, e))
+    return true
+end
+
+# Called when a rollout scenario fails during evaluation.
+# Return `true` to exclude this scenario from the metric, or rethrow.
+function DecisionRules.handle_rollout_error(fb::LoggingFallback, e, iter)
+    push!(fb.log, (:rollout, iter, e))
+    return true
+end
+```
+
+Then pass it to any training function:
+
+```julia
+fb = LoggingFallback(Any[])
+train_multistage(model, x0, subs, spi, spo, unc;
+    gradient_fallback=fb,
+)
+println("Caught $(length(fb.log)) errors during training")
+```
+
+This is useful for:
+- **Monitoring**: count how often solves fail and on which iterations
+- **Adaptive recovery**: adjust solver tolerances, restart from a checkpoint, etc.
+- **Selective rethrowing**: catch known benign errors but let unexpected ones through
+
+## Relationship to `STRICT_GRADIENTS`
+
+The global [`STRICT_GRADIENTS`](@ref DecisionRules.STRICT_GRADIENTS) flag controls a separate, lower-level mechanism:
+inside the `rrule` pullback, when the forward solver terminates with a
+non-optimal status (e.g., `ITERATION_LIMIT`), the pullback returns zero
+gradients (if `STRICT_GRADIENTS[] == false`, the default) or throws (if `true`).
+
+The `gradient_fallback` keyword operates at a higher level — it catches errors
+from DiffOpt's `reverse_differentiate!` (assertion errors, degenerate duals,
+etc.) and from the training loop itself. Both mechanisms are independent and
+complementary:
+
+```
+Forward solve
+  └─ bad termination status → STRICT_GRADIENTS controls behavior
+  └─ good status → DiffOpt reverse_differentiate!
+       └─ error (assertion, numerical) → gradient_fallback catches it
+            └─ in rrule pullback: handle_gradient_error
+            └─ in training loop: handle_training_error
+            └─ in rollout eval: handle_rollout_error
+```
diff --git a/docs/src/index.md b/docs/src/index.md
index 070f3c1..7051de0 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -30,7 +30,7 @@ Three training formulations are supported:
 
 ```julia
 using Pkg
-Pkg.add(url="https://github.com/LearningToOptimize/DecisionRules.jl.git")
+Pkg.add("DecisionRules")
 ```
 
 ## Quick start
@@ -55,8 +55,10 @@ train_multistage(
 )
 ```
 
-See the [Algorithm](@ref) page for the mathematical formulation and the
-examples for complete worked problems.
+See the [Algorithm](@ref) page for the mathematical formulation, the
+[Uncertainty Sampling](@ref) guide for how to prepare your scenario data, the
+[GPU Acceleration with DecisionRulesExa.jl](@ref) page for GPU-accelerated training,
+and the examples for complete worked problems.
 
 ## Citation
 
diff --git a/docs/src/sampling.md b/docs/src/sampling.md
new file mode 100644
index 0000000..bacdfad
--- /dev/null
+++ b/docs/src/sampling.md
@@ -0,0 +1,314 @@
+# Uncertainty Sampling
+
+```@meta
+CurrentModule = DecisionRules
+```
+
+## Why sampling matters in TS-DDR
+
+In the TS-DDR training loop, each SGD step approximates the stochastic objective
+
+```math
+\min_\theta \; \mathbb{E}_{w_{1:T}} \left[
+    \sum_{t=1}^{T} q_t\bigl(x_{t-1}, w_t;\, \hat{x}_t(\theta)\bigr)
+\right]
+```
+
+by drawing **sample trajectories** ``w_{1:T}^{(s)},\; s = 1,\ldots,S`` and
+differentiating through the subproblem solves. The `uncertainty_sampler`
+argument in [`train_multistage`](@ref) and [`train_multiple_shooting`](@ref)
+controls how these trajectories are generated.
+
+Once a trajectory ``w_{1:T}`` is realized — a concrete numeric value per
+uncertain parameter per stage — the rest of the training pipeline
+(policy rollout, subproblem solve, gradient computation) is **identical**
+regardless of how the trajectory was sampled. The sampler is therefore a
+pluggable component that lets you match the correlation structure of your
+problem.
+
+## Three sampling formats
+
+DecisionRules.jl supports three ways to specify uncertainty, offering
+increasing levels of correlation control.
+
+### 1. Independent (per-unit) pools
+
+Each uncertain parameter has its own finite support and is sampled
+**independently** at each stage.
+
+```
+                    ┌─ param₁: draw from {v₁₁, v₁₂, v₁₃}  ←── independent
+Stage t ────────────┤
+                    └─ param₂: draw from {v₂₁, v₂₂}        ←── independent
+```
+
+**Julia type**: `Vector{Vector{Tuple{VariableRef, Vector{T}}}}`
+
+```julia
+# uncertainty_pool[t][i] = (param_ref, [possible_values...])
+independent_pool = [
+    # stage 1
+    [(demand_param_1, [10.0, 15.0, 12.0]),
+     (demand_param_2, [8.0, 12.0, 9.0])],
+    # stage 2
+    [(demand_param_1, [11.0, 14.0, 13.0]),
+     (demand_param_2, [7.0, 11.0, 10.0])],
+]
+```
+
+Each call to `sample(independent_pool)` draws one value per parameter per
+stage, independently. With ``n`` parameters each having ``k`` scenarios, this
+samples from ``k^n`` possible combinations per stage — most of which may
+never have occurred in reality.
+
+**Use when**: parameters are genuinely independent, or you have a single
+uncertain parameter per stage.
+
+### 2. Joint-scenario pools (spatial correlation)
+
+Pre-defined joint realizations across **all** parameters. Sampling picks one
+complete scenario per stage, preserving cross-parameter (spatial) correlations.
+
+```
+                         ω=1: (param₁=v₁₁, param₂=v₂₁)
+                        ╱
+Stage t ── draw one ω ─── ω=2: (param₁=v₁₂, param₂=v₂₂)
+                        ╲
+                         ω=3: (param₁=v₁₃, param₂=v₂₃)
+```
+
+**Julia type**: `Vector{Vector{Vector{Tuple{VariableRef, T}}}}`
+
+```julia
+# uncertainty_pool[t][ω] = [(param₁, val), (param₂, val), ...]
+joint_pool = [
+    # stage 1: 3 scenarios
+    [[(inflow_1, 10.0), (inflow_2, 80.0)],   # ω=1
+     [(inflow_1, 20.0), (inflow_2, 120.0)],   # ω=2
+     [(inflow_1, 15.0), (inflow_2, 90.0)]],   # ω=3
+    # stage 2: 3 scenarios
+    [[(inflow_1, 11.0), (inflow_2, 70.0)],
+     [(inflow_1, 14.0), (inflow_2, 110.0)],
+     [(inflow_1, 13.0), (inflow_2, 100.0)]],
+]
+```
+
+Each call to `sample(joint_pool)` picks one scenario index ``\omega``
+per stage and returns all parameters from that scenario. Only historically
+observed combinations appear.
+
+!!! warning "Matching SDDP semantics"
+    SDDP.jl's `SDDP.parameterize` draws one ``\omega`` for all random
+    variables in a stage. If you compare TS-DDR against SDDP, you **must**
+    use joint-scenario pools to avoid a distributional mismatch.
+
+**Use when**: parameters are correlated (e.g., inflows across a river basin),
+or your benchmark uses joint scenarios (SDDP, scenario trees).
+
+### 3. Trajectory sampler (spatial + temporal correlation)
+
+A callable that generates each stage's realization **conditioned on previous
+stages**. This enables autoregressive (AR), Markovian, or any custom temporal
+dependence — something the data-pool formats above cannot express because
+they sample stages independently.
+
+```
+Stage 1 ── sampler(1, []) ──────────────────── w₁
+                                                │
+Stage 2 ── sampler(2, [w₁]) ───────────────── w₂
+                                                │
+Stage 3 ── sampler(3, [w₁, w₂]) ──────────── w₃
+```
+
+**Julia type**: `Function` with signature `(t::Int, past::Vector{...}) -> Vector{Tuple{VariableRef, T}}`
+
+```julia
+# AR(1) inflow sampler with spatial correlation
+function ar1_sampler(t, past)
+    if isempty(past)
+        # Stage 1: draw from marginal distribution
+        ω = rand(1:nScenarios)
+        return [(params[t][r], data[r][t, ω]) for r in 1:nHyd]
+    else
+        # Stage t > 1: AR(1) conditioned on previous stage
+        prev = [pair[2] for pair in past[end]]
+        noise = randn(nHyd) .* σ
+        vals = ρ .* prev .+ (1 .- ρ) .* μ .+ noise
+        return [(params[t][r], vals[r]) for r in 1:nHyd]
+    end
+end
+
+# Generate one trajectory
+trajectory = sample(ar1_sampler, T)
+
+# Use in training — wrap as zero-arg callable
+train_multistage(
+    policy, x0, subproblems,
+    state_in, state_out,
+    () -> sample(ar1_sampler, T);  # pass as callable
+    num_batches=500,
+)
+```
+
+**Use when**: your uncertainty process has temporal dependence (e.g.,
+autoregressive inflows, mean-reverting prices, regime-switching demands).
+
+## Comparison table
+
+| Feature | Independent | Joint-scenario | Trajectory sampler |
+|:--------|:-----------|:--------------|:-------------------|
+| Spatial correlation | ✗ | ✓ | ✓ |
+| Temporal correlation | ✗ | ✗ | ✓ |
+| Data format | Finite supports | Pre-built scenarios | Callable |
+| Combinations per stage | ``k^n`` | ``k`` | unlimited |
+| SDDP-compatible | only if ``n=1`` | ✓ | depends on model |
+| Ease of use | simplest | moderate | most flexible |
+
+## Building uncertainty pools
+
+### From historical data (joint scenarios — recommended)
+
+When your data comes as a matrix where columns are scenarios:
+
+```julia
+# data[r] is a (T × nScenarios) matrix for reservoir r
+nHyd = length(data)
+nCen = size(data[1], 2)
+
+uncertainty_pool = Vector{Any}(undef, T)
+for t in 1:T
+    uncertainty_pool[t] = [
+        [(inflow_params[t][r], data[r][t, ω] + 0.0) for r in 1:nHyd]
+        for ω in 1:nCen      # ω is the OUTER loop — all units share it
+    ]
+end
+```
+
+### From independent distributions
+
+```julia
+uncertainty_pool = [
+    [(demand_param[t], [low, mid, high])]
+    for t in 1:T
+]
+```
+
+### From an AR(1) process
+
+```julia
+# Estimate AR(1) parameters from data
+μ = mean.(eachrow(hcat(data...)))    # long-run mean per unit
+σ = std.(eachrow(hcat(data...)))     # innovation std per unit
+ρ = 0.7                              # autocorrelation coefficient
+
+function ar1_sampler(t, past)
+    if isempty(past)
+        vals = μ .+ σ .* randn(nHyd)
+    else
+        prev = [p[2] for p in past[end]]
+        vals = ρ .* prev .+ (1 .- ρ) .* μ .+ σ .* randn(nHyd)
+    end
+    return [(inflow_params[t][r], max(0.0, vals[r])) for r in 1:nHyd]
+end
+```
+
+## Sampling in practice
+
+All three formats produce the same **realized trajectory** type:
+`Vector{Vector{Tuple{VariableRef, Float64}}}`. This is what gets passed to
+`simulate_multistage`, `simulate_stage`, and all internal training code.
+
+```julia
+using DecisionRules
+
+# 1. From a data pool (independent or joint):
+trajectory = sample(uncertainty_pool)
+
+# 2. From a trajectory sampler:
+trajectory = sample(ar1_sampler, T)
+
+# Both produce the same type — downstream code is identical:
+objective = simulate_multistage(
+    subproblems, state_params_in, state_params_out,
+    initial_state, trajectory, policy,
+)
+```
+
+### Passing to training functions
+
+```julia
+# Data pools are passed directly:
+train_multistage(policy, x0, subs, s_in, s_out, uncertainty_pool; ...)
+
+# Trajectory samplers are wrapped as zero-arg callables:
+train_multistage(policy, x0, subs, s_in, s_out,
+    () -> sample(ar1_sampler, T); ...)
+```
+
+This works because `train_multistage` calls `sample(uncertainty_sampler)` to
+draw each trajectory. For data pools, `sample` dispatches on the pool type.
+For callables, `sample(f::Function)` simply calls `f()`.
+
+## Demonstrating the difference
+
+Consider 3 hydro reservoirs with 4 historical inflow scenarios:
+
+```
+Historical inflow data (columns = scenarios):
+
+         ω=1    ω=2    ω=3    ω=4
+Res 1:   10     20     15     25
+Res 2:   80    120     90    110
+Res 3:    5      8      6      9
+```
+
+**Independent sampling** draws one value per row independently. A sample
+might be `(10, 120, 9)` — reservoir 1 from ω=1, reservoir 2 from ω=2,
+reservoir 3 from ω=4. This combination never occurred historically and
+may violate the drought-affects-all-basins correlation.
+
+**Joint sampling** picks one column: `(10, 80, 5)` or `(25, 110, 9)` —
+always a historically observed combination.
+
+**Trajectory sampling** can additionally model temporal persistence:
+if ω=1 (dry year) was drawn at stage 1, the AR(1) sampler will likely
+produce below-average inflows at stage 2 as well.
+
+```
+Joint sampling (k=4 possible outcomes per stage):
+
+    Res 1 ──┐
+    Res 2 ──┼── same ω ──→ one of 4 historical vectors
+    Res 3 ──┘
+
+Independent sampling (k³=64 possible outcomes per stage):
+
+    Res 1 ── ω₁ ──┐
+    Res 2 ── ω₂ ──┼──→ one of 64 combinations (most never observed)
+    Res 3 ── ω₃ ──┘
+
+Trajectory sampling (conditioned on past):
+
+    Stage 1: same as joint ──→ w₁
+                                │
+    Stage 2: AR(1)(w₁) ──────→ w₂  (temporal correlation preserved)
+```
+
+## Internal functions
+
+The following internal helpers process uncertainty pools in different training
+formulations. They are not part of the public API but documented here for
+maintainability.
+
+- [`_remap_uncertainties`](@ref): Remap JuMP `VariableRef` keys when
+  copying uncertainty pools into a deterministic-equivalent model.
+  Two methods dispatch on per-unit vs. joint-scenario pool types.
+- [`extract_uncertainty_params`](@ref): Extract just the `VariableRef`
+  parameters from an uncertainty pool, discarding the scenario values.
+  Used by `setup_shooting_windows` for multiple-shooting training.
+
+## API Reference
+
+```@docs
+sample
+```
diff --git a/examples/HydroPowerModels/README.md b/examples/HydroPowerModels/README.md
index e367d6f..7a6d62b 100644
--- a/examples/HydroPowerModels/README.md
+++ b/examples/HydroPowerModels/README.md
@@ -15,15 +15,30 @@ generation to meet those targets at minimum cost.
 
 ## Training scripts
 
+### TS-DDR (Deep Decision Rules — LSTM policy)
+
 | Script | Decomposition | Reference |
 |--------|--------------|-----------|
 | `train_dr_hydropowermodels.jl` | Deterministic equivalent (GPU-enabled) | Extension §1 |
 | `train_dr_hydropowermodels_subproblems.jl` | Stage-wise (single shooting) | Extension §2 |
 | `train_dr_hydropowermodels_multipleshooting.jl` | Windowed (multiple shooting) | Extension §3 |
 
-All three share the same data loader (`load_hydropowermodels.jl`) and
-policy architecture (LSTM encoder + state-conditioned dense layers).
-Training logs to Weights & Biases and saves the best model to JLD2.
+These use a `StateConditionedPolicy` (LSTM encoder + state-conditioned dense
+layers, `[128, 128]`, sigmoid activation).
+
+### TS-LDR (Linear Decision Rules — linear policy)
+
+| Script | Decomposition | Reference |
+|--------|--------------|-----------|
+| `train_ldr_hydropowermodels.jl` | Deterministic equivalent (GPU-enabled) | §3 |
+
+TS-LDR uses `dense_multilayer_nn` with identity activation — a composition
+of linear layers that is equivalent to a single linear map from
+(uncertainties, state) to targets.  Same training pipeline as TS-DDR; the
+only difference is the policy architecture.
+
+All training scripts share the data loader (`load_hydropowermodels.jl`),
+log to Weights & Biases, and save the best model to JLD2.
 
 ### GPU training
 
@@ -64,20 +79,30 @@ policy relying on slack.
 
 | Script | Purpose |
 |--------|---------|
+| `evaluate_hydro_policies.jl` | Load all trained TS-DDR and TS-LDR models and evaluate on a common out-of-sample scenario set using stage-wise ACP rollout; writes `eval_costs.csv` |
 | `eval_jump_de.jl` | Solve the DE with a constant policy and save a reference solution (JLD2) for cross-validation with ExaModels |
-| `test_dr_hydropowermodels.jl` | Load a trained model and produce volume/generation/cost comparison plots and CSVs |
 | `check_consistent_state_paths.jl` | Verify that stage-wise, deterministic equivalent, and multiple-shooting decompositions produce identical state trajectories under the same policy and inflows |
 
 ## SDDP baselines
 
-These scripts require [HydroPowerModels.jl](https://github.com/andrewrosemberg/HydroPowerModels.jl),
-Gurobi, and Mosek licenses.
+These scripts use a dedicated Julia environment in `sddp/`. The inconsistent
+SOC-backward/AC-forward baseline uses
+[HydroPowerModels.jl](https://github.com/LAMPSPUC/HydroPowerModels.jl), SDDP.jl,
+Clarabel for the SOC backward pass, and MadNLP for the AC forward pass. Training
+runs log iteration and final simulation metrics to Weights & Biases using the
+same keys as the DR runs: `metrics/loss` is the SDDP bound, and
+`metrics/rollout_realized_objective_no_deficit` is the SDDP forward-pass
+objective. SDDP iterations are logged as `batch` so W&B plots can share the same
+x-axis as the DR training runs. Because SDDP solves the forward policy
+stage-wise, that forward-pass objective is already the no-target-penalty
+objective.
 
 | Script | Description |
 |--------|-------------|
-| `run_sddp.jl` | Train SDDP with a consistent convex (SOCWRConic) formulation |
-| `run_sddp_inconsistent.jl` | Train SDDP with SOCWRConic backward pass and ACP forward pass |
-| `simulate_sddp_policy.jl` | Simulate a pre-trained SDDP policy under ACP and produce comparison plots |
+| `sddp/run_sddp.jl` | Train SDDP with a consistent convex (SOCWRConic) formulation |
+| `sddp/run_sddp_inconsistent.jl` | Train SDDP with SOCWRConic backward pass and ACP forward pass |
+| `sddp/run_sddp_inconsistent.sbatch` | Submit the SOC-backward/AC-forward run with a 12-hour wall time |
+| `sddp/simulate_sddp_policy.jl` | Simulate a pre-trained SDDP policy under ACP and produce comparison plots |
 
 ## Learning-to-Optimize (L2O) pipeline
 
diff --git a/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanGeneration.csv b/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanGeneration.csv
index 38c1229..d0b560d 100644
--- a/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanGeneration.csv
+++ b/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanGeneration.csv
@@ -1,97 +1,97 @@
 TS-DDR,TS-LDR,SDDP-DCLL,SDDP-SOC
-147.0247133419527,299.7450624492452,100.8484585807442,210.85805129912742
-157.58814081813028,225.6675464897668,104.86242775104952,211.20242951110967
-185.43371546547627,180.7943849292062,107.48502989328503,211.34903175695075
-199.3134646164604,199.83572444735833,107.55475020324891,211.50499902777818
-206.61858480532382,231.35910431382658,106.18650990754062,213.03900650304772
-214.3697084291256,189.64814901428522,113.67317158934263,213.3433026957727
-187.9473304345247,241.6103392896273,116.58714528380276,213.53001862630535
-171.46565827999535,187.4542766440176,129.25366035880148,213.42630840733494
-170.2002528391322,233.150743363177,114.76063596215229,213.53273980518378
-182.98142893312487,185.52122552406792,109.93768423864336,213.45398275330976
-202.19901785621693,215.06714163113537,116.98303295186679,213.00893342509022
-193.96716199312775,199.9910790500363,121.32762963754153,213.34946528253167
-193.52811950138334,205.21889018439646,118.81044678071419,211.36762915185463
-195.13814426290378,184.97486998527484,140.68919880246779,210.47242267421387
-226.57039256458856,167.1372529242544,157.98870219622455,208.66880566556748
-248.08925112087996,177.07636222131532,182.13344732161957,208.8072773221114
-233.5593017377245,206.24124265365077,213.25680127887568,208.85548225347776
-212.98635880030474,203.58562602361448,219.65332808686793,209.21183174817384
-208.1182840666282,225.62114837627547,215.53856096654266,209.00239858283362
-205.87501153760303,282.5815380137001,218.69325537706618,208.35800693518277
-218.77529830243512,261.2652681819222,229.70401987822862,207.84410265509277
-219.5621182240177,228.86267450841828,233.80086503295337,208.84146637556242
-215.15355785846822,278.8379771241615,219.9540629779893,208.46432000930798
-225.44578342326264,208.6334543644992,227.25750181666496,208.5398276805071
-216.46699729157837,233.62842636523723,228.98664748977347,208.87782967565062
-210.99855030225916,236.77260170176334,245.24308942520355,208.618115177171
-200.82120574954234,257.832718296366,266.63858091994916,210.3109443150775
-217.00901852303974,206.80316405135653,287.580525250383,210.15033697956687
-205.60116177559695,247.22715853816413,288.43119570669523,209.7157833146924
-204.48717949362427,242.7626603477798,295.40865231198757,210.07161367481694
-215.01700896149435,264.8798596812393,316.4539931784796,209.14538186180965
-194.39594990932568,253.05048627557306,329.7940889425171,209.0149875668554
-212.44246770637898,281.106369677497,327.59228806131625,208.94739091431111
-218.88119237987075,264.11335838123404,330.87683426769047,208.47934249184019
-209.54103152313093,265.672452188459,320.54817802711284,209.16634211748254
-232.50511220115487,242.47928625172216,336.16459151820277,210.2650760789873
-246.55559517109822,250.15790720527644,328.4530145309637,210.8958189450547
-259.1804770347488,241.4580174607301,306.6623006700101,210.21284850792782
-243.65212407401336,216.2392032573588,311.05102021329515,210.84944676111476
-268.3327047244221,244.87238185642838,313.43043807340126,216.04183457929315
-259.25135590920064,212.7607093726518,289.6446657672623,217.73032842094054
-239.7596346992519,211.60973323507133,236.34688350288008,212.03827129950005
-225.1627151901455,224.31226414469103,227.41520899884554,214.05479553340882
-226.300220128706,190.17002437882283,237.60749837482643,214.2688665376018
-246.5897657016228,197.34726823968018,242.14935260524663,214.4866449988023
-200.45447214778832,159.42600322266816,217.0164054870767,212.007886833974
-177.65036023699594,199.6529727342857,200.46699991530508,209.9183339789378
-152.7449270008933,184.7732720253377,215.7682105980341,210.72466969624674
-116.54920043933797,185.46879418205066,111.6333307935832,210.26544418415878
-118.11835868140619,166.0896692911,101.80408752144571,210.23698737179038
-135.40331893488192,159.49003675324482,106.15347696391866,210.35402839872972
-165.39822644830028,182.5511233035246,108.92328402254266,209.30901089378818
-164.63498605950215,157.20350355656583,103.94596690664551,208.01878145654337
-166.92019582010437,188.70842110946302,103.82258613108722,208.47415960986146
-153.73417427187314,161.11055875216002,112.13281054476954,209.23591063543003
-164.778321612267,181.93462654464733,161.52068981849445,208.22372981215094
-160.9396855926067,146.59967315111336,169.88300890301332,206.5932949414058
-161.6813291137583,157.86943128110318,177.59131665764,206.6902407990834
-161.9520731591163,129.58260488127797,172.45550740547597,206.73951637050718
-161.5166767553574,192.37516226972488,177.4638905803286,206.95619327342342
-171.23315428685606,169.81265452067095,176.73862367347002,207.51071253650153
-203.84334411204006,189.01908062873738,182.19685255887867,209.81694080008674
-243.88748126609443,185.3643782473636,192.64656185901407,208.53610364457444
-255.36863069633904,189.10691135198536,210.14038242475175,208.4617557397368
-275.72231730419776,177.47926018459844,214.11405852350882,207.85209901607163
-231.14746381351682,203.55835450764843,216.8881876993481,207.6260731591064
-239.70981393223852,185.15015653078225,206.20963274723462,208.4778921323274
-236.55408488066604,209.9345245251102,228.06281826263384,208.23115750760718
-232.50309001204022,247.09182945796942,232.53663833367236,208.3506533209856
-211.09249114427467,224.13559766576765,229.0267023271387,207.95548864145377
-222.7816613997703,251.3620122443193,232.90850460804563,206.62135419445332
-219.16351862667358,241.65598188514326,228.66399277324248,205.7351933331613
-207.22133488700123,260.0058643392084,266.0049119099325,204.86776664864328
-217.06224817213152,249.65113999794553,228.7078025295844,204.38830132480373
-210.07761177326952,201.43928958603075,257.88491102881073,205.79912219619206
-216.6438147791045,209.70322997573356,256.3000900779569,205.84504565445877
-201.41487156994177,254.6147418912871,260.0864123226794,206.04459937322108
-207.97135419475285,233.44163279290632,245.1593291381967,203.60562199452835
-211.26689971022438,206.86923643700797,266.2262958784643,202.65541198826796
-240.1582296197083,249.82887487767167,286.04078934122384,204.0215906015123
-233.02195021621122,218.9756433172769,302.4507102096619,204.25738636756918
-237.29598744217932,254.3292021981702,310.8026658382449,201.8306318980746
-262.7454036444481,254.22526995036333,312.90905574443457,200.48697941077813
-277.54946543623424,228.08210258025755,326.2879316740818,200.8089340858362
-259.14118496809317,237.00317892462166,325.13527205939647,203.97622797873652
-258.0603903695542,268.0249997632043,305.61306417662183,207.41992227810343
-263.1354853907414,238.83057850845063,298.7323501222888,207.68175937544407
-280.35492909981747,216.14279418347928,318.1451091369037,208.5046041321376
-271.56374567280704,229.39964558658272,286.6845872174376,209.98920231592425
-229.35820796650296,212.93348980868046,245.77044877320677,207.67020973138582
-230.49394051322878,203.67236378550288,224.55512987384654,202.38584149268576
-225.7601668320821,219.35714063162678,238.83881638414672,204.348082102984
-230.75407630714457,198.06757965648347,238.36899164573157,204.80416882648063
-221.31628509636886,149.20055233004982,218.55183620363255,200.6825798131114
-200.12300075458802,167.6799930782707,209.6087072026431,196.29537601929695
-138.65216945726675,143.81558253281227,219.38832428287833,206.63438898989997
+147.0247133419527,299.7450624492452,100.8484585807442,209.86380882088514
+157.58814081813028,225.6675464897668,104.86242775104952,209.53839385906838
+185.43371546547627,180.7943849292062,107.48502989328503,209.52017504171553
+199.3134646164604,199.83572444735833,107.55475020324891,209.4783394659705
+206.61858480532382,231.35910431382658,106.18650990754062,209.52518234738324
+214.3697084291256,189.64814901428522,113.67317158934263,209.6034477390574
+187.9473304345247,241.6103392896273,116.58714528380276,209.65384702850434
+171.46565827999535,187.4542766440176,129.25366035880148,209.51238800566185
+170.2002528391322,233.150743363177,114.76063596215229,209.45259355985954
+182.98142893312487,185.52122552406792,109.93768423864336,209.34525985187597
+202.19901785621693,215.06714163113537,116.98303295186679,209.44039837517906
+193.96716199312775,199.9910790500363,121.32762963754153,209.3672289663951
+193.52811950138334,205.21889018439646,118.81044678071419,209.32163376554064
+195.13814426290378,184.97486998527484,140.68919880246779,209.6614054735699
+226.57039256458856,167.1372529242544,157.98870219622455,210.19631956661485
+248.08925112087996,177.07636222131532,182.13344732161957,210.70698165532528
+233.5593017377245,206.24124265365077,213.25680127887568,211.4401169112003
+212.98635880030474,203.58562602361448,219.65332808686793,211.48105832602775
+208.1182840666282,225.62114837627547,215.53856096654266,211.94334659983778
+205.87501153760303,282.5815380137001,218.69325537706618,211.8784711569352
+218.77529830243512,261.2652681819222,229.70401987822862,211.85037808628837
+219.5621182240177,228.86267450841828,233.80086503295337,212.00653608776338
+215.15355785846822,278.8379771241615,219.9540629779893,211.71653085812682
+225.44578342326264,208.6334543644992,227.25750181666496,211.72866526590903
+216.46699729157837,233.62842636523723,228.98664748977347,211.79095480066385
+210.99855030225916,236.77260170176334,245.24308942520355,211.9396216916039
+200.82120574954234,257.832718296366,266.63858091994916,212.2412014072736
+217.00901852303974,206.80316405135653,287.580525250383,211.96614587275292
+205.60116177559695,247.22715853816413,288.43119570669523,212.0593948656155
+204.48717949362427,242.7626603477798,295.40865231198757,212.13189528353013
+215.01700896149435,264.8798596812393,316.4539931784796,212.48730840572054
+194.39594990932568,253.05048627557306,329.7940889425171,212.170191104127
+212.44246770637898,281.106369677497,327.59228806131625,212.68099082676446
+218.88119237987075,264.11335838123404,330.87683426769047,212.9854607332948
+209.54103152313093,265.672452188459,320.54817802711284,212.90364434114954
+232.50511220115487,242.47928625172216,336.16459151820277,212.98400917882333
+246.55559517109822,250.15790720527644,328.4530145309637,212.90706046302626
+259.1804770347488,241.4580174607301,306.6623006700101,212.78491180246007
+243.65212407401336,216.2392032573588,311.05102021329515,213.50440427222605
+268.3327047244221,244.87238185642838,313.43043807340126,214.71903130733853
+259.25135590920064,212.7607093726518,289.6446657672623,215.71309550713656
+239.7596346992519,211.60973323507133,236.34688350288008,214.21514981807115
+225.1627151901455,224.31226414469103,227.41520899884554,214.14602835221518
+226.300220128706,190.17002437882283,237.60749837482643,213.05570166141047
+246.5897657016228,197.34726823968018,242.14935260524663,214.3397929974062
+200.45447214778832,159.42600322266816,217.0164054870767,212.87589299860522
+177.65036023699594,199.6529727342857,200.46699991530508,210.512610777988
+152.7449270008933,184.7732720253377,215.7682105980341,208.87860648289214
+116.54920043933797,185.46879418205066,111.6333307935832,205.95870346385465
+118.11835868140619,166.0896692911,101.80408752144571,205.70698904522408
+135.40331893488192,159.49003675324482,106.15347696391866,205.4160873909239
+165.39822644830028,182.5511233035246,108.92328402254266,205.59700933991252
+164.63498605950215,157.20350355656583,103.94596690664551,205.58479725277505
+166.92019582010437,188.70842110946302,103.82258613108722,205.77526373955115
+153.73417427187314,161.11055875216002,112.13281054476954,205.88135777815637
+164.778321612267,181.93462654464733,161.52068981849445,206.02649485451556
+160.9396855926067,146.59967315111336,169.88300890301332,205.8871588917278
+161.6813291137583,157.86943128110318,177.59131665764,206.70433459244137
+161.9520731591163,129.58260488127797,172.45550740547597,206.366999884319
+161.5166767553574,192.37516226972488,177.4638905803286,206.38870455127955
+171.23315428685606,169.81265452067095,176.73862367347002,206.249716219982
+203.84334411204006,189.01908062873738,182.19685255887867,206.63455688813036
+243.88748126609443,185.3643782473636,192.64656185901407,205.9797537356252
+255.36863069633904,189.10691135198536,210.14038242475175,206.03555271847185
+275.72231730419776,177.47926018459844,214.11405852350882,206.51953984726723
+231.14746381351682,203.55835450764843,216.8881876993481,206.12018860291096
+239.70981393223852,185.15015653078225,206.20963274723462,206.250132372938
+236.55408488066604,209.9345245251102,228.06281826263384,206.1138429495153
+232.50309001204022,247.09182945796942,232.53663833367236,206.4146828063853
+211.09249114427467,224.13559766576765,229.0267023271387,206.5605843384091
+222.7816613997703,251.3620122443193,232.90850460804563,206.40037604289813
+219.16351862667358,241.65598188514326,228.66399277324248,206.5295113131882
+207.22133488700123,260.0058643392084,266.0049119099325,206.8315496621826
+217.06224817213152,249.65113999794553,228.7078025295844,207.13270789037426
+210.07761177326952,201.43928958603075,257.88491102881073,207.77068168915991
+216.6438147791045,209.70322997573356,256.3000900779569,208.16087070287398
+201.41487156994177,254.6147418912871,260.0864123226794,208.40830707759466
+207.97135419475285,233.44163279290632,245.1593291381967,207.38456438434162
+211.26689971022438,206.86923643700797,266.2262958784643,206.9039133561306
+240.1582296197083,249.82887487767167,286.04078934122384,207.77917011114187
+233.02195021621122,218.9756433172769,302.4507102096619,207.558738022174
+237.29598744217932,254.3292021981702,310.8026658382449,208.03806284333123
+262.7454036444481,254.22526995036333,312.90905574443457,209.270249411708
+277.54946543623424,228.08210258025755,326.2879316740818,208.91105191830513
+259.14118496809317,237.00317892462166,325.13527205939647,208.38620445160527
+258.0603903695542,268.0249997632043,305.61306417662183,207.98642330663995
+263.1354853907414,238.83057850845063,298.7323501222888,209.43087634497854
+280.35492909981747,216.14279418347928,318.1451091369037,211.27160719784078
+271.56374567280704,229.39964558658272,286.6845872174376,210.0236414480106
+229.35820796650296,212.93348980868046,245.77044877320677,208.8917459728048
+230.49394051322878,203.67236378550288,224.55512987384654,205.8564923994707
+225.7601668320821,219.35714063162678,238.83881638414672,208.55931411570532
+230.75407630714457,198.06757965648347,238.36899164573157,208.37478598750155
+221.31628509636886,149.20055233004982,218.55183620363255,205.669374135596
+200.12300075458802,167.6799930782707,209.6087072026431,200.7031326575932
+138.65216945726675,143.81558253281227,219.38832428287833,201.49222415721096
diff --git a/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanVolume.csv b/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanVolume.csv
index eaae969..6b0f78f 100644
--- a/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanVolume.csv
+++ b/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanVolume.csv
@@ -1,97 +1,97 @@
 TS-DDR,TS-LDR,SDDP-DCLL,SDDP-SOC
-22.51732496848532,37.919375441643616,15.010226053947687,29.33194347862757
-47.20258811999907,71.04409366252557,27.799866146567854,56.51380736510445
-72.83176865479658,87.01935425652064,41.809771632008996,84.20898277540019
-91.6036862853376,110.9546703800459,54.779619691636846,110.27630381823403
-111.75802365829193,124.62427072370988,67.41758956472614,134.27275994415598
-132.06225985386516,140.7114941122255,76.61311783876522,155.39600670884147
-151.88292311509423,163.09183590390472,85.64543943322955,176.79982271494404
-167.62574263315506,176.9949496837815,90.09866940534309,200.7038069282425
-184.1432711637443,192.12358945548075,97.21306040484191,224.62760324276306
-199.10712684414443,211.72514898332275,100.754649277593,245.07104162682876
-213.03560361830984,234.5026497996314,107.46682144224346,272.4317856234269
-224.76897734579637,261.6236901648443,111.30922724569007,296.4571689351554
-238.39447203378649,282.426645017808,114.62593628727466,314.971343907225
-245.37549219043413,285.94648083840264,113.43065917802855,323.59617431541096
-252.4494932363904,296.90814187996295,110.20680607797237,326.08839288669947
-250.45907641085287,296.9831633048792,106.20779976718435,318.4972488768354
-248.8524629192563,300.7632599043702,101.76735381105871,306.22180979408193
-245.3285907233545,290.30183231928174,95.88402955956306,292.9779998795557
-237.90416834064467,290.15844458648814,89.15470564095466,280.274491334906
-230.9413064277374,281.77833258453296,82.17436482998376,268.26270575999536
-221.1118971810357,281.1300748339776,74.6379921116223,254.79818728066502
-209.82043242344366,273.15550833073513,66.46062403663097,241.0049501227325
-197.17838472323572,255.4079970540009,58.1801555890566,229.11021885785559
-184.5082417295772,243.31108770222295,49.643407346969596,217.6998371921953
-171.364986731152,236.8863043514778,40.88630847779404,206.66414135128852
-158.67507679180278,228.75659130962944,32.42172641872411,194.91447234762936
-145.70893099014296,228.60197560222855,25.915677587474477,186.04793001489267
-132.80165423375078,219.4870486506905,17.949546658738377,173.36981451590145
-118.76446158949864,212.842623503242,10.91236331395401,163.46784473894132
-105.47673733968136,197.82735475771423,6.342665426831365,154.54992926931362
-92.28262527930282,183.60880119824964,2.9637113646432924,145.05997371744357
-80.08178289428561,178.21417462426308,1.4904346490510063,135.37427475280447
-68.05075167883604,174.38024725627528,0.6542767511259123,125.33618787791389
-57.33892828225943,163.75151223638323,-2.1629451753485584e-7,114.38326959746601
-47.91122996837452,158.30124789510745,0.01699339492037168,104.57111137056225
-39.71203072777365,146.6433762840449,0.013683141695575645,92.26121331847665
-32.08903884626882,138.74956149885196,-2.776369796429933e-5,77.95984718351467
-25.567879137873327,117.64643250247984,0.02029723087689403,68.14542674466739
-20.226109971544595,105.87824541767543,0.02942785780510852,57.80039246187566
-15.822029319315941,95.26193888634329,0.0015619407570932227,44.98308797532776
-11.86482403506727,82.88471848733366,0.0139202251526318,36.4345842628461
-11.41444238123061,89.84998297693535,0.0666095021126761,33.672607545128585
-11.760709387675428,80.56903002626132,0.02911519239806684,31.779550259167188
-15.608653812842075,84.4812008303581,0.1451610159592241,29.167467682825105
-18.46652374296125,75.10873866189522,0.257529864394238,25.52786987723517
-19.154852416425946,79.66424881896181,0.34310578392406726,24.862484554227695
-17.76382346268219,82.33214023398799,0.04020249114668608,27.440814552437722
-11.770417835270539,77.16028138611318,0.1601258590007935,28.178903593901776
-28.575902113160843,100.30120799881679,8.290616244150943,51.18755046563618
-44.6500656371219,133.7662232684492,18.826473984628194,79.46990149570847
-58.93359681990834,144.39896871097727,26.967830366959436,107.64334396878635
-78.81921579909906,157.05027238202555,36.00161441777166,132.62153676565572
-96.31823829542331,172.46207271242636,47.67488678462672,163.03607490960206
-113.68465589999252,199.6980903165839,56.815618070614086,185.5902142428461
-131.44733746695687,218.31345641963154,65.0298903498588,205.6040788843083
-143.57022598996326,248.0669589387606,78.12103160217615,225.18336274841536
-160.2545160514829,276.4730547448535,95.53937029048053,246.97356190959027
-173.71546658765067,294.32632069405474,107.5932705161822,262.4752046573908
-188.87717865623495,289.0248763377732,118.23444331568548,278.0369939094454
-200.72440725109396,310.47111738812987,125.58297208702875,291.21933275895447
-211.54578497851003,321.62467412938156,135.04096202155375,311.8472483005798
-219.67787470209927,322.9377982306577,138.23259862143857,326.9447866786039
-225.5911712053431,313.0818205198798,137.52220454328514,331.6321292047405
-225.39958719867292,303.0418832885243,134.56161843454288,334.41533503174685
-223.0003148921021,295.41511330313085,130.39480512697176,330.21097313357666
-219.3358500411617,292.2241103571665,124.0594015578791,324.70715783895025
-215.7865012983455,282.52479046838965,117.20362201093069,318.9013992484797
-208.9688074571684,270.8475139202375,109.91183630093335,309.0068767261927
-200.25820568279497,268.14577165979705,102.18413675040794,299.37788585534616
-189.3589474950667,274.8028403967962,94.29202902742942,291.0950904790007
-179.11347981019935,269.33461930444184,86.51972305194262,277.7102369291978
-166.46173064664583,261.0404132262943,77.8672879103141,263.4527539589003
-154.36450314445943,258.2200123763158,69.45071513510558,248.33011814559686
-142.61948735542381,250.57594722312155,61.04749888339048,235.64188913357484
-130.39403275688142,237.56983972063503,53.57905190860542,220.48242803133235
-118.09172135231981,232.9065992805976,45.31949038232733,204.10586308604817
-107.2702477985387,215.22999411819978,37.242727577570335,186.07281143864037
-97.0065973471623,205.80014155651824,28.961340379753675,172.53116562776583
-84.6333338632984,192.9275305544331,20.748060187265494,156.70296586552814
-74.44115642907373,175.65153529061962,13.951363389962466,138.83089629990818
-65.89676047851572,159.63588608030236,9.971531743391353,125.07480741352246
-59.101026640990746,157.6032563807769,5.649949808269948,109.34879298512287
-50.4628042074048,151.14613237451394,2.8422167357923183,93.4456297857743
-43.380434756627224,142.1297446607122,0.8089371675179539,77.96036680911685
-37.2402039626919,131.15093810046022,-2.7757075579902497e-5,64.8649411952869
-32.6377410503266,127.61272326799181,0.014576637404480192,58.483461680238136
-29.407339371204802,122.7377666202245,0.020931438505678426,49.55232088062062
-24.06393930942189,85.84155793018172,0.0062571966267834424,37.76748815084717
-23.127901917898463,65.85303059430143,0.03619878202007641,29.09502728354922
-22.924839043559988,62.40507538369134,0.23070127422069103,25.66711762242007
-22.007801580404273,59.154416378847564,0.49709652305249996,23.17058109737539
-21.136620549198433,54.197233720503064,0.12869349623927082,17.637496022430913
-19.031142363960612,38.14249299045677,0.2798932995958727,12.974210990765622
-17.40204173868524,30.7482859887398,0.6275685251939801,10.510870387587385
-12.246953654446704,19.1380483634613,0.7553403609073082,9.280752131127022
-0.40074148030873114,1.9740202514084515,-2.7761152226093412e-5,6.682329562456955
+22.51732496848532,37.919375441643616,15.010226053947687,29.88884403814745
+47.20258811999907,71.04409366252557,27.799866146567854,58.85398607133367
+72.83176865479658,87.01935425652064,41.809771632008996,82.61277155868845
+91.6036862853376,110.9546703800459,54.779619691636846,105.02407149811434
+111.75802365829193,124.62427072370988,67.41758956472614,127.58313808354393
+132.06225985386516,140.7114941122255,76.61311783876522,147.63328200911607
+151.88292311509423,163.09183590390472,85.64543943322955,168.65289410472099
+167.62574263315506,176.9949496837815,90.09866940534309,189.75454495424944
+184.1432711637443,192.12358945548075,97.21306040484191,218.11088982453344
+199.10712684414443,211.72514898332275,100.754649277593,246.93492472290026
+213.03560361830984,234.5026497996314,107.46682144224346,272.21531353974717
+224.76897734579637,261.6236901648443,111.30922724569007,292.90711864275784
+238.39447203378649,282.426645017808,114.62593628727466,311.83209342410777
+245.37549219043413,285.94648083840264,113.43065917802855,322.5848167767246
+252.4494932363904,296.90814187996295,110.20680607797237,322.2425933406325
+250.45907641085287,296.9831633048792,106.20779976718435,320.00444979375743
+248.8524629192563,300.7632599043702,101.76735381105871,312.8087106999844
+245.3285907233545,290.30183231928174,95.88402955956306,303.1966369847269
+237.90416834064467,290.15844458648814,89.15470564095466,296.46398240261857
+230.9413064277374,281.77833258453296,82.17436482998376,286.69479612549327
+221.1118971810357,281.1300748339776,74.6379921116223,275.7387873489527
+209.82043242344366,273.15550833073513,66.46062403663097,264.90582250183024
+197.17838472323572,255.4079970540009,58.1801555890566,252.08676360007652
+184.5082417295772,243.31108770222295,49.643407346969596,237.85231993230704
+171.364986731152,236.8863043514778,40.88630847779404,224.51757587451243
+158.67507679180278,228.75659130962944,32.42172641872411,211.62777515165263
+145.70893099014296,228.60197560222855,25.915677587474477,197.57381524080466
+132.80165423375078,219.4870486506905,17.949546658738377,182.8952051783611
+118.76446158949864,212.842623503242,10.91236331395401,169.0595060088052
+105.47673733968136,197.82735475771423,6.342665426831365,156.70187722582688
+92.28262527930282,183.60880119824964,2.9637113646432924,143.70426904824114
+80.08178289428561,178.21417462426308,1.4904346490510063,131.34345908273133
+68.05075167883604,174.38024725627528,0.6542767511259123,119.6560155151396
+57.33892828225943,163.75151223638323,-2.1629451753485584e-7,107.89499970347059
+47.91122996837452,158.30124789510745,0.01699339492037168,95.99205089892341
+39.71203072777365,146.6433762840449,0.013683141695575645,81.98936536386094
+32.08903884626882,138.74956149885196,-2.776369796429933e-5,70.22278074065235
+25.567879137873327,117.64643250247984,0.02029723087689403,57.957500191547524
+20.226109971544595,105.87824541767543,0.02942785780510852,48.218945817433884
+15.822029319315941,95.26193888634329,0.0015619407570932227,36.867704663165426
+11.86482403506727,82.88471848733366,0.0139202251526318,29.26424780181313
+11.41444238123061,89.84998297693535,0.0666095021126761,25.92639157504742
+11.760709387675428,80.56903002626132,0.02911519239806684,26.098422131593825
+15.608653812842075,84.4812008303581,0.1451610159592241,22.394312574066713
+18.46652374296125,75.10873866189522,0.257529864394238,19.330147292287013
+19.154852416425946,79.66424881896181,0.34310578392406726,19.064203280951464
+17.76382346268219,82.33214023398799,0.04020249114668608,20.09172473445721
+11.770417835270539,77.16028138611318,0.1601258590007935,19.35396878018543
+28.575902113160843,100.30120799881679,8.290616244150943,46.258638018871736
+44.6500656371219,133.7662232684492,18.826473984628194,73.04283772380003
+58.93359681990834,144.39896871097727,26.967830366959436,95.94410009405843
+78.81921579909906,157.05027238202555,36.00161441777166,117.71853421648952
+96.31823829542331,172.46207271242636,47.67488678462672,140.18906733662052
+113.68465589999252,199.6980903165839,56.815618070614086,161.20146693953137
+131.44733746695687,218.31345641963154,65.0298903498588,181.69307761993895
+143.57022598996326,248.0669589387606,78.12103160217615,199.8999392076057
+160.2545160514829,276.4730547448535,95.53937029048053,220.31192988484966
+173.71546658765067,294.32632069405474,107.5932705161822,239.09635346381702
+188.87717865623495,289.0248763377732,118.23444331568548,255.85211397472423
+200.72440725109396,310.47111738812987,125.58297208702875,272.7401007551301
+211.54578497851003,321.62467412938156,135.04096202155375,292.5534588326327
+219.67787470209927,322.9377982306577,138.23259862143857,305.2552176250861
+225.5911712053431,313.0818205198798,137.52220454328514,306.9497613201085
+225.39958719867292,303.0418832885243,134.56161843454288,303.58072451918565
+223.0003148921021,295.41511330313085,130.39480512697176,295.56194440948934
+219.3358500411617,292.2241103571665,124.0594015578791,285.94793100031006
+215.7865012983455,282.52479046838965,117.20362201093069,278.35828561341475
+208.9688074571684,270.8475139202375,109.91183630093335,268.7469595331369
+200.25820568279497,268.14577165979705,102.18413675040794,256.9288932038997
+189.3589474950667,274.8028403967962,94.29202902742942,246.23379738459545
+179.11347981019935,269.33461930444184,86.51972305194262,235.28235566184617
+166.46173064664583,261.0404132262943,77.8672879103141,222.9405685439762
+154.36450314445943,258.2200123763158,69.45071513510558,209.73529860734234
+142.61948735542381,250.57594722312155,61.04749888339048,197.4097594424145
+130.39403275688142,237.56983972063503,53.57905190860542,183.16067786345798
+118.09172135231981,232.9065992805976,45.31949038232733,167.08432725260056
+107.2702477985387,215.22999411819978,37.242727577570335,153.20481806438926
+97.0065973471623,205.80014155651824,28.961340379753675,140.27220558777037
+84.6333338632984,192.9275305544331,20.748060187265494,128.88089859059698
+74.44115642907373,175.65153529061962,13.951363389962466,115.32978413235291
+65.89676047851572,159.63588608030236,9.971531743391353,102.90650396427401
+59.101026640990746,157.6032563807769,5.649949808269948,91.77112364358631
+50.4628042074048,151.14613237451394,2.8422167357923183,81.38224893770428
+43.380434756627224,142.1297446607122,0.8089371675179539,69.40743842003496
+37.2402039626919,131.15093810046022,-2.7757075579902497e-5,58.51697840372556
+32.6377410503266,127.61272326799181,0.014576637404480192,49.72583740775145
+29.407339371204802,122.7377666202245,0.020931438505678426,41.99241630483234
+24.06393930942189,85.84155793018172,0.0062571966267834424,29.914239653894793
+23.127901917898463,65.85303059430143,0.03619878202007641,21.935606087997716
+22.924839043559988,62.40507538369134,0.23070127422069103,18.650205536148956
+22.007801580404273,59.154416378847564,0.49709652305249996,16.31324545284173
+21.136620549198433,54.197233720503064,0.12869349623927082,12.457190721911122
+19.031142363960612,38.14249299045677,0.2798932995958727,8.11780690376344
+17.40204173868524,30.7482859887398,0.6275685251939801,6.793617876065368
+12.246953654446704,19.1380483634613,0.7553403609073082,6.373971458330337
+0.40074148030873114,1.9740202514084515,-2.7761152226093412e-5,4.45582104449006
diff --git a/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-Volume.png b/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-Volume.png
new file mode 100644
index 0000000..b8d3b29
Binary files /dev/null and b/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-Volume.png differ
diff --git a/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-thermal.png b/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-thermal.png
new file mode 100644
index 0000000..715ae9f
Binary files /dev/null and b/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-thermal.png differ
diff --git a/examples/HydroPowerModels/evaluate_hydro_policies.jl b/examples/HydroPowerModels/evaluate_hydro_policies.jl
new file mode 100644
index 0000000..cf59032
--- /dev/null
+++ b/examples/HydroPowerModels/evaluate_hydro_policies.jl
@@ -0,0 +1,234 @@
+# Evaluate pre-trained TS-DDR and TS-LDR policies on the Bolivia LTHD problem
+# using stage-wise rollout under the ACP formulation with a fixed scenario set.
+#
+# This produces an apples-to-apples comparison across all methods using the
+# same evaluation protocol:
+#   - stage-wise AC-OPF subproblems (Ipopt)
+#   - realized-state feedback (closed-loop / deployment semantics)
+#   - same seed and number of out-of-sample scenarios
+#   - operational cost excluding target-deficit penalty
+#
+# The script auto-discovers saved .jld2 checkpoints and reconstructs the
+# correct policy architecture (LDR vs DDR) from the filename.
+# Results are written to eval_costs.csv.
+#
+# Usage:
+#   julia --project=. evaluate_hydro_policies.jl [NUM_SIMULATIONS]
+#
+# Environment overrides:
+#   DR_EVAL_SIMULATIONS=100   number of out-of-sample scenarios
+#   DR_EVAL_SEED=1221         random seed for scenario generation
+
+using DecisionRules
+using Statistics
+using Random
+using Flux
+using Ipopt
+using DiffOpt
+using JLD2
+using JuMP
+using CSV
+using DataFrames
+
+const HYDRO_DIR = dirname(@__FILE__)
+include(joinpath(HYDRO_DIR, "load_hydropowermodels.jl"))
+
+const CASE_NAME = "bolivia"
+const FORMULATION = "ACPPowerModel"
+const FORMULATION_FILE = FORMULATION * ".mof.json"
+const NUM_STAGES = 96
+const NUM_SIMULATIONS = parse(Int, get(ENV, "DR_EVAL_SIMULATIONS",
+    length(ARGS) >= 1 ? ARGS[1] : "100"))
+const SEED = parse(Int, get(ENV, "DR_EVAL_SEED", "1221"))
+
+const CASE_DIR = joinpath(HYDRO_DIR, CASE_NAME)
+const OUT_DIR = joinpath(CASE_DIR, FORMULATION)
+const MODEL_DIR = joinpath(OUT_DIR, "models")
+
+println("="^60)
+println("Policy Evaluation (TS-DDR + TS-LDR)")
+println("="^60)
+println("Case:         ", CASE_NAME)
+println("Formulation:  ", FORMULATION)
+println("Stages:       ", NUM_STAGES)
+println("Simulations:  ", NUM_SIMULATIONS)
+println("Seed:         ", SEED)
+println("="^60)
+
+# ── Build stage-wise subproblems ─────────────────────────────────────────────
+
+diff_optimizer = () -> DiffOpt.diff_optimizer(
+    optimizer_with_attributes(
+        Ipopt.Optimizer, "print_level" => 0, "linear_solver" => "mumps",
+    ),
+)
+
+subproblems, state_params_in, state_params_out, uncertainty_samples, initial_state, max_volume =
+    build_hydropowermodels(
+        CASE_DIR, FORMULATION_FILE;
+        num_stages=NUM_STAGES,
+        optimizer=diff_optimizer,
+        penalty_l1=:auto, penalty_l2=:auto,
+    )
+
+num_hydro = length(initial_state)
+num_uncertainties = length(uncertainty_samples[1][1])
+num_inputs = DecisionRules.policy_input_dim(num_uncertainties, num_hydro)
+
+# ── Generate fixed scenario set ──────────────────────────────────────────────
+
+Random.seed!(SEED)
+eval_scenarios = [DecisionRules.sample(uncertainty_samples) for _ in 1:NUM_SIMULATIONS]
+
+# ── Discover saved models ────────────────────────────────────────────────────
+#
+# Model files encode the training method and policy type in their filename:
+#   *-deteq-*   → DDR trained with deterministic equivalent
+#   *-subproblems-* → DDR trained with stage-wise decomposition
+#   *-shooting-* → DDR trained with multiple shooting
+#   *-ldr-*     → LDR (linear decision rule)
+#
+# DDR models use state_conditioned_policy (LSTM [128,128], sigmoid).
+# LDR models use dense_multilayer_nn (identity activation, [64,64]).
+# The most recent file (by lexicographic sort on timestamps) is selected
+# for each method.
+
+struct PolicySpec
+    label::String
+    model_file::String
+    is_ldr::Bool
+end
+
+function _method_variant(base)
+    method = if contains(base, "ldr")
+        "ldr"
+    elseif contains(base, "shooting")
+        "shooting"
+    elseif contains(base, "subproblems")
+        "subproblems"
+    elseif contains(base, "deteq")
+        "deteq"
+    else
+        return nothing
+    end
+    clip_tag = contains(base, "clip") ? "-clip" : ""
+    sched_tag = contains(base, "anneal") ? "-anneal" :
+                contains(base, "const") ? "-const" : ""
+    return method * clip_tag * sched_tag
+end
+
+function _variant_label(variant)
+    labels = Dict(
+        "subproblems-anneal" => "Subproblems (anneal)",
+        "subproblems-clip-anneal" => "Subproblems (clip, anneal)",
+        "subproblems-const" => "Subproblems (const)",
+        "subproblems-clip-const" => "Subproblems (clip, const)",
+        "subproblems" => "Subproblems",
+        "shooting-anneal" => "Shooting w=12 (anneal)",
+        "shooting-clip-anneal" => "Shooting w=12 (clip, anneal)",
+        "shooting" => "Shooting w=12",
+        "deteq-anneal" => "DE (anneal)",
+        "deteq-clip-anneal" => "DE (clip, anneal)",
+        "deteq" => "DE",
+        "ldr" => "TS-LDR",
+    )
+    return get(labels, variant, variant)
+end
+
+function discover_policies(model_dir)
+    files = sort(filter(f -> endswith(f, ".jld2"), readdir(model_dir; join=true)))
+    best = Dict{String,Tuple{String,Bool}}()
+    for f in files
+        base = basename(f)
+        variant = _method_variant(base)
+        isnothing(variant) && continue
+        is_ldr = contains(base, "ldr")
+        best[variant] = (f, is_ldr)
+    end
+    specs = PolicySpec[]
+    for (variant, (path, is_ldr)) in sort(collect(best); by=first)
+        push!(specs, PolicySpec(_variant_label(variant), path, is_ldr))
+    end
+    return specs
+end
+
+function build_policy(spec::PolicySpec, num_inputs, num_hydro, num_uncertainties)
+    if spec.is_ldr
+        return dense_multilayer_nn(num_inputs, num_hydro, Int64[64, 64]; activation=identity)
+    else
+        return state_conditioned_policy(
+            num_uncertainties, num_hydro, num_hydro, Int64[128, 128];
+            activation=sigmoid, encoder_type=Flux.LSTM,
+        )
+    end
+end
+
+policies = discover_policies(MODEL_DIR)
+println("\nDiscovered policies:")
+for p in policies
+    tag = p.is_ldr ? " (LDR)" : " (DDR)"
+    println("  ", p.label, tag, " → ", basename(p.model_file))
+end
+
+# ── Evaluate each policy ─────────────────────────────────────────────────────
+
+results = DataFrame()
+
+for spec in policies
+    println("\nEvaluating: ", spec.label)
+
+    models = build_policy(spec, num_inputs, num_hydro, num_uncertainties)
+    model_state = JLD2.load(spec.model_file, "model_state")
+    Flux.loadmodel!(models, model_state)
+
+    objectives_no_deficit = Vector{Float64}(undef, NUM_SIMULATIONS)
+    objectives_total = Vector{Float64}(undef, NUM_SIMULATIONS)
+
+    for i in 1:NUM_SIMULATIONS
+        Flux.reset!(models)
+
+        objectives_total[i] = simulate_multistage(
+            subproblems,
+            state_params_in,
+            state_params_out,
+            initial_state,
+            eval_scenarios[i],
+            models;
+        )
+
+        objectives_no_deficit[i] = DecisionRules.get_objective_no_target_deficit(subproblems)
+    end
+
+    violation_share = 1.0 - mean(objectives_no_deficit) / mean(objectives_total)
+
+    println("  Mean cost (no deficit): ", round(mean(objectives_no_deficit); digits=1))
+    println("  Std:                    ", round(std(objectives_no_deficit); digits=1))
+    println("  Violation share:        ", round(violation_share * 100; digits=2), "%")
+
+    results[!, spec.label] = objectives_no_deficit
+end
+
+# ── Write results ────────────────────────────────────────────────────────────
+
+costs_file = joinpath(OUT_DIR, "eval_costs.csv")
+CSV.write(costs_file, results)
+println("\nSaved: ", costs_file)
+
+# ── Summary table ────────────────────────────────────────────────────────────
+
+println("\n", "="^70)
+println(rpad("Method", 35), rpad("Mean", 12), rpad("Std", 12), "N")
+println("-"^70)
+for col in names(results)
+    vals = results[!, col]
+    println(
+        rpad(col, 35),
+        rpad(string(round(mean(vals); digits=1)), 12),
+        rpad(string(round(std(vals); digits=1)), 12),
+        length(vals),
+    )
+end
+println("="^70)
+println("\nNote: SDDP results are from sddp/simulate_sddp_policy.jl")
+println("SDDP uses 126 stages (96 + 30 margin) to avoid end-of-horizon effects,")
+println("while TS-DDR/TS-LDR use 96 stages. This gives SDDP a structural advantage.")
diff --git a/examples/HydroPowerModels/load_hydropowermodels.jl b/examples/HydroPowerModels/load_hydropowermodels.jl
index a86e287..613b425 100644
--- a/examples/HydroPowerModels/load_hydropowermodels.jl
+++ b/examples/HydroPowerModels/load_hydropowermodels.jl
@@ -47,7 +47,7 @@ function build_hydropowermodels(
     subproblems = Vector{JuMP.Model}(undef, num_stages)
     state_params_in = Vector{Vector{Any}}(undef, num_stages)
     state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, num_stages)
-    uncertainty_samples = Vector{Vector{Tuple{VariableRef,Vector{Float64}}}}(
+    uncertainty_samples = Vector{Vector{Vector{Tuple{VariableRef,Float64}}}}(
         undef, num_stages
     )
 
@@ -78,13 +78,14 @@ function build_hydropowermodels(
             variable_to_parameter(subproblems[t], state_param_out[i]; deficit=_deficit[i])
             for i in 1:nHyd
         ]
-        inflow = [
-            (
-                variable_to_parameter(subproblems[t], inflow[i]),
-                vector_inflows[i][t, :] .+ 0.0,
-            ) for i in 1:nHyd
+        # Joint scenarios: all hydro units share the same scenario index ω,
+        # preserving the spatial correlation in the historical inflow data.
+        inflow_params = [variable_to_parameter(subproblems[t], inflow[i]) for i in 1:nHyd]
+        joint_scenarios = [
+            [(inflow_params[i], vector_inflows[i][t, ω] + 0.0) for i in 1:nHyd]
+            for ω in 1:nCen
         ]
-        uncertainty_samples[t] = inflow
+        uncertainty_samples[t] = joint_scenarios
     end
 
     return subproblems,
diff --git a/examples/HydroPowerModels/run_sddp.jl b/examples/HydroPowerModels/run_sddp.jl
deleted file mode 100644
index 6a806e2..0000000
--- a/examples/HydroPowerModels/run_sddp.jl
+++ /dev/null
@@ -1,120 +0,0 @@
-# SDDP baseline: train and simulate SDDP policy on the Bolivia LTHD problem
-# using a consistent (convex) formulation for both forward and backward passes.
-# Requires HydroPowerModels.jl and a Mosek license.
-using Gurobi
-using MosekTools
-using HydroPowerModels
-using JuMP
-using Statistics
-import SDDP: stopping_rule_status, convergence_test, PolicyGraph, AbstractStoppingRule, Log
-using Wandb, Dates, Logging
-
-using Random
-seed = 1221
-
-# Load case
-case = "bolivia"
-case_dir = joinpath(dirname(@__FILE__), case)
-alldata = HydroPowerModels.parse_folder(case_dir);
-for load in values(alldata[1]["powersystem"]["load"])
-    load["qd"] = load["qd"] * 0.6
-    load["pd"] = load["pd"] * 0.6
-end
-rm_stages = 30
-num_stages = 96 + rm_stages
-formulation = SOCWRConicPowerModel
-
-params = create_param(;
-    stages=num_stages,
-    model_constructor_grid=formulation,
-    post_method=PowerModels.build_opf,
-    optimizer=Mosek.Optimizer,
-);
-
-m = hydro_thermal_operation(alldata, params);
-
-# Wandb stopping rule: logs every iteration but never triggers termination
-mutable struct WandBLog <: SDDP.AbstractStoppingRule
-    lg
-end
-
-SDDP.stopping_rule_status(::WandBLog) = :not_solved
-
-save_file = "SDDP-$(case)-$(formulation)-$(formulation)-h$(num_stages)-$(now())"
-
-cuts_file = joinpath(
-    case_dir, string(formulation), string(formulation)*"-"*string(formulation)*".cuts.json"
-)
-
-function SDDP.convergence_test(
-    policy::SDDP.PolicyGraph, log::Vector{SDDP.Log}, rule::WandBLog
-)
-    SDDP.write_cuts_to_file(
-        policy,
-        joinpath(
-            case_dir,
-            string(formulation),
-            string(formulation)*"-"*string(formulation)*".cuts.json",
-        ),
-    )
-
-    Wandb.log(
-        rule.lg,
-        Dict(
-            "iteration" => length(log),
-            "bound" => log[end].bound,
-            "metrics/loss" => log[end].simulation_value,
-        ),
-    )
-    return false
-end
-
-lg = WandbLogger(; project="HydroPowerModels", name=save_file, save_code=false)
-
-# ## Train
-Random.seed!(seed)
-start_time = time()
-HydroPowerModels.train(
-    m;
-    iteration_limit=200,
-    stopping_rules=[
-        WandBLog(lg); SDDP.Statistical(; num_replications=300, iteration_period=50)
-    ],
-);
-end_time = time() - start_time
-
-# Termination Status and solve time (s)
-(SDDP.termination_status(m.forward_graph), end_time)
-
-# save cuts
-SDDP.write_cuts_to_file(
-    m.forward_graph,
-    joinpath(
-        case_dir,
-        string(formulation),
-        string(formulation)*"-"*string(formulation)*".cuts.json",
-    ),
-)
-
-# ## Simulation 
-using Random: Random
-Random.seed!(seed)
-results = HydroPowerModels.simulate(m, 300);
-
-# ## Objective
-objective_values = [
-    sum(results[:simulations][i][t][:stage_objective] for t in 1:(num_stages - rm_stages))
-    for i in 1:length(results[:simulations])
-]
-println("Mean Sim: ", mean(objective_values))
-
-Wandb.log(
-    lg,
-    Dict(
-        "bound" => SDDP.calculate_bound(m.forward_graph),
-        "metrics/final_loss" => mean(objective_values),
-    ),
-)
-
-# Finish the run
-close(lg)
diff --git a/examples/HydroPowerModels/run_sddp_inconsistent.jl b/examples/HydroPowerModels/run_sddp_inconsistent.jl
deleted file mode 100644
index de67550..0000000
--- a/examples/HydroPowerModels/run_sddp_inconsistent.jl
+++ /dev/null
@@ -1,114 +0,0 @@
-# SDDP baseline with inconsistent formulations: train SDDP with a convex
-# backward-pass formulation (SOCWRConic) and an AC forward-pass formulation,
-# then simulate the resulting policy under the AC model.
-# Requires HydroPowerModels.jl, Gurobi, Mosek, and MadNLP.
-using Gurobi
-using MosekTools
-using MadNLP
-using HydroPowerModels
-using JuMP
-using Statistics
-import SDDP: stopping_rule_status, convergence_test, PolicyGraph, AbstractStoppingRule, Log
-using Wandb, Dates, Logging
-
-using Random
-seed = 1221
-
-# Load case
-case = "bolivia"
-case_dir = joinpath(dirname(@__FILE__), case)
-alldata = HydroPowerModels.parse_folder(case_dir);
-for load in values(alldata[1]["powersystem"]["load"])
-    load["qd"] = load["qd"] * 0.6
-    load["pd"] = load["pd"] * 0.6
-end
-rm_stages = 30
-num_stages = 96 + rm_stages
-formulation_b = SOCWRConicPowerModel
-formulation = ACPPowerModel
-
-# Wandb stopping rule
-mutable struct WandBLog <: SDDP.AbstractStoppingRule
-    lg
-end
-
-SDDP.stopping_rule_status(::WandBLog) = :not_solved
-
-save_file = "SDDP-$(case)-$(formulation)-$(formulation_b)-h$(num_stages)-$(now())"
-
-cuts_file = joinpath(
-    case_dir,
-    string(formulation),
-    string(formulation_b)*"-"*string(formulation)*".cuts.json",
-)
-
-function SDDP.convergence_test(
-    policy::SDDP.PolicyGraph, log::Vector{SDDP.Log}, rule::WandBLog
-)
-    SDDP.write_cuts_to_file(policy, cuts_file)
-    Wandb.log(
-        rule.lg,
-        Dict(
-            "iteration" => length(log),
-            "bound" => log[end].bound,
-            "metrics/loss" => log[end].simulation_value,
-        ),
-    )
-    return false
-end
-
-lg = WandbLogger(; project="HydroPowerModels", name=save_file, save_code=false)
-
-# Train: SOCWRConic backward pass, ACP forward pass via MadNLP
-Random.seed!(seed)
-
-params = create_param(;
-    stages=num_stages,
-    model_constructor_grid=formulation_b,
-    model_constructor_grid_forward=formulation,
-    post_method=PowerModels.build_opf,
-    optimizer=Mosek.Optimizer,
-    optimizer_forward=() -> MadNLP.Optimizer(; print_level=MadNLP.INFO),
-);
-
-m = hydro_thermal_operation(alldata, params);
-
-if isfile(cuts_file)
-    SDDP.read_cuts_from_file(m.forward_graph, cuts_file)
-end
-
-start_time = time()
-HydroPowerModels.train(
-    m;
-    iteration_limit=2000,
-    stopping_rules=[
-        WandBLog(lg); SDDP.Statistical(; num_replications=300, iteration_period=200)
-    ],
-);
-end_time = time() - start_time
-
-(SDDP.termination_status(m.forward_graph), end_time)
-
-SDDP.write_cuts_to_file(m.forward_graph, cuts_file)
-
-# Simulation
-using Random: Random
-Random.seed!(seed)
-results = HydroPowerModels.simulate(m, 300);
-
-# Objective
-objective_values = [
-    sum(results[:simulations][i][t][:stage_objective] for t in 1:(num_stages - rm_stages))
-    for i in 1:length(results[:simulations])
-]
-println("Mean Sim: ", mean(objective_values))
-
-Wandb.log(
-    lg,
-    Dict(
-        "bound" => SDDP.calculate_bound(m.forward_graph),
-        "metrics/final_loss" => mean(objective_values),
-    ),
-)
-
-close(lg)
diff --git a/examples/HydroPowerModels/sddp/Project.toml b/examples/HydroPowerModels/sddp/Project.toml
new file mode 100644
index 0000000..2aac97c
--- /dev/null
+++ b/examples/HydroPowerModels/sddp/Project.toml
@@ -0,0 +1,11 @@
+[deps]
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+Clarabel = "61c947e1-3e6d-4ee4-985a-eec8c727bd6e"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+HydroPowerModels = "1bf2e10f-7293-4f36-bafb-f7584ca75eae"
+JuMP = "4076af6c-e467-56ae-b986-b466b2749572"
+MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+PowerModels = "c36e90e8-916a-50a6-bd94-075b64ef4655"
+SDDP = "f4570300-c277-11e8-125c-4912f86ce65d"
+Wandb = "ad70616a-06c9-5745-b1f1-6a5f42545108"
diff --git a/examples/HydroPowerModels/sddp/run_sddp.jl b/examples/HydroPowerModels/sddp/run_sddp.jl
new file mode 100644
index 0000000..6643640
--- /dev/null
+++ b/examples/HydroPowerModels/sddp/run_sddp.jl
@@ -0,0 +1,168 @@
+# SDDP baseline: train and simulate SDDP policy on the Bolivia LTHD problem
+# using a consistent convex SOCWRConic formulation.
+
+using Clarabel
+using HydroPowerModels
+using JuMP
+using Logging
+using PowerModels
+using Random
+using SDDP
+using Statistics
+using Wandb, Dates
+
+const SEED = parse(Int, get(ENV, "DR_SDDP_SEED", "1221"))
+const CASE = get(ENV, "DR_SDDP_CASE", "bolivia")
+const HYDRO_DIR = dirname(@__DIR__)
+const CASE_DIR = joinpath(HYDRO_DIR, CASE)
+const RM_STAGES = parse(Int, get(ENV, "DR_SDDP_RM_STAGES", "30"))
+const NUM_STAGES = parse(Int, get(ENV, "DR_SDDP_NUM_STAGES", string(96 + RM_STAGES)))
+const ITERATION_LIMIT = parse(Int, get(ENV, "DR_SDDP_ITERATION_LIMIT", "200"))
+const NUM_SIMULATIONS = parse(Int, get(ENV, "DR_SDDP_SIMULATIONS", "300"))
+const STAT_REPLICATIONS = parse(Int, get(ENV, "DR_SDDP_STAT_REPLICATIONS", "300"))
+const STAT_PERIOD = parse(Int, get(ENV, "DR_SDDP_STAT_PERIOD", "50"))
+const FORMULATION = SOCWRConicPowerModel
+const save_file = "SDDP-$(CASE)-$(FORMULATION)-$(FORMULATION)-h$(NUM_STAGES)-$(Dates.now())"
+const CUTS_FILE = joinpath(
+    CASE_DIR,
+    string(FORMULATION),
+    string(FORMULATION) * "-" * string(FORMULATION) * ".cuts.json",
+)
+
+function clarabel_optimizer()
+    return Clarabel.Optimizer(;
+        verbose=false,
+        max_iter=parse(Int, get(ENV, "DR_SDDP_CLARABEL_MAX_ITER", "1000")),
+        tol_gap_abs=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")),
+        tol_gap_rel=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")),
+        tol_feas=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")),
+    )
+end
+
+mutable struct WandBLog <: SDDP.AbstractStoppingRule
+    cuts_file::String
+    lg
+end
+
+SDDP.stopping_rule_status(::WandBLog) = :not_solved
+
+function SDDP.convergence_test(
+    policy::SDDP.PolicyGraph,
+    log::Vector{SDDP.Log},
+    rule::WandBLog,
+)
+    mkpath(dirname(rule.cuts_file))
+    SDDP.write_cuts_to_file(policy, rule.cuts_file)
+    latest = log[end]
+    Wandb.log(
+        rule.lg,
+        Dict(
+            "batch" => length(log),
+            "metrics/loss" => latest.bound,
+            "metrics/rollout_realized_objective_no_deficit" => latest.simulation_value,
+        ),
+    )
+    println(
+        "iteration=$(length(log)) bound=$(latest.bound) simulation_value=$(latest.simulation_value)",
+    )
+    flush(stdout)
+    return false
+end
+
+function load_case_data()
+    alldata = HydroPowerModels.parse_folder(CASE_DIR)
+    for load in values(alldata[1]["powersystem"]["load"])
+        load["qd"] *= 0.6
+        load["pd"] *= 0.6
+    end
+    return alldata
+end
+
+function main()
+    println("Run: ", save_file)
+    println("Case directory: ", CASE_DIR)
+    println("Formulation: ", FORMULATION, " with Clarabel")
+
+    Random.seed!(SEED)
+    mkpath(dirname(CUTS_FILE))
+    alldata = load_case_data()
+    lg = WandbLogger(;
+        project="RL",
+        name=save_file,
+        save_code=false,
+        config=Dict(
+            "case_name" => CASE,
+            "training_method" => "sddp_consistent",
+            "formulation" => string(FORMULATION),
+            "solver" => "Clarabel",
+            "num_stages" => NUM_STAGES,
+            "rm_stages" => RM_STAGES,
+            "iteration_limit" => ITERATION_LIMIT,
+            "num_simulations" => NUM_SIMULATIONS,
+            "stat_replications" => STAT_REPLICATIONS,
+            "stat_period" => STAT_PERIOD,
+            "seed" => SEED,
+        ),
+    )
+    params = create_param(;
+        stages=NUM_STAGES,
+        model_constructor_grid=FORMULATION,
+        post_method=PowerModels.build_opf,
+        optimizer=clarabel_optimizer,
+    )
+    model = hydro_thermal_operation(alldata, params)
+
+    if isfile(CUTS_FILE)
+        println("Loading existing cuts: ", CUTS_FILE)
+        SDDP.read_cuts_from_file(model.forward_graph, CUTS_FILE)
+    end
+
+    stopping_rules = SDDP.AbstractStoppingRule[WandBLog(CUTS_FILE, lg)]
+    if STAT_REPLICATIONS > 0
+        push!(
+            stopping_rules,
+            SDDP.Statistical(;
+                num_replications=STAT_REPLICATIONS,
+                iteration_period=STAT_PERIOD,
+            ),
+        )
+    end
+
+    start_time = time()
+    HydroPowerModels.train(
+        model;
+        iteration_limit=ITERATION_LIMIT,
+        stopping_rules=stopping_rules,
+    )
+    elapsed = time() - start_time
+    bound = SDDP.calculate_bound(model.forward_graph)
+    println("Termination status: ", SDDP.termination_status(model.forward_graph))
+    println("Elapsed seconds: ", elapsed)
+    println("Bound: ", bound)
+
+    SDDP.write_cuts_to_file(model.forward_graph, CUTS_FILE)
+    println("Saved cuts: ", CUTS_FILE)
+
+    Random.seed!(SEED)
+    results = HydroPowerModels.simulate(model, NUM_SIMULATIONS)
+    objective_values = [
+        sum(results[:simulations][i][t][:stage_objective] for t in 1:(NUM_STAGES - RM_STAGES))
+        for i in 1:length(results[:simulations])
+    ]
+    final_loss = mean(objective_values)
+    println("Mean Sim: ", final_loss)
+    Wandb.log(
+        lg,
+        Dict(
+            "batch" => ITERATION_LIMIT,
+            "metrics/loss" => bound,
+            "metrics/final_loss" => final_loss,
+            "metrics/rollout_realized_objective_no_deficit" => final_loss,
+            "metrics/final_rollout_realized_objective_no_deficit" => final_loss,
+            "metrics/elapsed_seconds" => elapsed,
+        ),
+    )
+    close(lg)
+end
+
+main()
diff --git a/examples/HydroPowerModels/sddp/run_sddp_inconsistent.jl b/examples/HydroPowerModels/sddp/run_sddp_inconsistent.jl
new file mode 100644
index 0000000..0035912
--- /dev/null
+++ b/examples/HydroPowerModels/sddp/run_sddp_inconsistent.jl
@@ -0,0 +1,196 @@
+# SDDP baseline with inconsistent formulations: train SDDP with a convex
+# SOCWRConic backward-pass formulation and an AC forward-pass formulation,
+# then simulate the resulting policy under the AC model.
+#
+# Environment overrides for local smoke tests:
+#   DR_SDDP_ITERATION_LIMIT=2
+#   DR_SDDP_SIMULATIONS=2
+#   DR_SDDP_STAT_REPLICATIONS=2
+#   DR_SDDP_STAT_PERIOD=1
+
+using Clarabel
+using HydroPowerModels
+using JuMP
+using Logging
+using MadNLP
+using PowerModels
+using Random
+using SDDP
+using Statistics
+using Wandb, Dates
+
+const SEED = parse(Int, get(ENV, "DR_SDDP_SEED", "1221"))
+const CASE = get(ENV, "DR_SDDP_CASE", "bolivia")
+const HYDRO_DIR = dirname(@__DIR__)
+const CASE_DIR = joinpath(HYDRO_DIR, CASE)
+const RM_STAGES = parse(Int, get(ENV, "DR_SDDP_RM_STAGES", "30"))
+const NUM_STAGES = parse(Int, get(ENV, "DR_SDDP_NUM_STAGES", string(96 + RM_STAGES)))
+const ITERATION_LIMIT = parse(Int, get(ENV, "DR_SDDP_ITERATION_LIMIT", "2000"))
+const NUM_SIMULATIONS = parse(Int, get(ENV, "DR_SDDP_SIMULATIONS", "300"))
+const STAT_REPLICATIONS = parse(Int, get(ENV, "DR_SDDP_STAT_REPLICATIONS", "300"))
+const STAT_PERIOD = parse(Int, get(ENV, "DR_SDDP_STAT_PERIOD", "200"))
+
+const FORMULATION_BACKWARD = SOCWRConicPowerModel
+const FORMULATION_FORWARD = ACPPowerModel
+const save_file = "SDDP-$(CASE)-$(FORMULATION_FORWARD)-$(FORMULATION_BACKWARD)-h$(NUM_STAGES)-$(Dates.now())"
+const CUTS_DIR = joinpath(CASE_DIR, string(FORMULATION_FORWARD))
+const CUTS_FILE = joinpath(
+    CUTS_DIR,
+    string(FORMULATION_BACKWARD) * "-" * string(FORMULATION_FORWARD) * ".cuts.json",
+)
+
+function clarabel_optimizer()
+    return Clarabel.Optimizer(;
+        verbose=false,
+        max_iter=parse(Int, get(ENV, "DR_SDDP_CLARABEL_MAX_ITER", "1000")),
+        tol_gap_abs=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")),
+        tol_gap_rel=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")),
+        tol_feas=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")),
+    )
+end
+
+function madnlp_optimizer()
+    return MadNLP.Optimizer(;
+        print_level=parse(Int, get(ENV, "DR_SDDP_MADNLP_PRINT_LEVEL", "0")),
+    )
+end
+
+mutable struct WandBLog <: SDDP.AbstractStoppingRule
+    cuts_file::String
+    lg
+end
+
+SDDP.stopping_rule_status(::WandBLog) = :not_solved
+
+function SDDP.convergence_test(
+    policy::SDDP.PolicyGraph,
+    log::Vector{SDDP.Log},
+    rule::WandBLog,
+)
+    mkpath(dirname(rule.cuts_file))
+    SDDP.write_cuts_to_file(policy, rule.cuts_file)
+    latest = log[end]
+    Wandb.log(
+        rule.lg,
+        Dict(
+            "batch" => length(log),
+            "metrics/loss" => latest.bound,
+            "metrics/rollout_realized_objective_no_deficit" => latest.simulation_value,
+        ),
+    )
+    println(
+        "iteration=$(length(log)) bound=$(latest.bound) simulation_value=$(latest.simulation_value)",
+    )
+    flush(stdout)
+    return false
+end
+
+function load_case_data()
+    alldata = HydroPowerModels.parse_folder(CASE_DIR)
+    for load in values(alldata[1]["powersystem"]["load"])
+        load["qd"] *= 0.6
+        load["pd"] *= 0.6
+    end
+    return alldata
+end
+
+function main()
+    println("Run: ", save_file)
+    println("Case directory: ", CASE_DIR)
+    println("Stages: ", NUM_STAGES, " (reporting first ", NUM_STAGES - RM_STAGES, ")")
+    println("Backward formulation: ", FORMULATION_BACKWARD, " with Clarabel")
+    println("Forward formulation: ", FORMULATION_FORWARD, " with MadNLP")
+    println("Iteration limit: ", ITERATION_LIMIT)
+    println("Simulations: ", NUM_SIMULATIONS)
+
+    Random.seed!(SEED)
+    mkpath(CUTS_DIR)
+    alldata = load_case_data()
+    lg = WandbLogger(;
+        project="RL",
+        name=save_file,
+        save_code=false,
+        config=Dict(
+            "case_name" => CASE,
+            "training_method" => "sddp_inconsistent",
+            "backward_formulation" => string(FORMULATION_BACKWARD),
+            "forward_formulation" => string(FORMULATION_FORWARD),
+            "backward_solver" => "Clarabel",
+            "forward_solver" => "MadNLP",
+            "num_stages" => NUM_STAGES,
+            "rm_stages" => RM_STAGES,
+            "iteration_limit" => ITERATION_LIMIT,
+            "num_simulations" => NUM_SIMULATIONS,
+            "stat_replications" => STAT_REPLICATIONS,
+            "stat_period" => STAT_PERIOD,
+            "seed" => SEED,
+        ),
+    )
+
+    params = create_param(;
+        stages=NUM_STAGES,
+        model_constructor_grid=FORMULATION_BACKWARD,
+        model_constructor_grid_forward=FORMULATION_FORWARD,
+        post_method=PowerModels.build_opf,
+        optimizer=clarabel_optimizer,
+        optimizer_forward=madnlp_optimizer,
+    )
+
+    model = hydro_thermal_operation(alldata, params)
+
+    if isfile(CUTS_FILE)
+        println("Loading existing cuts: ", CUTS_FILE)
+        SDDP.read_cuts_from_file(model.forward_graph, CUTS_FILE)
+    end
+
+    stopping_rules = SDDP.AbstractStoppingRule[WandBLog(CUTS_FILE, lg)]
+    if STAT_REPLICATIONS > 0
+        push!(
+            stopping_rules,
+            SDDP.Statistical(;
+                num_replications=STAT_REPLICATIONS,
+                iteration_period=STAT_PERIOD,
+            ),
+        )
+    end
+
+    start_time = time()
+    HydroPowerModels.train(
+        model;
+        iteration_limit=ITERATION_LIMIT,
+        stopping_rules=stopping_rules,
+    )
+    elapsed = time() - start_time
+
+    status = SDDP.termination_status(model.forward_graph)
+    bound = SDDP.calculate_bound(model.forward_graph)
+    println("Termination status: ", status)
+    println("Elapsed seconds: ", elapsed)
+    println("Bound: ", bound)
+
+    SDDP.write_cuts_to_file(model.forward_graph, CUTS_FILE)
+    println("Saved cuts: ", CUTS_FILE)
+
+    Random.seed!(SEED)
+    results = HydroPowerModels.simulate(model, NUM_SIMULATIONS)
+    objective_values = [
+        sum(results[:simulations][i][t][:stage_objective] for t in 1:(NUM_STAGES - RM_STAGES))
+        for i in 1:length(results[:simulations])
+    ]
+    final_loss = mean(objective_values)
+    println("Mean Sim: ", final_loss)
+    Wandb.log(
+        lg,
+        Dict(
+            "batch" => ITERATION_LIMIT,
+            "metrics/loss" => bound,
+            "metrics/final_loss" => final_loss,
+            "metrics/rollout_realized_objective_no_deficit" => final_loss,
+            "metrics/final_rollout_realized_objective_no_deficit" => final_loss,
+            "metrics/elapsed_seconds" => elapsed,
+        ),
+    )
+    close(lg)
+end
+
+main()
diff --git a/examples/HydroPowerModels/simulate_sddp_policy.jl b/examples/HydroPowerModels/sddp/simulate_sddp_policy.jl
similarity index 66%
rename from examples/HydroPowerModels/simulate_sddp_policy.jl
rename to examples/HydroPowerModels/sddp/simulate_sddp_policy.jl
index 76f8aca..a392de6 100644
--- a/examples/HydroPowerModels/simulate_sddp_policy.jl
+++ b/examples/HydroPowerModels/sddp/simulate_sddp_policy.jl
@@ -1,20 +1,18 @@
 # Simulate a pre-trained SDDP policy (cuts from run_sddp_inconsistent.jl) under
 # the ACP formulation and produce comparison plots/CSVs against TS-DDR baselines.
-# Requires HydroPowerModels.jl, MadNLP, Gurobi, Mosek.
-using MosekTools
 using MadNLP
 using HydroPowerModels
 using JuMP
+using PowerModels
 using Statistics
 using SDDP: SDDP
-using Gurobi
 
 using Random
 seed = 1221
 
 # Load case
 case = "bolivia"
-case_dir = joinpath(dirname(@__FILE__), case)
+case_dir = joinpath(dirname(@__DIR__), case)
 alldata = HydroPowerModels.parse_folder(case_dir);
 for load in values(alldata[1]["powersystem"]["load"])
     load["qd"] = load["qd"] * 0.6
@@ -29,7 +27,7 @@ params = create_param(;
     stages=num_stages,
     model_constructor_grid=formulation,
     post_method=PowerModels.build_opf,
-    optimizer=() -> MadNLP.Optimizer(; print_level=MadNLP.INFO),
+    optimizer=() -> MadNLP.Optimizer(; print_level=0),
 );
 
 m = hydro_thermal_operation(alldata, params);
@@ -56,9 +54,14 @@ using CSV
 using DataFrames
 volume_to_mw(volume, stage_hours; k=0.0036) = volume / (k * stage_hours)
 
-labels = ["TS-DDR"; "TS-LDR"; "SDDP-DCLL"]
-colors = [:black :purple :red]
-markers = [:hline :+ :pixel]
+const SDDP_COL = "SDDP-SOC"
+labels = ["TS-DDR"; "TS-LDR"; "SDDP-DCLL"; SDDP_COL]
+colors = [:black :purple :red :orange]
+markers = [:hline :+ :pixel :diamond]
+
+const DOCS_ASSETS = joinpath(dirname(@__DIR__), "..", "..", "docs", "src", "assets")
+mkpath(DOCS_ASSETS)
+out_dir = joinpath(case_dir, string(formulation))
 
 # Volume trajectory
 hydro_gen = [
@@ -78,30 +81,24 @@ savefig(
         ylabel="Volume (Hm3)",
         title="$(case)-$(formulation_b)-$(formulation)",
     ),
-    joinpath(
-        case_dir,
-        string(formulation),
-        "SDDP-$(case)-$(formulation_b)-$(formulation)-Volume.png",
-    ),
-)
-
-df = CSV.read(
-    joinpath(case_dir, string(formulation), "MeanVolume.csv"), DataFrame; header=true
+    joinpath(out_dir, "SDDP-$(case)-$(formulation_b)-$(formulation)-Volume.png"),
 )
-df[!, "$(string(formulation_b))"] = hydro_gen
 
-CSV.write(joinpath(case_dir, string(formulation), "MeanVolume.csv"), df)
+df = CSV.read(joinpath(out_dir, "MeanVolume.csv"), DataFrame; header=true)
+df[!, SDDP_COL] = hydro_gen
+CSV.write(joinpath(out_dir, "MeanVolume.csv"), df)
 
 savefig(
     plot(
         Matrix(df[!, labels]);
-        labels=permutedims(names(df[!, labels])),
+        labels=permutedims(labels),
         xlabel="Stage",
         ylabel="Expected Volume (MWh)",
         color=colors,
         shape=markers,
+        title="Reservoir Volume Comparison",
     ),
-    joinpath(case_dir, string(formulation), "DCLL-Comparison-$(case)-Volume.png"),
+    joinpath(DOCS_ASSETS, "hydro_volume_comparison.png"),
 )
 
 # Thermal generation
@@ -125,28 +122,24 @@ savefig(
         ylabel="Mwh",
         title="Thermal-Generation $(case)-$(formulation_b)-$(formulation)",
     ),
-    joinpath(
-        case_dir,
-        string(formulation),
-        "SDDP-$(case)-$(formulation_b)-$(formulation)-thermal.png",
-    ),
+    joinpath(out_dir, "SDDP-$(case)-$(formulation_b)-$(formulation)-thermal.png"),
 )
 
-df = CSV.read(joinpath(case_dir, string(formulation), "MeanGeneration.csv"), DataFrame)
-df[!, "SOC"] = thermal_gen
-
-CSV.write(joinpath(case_dir, string(formulation), "MeanGeneration.csv"), df)
+df = CSV.read(joinpath(out_dir, "MeanGeneration.csv"), DataFrame)
+df[!, SDDP_COL] = thermal_gen
+CSV.write(joinpath(out_dir, "MeanGeneration.csv"), df)
 
 savefig(
     plot(
         Matrix(df[!, labels]);
-        labels=permutedims(names(df[!, labels])),
+        labels=permutedims(labels),
         xlabel="Stage",
         ylabel="Expected Thermal Generation (MWh)",
         color=colors,
         shape=markers,
+        title="Thermal Generation Comparison",
     ),
-    joinpath(case_dir, string(formulation), "DCLL-Comparison-$(case)-thermal.png"),
+    joinpath(DOCS_ASSETS, "hydro_generation_comparison.png"),
 )
 
 # Objective costs
@@ -155,9 +148,14 @@ objective_values = [
     for i in 1:length(results[:simulations])
 ]
 
-df = CSV.read(joinpath(case_dir, string(formulation), "costs.csv"), DataFrame)
-df[!, "SDDP_SOC"] = objective_values
-
-CSV.write(joinpath(case_dir, string(formulation), "costs.csv"), df)
+costs_file = joinpath(out_dir, "costs.csv")
+if isfile(costs_file)
+    df = CSV.read(costs_file, DataFrame)
+    df[!, SDDP_COL] = objective_values
+else
+    df = DataFrame(Symbol(SDDP_COL) => objective_values)
+end
+CSV.write(costs_file, df)
 
 println("Mean Sim: ", mean(objective_values))
+println("Std  Sim: ", std(objective_values))
diff --git a/examples/HydroPowerModels/test_dr_hydropowermodels.jl b/examples/HydroPowerModels/test_dr_hydropowermodels.jl
deleted file mode 100644
index 79aebf8..0000000
--- a/examples/HydroPowerModels/test_dr_hydropowermodels.jl
+++ /dev/null
@@ -1,264 +0,0 @@
-# Evaluate a pre-trained TS-DDR policy on the Bolivia LTHD problem and produce
-# comparison plots (volume, generation, cost) against SDDP baselines.
-# Requires a trained model .jld2 in the case/formulation/models/ directory.
-using Statistics
-using Random
-using Flux
-using DecisionRules
-using Gurobi
-using MosekTools
-using Ipopt
-using MathOptSymbolicAD: MathOptSymbolicAD
-using JLD2
-using HydroPowerModels
-using DiffOpt
-
-HydroPowerModels_dir = dirname(@__FILE__)
-include(joinpath(HydroPowerModels_dir, "load_hydropowermodels.jl"))
-
-function non_ensurance(x_out, x_in, uncertainty, max_volume)
-    return x_out
-end
-
-# Parameters
-case_name = "bolivia"
-formulation = "DCPPowerModel"
-num_stages = 96
-model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models")
-model_file = readdir(model_dir; join=true)[end]
-save_name = split(split(model_file, "/")[end], ".")[1]
-formulation_file = formulation * ".mof.json"
-dense = Dense
-activation = DecisionRules.identity
-layers = Int64[32, 32]
-ensure_feasibility = non_ensurance
-optimizer = Flux.Adam(0.01)
-
-data = HydroPowerModels.parse_folder(joinpath(HydroPowerModels_dir, case_name))[1];
-HydroPowerModels.gather_useful_info!(data)
-# Build MSP
-
-subproblems, state_params_in, state_params_out, uncertainty_samples, initial_state, max_volume = build_hydropowermodels(
-    joinpath(HydroPowerModels_dir, case_name), formulation_file; num_stages=num_stages
-)
-
-det_equivalent = DiffOpt.nonlinear_diff_model(
-    optimizer_with_attributes(
-        Ipopt.Optimizer,
-        "print_level" => 0,
-        "linear_solver" => "mumps",
-    ),
-)
-
-det_equivalent, uncertainty_samples = DecisionRules.deterministic_equivalent!(
-    det_equivalent,
-    subproblems,
-    state_params_in,
-    state_params_out,
-    initial_state,
-    uncertainty_samples,
-)
-
-num_hydro = length(initial_state)
-
-# Build Model
-models = dense_multilayer_nn(
-    num_hydro, num_hydro, layers; activation=activation, dense=dense
-)
-model = models
-opt_state = Flux.setup(optimizer, model)
-x = randn(num_hydro, 1)
-y = rand(num_hydro, 1)
-train_set = [(x, y)]
-Flux.train!(model, train_set, opt_state) do m, x, y
-    return Flux.mse(m(x), y)
-end
-models = model
-model_state = JLD2.load(model_file, "model_state")
-Flux.loadmodel!(model, model_state)
-
-Random.seed!(1221)
-num_samples = 100
-objective_values = Vector{Float64}(undef, num_samples)
-states = Vector{Any}(undef, num_samples)
-inflows = Array{Float64,3}(undef, num_samples, num_hydro, num_stages)
-record_variables_names = ["0_pg", "norm_deficit"]
-record_variables = Dict{String,Any}()
-record = Dict{String,Array{Float64,3}}()
-for _var in record_variables_names
-    num_vars = length(find_variables(det_equivalent, [_var; r"#1$"]))
-    record[_var] = Array{Float64,3}(undef, num_samples, num_vars, num_stages)
-    record_variables[_var] = [
-        find_variables(det_equivalent, [_var; Regex("#$i\$")]) for i in 1:num_stages
-    ]
-end
-for i in 1:num_samples
-    Flux.reset!(models)
-    uncertainty_s = sample(uncertainty_samples)
-    for j in 1:num_hydro, t in 1:num_stages
-        inflow_var = collect(keys(uncertainty_s[t]))
-        inflow_var = inflow_var[findfirst(
-            x -> occursin("_inflow[$j]", JuMP.name(x)), inflow_var
-        )]
-        inflows[i, j, t] = uncertainty_s[t][inflow_var]
-    end
-    simulate_multistage(
-        det_equivalent,
-        state_params_in,
-        state_params_out,
-        initial_state,
-        uncertainty_s,
-        models;
-        ensure_feasibility=(x_out, x_in, _sa) ->
-            ensure_feasibility(x_out, x_in, _sa, max_volume),
-    )
-    objective_values[i] = DecisionRules.get_objective_no_target_deficit(det_equivalent)
-    for _var in record_variables_names
-        num_vars = length(find_variables(det_equivalent, [_var; r"#1$"]))
-        for j in 1:num_vars, t in 1:num_stages
-            record[_var][i, j, t] = value(record_variables[_var][t][j])
-        end
-    end
-    states[i] = Vector{Vector{Float64}}(undef, num_hydro)
-    for j in 1:num_hydro
-        states[i][j] = Vector{Float64}(undef, num_stages+1)
-        states[i][j][1] = initial_state[j]
-        for t in 1:num_stages
-            states[i][j][t + 1] = value(state_params_out[t][j][2])
-        end
-    end
-end
-
-# Plot Volumes
-
-using Plots
-using Statistics
-using DataFrames
-using CSV
-
-plt = plot(
-    1:(num_stages + 1),
-    [sum([states[1][j][t] for j in 1:num_hydro]) for t in 1:(num_stages + 1)];
-    legend=false,
-    xlabel="Stage",
-    ylabel="Volume (Hm3)",
-    title="$(case_name)-$(formulation)",
-);
-for i in 2:num_samples
-    plot!(
-        plt,
-        1:(num_stages + 1),
-        [sum([states[i][j][t] for j in 1:num_hydro]) for t in 1:(num_stages + 1)],
-    );
-end
-savefig(
-    plt, joinpath(HydroPowerModels_dir, case_name, formulation, save_name * "Volume.png")
-)
-
-# Plot Mean Volume
-volume_to_mw(volume, stage_hours; k=0.0036) = volume / (k * stage_hours)
-
-plt = plot(
-    1:(num_stages + 1),
-    [
-        mean(sum([states[i][j][t] for j in 1:num_hydro]) for i in 1:num_samples) for
-        t in 1:(num_stages + 1)
-    ];
-    xlabel="Stage",
-    ylabel="Volume (Hm3)",
-    label="Mean Volume",
-    title="$(case_name)-$(formulation)",
-);
-savefig(
-    plt,
-    joinpath(HydroPowerModels_dir, case_name, formulation, save_name * "MeanVolume.png"),
-)
-
-df = DataFrame(;
-    ML_Rule=[
-        mean(
-            sum([volume_to_mw(states[i][j][t], 1) for j in 1:num_hydro]) for
-            i in 1:num_samples
-        ) for t in 2:(num_stages + 1)
-    ],
-)
-
-df = CSV.read(
-    joinpath(HydroPowerModels_dir, case_name, formulation, "MeanVolume.csv"),
-    DataFrame;
-    header=true,
-)
-df[!, "TS-LDR"] = [
-    mean(
-        sum([volume_to_mw(states[i][j][t], 1) for j in 1:num_hydro]) for i in 1:num_samples
-    ) for t in 2:(num_stages + 1)
-]
-
-CSV.write(joinpath(HydroPowerModels_dir, case_name, formulation, "MeanVolume.csv"), df)
-
-# Plot Mean Inflows
-
-plt = plot(
-    1:num_stages,
-    [
-        mean(sum(inflows[i, j, t] for j in 1:num_hydro) for i in 1:num_samples) for
-        t in 1:num_stages
-    ];
-    xlabel="Stage",
-    ylabel="Inflow (Hm3)",
-    label="Mean Inflow",
-    title="$(case_name)-$(formulation)",
-);
-savefig(
-    plt,
-    joinpath(HydroPowerModels_dir, case_name, formulation, save_name * "MeanInflow.png"),
-)
-
-# Plot Generation
-num_gen = size(record["0_pg"], 2)
-hydro_idx = HydroPowerModels.idx_hydro(data)
-
-thermal_gen = [
-    mean(
-        sum(record["0_pg"][i, j, t] * 100 for j in 1:num_gen if !(j in hydro_idx)) for
-        i in 1:num_samples
-    ) for t in 1:num_stages
-]
-
-plt = plot(
-    1:num_stages,
-    thermal_gen;
-    xlabel="Stage",
-    ylabel="Generation (MW)",
-    label="Mean Generation",
-    title="$(case_name)-$(formulation)",
-);
-savefig(
-    plt,
-    joinpath(
-        HydroPowerModels_dir, case_name, formulation, save_name * "MeanGeneration.png"
-    ),
-)
-
-df = DataFrame(; ML_Rule=thermal_gen)
-
-df = CSV.read(
-    joinpath(HydroPowerModels_dir, case_name, formulation, "MeanGeneration.csv"),
-    DataFrame;
-    header=true,
-)
-df[!, "TS-LDR"] = thermal_gen
-
-CSV.write(joinpath(HydroPowerModels_dir, case_name, formulation, "MeanGeneration.csv"), df)
-
-# objective
-df = CSV.read(
-    joinpath(HydroPowerModels_dir, case_name, formulation, "costs.csv"),
-    DataFrame;
-    header=true,
-)
-df[!, "TS-LDR"] = objective_values
-
-mean((df[!, "SDDP_SOC"] .- df[!, "TS-DDR"]) * 100 ./ df[!, "TS-DDR"])
-
-CSV.write(joinpath(HydroPowerModels_dir, case_name, formulation, "costs.csv"), df)
diff --git a/examples/HydroPowerModels/test_sampling_consistency.jl b/examples/HydroPowerModels/test_sampling_consistency.jl
new file mode 100644
index 0000000..0bbe997
--- /dev/null
+++ b/examples/HydroPowerModels/test_sampling_consistency.jl
@@ -0,0 +1,166 @@
+# test_sampling_consistency.jl
+#
+# Verifies that DecisionRules.jl and the ExaModels companion package
+# (DecisionRulesExa.jl) sample from the exact same inflow distribution
+# as SDDP.jl for the Bolivia HydroPowerModels case.
+#
+# All three systems read the same inflows.csv and hydro.json files.
+# The sampling contract is:
+#
+#   At each stage t, draw one scenario index ω ∈ {1, …, nScenarios}
+#   uniformly at random. All hydro reservoirs receive the inflow from
+#   column ω of the historical data for their respective row t.
+#   Stages are sampled independently (no temporal correlation).
+#
+# This is SDDP.jl's `SDDP.parameterize` semantics: one ω per node,
+# applied to all random variables in that node.
+#
+# What this script checks:
+#   1. Both loaders parse inflows.csv into identical per-reservoir matrices.
+#   2. Both samplers produce draws from the same support (only historically
+#      observed joint vectors, never cross-scenario combinations).
+#   3. With the same RNG seed, both produce identical trajectories.
+#
+# Usage:
+#   julia --project=. test_sampling_consistency.jl           (from this dir)
+#   julia --project=. test_sampling_consistency.jl /path/to/DecisionRulesExa.jl/examples/HydroPowerModels
+
+using Test
+using Random
+using CSV, Tables, JSON
+
+# ── Paths ────────────────────────────────────────────────────────────────────
+
+const SCRIPT_DIR = dirname(@__FILE__)
+const CASE_DIR = joinpath(SCRIPT_DIR, "bolivia")
+const INFLOW_FILE = joinpath(CASE_DIR, "inflows.csv")
+const HYDRO_FILE = joinpath(CASE_DIR, "hydro.json")
+
+const EXA_DIR = length(ARGS) >= 1 ? ARGS[1] :
+    joinpath(dirname(dirname(dirname(SCRIPT_DIR))),
+             "..", "DecisionRulesExa.jl", "examples", "HydroPowerModels")
+
+# ── 1. Parse inflows with both loaders ───────────────────────────────────────
+
+# DecisionRules.jl loader (load_hydropowermodels.jl::read_inflow)
+function dr_read_inflow(file, nHyd; num_stages=nothing)
+    allinflows = CSV.read(file, Tables.matrix; header=false)
+    nlin, ncol = size(allinflows)
+    if isnothing(num_stages)
+        num_stages = nlin
+    elseif num_stages > nlin
+        number_of_cycles = div(num_stages, nlin) + 1
+        allinflows = vcat([allinflows for _ in 1:number_of_cycles]...)
+    end
+    nCen = Int(floor(ncol / nHyd))
+    vector_inflows = [allinflows[1:num_stages, ((i-1)*nCen+1):(i*nCen)] for i in 1:nHyd]
+    return vector_inflows, nCen, num_stages
+end
+
+# ExaModels loader (hydro_power_data.jl::load_hydro_data, inflow portion only)
+function exa_read_inflow(file, nHyd; num_stages=nothing)
+    allinflows = CSV.read(file, Tables.matrix; header=false)
+    nrows, ncols = size(allinflows)
+    nScenarios = div(ncols, nHyd)
+    nStagesSample = isnothing(num_stages) ? nrows : num_stages
+    if !isnothing(num_stages) && num_stages > nrows
+        repeats = div(num_stages, nrows) + 1
+        allinflows = vcat([allinflows for _ in 1:repeats]...)
+    end
+    allinflows = allinflows[1:nStagesSample, :]
+    scenario_inflows = [Float64.(allinflows[:, ((r-1)*nScenarios+1):(r*nScenarios)]) for r in 1:nHyd]
+    return scenario_inflows, nScenarios, nStagesSample
+end
+
+# ── 2. Sampler implementations (extracted, no package dependencies) ──────────
+
+function dr_sample_joint(vector_inflows, nCen, T)
+    nHyd = length(vector_inflows)
+    trajectory = Vector{Vector{Float64}}(undef, T)
+    for t in 1:T
+        ω = rand(1:nCen)
+        trajectory[t] = [vector_inflows[r][t, ω] for r in 1:nHyd]
+    end
+    return trajectory
+end
+
+function exa_sample_scenario(scenario_inflows, nScenarios, T)
+    nHyd = length(scenario_inflows)
+    nStagesSample = size(scenario_inflows[1], 1)
+    w = Vector{Float64}(undef, T * nHyd)
+    for t in 1:T
+        t_row = mod1(t, nStagesSample)
+        j = rand(1:nScenarios)
+        for r in 1:nHyd
+            w[(t-1)*nHyd + r] = scenario_inflows[r][t_row, j]
+        end
+    end
+    return w
+end
+
+# ── Tests ────────────────────────────────────────────────────────────────────
+
+hydro_json = JSON.parsefile(HYDRO_FILE)["Hydrogenerators"]
+nHyd = length(hydro_json)
+T = 96
+
+@testset "Sampling consistency: DecisionRules vs Exa vs SDDP" begin
+    dr_inflows, dr_nCen, dr_T = dr_read_inflow(INFLOW_FILE, nHyd; num_stages=T)
+    exa_inflows, exa_nScen, exa_T = exa_read_inflow(INFLOW_FILE, nHyd; num_stages=T)
+
+    @testset "identical inflow matrices" begin
+        @test dr_nCen == exa_nScen
+        @test dr_T == exa_T
+        for r in 1:nHyd
+            @test dr_inflows[r] == exa_inflows[r]
+        end
+    end
+
+    @testset "same seed → identical trajectories" begin
+        for seed in [42, 123, 9999]
+            Random.seed!(seed)
+            dr_traj = dr_sample_joint(dr_inflows, dr_nCen, T)
+
+            Random.seed!(seed)
+            exa_flat = exa_sample_scenario(exa_inflows, exa_nScen, T)
+
+            for t in 1:T
+                for r in 1:nHyd
+                    @test dr_traj[t][r] == exa_flat[(t-1)*nHyd + r]
+                end
+            end
+        end
+    end
+
+    @testset "samples are always from historical scenarios (joint)" begin
+        valid_vectors = Set{Vector{Float64}}()
+        for t in 1:T, ω in 1:dr_nCen
+            push!(valid_vectors, [dr_inflows[r][t, ω] for r in 1:nHyd])
+        end
+
+        Random.seed!(42)
+        for _ in 1:500
+            traj = dr_sample_joint(dr_inflows, dr_nCen, T)
+            for stage_vec in traj
+                @test stage_vec in valid_vectors
+            end
+        end
+    end
+
+    @testset "uniform coverage of all scenarios" begin
+        Random.seed!(42)
+        N = 10_000
+        counts = zeros(Int, dr_nCen)
+        for _ in 1:N
+            ω = rand(1:dr_nCen)
+            counts[ω] += 1
+        end
+        for ω in 1:dr_nCen
+            freq = counts[ω] / N
+            expected = 1.0 / dr_nCen
+            @test abs(freq - expected) < 0.03
+        end
+    end
+end
+
+println("\nAll sampling consistency tests passed.")
diff --git a/examples/HydroPowerModels/train_dr_hydropowermodels.jl b/examples/HydroPowerModels/train_dr_hydropowermodels.jl
index 7b3f2e3..ad6876c 100644
--- a/examples/HydroPowerModels/train_dr_hydropowermodels.jl
+++ b/examples/HydroPowerModels/train_dr_hydropowermodels.jl
@@ -32,30 +32,36 @@ end
 # Parameters
 case_name = "bolivia"                    # bolivia, case3
 formulation = "ACPPowerModel"            # SOCWRConicPowerModel, DCPPowerModel, ACPPowerModel
-num_stages = 96                          # 96, 48
+num_stages = parse(Int, get(ENV, "DR_NUM_STAGES", "126"))
 model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models")
 mkpath(model_dir)
 solver_tag = USE_GPU ? "gpu" : "cpu"
-save_file = "$(case_name)-$(formulation)-h$(num_stages)-deteq-$(solver_tag)-$(now())"
 formulation_file = formulation * ".mof.json"
 
 # Training parameters
-num_epochs = 40
+num_epochs = parse(Int, get(ENV, "DR_NUM_EPOCHS", "80"))
 num_batches = 100
 _num_train_per_batch = 1
 activation = sigmoid                     # tanh, identity, relu, sigmoid
 layers = Int64[128, 128]
 ensure_feasibility = non_ensurance
-optimizers = [Flux.Adam()]
+grad_clip = parse(Float32, get(ENV, "DR_GRAD_CLIP", "0"))
+optimizers = if grad_clip > 0
+    [Flux.Optimisers.OptimiserChain(Flux.Optimisers.ClipGrad(grad_clip), Flux.Adam())]
+else
+    [Flux.Adam()]
+end
 pre_trained_model = nothing
 penalty_l2 = :auto
 penalty_l1 = :auto
-penalty_schedule = [
-    (1, 100, 0.1),
-    (101, 210, 1.0),
-    (211, 300, 10.0),
-    (301, num_epochs * num_batches, 30.0),
-]
+penalty_schedule = if get(ENV, "DR_PENALTY_SCHEDULE", "annealed") == "annealed"
+    :default_annealed
+else
+    nothing
+end
+clip_tag = grad_clip > 0 ? "-clip$(Int(grad_clip))" : ""
+sched_tag = isnothing(penalty_schedule) ? "-const" : "-anneal"
+save_file = "$(case_name)-$(formulation)-h$(num_stages)-deteq-$(solver_tag)$(clip_tag)$(sched_tag)-$(now())"
 num_eval_scenarios = 4
 eval_every = 25
 
@@ -121,6 +127,7 @@ lg = WandbLogger(;
         "encoder_type" => "LSTM",
         "ensure_feasibility" => string(ensure_feasibility),
         "optimizer" => string(optimizers),
+        "grad_clip" => grad_clip,
         "training_method" => "deterministic_equivalent",
         "solver" => USE_GPU ? "MadNLP+CUDSS (GPU)" : "MadNLP (CPU)",
         "penalty_l1" => string(penalty_l1),
@@ -136,7 +143,7 @@ lg = WandbLogger(;
 )
 
 # Define Model
-num_uncertainties = length(uncertainty_samples[1])
+num_uncertainties = length(uncertainty_samples[1][1])
 models = state_conditioned_policy(
     num_uncertainties,
     num_hydro,
@@ -169,8 +176,8 @@ best_obj = mean(objective_values)
 
 model_path = joinpath(model_dir, save_file * ".jld2")
 save_control = SaveBest(best_obj, model_path)
-stall_train = StallingCriterium(100, best_obj, 0)
-stall_rollout = StallingCriterium(5, best_obj, 0)
+stall_train = StallingCriterium(num_epochs * num_batches, best_obj, 0)
+stall_rollout = StallingCriterium(num_epochs * num_batches, best_obj, 0)
 
 
 # Rollout evaluation (stage-wise subproblems, CPU)
diff --git a/examples/HydroPowerModels/train_dr_hydropowermodels_multipleshooting.jl b/examples/HydroPowerModels/train_dr_hydropowermodels_multipleshooting.jl
index 0f41213..906f8e4 100644
--- a/examples/HydroPowerModels/train_dr_hydropowermodels_multipleshooting.jl
+++ b/examples/HydroPowerModels/train_dr_hydropowermodels_multipleshooting.jl
@@ -21,27 +21,32 @@ end
 # Parameters
 case_name = "bolivia"                    # bolivia, case3
 formulation = "ACPPowerModel"            # SOCWRConicPowerModel, DCPPowerModel, ACPPowerModel
-num_stages = 96                          # 96, 48
+num_stages = parse(Int, get(ENV, "DR_NUM_STAGES", "126"))
 window_size = 12                       # 12, 6
 model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models")
 mkpath(model_dir)
-save_file = "$(case_name)-$(formulation)-h$(num_stages)-shooting-w$(window_size)-$(now())"
 formulation_file = formulation * ".mof.json"
 
 # Training parameters
-num_epochs = 30
+num_epochs = parse(Int, get(ENV, "DR_NUM_EPOCHS", "80"))
 num_batches = 100
 _num_train_per_batch = 1
 activation = sigmoid                     # tanh, identity, relu, sigmoid
 layers = Int64[128, 128]
 ensure_feasibility = non_ensurance
-optimizers = [Flux.Adam()]
+grad_clip = parse(Float32, get(ENV, "DR_GRAD_CLIP", "0"))
+optimizers = if grad_clip > 0
+    [Flux.Optimisers.OptimiserChain(Flux.Optimisers.ClipGrad(grad_clip), Flux.Adam())]
+else
+    [Flux.Adam()]
+end
 pre_trained_model = nothing
 penalty_l2 = :auto
-penalty_l1 = nothing
-# Annealed target-penalty multipliers (relative to the :auto base above); set to `nothing`
-# to train with the constant penalties the models were built with.
-penalty_schedule = :default_annealed
+penalty_l1 = :auto
+penalty_schedule = get(ENV, "DR_PENALTY_SCHEDULE", "annealed") == "annealed" ? :default_annealed : nothing
+clip_tag = grad_clip > 0 ? "-clip$(Int(grad_clip))" : ""
+sched_tag = isnothing(penalty_schedule) ? "-const" : "-anneal"
+save_file = "$(case_name)-$(formulation)-h$(num_stages)-shooting-w$(window_size)$(clip_tag)$(sched_tag)-$(now())"
 num_eval_scenarios = 4
 eval_every = 25
 
@@ -87,6 +92,7 @@ lg = WandbLogger(;
         "activation" => string(activation),
         "ensure_feasibility" => string(ensure_feasibility),
         "optimizer" => string(optimizers),
+        "grad_clip" => grad_clip,
         "training_method" => "multiple_shooting",
         "window_size" => string(window_size),
         "penalty_l1" => string(penalty_l1),
@@ -100,7 +106,7 @@ lg = WandbLogger(;
 
 # Define Model
 # Policy architecture: LSTM processes uncertainty, Dense combines with previous state
-num_uncertainties = length(uncertainty_samples[1])
+num_uncertainties = length(uncertainty_samples[1][1])
 models = state_conditioned_policy(
     num_uncertainties,
     num_hydro,
@@ -145,7 +151,7 @@ best_obj = mean(objective_values)
 
 model_path = joinpath(model_dir, save_file * ".jld2")
 save_control = SaveBest(best_obj, model_path)
-convergence_criterium = StallingCriterium(200, best_obj, 0)
+convergence_criterium = StallingCriterium(num_epochs * num_batches, best_obj, 0)
 
 Random.seed!(8789)
 eval_scenarios = [DecisionRules.sample(uncertainty_samples) for _ in 1:num_eval_scenarios]
@@ -170,7 +176,7 @@ train_multiple_shooting(
     models,
     initial_state,
     windows,
-    () -> uncertainty_samples;
+    uncertainty_samples;
     num_batches=num_epochs * num_batches,
     num_train_per_batch=_num_train_per_batch,
     optimizer=first(optimizers),
diff --git a/examples/HydroPowerModels/train_dr_hydropowermodels_subproblems.jl b/examples/HydroPowerModels/train_dr_hydropowermodels_subproblems.jl
index d98c575..7428293 100644
--- a/examples/HydroPowerModels/train_dr_hydropowermodels_subproblems.jl
+++ b/examples/HydroPowerModels/train_dr_hydropowermodels_subproblems.jl
@@ -21,24 +21,29 @@ end
 # Parameters
 case_name = "bolivia"
 formulation = "ACPPowerModel"
-num_stages = 96
+num_stages = parse(Int, get(ENV, "DR_NUM_STAGES", "126"))
 model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models")
 mkpath(model_dir)
-save_file = "$(case_name)-$(formulation)-h$(num_stages)-subproblems-$(now())"
 formulation_file = formulation * ".mof.json"
-num_epochs = 30
+num_epochs = parse(Int, get(ENV, "DR_NUM_EPOCHS", "80"))
 num_batches = 100
 _num_train_per_batch = 1
 activation = sigmoid
 layers = Int64[128, 128]
 ensure_feasibility = non_ensurance
-optimizers = [Flux.Adam()]
+grad_clip = parse(Float32, get(ENV, "DR_GRAD_CLIP", "0"))
+optimizers = if grad_clip > 0
+    [Flux.Optimisers.OptimiserChain(Flux.Optimisers.ClipGrad(grad_clip), Flux.Adam())]
+else
+    [Flux.Adam()]
+end
 pre_trained_model = nothing
 penalty_l2 = :auto
 penalty_l1 = :auto
-# Annealed target-penalty multipliers (relative to the :auto base above); set to `nothing`
-# to train with the constant penalties the models were built with.
-penalty_schedule = :default_annealed
+penalty_schedule = get(ENV, "DR_PENALTY_SCHEDULE", "constant") == "annealed" ? :default_annealed : nothing
+clip_tag = grad_clip > 0 ? "-clip$(Int(grad_clip))" : ""
+sched_tag = isnothing(penalty_schedule) ? "-const" : "-anneal"
+save_file = "$(case_name)-$(formulation)-h$(num_stages)-subproblems$(clip_tag)$(sched_tag)-$(now())"
 num_eval_scenarios = 4                   # fixed held-out scenarios for the rollout evaluation
 eval_every = 25                          # rollout-evaluate every eval_every batches
 
@@ -76,6 +81,7 @@ lg = WandbLogger(;
         "activation" => string(activation),
         "ensure_feasibility" => string(ensure_feasibility),
         "optimizer" => string(optimizers),
+        "grad_clip" => grad_clip,
         "training_method" => "subproblems",
         "penalty_l1" => string(penalty_l1),
         "penalty_l2" => string(penalty_l2),
@@ -88,7 +94,7 @@ lg = WandbLogger(;
 
 # Define Model
 # Policy architecture: LSTM processes uncertainty, Dense combines with previous state
-num_uncertainties = length(uncertainty_samples[1])
+num_uncertainties = length(uncertainty_samples[1][1])
 models = state_conditioned_policy(
     num_uncertainties,
     num_hydro,
@@ -120,7 +126,7 @@ best_obj = mean(objective_values)
 
 model_path = joinpath(model_dir, save_file * ".jld2")
 save_control = SaveBest(best_obj, model_path)
-convergence_criterium = StallingCriterium(100, best_obj, 0)
+convergence_criterium = StallingCriterium(num_epochs * num_batches, best_obj, 0)
 
 # Fixed held-out scenarios, materialized once so every evaluation uses the same set.
 # The rollout evaluation executes the policy stage by stage (deployment semantics) and
diff --git a/examples/HydroPowerModels/train_ldr_hydropowermodels.jl b/examples/HydroPowerModels/train_ldr_hydropowermodels.jl
new file mode 100644
index 0000000..c296ce2
--- /dev/null
+++ b/examples/HydroPowerModels/train_ldr_hydropowermodels.jl
@@ -0,0 +1,264 @@
+# Train a TS-LDR (Linear Decision Rule) policy on the Bolivia LTHD problem.
+#
+# TS-LDR uses the same target-setting framework as TS-DDR but replaces the
+# deep neural network with a linear map:
+#
+#   x̂_t = W [w_{1:t}; x_{t-1}] + b
+#
+# where W, b are the trainable parameters.  This is a `dense_multilayer_nn`
+# with identity activation — a composition of linear layers is still linear,
+# so the result is a standard linear decision rule.
+#
+# Training uses the Deterministic Equivalent pipeline (all stages coupled in
+# one NLP), identical to train_dr_hydropowermodels.jl except for the policy
+# architecture.  The saved model is evaluated by evaluate_hydro_policies.jl.
+#
+# Usage:
+#   julia --project=. train_ldr_hydropowermodels.jl
+
+using DecisionRules
+using Statistics
+using Random
+using Flux
+
+using Ipopt
+using Wandb, Dates, Logging
+using JLD2
+using DiffOpt
+using JuMP
+using MadNLP
+
+USE_GPU = try
+    using CUDA, CUDSS, MadNLPGPU
+    CUDA.functional()
+catch
+    @warn "GPU packages not available — running on CPU"
+    false
+end
+@info "GPU status" USE_GPU
+
+HydroPowerModels_dir = dirname(@__FILE__)
+include(joinpath(HydroPowerModels_dir, "load_hydropowermodels.jl"))
+
+function non_ensurance(x_out, x_in, uncertainty, max_volume)
+    return x_out
+end
+
+# ── Parameters ───────────────────────────────────────────────────────────────
+
+case_name = "bolivia"
+formulation = "ACPPowerModel"
+num_stages = 96
+model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models")
+mkpath(model_dir)
+solver_tag = USE_GPU ? "gpu" : "cpu"
+save_file = "$(case_name)-$(formulation)-h$(num_stages)-ldr-$(solver_tag)-$(now())"
+formulation_file = formulation * ".mof.json"
+
+num_epochs = 40
+num_batches = 100
+_num_train_per_batch = 1
+activation = identity
+layers = Int64[64, 64]
+ensure_feasibility = non_ensurance
+optimizers = [Flux.Adam()]
+pre_trained_model = nothing
+penalty_l2 = :auto
+penalty_l1 = :auto
+penalty_schedule = [
+    (1, 100, 0.1),
+    (101, 210, 1.0),
+    (211, 300, 10.0),
+    (301, num_epochs * num_batches, 30.0),
+]
+num_eval_scenarios = 4
+eval_every = 25
+
+# ── Build MSP: subproblems for rollout evaluation ────────────────────────────
+
+diff_optimizer =
+    () -> DiffOpt.diff_optimizer(
+        optimizer_with_attributes(
+            Ipopt.Optimizer,
+            "print_level" => 0,
+            "linear_solver" => "mumps",
+        ),
+    )
+subproblems, state_params_in_sub, state_params_out_sub, uncertainty_samples_sub, initial_state, max_volume = build_hydropowermodels(
+    joinpath(HydroPowerModels_dir, case_name),
+    formulation_file;
+    num_stages=num_stages,
+    optimizer=diff_optimizer,
+    penalty_l1=penalty_l1,
+    penalty_l2=penalty_l2,
+)
+
+# ── Build det-eq for training ────────────────────────────────────────────────
+
+subproblems_de, state_params_in, state_params_out, uncertainty_samples, _, _ = build_hydropowermodels(
+    joinpath(HydroPowerModels_dir, case_name),
+    formulation_file;
+    num_stages=num_stages,
+    penalty_l1=penalty_l1,
+    penalty_l2=penalty_l2,
+)
+
+det_equivalent = Model(MadNLP.Optimizer)
+
+if USE_GPU
+    set_optimizer_attribute(det_equivalent, "array_type", CUDA.CuArray)
+    set_optimizer_attribute(det_equivalent, "linear_solver", MadNLPGPU.CUDSSSolver)
+    set_optimizer_attribute(det_equivalent, "print_level", MadNLP.ERROR)
+    set_optimizer_attribute(det_equivalent, "barrier", MadNLP.LOQOUpdate())
+else
+    set_optimizer_attribute(det_equivalent, "print_level", MadNLP.ERROR)
+    set_optimizer_attribute(det_equivalent, "barrier", MadNLP.LOQOUpdate())
+end
+
+det_equivalent, uncertainty_samples = DecisionRules.deterministic_equivalent!(
+    det_equivalent,
+    subproblems_de,
+    state_params_in,
+    state_params_out,
+    initial_state,
+    uncertainty_samples,
+)
+
+num_hydro = length(initial_state)
+
+# ── Logging ──────────────────────────────────────────────────────────────────
+
+lg = WandbLogger(;
+    project="RL",
+    name=save_file,
+    save_code=false,
+    config=Dict(
+        "layers" => layers,
+        "activation" => "identity (LDR)",
+        "policy_type" => "dense_multilayer_nn",
+        "ensure_feasibility" => string(ensure_feasibility),
+        "optimizer" => string(optimizers),
+        "training_method" => "deterministic_equivalent",
+        "solver" => USE_GPU ? "MadNLP+CUDSS (GPU)" : "MadNLP (CPU)",
+        "penalty_l1" => string(penalty_l1),
+        "penalty_l2" => string(penalty_l2),
+        "penalty_schedule" => string(penalty_schedule),
+        "num_epochs" => string(num_epochs),
+        "num_batches" => string(num_batches),
+        "num_train_per_batch" => string(_num_train_per_batch),
+        "num_eval_scenarios" => num_eval_scenarios,
+        "eval_every" => eval_every,
+        "use_gpu" => USE_GPU,
+    ),
+)
+
+# ── Define linear policy ─────────────────────────────────────────────────────
+
+num_uncertainties = length(uncertainty_samples[1][1])
+num_inputs = DecisionRules.policy_input_dim(num_uncertainties, num_hydro)
+models = dense_multilayer_nn(
+    num_inputs, num_hydro, layers;
+    activation=activation,
+)
+
+if !isnothing(pre_trained_model)
+    model_save = JLD2.load(pre_trained_model)
+    model_state = model_save["model_state"]
+    Flux.loadmodel!(models, model_state)
+end
+
+# ── Initial evaluation ───────────────────────────────────────────────────────
+
+Random.seed!(8788)
+@time objective_values = [
+    simulate_multistage(
+        det_equivalent,
+        state_params_in,
+        state_params_out,
+        initial_state,
+        DecisionRules.sample(uncertainty_samples),
+        models;
+    ) for _ in 1:2
+]
+best_obj = mean(objective_values)
+
+model_path = joinpath(model_dir, save_file * ".jld2")
+save_control = SaveBest(best_obj, model_path)
+stall_train = StallingCriterium(100, best_obj, 0)
+stall_rollout = StallingCriterium(5, best_obj, 0)
+
+# ── Rollout evaluation (stage-wise subproblems, CPU) ─────────────────────────
+
+Random.seed!(8789)
+eval_scenarios = [
+    DecisionRules.sample(uncertainty_samples_sub) for _ in 1:num_eval_scenarios
+]
+rollout_evaluation = RolloutEvaluation(
+    subproblems,
+    state_params_in_sub,
+    state_params_out_sub,
+    initial_state,
+    eval_scenarios;
+    stride=eval_every,
+    policy_state=:target,
+)
+realized_rollout_evaluation = RolloutEvaluation(
+    subproblems,
+    state_params_in_sub,
+    state_params_out_sub,
+    initial_state,
+    eval_scenarios;
+    stride=eval_every,
+    policy_state=:realized,
+)
+resolved_penalty_schedule = isnothing(penalty_schedule) ? nothing :
+    DecisionRules._resolve_penalty_schedule(penalty_schedule, num_epochs * num_batches)
+
+# ── Train ────────────────────────────────────────────────────────────────────
+
+train_multistage(
+    models,
+    initial_state,
+    det_equivalent,
+    state_params_in,
+    state_params_out,
+    uncertainty_samples;
+    num_batches=num_epochs * num_batches,
+    num_train_per_batch=_num_train_per_batch,
+    optimizer=first(optimizers),
+    record=(sample_log, iter, model) -> begin
+        training_loss = mean(sample_log.objectives)
+        loss_no_deficit = mean(sample_log.objectives_no_deficit)
+        metrics = Dict(
+            "metrics/loss" => loss_no_deficit,
+            "metrics/training_loss" => training_loss,
+        )
+        rollout_evaluation(iter, model)
+        realized_rollout_evaluation(iter, model)
+        converged_training = stall_train(iter, model, training_loss)
+        converged_rollout = false
+        if iter % eval_every == 0
+            converged_rollout = stall_rollout(
+                iter, model, rollout_evaluation.last_objective_no_deficit
+            )
+            metrics["metrics/rollout_objective_no_deficit"] =
+                rollout_evaluation.last_objective_no_deficit
+            metrics["metrics/rollout_target_violation_share"] =
+                rollout_evaluation.last_violation_share
+            metrics["metrics/rollout_realized_objective_no_deficit"] =
+                realized_rollout_evaluation.last_objective_no_deficit
+            metrics["metrics/rollout_realized_target_violation_share"] =
+                realized_rollout_evaluation.last_violation_share
+        end
+        if !isnothing(resolved_penalty_schedule)
+            metrics["metrics/target_penalty_multiplier"] =
+                DecisionRules._penalty_multiplier_for(resolved_penalty_schedule, iter)
+        end
+        Wandb.log(lg, metrics)
+        save_control(iter, model, training_loss)
+        return converged_training && converged_rollout && isapprox(training_loss, rollout_evaluation.last_objective_no_deficit; rtol=0.01)
+    end,
+    penalty_schedule=penalty_schedule,
+)
+
+close(lg)
diff --git a/examples/README.md b/examples/README.md
index 6bf37dc..23e4d16 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -8,7 +8,8 @@ and additional experiments.
 
 | Directory | Application | Paper section |
 |-----------|------------|---------------|
-| [`HydroPowerModels/`](HydroPowerModels/) | Bolivia Long-Term Hydrothermal Dispatching (10 hydro units, AC/SOC/DC OPF, 96 stages) | §4, Extension §1–§4 |
+| [`HydroPowerModels/`](HydroPowerModels/) | Bolivia Long-Term Hydrothermal Dispatching (10 hydro units, AC/SOC/DC OPF, 96 stages). Trains TS-DDR (LSTM) and TS-LDR (linear) policies. | §4, Extension §1–§4 |
+| [`inventory_control/`](inventory_control/) | Stochastic lot-sizing with fixed ordering costs (relaxed LP and integer MIP). Demonstrates score-function (REINFORCE) gradient mixing for integer variables. | §3 |
 | [`rocket_control/`](rocket_control/) | Goddard rocket altitude maximization with stochastic wind | §3 |
 | [`RL/`](RL/) | Reinforcement learning baselines (REINFORCE, PPO, DDPG, TD3, SAC) on Bolivia LTHD | Beyond paper |
 | `Experimental/` | Work-in-progress experiments (not documented) | — |
@@ -44,6 +45,7 @@ same problem:
 2. **Stage-wise / Single Shooting** — solve one subproblem per stage, backpropagate through the chain (Extension §2)
 3. **Windowed / Multiple Shooting** — partition stages into windows, parallelize window solves (Extension §3)
 
-The HydroPowerModels directory contains a training script for each strategy
-and a consistency check (`check_consistent_state_paths.jl`) verifying they
-produce identical trajectories.
+The HydroPowerModels directory contains a training script for each strategy,
+a TS-LDR training script (linear policy baseline), and an evaluation script
+(`evaluate_hydro_policies.jl`) that runs all trained policies on a common
+out-of-sample scenario set.
diff --git a/examples/inventory_control/README.md b/examples/inventory_control/README.md
index c6e3132..a4ae4be 100644
--- a/examples/inventory_control/README.md
+++ b/examples/inventory_control/README.md
@@ -68,6 +68,26 @@ subproblems. Two strategies are available:
 For the relaxed formulation (no integer variables), `NoIntegerStrategy`
 is used — subproblems are solved and duals read as-is.
 
+## Score-Function Gradient Mixing
+
+`ScoreFunctionConfig` adds a REINFORCE-style correction to the dual
+gradient, enabling TS-DDR to capture discrete transitions that LP duals
+miss. Stage-wise rollouts with Gaussian-perturbed targets estimate the
+gradient of the true integer cost, and the two signals are mixed:
+
+    g = α · g_dual + (1-α) · g_score_function
+
+There are two solves in the mixed-gradient training loop:
+
+- `train_multistage(...; integer_strategy=...)` controls the
+  deterministic-equivalent solve used to read the local dual-gradient term.
+- `ScoreFunctionConfig(subproblems, ...)` uses its rollout subproblems exactly
+  as built. If those subproblems contain binary setup variables, the
+  score-function term measures true MIP rollout costs.
+
+So the integer strategy is not duplicated: it belongs to the dual path. The
+score-function path measures costs from the rollout models you pass in.
+
 ## Scripts
 
 Run from the repository root:
@@ -105,11 +125,14 @@ Figures are written to `docs/src/assets/`:
 - **TS-DDR** learns an ex-ante order target from inventory and demand history,
   using the same time-invariant neural policy at every period.
 - **SDDP** uses a PAR(1) demand approximation in a 24-stage order/demand graph.
-  For the integer case, it uses LP relaxation with integer rounding at rollout.
+  For the integer case, it uses `AlternativeForwardPass`: the forward pass solves
+  true MIP subproblems (`z ∈ {0,1}`), while the backward pass uses LP relaxation
+  (`z ∈ [0,1]`) to compute cuts with valid duals.
 - **Base-stock** is a tuned constant order-up-to policy.
 - **Random** is an untrained ex-ante neural policy.
 
 The expected qualitative result is:
 - **Relaxed**: SDDP dominates (near-optimal for convex problems with Markov noise).
 - **Integer**: TS-DDR dominates (handles MIP subproblems natively via integer
-  postprocessing strategies, while SDDP's LP relaxation underestimates fixed costs).
+  postprocessing strategies, while SDDP with `AlternativeForwardPass` generates
+  cuts at MIP-realistic trial points but still relies on LP duals for cuts).
diff --git a/examples/inventory_control/build_inventory_problem.jl b/examples/inventory_control/build_inventory_problem.jl
index 4c5ebdf..b6137b8 100644
--- a/examples/inventory_control/build_inventory_problem.jl
+++ b/examples/inventory_control/build_inventory_problem.jl
@@ -39,40 +39,123 @@ const D_HI = Float64[44, 55, 81, 112, 153, 185, 164, 122, 88, 65, 49, 40]
 # ---------------------------------------------------------------------------
 # Latent demand process (stronger structure than original)
 # ---------------------------------------------------------------------------
+
+"""
+    sample_inventory_demand_path(rng = Random.default_rng()) -> Vector{Float64}
+
+Draw one realization of the latent demand process over `T` periods.
+
+The process has three hidden components:
+- a seasonal phase shift (uniform in 0:T-1);
+- a persistent regime ∈ {-1, 0, 1} with 4% switching probability;
+- an AR(1) shock with coefficient 0.92.
+
+None of these are observed by the policy — only realized demand values.
+
+# Arguments
+- `rng::AbstractRNG`: random number generator.
+
+# Examples
+```julia
+path = sample_inventory_demand_path()
+```
+"""
 function sample_inventory_demand_path(rng::AbstractRNG=Random.default_rng())
+    # Draw a random seasonal phase offset so demand peaks at different months.
     phase_shift = rand(rng, 0:(INVENTORY_T - 1))
+
+    # Draw the initial demand regime (low / neutral / high).
     regime = rand(rng, (-1.0, 0.0, 1.0))
+
+    # Initialize the AR(1) shock ε₀ = 0.
     shock = 0.0
+
+    # Pre-allocate the demand path vector.
     path = Vector{Float64}(undef, INVENTORY_T)
+
     for t in 1:INVENTORY_T
+        # Map period t to the shifted seasonal index κ_t = 1 + ((t+φ-1) mod T).
         seasonal_t = mod1(t + phase_shift, INVENTORY_T)
+
+        # With 4% probability, jump to a new demand regime.
         if rand(rng) < 0.04
             regime = rand(rng, (-1.0, 0.0, 1.0))
         end
+
+        # Update the AR(1) shock: ε_t = 0.92 ε_{t-1} + 0.35 η_t.
         shock = 0.92 * shock + 0.35 * randn(rng)
+
+        # Compute the seasonal center μ_κ = (D_LO[κ] + D_HI[κ]) / 2.
         center = (D_LO[seasonal_t] + D_HI[seasonal_t]) / 2
+
+        # Compute the seasonal half-width w_κ = (D_HI[κ] - D_LO[κ]) / 2.
         half_width = (D_HI[seasonal_t] - D_LO[seasonal_t]) / 2
+
+        # Demand: d_t = μ_κ + w_κ · (0.85·regime + 0.42·shock + 0.12·noise).
         demand = center + half_width * (0.85 * regime + 0.42 * shock + 0.12 * randn(rng))
+
+        # Clip demand to [5, D_HI[κ] + 0.55·w_κ] to prevent negative or extreme values.
         path[t] = clamp(demand, 5.0, D_HI[seasonal_t] + 0.55 * half_width)
     end
+
     return path
 end
 
 # ---------------------------------------------------------------------------
 # DecisionRules sampler
 # ---------------------------------------------------------------------------
+
+"""
+    InventoryProcessSampler
+
+Wraps JuMP demand parameters so `DecisionRules.sample` returns uncertainty
+realizations in the format expected by `train_multistage` and
+`simulate_multistage`.
+
+# Fields
+- `params::Vector{VariableRef}`: one demand parameter per stage.
+"""
 struct InventoryProcessSampler
     params::Vector{VariableRef}
 end
 
+"""
+    DecisionRules.sample(sampler::InventoryProcessSampler)
+
+Draw one demand path and return it as a vector of `[(param, value)]` pairs.
+"""
 function DecisionRules.sample(sampler::InventoryProcessSampler)
+    # Draw a fresh demand path from the latent process.
     demand_path = sample_inventory_demand_path()
+
+    # Pair each stage's JuMP parameter with the sampled demand value.
     return [[(sampler.params[t], demand_path[t])] for t in 1:INVENTORY_T]
 end
 
 # ---------------------------------------------------------------------------
 # Stage-wise subproblems
 # ---------------------------------------------------------------------------
+
+"""
+    build_inventory_subproblems(; kwargs...) -> (subproblems, state_in, state_out, sampler, x0)
+
+Build `T` independent JuMP stage models for stage-wise rollout evaluation.
+
+Each model has demand as a parameter, input state `(inventory, d_{t-1},
+d_{t-2})`, and a target constraint on mid-period inventory `s_mid` using
+`create_deficit!`.
+
+Returns the five-tuple expected by `simulate_multistage` and
+`train_multistage`.
+
+# Keyword Arguments
+- `T`, `K`, `c`, `h`, `p`, `Q_max`: problem parameters.
+- `I_0`: initial inventory.
+- `num_scenarios`: number of uncertainty samples per SGD batch.
+- `penalty`: target-deficit penalty λ.
+- `seed`: RNG seed for demand sampling.
+- `integer`: whether to include binary setup variable z.
+"""
 function build_inventory_subproblems(;
     T = INVENTORY_T,
     K = INVENTORY_K,
@@ -86,49 +169,84 @@ function build_inventory_subproblems(;
     seed = 42,
     integer = true,
 )
+    # Fix the random seed so demand samples are reproducible.
     Random.seed!(seed)
+
+    # Pre-allocate one JuMP model per stage.
     subproblems = Vector{JuMP.Model}(undef, T)
+
+    # Each stage has 3 input-state parameters: (inventory, d_{t-1}, d_{t-2}).
     state_params_in = Vector{Vector{Any}}(undef, T)
+
+    # Each stage has 3 output pairs: (target_param, realized_variable).
     state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, T)
+
+    # One demand parameter per stage.
     uncertainty_params = Vector{VariableRef}(undef, T)
 
     for t in 1:T
+        # Create a HiGHS LP/MIP model for this stage.
         m = Model(optimizer_with_attributes(HiGHS.Optimizer, "output_flag" => false))
         set_silent(m)
 
+        # --- Decision variables ---
+        # q: order quantity, bounded by capacity Q_max.
         @variable(m, 0 <= q <= Q_max)
+        # s_mid: mid-period inventory after order arrives but before demand.
         @variable(m, s_mid)
+        # s_out: end-of-period inventory after demand realizes.
         @variable(m, s_out)
+        # inv_hold: positive part of s_out (holding cost component).
         @variable(m, inv_hold >= 0)
+        # back: negative part of s_out (backlog cost component).
         @variable(m, back >= 0)
+        # Demand pass-through to the next stage state.
         @variable(m, last_demand_out)
         @variable(m, prev_demand_out)
 
+        # --- Parametric inputs (set before each solve) ---
+        # s_in: incoming inventory from the previous stage.
         @variable(m, s_in in MOI.Parameter(I_0))
+        # last_demand_in: demand observed one period ago (part of state).
         @variable(m, last_demand_in in MOI.Parameter(INVENTORY_LAST_DEMAND0))
+        # prev_demand_in: demand observed two periods ago (part of state).
         @variable(m, prev_demand_in in MOI.Parameter(INVENTORY_PREV_DEMAND0))
+        # demand: current-period demand realization (uncertainty).
         @variable(m, demand in MOI.Parameter((D_LO[t] + D_HI[t]) / 2))
+        # s_target: target mid-period inventory from the policy.
         @variable(m, s_target in MOI.Parameter(I_0))
+        # Target pass-throughs for demand state entries.
         @variable(m, last_demand_target in MOI.Parameter(INVENTORY_LAST_DEMAND0))
         @variable(m, prev_demand_target in MOI.Parameter(INVENTORY_PREV_DEMAND0))
 
         if integer
+            # z ∈ {0,1}: binary setup decision.
             @variable(m, z, Bin)
+            # If z = 0, no order is allowed: q ≤ Q_max · z.
             @constraint(m, q <= Q_max * z)
+            # Objective: K·z + c·q + h·hold + p·backlog.
             @objective(m, Min, K * z + c * q + h * inv_hold + p * back)
         else
+            # Relaxed objective: no setup cost or binary variable.
             @objective(m, Min, c * q + h * inv_hold + p * back)
         end
 
+        # s_mid = s_in + q: order arrives before demand.
         @constraint(m, s_mid == s_in + q)
+        # s_out = s_mid - demand: demand subtracts from inventory.
         @constraint(m, s_out == s_mid - demand)
+        # Pass current demand to next stage as "last demand".
         @constraint(m, last_demand_out == demand)
+        # Pass previous "last demand" to next stage as "prev demand".
         @constraint(m, prev_demand_out == last_demand_in)
+        # Split end-of-period inventory into holding and backlog parts.
         @constraint(m, inv_hold - back == s_out)
 
+        # L1 target-deficit penalty: λ · |s_mid - ŝ_target|.
         _, deficit = create_deficit!(m, 1; penalty_l1=penalty)
         @constraint(m, deficit[1] == s_mid - s_target)
 
+        # Store the model and parameter mappings.
         subproblems[t] = m
         state_params_in[t] = Any[s_in, last_demand_in, prev_demand_in]
         state_params_out[t] = [
@@ -139,6 +257,7 @@ function build_inventory_subproblems(;
         uncertainty_params[t] = demand
     end
 
+    # Return the five-tuple: (models, state_in, state_out, sampler, x0).
     return subproblems, state_params_in, state_params_out,
            InventoryProcessSampler(uncertainty_params),
            [I_0, INVENTORY_LAST_DEMAND0, INVENTORY_PREV_DEMAND0]
@@ -147,6 +266,39 @@ end
 # ---------------------------------------------------------------------------
 # Deterministic equivalent (full-horizon)
 # ---------------------------------------------------------------------------
+
+"""
+    build_inventory_det_equivalent(; kwargs...) -> (model, state_in, state_out, sampler, x0)
+
+Build a single JuMP model coupling all `T` stages for direct transcription
+training.
+
+The deterministic equivalent jointly optimizes over the full horizon. Target
+constraints appear as NormOneCone (L1) penalties so the training gradient
+captures inter-stage cost coupling that stage-wise rollouts miss.
+
+The penalty term is
+
+```math
+\\lambda \\sum_{t=1}^{T} |s_t^{mid} - \\hat{s}_t| .
+```
+
+# Keyword Arguments
+- `T`, `K`, `c`, `h`, `p`, `Q_max`: problem parameters.
+- `I_0`: initial inventory.
+- `num_scenarios`: number of uncertainty samples per SGD batch.
+- `penalty`: target-deficit penalty ``\\lambda``.
+- `seed`: RNG seed for demand sampling.
+- `integer`: whether to include binary setup variable z.
+
+# Examples
+```julia
+model, spi, spo, sampler, x0 = build_inventory_det_equivalent(;
+    num_scenarios = 50,
+    integer = true,
+)
+```
+"""
 function build_inventory_det_equivalent(;
     T = INVENTORY_T,
     K = INVENTORY_K,
@@ -160,44 +312,68 @@ function build_inventory_det_equivalent(;
     seed = 42,
     integer = true,
 )
+    # Fix the random seed so demand samples are reproducible.
     Random.seed!(seed)
+
+    # One monolithic model for the entire T-period horizon.
     m = Model(optimizer_with_attributes(HiGHS.Optimizer, "output_flag" => false))
     set_silent(m)
 
+    # --- Decision variables (one per stage, indexed 1:T) ---
+    # q[t]: order quantity in period t.
     @variable(m, 0 <= q[1:T] <= Q_max)
+    # s_mid[t]: mid-period inventory after order arrives.
     @variable(m, s_mid[1:T])
+    # s_out[t]: end-of-period inventory after demand.
     @variable(m, s_out[1:T])
+    # Demand pass-through state variables.
     @variable(m, last_demand_out[1:T])
     @variable(m, prev_demand_out[1:T])
+    # Holding and backlog split of s_out.
     @variable(m, inv_hold[1:T] >= 0)
     @variable(m, back[1:T] >= 0)
 
+    # --- Parametric inputs (set before each DE solve) ---
+    # Initial state at t = 0.
     @variable(m, s_init in MOI.Parameter(I_0))
     @variable(m, last_demand_init in MOI.Parameter(INVENTORY_LAST_DEMAND0))
     @variable(m, prev_demand_init in MOI.Parameter(INVENTORY_PREV_DEMAND0))
+    # Demand realizations for each period (set per scenario).
     @variable(m, demand[t=1:T] in MOI.Parameter((D_LO[t] + D_HI[t]) / 2))
+    # Target state from the policy (set per scenario).
     @variable(m, s_target[t=1:T] in MOI.Parameter(I_0))
     @variable(m, last_demand_target[t=1:T] in MOI.Parameter(INVENTORY_LAST_DEMAND0))
     @variable(m, prev_demand_target[t=1:T] in MOI.Parameter(INVENTORY_PREV_DEMAND0))
 
     if integer
+        # z[t] ∈ {0,1}: binary setup decision.
         @variable(m, z[1:T], Bin)
+        # q[t] ≤ Q_max · z[t]: no order if setup is off.
         @constraint(m, [t=1:T], q[t] <= Q_max * z[t])
     end
 
+    # --- Dynamics ---
+    # First stage links to the initial inventory parameter.
     @constraint(m, s_mid[1] == s_init + q[1])
+    # Subsequent stages chain from the previous end-of-period inventory.
     @constraint(m, [t=2:T], s_mid[t] == s_out[t-1] + q[t])
+    # Demand subtracts from mid-period inventory.
     @constraint(m, [t=1:T], s_out[t] == s_mid[t] - demand[t])
+    # Pass demand through to state for the next stage.
     @constraint(m, [t=1:T], last_demand_out[t] == demand[t])
     @constraint(m, prev_demand_out[1] == prev_demand_init)
     @constraint(m, [t=2:T], prev_demand_out[t] == last_demand_out[t-1])
+    # Split end-of-period inventory into holding and backlog.
     @constraint(m, [t=1:T], inv_hold[t] - back[t] == s_out[t])
 
+    # --- Target-deficit penalty via NormOneCone ---
+    # norm_deficit_arr[t] ≥ |s_mid[t] - s_target[t]| (L1 norm).
     @variable(m, norm_deficit_arr[1:T] >= 0.0)
     @variable(m, deficit_arr[1:T])
     @constraint(m, [t=1:T], deficit_arr[t] == s_mid[t] - s_target[t])
     @constraint(m, [t=1:T], [norm_deficit_arr[t]; deficit_arr[t:t]] in MOI.NormOneCone(2))
 
+    # --- Objective: operational cost + target penalty ---
     if integer
         @objective(m, Min,
             sum(K * z[t] + c * q[t] + h * inv_hold[t] + p * back[t] for t in 1:T) +
@@ -208,14 +384,20 @@ function build_inventory_det_equivalent(;
             penalty * sum(norm_deficit_arr))
     end
 
+    # --- Build parameter mappings for DecisionRules interface ---
     state_params_in = Vector{Vector{Any}}(undef, T)
     state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, T)
     uncertainty_params = Vector{VariableRef}(undef, T)
 
+    # Stage 1 reads from the initial-state parameters.
     state_params_in[1] = Any[s_init, last_demand_init, prev_demand_init]
+
+    # Stages 2..T read from the previous stage's realized output variables.
     for t in 2:T
         state_params_in[t] = Any[s_out[t-1], last_demand_out[t-1], prev_demand_out[t-1]]
     end
+
+    # Each stage maps (target_parameter → realized_variable) for gradient reading.
     for t in 1:T
         state_params_out[t] = [
             (s_target[t], s_out[t]),
@@ -225,6 +407,7 @@ function build_inventory_det_equivalent(;
         uncertainty_params[t] = demand[t]
     end
 
+    # Return the five-tuple: (model, state_in, state_out, sampler, x0).
     return m, state_params_in, state_params_out,
            InventoryProcessSampler(uncertainty_params),
            [I_0, INVENTORY_LAST_DEMAND0, INVENTORY_PREV_DEMAND0]
@@ -233,28 +416,89 @@ end
 # ---------------------------------------------------------------------------
 # Policies
 # ---------------------------------------------------------------------------
+
+"""
+    base_stock_policy(S_star) -> Function
+
+Return a constant-target base-stock policy.
+
+The target is the mid-period inventory level ``s^{mid} = S^*``; pass-through
+state entries are current and lagged demand.
+
+# Arguments
+- `S_star::Float64`: order-up-to level.
+
+# Examples
+```julia
+policy = base_stock_policy(160.0)
+target = policy(Float32[d_t, inventory, d_{t-1}, d_{t-2}])
+```
+"""
 function base_stock_policy(S_star::Float64)
+    # Return a closure: target is always S*, pass through d_t and d_{t-1}.
     return x -> Float32[S_star, x[1], x[2]]
 end
 
+"""
+    ExAnteInventoryPolicy{N}
+
+Feedforward ex-ante inventory policy.
+
+Input: `[d_t, inventory, d_{t-1}, d_{t-2}]`. The policy ignores the current
+demand `d_t` to respect the ex-ante information pattern. Features passed to
+the network are `[inventory/100, d_{t-1}/100, d_{t-2}/100]`.
+
+Output: `[500 σ(net(features)), d_t, d_{t-1}]` — a target for mid-period
+inventory, plus pass-through state entries.
+
+# Fields
+- `net::N`: Flux `Chain` mapping ℝ³ → ℝ¹.
+"""
 struct ExAnteInventoryPolicy{N}
     net::N
 end
 
 Functors.@functor ExAnteInventoryPolicy (net,)
 
+Flux.reset!(::ExAnteInventoryPolicy) = nothing
+
 function (policy::ExAnteInventoryPolicy)(x)
+    # Unpack the input vector: [d_t, inventory, d_{t-1}, d_{t-2}].
     current_demand = Float32(x[1])
     inventory = Float32(x[2])
     last_demand = Float32(x[3])
     prev_demand = Float32(x[4])
+
+    # Scale features to ≈ [0, 1] range for stable neural network training.
     order_features = Float32[inventory / 100, last_demand / 100, prev_demand / 100]
+
+    # Map through the network and squash to [0, 500] via sigmoid.
     target = 500f0 .* Flux.sigmoid.(policy.net(order_features))
+
+    # Return [target_s_mid, d_t, d_{t-1}] — target inventory + pass-through state.
     return Float32[target[1], current_demand, last_demand]
 end
 
+"""
+    build_exante_policy(; seed = 2024) -> ExAnteInventoryPolicy
+
+Construct the default feedforward ex-ante policy.
+
+Architecture: Dense(3 → 32, relu) → Dense(32 → 24, relu) → Dense(24 → 1).
+
+# Keyword Arguments
+- `seed::Int`: random seed for weight initialization.
+
+# Examples
+```julia
+policy = build_exante_policy(; seed = 2024)
+```
+"""
 function build_exante_policy(; seed=2024)
+    # Fix the random seed for reproducible weight initialization.
     Random.seed!(seed)
+
+    # Three-layer feedforward: 3 inputs → 32 hidden → 24 hidden → 1 output.
     net = Chain(
         Dense(3, 32, relu),
         Dense(32, 24, relu),
@@ -262,3 +506,120 @@ function build_exante_policy(; seed=2024)
     )
     return ExAnteInventoryPolicy(net)
 end
+
+# ---------------------------------------------------------------------------
+# LSTM ex-ante policy (temporal demand encoding, strictly ex-ante)
+# ---------------------------------------------------------------------------
+
+"""
+    LSTMExAntePolicy{E,C,S}
+
+Recurrent ex-ante inventory policy with temporal demand encoding.
+
+An `LSTMCell` encoder processes the *lagged* demand ``d_{t-1}`` at each
+stage, building a hidden representation of the demand history. The combiner
+maps the LSTM output concatenated with `[inventory, d_{t-2}]` to a single
+target value.
+
+The policy is strictly ex-ante: it never sees the current-period demand
+``d_t``. Temporal information comes from the LSTM state accumulated over
+previous stages.
+
+Output parameterization is affine: `raw * 200 + 150`, centered on typical
+mid-period inventory and free from sigmoid saturation.
+
+# Fields
+- `encoder::E`: `Flux.LSTMCell` processing one demand value per stage.
+- `combiner::C`: `Dense` layer mapping encoded + state features to target.
+- `state::S`: current LSTM hidden state (reset between scenarios).
+
+# Examples
+```julia
+policy = build_lstm_exante_policy(; seed = 2024, hidden = 16)
+Flux.reset!(policy)
+target = policy(Float32[d_t, inventory, d_{t-1}, d_{t-2}])
+```
+"""
+mutable struct LSTMExAntePolicy{E,C,S}
+    encoder::E
+    combiner::C
+    state::S
+end
+
+Functors.@functor LSTMExAntePolicy (encoder, combiner)
+
+function (policy::LSTMExAntePolicy)(x)
+    # Extract features from input: [d_t, inventory, d_{t-1}, d_{t-2}].
+    # Only d_{t-1} (lagged) feeds the LSTM — d_t is NOT used (ex-ante).
+    last_demand = Float32(x[3])
+    inventory = Float32(x[2])
+    prev_demand = Float32(x[4])
+
+    # Match the element type of the LSTM state (Float32 during training).
+    T = eltype(first(policy.state))
+
+    # Feed the normalized lagged demand through the LSTM cell.
+    # The cell returns the encoded output and the updated hidden state.
+    encoded, new_state = policy.encoder(T[last_demand / 100], policy.state)
+
+    # Update the hidden state for the next stage call within this scenario.
+    policy.state = new_state
+
+    # Concatenate LSTM output with current inventory and prev demand.
+    combined = vcat(encoded, T[inventory / 100, prev_demand / 100])
+
+    # Map combined features to a single scalar through the Dense combiner.
+    raw = policy.combiner(combined)
+
+    # Affine output: target = raw × 200 + 150 (centered, no saturation).
+    target_s_mid = raw[1] * 200f0 + 150f0
+
+    # Return [target_s_mid, d_t, d_{t-1}] — target inventory + pass-through.
+    return Float32[target_s_mid, x[1], last_demand]
+end
+
+"""
+    Flux.reset!(policy::LSTMExAntePolicy) -> Nothing
+
+Reset the LSTM hidden state to its initial value.
+
+Must be called before each scenario rollout so hidden state from previous
+scenarios does not leak.
+"""
+function Flux.reset!(policy::LSTMExAntePolicy)
+    # Restore the LSTM hidden state to its fresh initial values.
+    policy.state = Flux.initialstates(policy.encoder)
+    return nothing
+end
+
+"""
+    build_lstm_exante_policy(; seed = 2024, hidden = 16) -> LSTMExAntePolicy
+
+Construct an LSTM ex-ante policy.
+
+Architecture: LSTMCell(1 → hidden) encoder, Dense(hidden + 2 → 1) combiner.
+
+# Keyword Arguments
+- `seed::Int`: random seed for weight initialization.
+- `hidden::Int`: LSTM hidden dimension.
+
+# Examples
+```julia
+policy = build_lstm_exante_policy(; seed = 2024, hidden = 16)
+```
+"""
+function build_lstm_exante_policy(; seed=2024, hidden=16)
+    # Fix the random seed for reproducible weight initialization.
+    Random.seed!(seed)
+
+    # LSTM cell: 1 input (normalized lagged demand) → hidden state.
+    encoder = Flux.LSTMCell(1 => hidden)
+
+    # Dense combiner: [LSTM output; inventory; prev_demand] → 1 target.
+    combiner = Dense(hidden + 2, 1)
+
+    # Initialize the LSTM hidden state to its default zeros.
+    state = Flux.initialstates(encoder)
+
+    return LSTMExAntePolicy(encoder, combiner, state)
+end
diff --git a/examples/inventory_control/compare_results.jl b/examples/inventory_control/compare_results.jl
index 1251e43..f9d6f2e 100644
--- a/examples/inventory_control/compare_results.jl
+++ b/examples/inventory_control/compare_results.jl
@@ -1,205 +1,1060 @@
 """
-Compare all policies for both relaxed and integer inventory problem variants.
-Produces two-section output: tables + compact 2×2 plot layouts.
+Compare inventory-control benchmark results and regenerate documentation plots.
+
+This script expects the CSV files written by:
+
+- `train_dr_inventory.jl`;
+- `evaluate_inventory.jl`; and
+- `solve_sddp.jl`.
+
+Results live in timestamped subdirectories under `results/`. Pass the run ID as
+the first CLI argument, or omit it to use the most recent run:
+
+```bash
+julia --project=. compare_results.jl                 # latest run
+julia --project=. compare_results.jl 20260619_231417  # specific run
+```
 """
 
-using CSV, DataFrames, Statistics, Printf, Random
-using Plots, StatsPlots
+using CSV
+using DataFrames
+using Plots
+using Printf
+using Random
+using Statistics
+using StatsPlots
 
 include(joinpath(@__DIR__, "build_inventory_problem.jl"))
 
-result_dir = joinpath(@__DIR__, "results")
-example_dir = @__DIR__
-docs_dir = normpath(joinpath(example_dir, "..", "..", "docs", "src", "assets"))
-mkpath(docs_dir)
+"""
+    resolve_result_dir(args) -> String
+
+Pick the results directory from CLI args or default to the most recent run.
+
+When the `results/` directory contains timestamped subdirectories
+(e.g. `results/20260619_231417/`), the most recent one is used. If no
+subdirectories exist, the flat `results/` directory itself is used for
+backward compatibility with older runs.
+
+# Arguments
+- `args`: `ARGS` from the script entry point.
+
+# Examples
+```julia
+result_dir = resolve_result_dir(ARGS)
+```
+"""
+function resolve_result_dir(args)
+    base = joinpath(@__DIR__, "results")
+
+    if !isempty(args)
+        dir = joinpath(base, args[1])
+        isdir(dir) || error("Run directory not found: $dir")
+        return dir
+    end
+
+    subdirs = filter(d -> isdir(joinpath(base, d)), readdir(base))
+
+    if isempty(subdirs)
+        return base
+    end
+
+    return joinpath(base, sort(subdirs)[end])
+end
+
+const RESULT_DIR = resolve_result_dir(ARGS)
+const RESULT_BASE = joinpath(@__DIR__, "results")
+println("Loading results from: $RESULT_DIR")
+
+"""
+    resolve_file(filename::AbstractString) -> String
+
+Find a result file in `RESULT_DIR`, falling back to the base `results/`
+directory. This lets run-specific TS-DDR results coexist with shared
+baselines (SDDP, base-stock, random) that were generated once and live
+in the parent directory.
+
+# Arguments
+- `filename::AbstractString`: file name (not a full path).
+
+# Examples
+```julia
+path = resolve_file("relaxed_sddp_costs.csv")
+```
+"""
+function resolve_file(filename::AbstractString)
+    primary = joinpath(RESULT_DIR, filename)
+    isfile(primary) && return primary
+
+    fallback = joinpath(RESULT_BASE, filename)
+    isfile(fallback) && return fallback
+
+    error("Result file \"$filename\" not found in $RESULT_DIR or $RESULT_BASE")
+end
+
+"""
+    resolve_file_optional(filename::AbstractString) -> Union{String, Nothing}
+
+Like `resolve_file`, but returns `nothing` when the file does not exist in
+either directory.
+
+# Arguments
+- `filename::AbstractString`: file name (not a full path).
+
+# Examples
+```julia
+path = resolve_file_optional("integer_sf_training_curve.csv")
+```
+"""
+function resolve_file_optional(filename::AbstractString)
+    primary = joinpath(RESULT_DIR, filename)
+    isfile(primary) && return primary
+
+    fallback = joinpath(RESULT_BASE, filename)
+    isfile(fallback) && return fallback
+
+    return nothing
+end
+
+# Documentation figures are checked into the docs asset directory.
+const DOCS_ASSET_DIR = normpath(joinpath(@__DIR__, "..", "..", "docs", "src", "assets"))
 
-function ci95(costs)
+# Ensure the figure output directory exists before plotting.
+mkpath(DOCS_ASSET_DIR)
+
+"""
+    MethodResult
+
+Costs and display metadata for one benchmark method.
+
+# Fields
+- `name::String`: label printed in tables.
+- `costs::Vector{Float64}`: operational costs, one per evaluation scenario.
+
+# Examples
+```julia
+result = MethodResult("TS-DDR", [1.0, 2.0, 3.0])
+```
+"""
+struct MethodResult
+    name::String
+    costs::Vector{Float64}
+end
+
+"""
+    TimingRecord
+
+Training and evaluation timing for one benchmark method.
+
+# Fields
+- `fit_seconds::Float64`: total fitting time.
+- `eval_seconds::Float64`: average evaluation time per stage.
+
+# Examples
+```julia
+timing = TimingRecord(10.0, 0.01)
+```
+"""
+struct TimingRecord
+    fit_seconds::Float64
+    eval_seconds::Float64
+end
+
+"""
+    ci95(costs::AbstractVector{<:Real}) -> Float64
+
+Return the normal-approximation half-width of a 95% confidence interval.
+
+The reported value is
+
+```math
+1.96 \\frac{s}{\\sqrt{n}},
+```
+
+where ``s`` is the sample standard deviation and ``n`` is the number of costs.
+
+# Arguments
+- `costs::AbstractVector{<:Real}`: sampled operational costs.
+
+# Examples
+```julia
+half_width = ci95([10.0, 12.0, 11.0])
+```
+"""
+function ci95(costs::AbstractVector{<:Real})
+    # The table reports uncertainty in the sample mean, not in one trajectory.
     return 1.96 * std(costs) / sqrt(length(costs))
 end
 
-function load_costs(tag, method)
-    CSV.read(joinpath(result_dir, "$(tag)_$(method)_costs.csv"), DataFrame).operational_cost
+"""
+    percent_gap(costs, reference_costs) -> Float64
+
+Return the mean-cost percent gap relative to a reference method.
+
+The gap is
+
+```math
+100 \\frac{\\bar{c} - \\bar{c}_{ref}}{\\bar{c}_{ref}}.
+```
+
+# Arguments
+- `costs::AbstractVector{<:Real}`: candidate method costs.
+- `reference_costs::AbstractVector{<:Real}`: reference method costs.
+
+# Examples
+```julia
+gap = percent_gap(candidate.costs, reference.costs)
+```
+"""
+function percent_gap(costs, reference_costs)
+    # Positive gaps mean the candidate is more expensive than the reference.
+    return 100.0 * (mean(costs) - mean(reference_costs)) / mean(reference_costs)
 end
 
+"""
+    load_costs(tag::AbstractString, method::AbstractString) -> Vector{Float64}
+
+Load one benchmark cost vector from `RESULT_DIR`.
+
+# Arguments
+- `tag::AbstractString`: problem prefix, such as `"relaxed"` or `"integer"`.
+- `method::AbstractString`: method suffix, such as `"dr"` or `"sddp"`.
+
+# Examples
+```julia
+costs = load_costs("integer", "dr")
+```
+"""
+function load_costs(tag::AbstractString, method::AbstractString)
+    # Every cost file uses the shared `operational_cost` column.
+    table = CSV.read(resolve_file("$(tag)_$(method)_costs.csv"), DataFrame)
+
+    return Float64.(table.operational_cost)
+end
+
+"""
+    optional_costs(tag::AbstractString, method::AbstractString)
+
+Load costs if a result file exists; otherwise return `nothing`.
+
+# Arguments
+- `tag::AbstractString`: problem prefix.
+- `method::AbstractString`: method suffix.
+
+# Examples
+```julia
+costs = optional_costs("integer_sf", "dr")
+```
+"""
+function optional_costs(tag::AbstractString, method::AbstractString)
+    filename = "$(tag)_$(method)_costs.csv"
+    primary = joinpath(RESULT_DIR, filename)
+    fallback = joinpath(RESULT_BASE, filename)
+
+    path = isfile(primary) ? primary : isfile(fallback) ? fallback : nothing
+
+    return isnothing(path) ? nothing :
+        Float64.(CSV.read(path, DataFrame).operational_cost)
+end
+
+"""
+    read_scalar(path::AbstractString) -> Float64
+
+Read a scalar floating-point value from a text file.
+
+# Arguments
+- `path::AbstractString`: text file containing one numeric value.
+
+# Examples
+```julia
+bound = read_scalar(resolve_file("integer_sddp_bound.txt"))
+```
+"""
+function read_scalar(path::AbstractString)
+    # Baseline scripts write scalar values as plain text.
+    return parse(Float64, strip(read(path, String)))
+end
+
+"""
+    timing_key(method_name::AbstractString) -> String
+
+Return the key used to look up timing rows.
+
+# Arguments
+- `method_name::AbstractString`: table label.
+
+# Examples
+```julia
+key = timing_key("Base-stock (S*=160)")
+```
+"""
+function timing_key(method_name::AbstractString)
+    # Base-stock labels include S*, while timing files use a stable method name.
+    startswith(method_name, "Base-stock") && return "Base-stock"
+
+    return String(method_name)
+end
+
+"""
+    load_timing(tags) -> Dict{String,TimingRecord}
+
+Load timing rows for a set of result prefixes.
+
+# Arguments
+- `tags`: iterable of prefixes, such as `["integer", "integer_cr"]`.
+
+# Examples
+```julia
+timing = load_timing(["integer", "integer_cr"])
+```
+"""
 function load_timing(tags)
-    dfs = DataFrame[]
+    # These suffixes cover TS-DDR, SDDP, LP-relaxed SDDP, and baselines.
+    timing_suffixes = ["dr_timing", "sddp_timing", "sddp_lp_timing", "baseline_timing"]
+
+    # Accumulate all timing CSVs that exist for the requested tags.
+    rows = DataFrame[]
     for tag in tags
-        for f in ["dr_timing", "sddp_timing", "baseline_timing"]
-            path = joinpath(result_dir, "$(tag)_$(f).csv")
-            isfile(path) && push!(dfs, CSV.read(path, DataFrame))
+        for suffix in timing_suffixes
+            path = resolve_file_optional("$(tag)_$(suffix).csv")
+            !isnothing(path) && push!(rows, CSV.read(path, DataFrame))
         end
     end
-    df = vcat(dfs..., cols=:union)
-    return Dict(row.method => row for row in eachrow(df))
+
+    # A missing timing row is a data-generation error, so keep loading strict.
+    combined = vcat(rows...; cols = :union)
+
+    # Convert DataFrame rows to a small typed dictionary.
+    return Dict(
+        String(row.method) => TimingRecord(row.fit_seconds, row.eval_seconds)
+        for row in eachrow(combined)
+    )
 end
 
-# ═══════════════════════════════════════════════════════════════════════════════
-# Print comparison table
-# ═══════════════════════════════════════════════════════════════════════════════
-function print_table(entries, timing, sddp_bound; ref_idx=1)
-    ref_mean = mean(entries[ref_idx][2])
-    println("SDDP LP bound: $(@sprintf("%.1f", sddp_bound))")
+"""
+    print_table(results, timing, bound; reference_index = 1)
+
+Print a Markdown comparison table.
+
+# Arguments
+- `results::Vector{MethodResult}`: cost vectors and display names.
+- `timing::Dict{String,TimingRecord}`: timing rows by method name.
+- `bound::Real`: SDDP lower bound printed above the table.
+- `reference_index::Integer`: result used for percent-gap comparisons.
+
+# Examples
+```julia
+print_table(results, timing, bound)
+```
+"""
+function print_table(
+    results::Vector{MethodResult},
+    timing::Dict{String,TimingRecord},
+    bound::Real;
+    reference_index::Integer = 1,
+)
+    # The reference method defines the "vs" column.
+    reference = results[reference_index]
+
+    println("SDDP LP bound: $(@sprintf("%.1f", bound))")
     println()
-    println("| Method                   |   N | Mean cost |   Std | 95% CI | vs $(entries[ref_idx][1]) | Fit (s) | Eval (s) |")
+    println(
+        "| Method                   |   N | Mean cost |   Std | 95% CI | " *
+        "vs $(reference.name) | Fit (s) | Eval (s) |",
+    )
     println("|:-------------------------|----:|----------:|------:|-------:|----------:|--------:|---------:|")
-    for (name, costs) in entries
-        timing_key = startswith(name, "Base-stock") ? "Base-stock" : name
-        row = timing[timing_key]
-        gap = (mean(costs) - ref_mean) / ref_mean * 100
-        @printf("| %-24s | %3d | %9.1f | %5.1f | %6.1f | %+9.1f%% | %7.1f | %8.4f |\n",
-            name, length(costs), mean(costs), std(costs), ci95(costs),
-            gap, row.fit_seconds, row.eval_seconds)
+
+    for result in results
+        # Timing rows use stable method labels.
+        row = timing[timing_key(result.name)]
+
+        # Compute all statistics once so the table row is easy to inspect.
+        mean_cost = mean(result.costs)
+        std_cost = std(result.costs)
+        confidence = ci95(result.costs)
+        gap = percent_gap(result.costs, reference.costs)
+
+        @printf(
+            "| %-24s | %3d | %9.1f | %5.1f | %6.1f | %+9.1f%% | %7.1f | %8.4f |\n",
+            result.name,
+            length(result.costs),
+            mean_cost,
+            std_cost,
+            confidence,
+            gap,
+            row.fit_seconds,
+            row.eval_seconds,
+        )
     end
+
     println()
+
+    return nothing
 end
 
-# ═══════════════════════════════════════════════════════════════════════════════
-# Compact 2×2 plot
-# ═══════════════════════════════════════════════════════════════════════════════
-function make_plots(tag, entries, S_star, title_suffix; sddp_tag=tag, dr_tag=tag)
-    time_cols = [Symbol("t$i") for i in 0:INVENTORY_T]
-
-    # (1,1) SDDP learning curve
-    sddp_log = CSV.read(joinpath(result_dir, "$(sddp_tag)_sddp_training_log.csv"), DataFrame)
-    valid = filter(row -> !ismissing(row.bound) && isfinite(row.bound), sddp_log)
-    p1 = plot(valid.iteration, valid.bound;
-        xlabel="Iteration", ylabel="Cost",
-        title="SDDP learning curve", label="LP bound",
-        linewidth=2, color=:darkgreen, legend=:right)
-    if "simulation_value" in names(valid)
-        sim_rows = filter(row -> !ismissing(row.simulation_value) && isfinite(row.simulation_value), valid)
-        if nrow(sim_rows) > 0
-            plot!(p1, sim_rows.iteration, sim_rows.simulation_value;
-                label="Simulation", linewidth=2, color=:darkorange)
-        end
-    end
+"""
+    short_method_label(name::AbstractString, base_stock_level::Real) -> String
 
-    # (1,2) TS-DDR training curve
-    curve_df = CSV.read(joinpath(result_dir, "$(dr_tag)_training_curve.csv"), DataFrame)
-    p2 = plot(curve_df.batch, curve_df.loss;
-        xlabel="Batch", ylabel="Mean operational cost",
-        title="TS-DDR training curve", legend=false,
-        linewidth=2, color=:steelblue)
-
-    # (2,1) Net-inventory trajectories
-    dr_traj = CSV.read(joinpath(result_dir, "$(dr_tag)_dr_trajectories.csv"), DataFrame)
-    bs_tag_file = sddp_tag  # baselines share the sddp tag prefix
-    bs_traj = CSV.read(joinpath(result_dir, "$(bs_tag_file)_basestock_trajectories.csv"), DataFrame)
-    n_show = min(20, nrow(dr_traj), nrow(bs_traj))
-    p3 = plot(; xlabel="Period", ylabel="Net inventory",
-        title="Inventory trajectories", legend=:topright)
-    for s in 1:n_show
-        plot!(p3, 0:INVENTORY_T, Vector(dr_traj[s, time_cols]);
-            color=:steelblue, alpha=0.35, label=s == 1 ? "TS-DDR" : false)
-    end
-    for s in 1:n_show
-        plot!(p3, 0:INVENTORY_T, Vector(bs_traj[s, time_cols]);
-            color=:darkorange, alpha=0.35, label=s == 1 ? "Base-stock" : false)
-    end
-    hline!(p3, [0.0]; linestyle=:dash, color=:black, label="Zero")
+Return compact labels for plot axes.
 
-    # (2,2) Cost distribution boxplot
-    labels = [e[1] for e in entries]
-    short_labels = replace.(labels,
+# Arguments
+- `name::AbstractString`: full table label.
+- `base_stock_level::Real`: base-stock order-up-to level.
+
+# Examples
+```julia
+label = short_method_label("TS-DDR (FixedDiscrete)", 160.0)
+```
+"""
+function short_method_label(name::AbstractString, base_stock_level::Real)
+    # Keep repeated labels short enough for the violin plot axis.
+    startswith(name, "Base-stock") &&
+        return "Base-stock\n(S*=$(round(Int, base_stock_level)))"
+
+    replacements = Dict(
         "TS-DDR (FixedDiscrete)" => "TS-DDR\n(FixedDisc)",
         "TS-DDR (ContRelax)" => "TS-DDR\n(ContRelax)",
+        "TS-DDR (MixedGrad)" => "TS-DDR\n(MixedGrad)",
+        "TS-DDR (HighPenalty)" => "TS-DDR\n(HighPen)",
+        "TS-DDR (LSTM)" => "TS-DDR\n(LSTM)",
+        "TS-DDR (LSTM+SF)" => "TS-DDR\n(LSTM+SF)",
         "TS-DDR (trained)" => "TS-DDR",
+        "TS-DDR Relaxed (LSTM)" => "TS-DDR\n(LSTM)",
+        "TS-DDR Relaxed (HighPenalty)" => "TS-DDR\n(HighPen)",
+        "TS-DDR Relaxed (LSTM+HP)" => "TS-DDR\n(LSTM+HP)",
         "SDDP (PAR)" => "SDDP",
-        "SDDP.jl integer rollout" => "SDDP",
-        "Random (untrained)" => "Random")
-    short_labels = [startswith(l, "Base-stock") ? "Base-stock\n(S*=$(round(Int,S_star)))" : l for l in short_labels]
-    data = [e[2] for e in entries]
-    n_methods = length(entries)
-    colors = if n_methods == 4
-        [:gold :darkgreen :steelblue :gray]
-    elseif n_methods == 5
-        [:gold :royalblue :darkgreen :steelblue :gray]
-    else
-        palette(:auto, n_methods)'
+        "SDDP (MIP fwd)" => "SDDP\n(MIP fwd)",
+        "SDDP (LP relax)" => "SDDP\n(LP relax)",
+        "Random (untrained)" => "Random",
+    )
+
+    return get(replacements, String(name), String(name))
+end
+
+"""
+    method_colors(num_methods::Integer)
+
+Return stable plot colors for the number of compared methods.
+
+# Arguments
+- `num_methods::Integer`: number of methods in the comparison.
+
+# Examples
+```julia
+colors = method_colors(length(results))
+```
+"""
+function method_colors(num_methods::Integer)
+    # Keep colors stable between documentation rebuilds.
+    # Color assignments: TS-DDR variants in blues/oranges, SDDP in greens,
+    # baselines in warm tones and gray.
+    color_bank = [
+        :steelblue, :royalblue, :darkorange, :mediumpurple,
+        :coral, :teal, :darkgreen, :seagreen, :gold, :gray,
+    ]
+
+    num_methods <= length(color_bank) && return color_bank[1:num_methods]
+
+    return palette(:auto, num_methods)
+end
+
+"""
+    plot_sddp_learning_curve(tag::AbstractString)
+
+Create the SDDP training-bound subplot.
+
+# Arguments
+- `tag::AbstractString`: result prefix used by SDDP output files.
+
+# Examples
+```julia
+plot_sddp_learning_curve("integer")
+```
+"""
+function plot_sddp_learning_curve(tag::AbstractString; start_fraction::Float64 = 0.5)
+    # SDDP logs may include failed or missing simulation rows.
+    log = CSV.read(resolve_file("$(tag)_sddp_training_log.csv"), DataFrame)
+
+    # Log-scale plots require strictly positive finite values.
+    valid_bound_rows = filter(
+        row -> !ismissing(row.bound) && isfinite(row.bound) && row.bound > 0,
+        log,
+    )
+
+    # Show only the converged portion of training.
+    start_iter = round(Int, start_fraction * maximum(valid_bound_rows.iteration))
+    converged = filter(row -> row.iteration >= start_iter, valid_bound_rows)
+
+    plot_handle = plot(
+        converged.iteration,
+        converged.bound;
+        xlabel = "Iteration",
+        ylabel = "Cost (log scale)",
+        title = "SDDP learning curve (converged)",
+        label = "LP bound",
+        linewidth = 2,
+        color = :darkgreen,
+        legend = :right,
+        yscale = :log10,
+    )
+
+    if "simulation_value" in names(converged)
+        valid_sim_rows = filter(
+            row -> !ismissing(row.simulation_value) &&
+                isfinite(row.simulation_value) &&
+                row.simulation_value > 0,
+            converged,
+        )
+
+        if nrow(valid_sim_rows) > 0
+            plot!(
+                plot_handle,
+                valid_sim_rows.iteration,
+                valid_sim_rows.simulation_value;
+                label = "Simulation",
+                linewidth = 2,
+                color = :darkorange,
+            )
+        end
     end
-    p4 = boxplot(short_labels, data;
-        xlabel="Method", ylabel="Operational cost",
-        title="Cost comparison", legend=false,
-        fillcolor=colors, linecolor=:black)
 
+    return plot_handle
+end
+
+"""
+    plot_training_curves(curve_specs)
+
+Create the TS-DDR training-curve subplot.
+
+# Arguments
+- `curve_specs`: tuples `(tag, label, color)` for training-curve CSV files.
+
+# Examples
+```julia
+plot_training_curves([("integer", "FixedDiscrete", :steelblue)])
+```
+"""
+function plot_training_curves(curve_specs)
+    # Start with an empty plot so optional curves can be skipped cleanly.
+    plot_handle = plot(;
+        xlabel = "Batch",
+        ylabel = "Out-of-sample rollout cost",
+        title = "TS-DDR training curves",
+        legend = :topright,
+    )
+
+    for (tag, label, color) in curve_specs
+        # Optional variants should not break the plot.
+        path = resolve_file_optional("$(tag)_training_curve.csv")
+        isnothing(path) && continue
+
+        curve = CSV.read(path, DataFrame)
+
+        # Prefer the true out-of-sample rollout cost; fall back to the
+        # DE training objective for data generated before the rollout
+        # evaluation was added.
+        if "rollout_cost" in names(curve)
+            valid = dropmissing(curve, :rollout_cost)
+            valid = filter(row -> isfinite(row.rollout_cost), valid)
+            plot!(plot_handle, valid.batch, valid.rollout_cost;
+                  label = label, linewidth = 2, color = color)
+        else
+            plot!(plot_handle, curve.batch, curve.loss;
+                  label = label, linewidth = 2, color = color)
+        end
+    end
+
+    return plot_handle
+end
+
+"""
+    plot_inventory_trajectories(dr_tag, baseline_tag)
+
+Create the inventory-trajectory subplot.
+
+# Arguments
+- `dr_tag::AbstractString`: TS-DDR trajectory prefix.
+- `baseline_tag::AbstractString`: baseline trajectory prefix.
+
+# Examples
+```julia
+plot_inventory_trajectories("integer", "integer")
+```
+"""
+function plot_inventory_trajectories(dr_tag, baseline_tag)
+    # Trajectory files have columns t0, t1, ..., tT.
+    time_columns = [Symbol("t$(period)") for period in 0:INVENTORY_T]
+
+    # Load TS-DDR and base-stock trajectories.
+    dr_paths = CSV.read(resolve_file("$(dr_tag)_dr_trajectories.csv"), DataFrame)
+    base_stock_paths = CSV.read(
+        resolve_file("$(baseline_tag)_basestock_trajectories.csv"),
+        DataFrame,
+    )
+
+    # Plot a readable subset rather than all trajectories.
+    num_paths = min(20, nrow(dr_paths), nrow(base_stock_paths))
+
+    plot_handle = plot(;
+        xlabel = "Period",
+        ylabel = "Net inventory",
+        title = "Inventory trajectories",
+        legend = :topright,
+    )
+
+    for row in 1:num_paths
+        plot!(
+            plot_handle,
+            0:INVENTORY_T,
+            Vector(dr_paths[row, time_columns]);
+            color = :steelblue,
+            alpha = 0.35,
+            label = row == 1 ? "TS-DDR" : false,
+        )
+    end
+
+    for row in 1:num_paths
+        plot!(
+            plot_handle,
+            0:INVENTORY_T,
+            Vector(base_stock_paths[row, time_columns]);
+            color = :darkorange,
+            alpha = 0.35,
+            label = row == 1 ? "Base-stock" : false,
+        )
+    end
+
+    # Zero inventory separates holding from backlog.
+    hline!(plot_handle, [0.0]; linestyle = :dash, color = :black, label = "Zero")
+
+    return plot_handle
+end
+
+"""
+    plot_cost_distribution(results, base_stock_level)
+
+Create the cost-distribution subplot.
+
+# Arguments
+- `results::Vector{MethodResult}`: methods to compare.
+- `base_stock_level::Real`: base-stock order-up-to level.
+
+# Examples
+```julia
+plot_cost_distribution(results, 160.0)
+```
+"""
+function plot_cost_distribution(results::Vector{MethodResult}, base_stock_level::Real)
+    # Convert table labels to compact axis labels.
+    labels = [short_method_label(result.name, base_stock_level) for result in results]
+
+    # Keep method colors stable across plot rebuilds.
+    colors = method_colors(length(results))
+
+    plot_handle = plot(;
+        xlabel = "Method",
+        ylabel = "Operational cost",
+        title = "Cost comparison",
+        legend = false,
+        xrotation = 30,
+        bottom_margin = 8Plots.mm,
+        xtickfontsize = 7,
+    )
+
+    for index in eachindex(results)
+        violin!(
+            plot_handle,
+            fill(labels[index], length(results[index].costs)),
+            results[index].costs;
+            fillcolor = colors[index],
+            linecolor = :black,
+            fillalpha = 0.7,
+        )
+    end
+
+    return plot_handle
+end
+
+"""
+    make_summary_plot(problem; kwargs...)
+
+Build the 2x2 documentation figure for one problem variant.
+
+# Arguments
+- `problem::AbstractString`: figure title.
+- `results::Vector{MethodResult}`: compared methods.
+- `base_stock_level::Real`: base-stock order-up-to level.
+- `sddp_tag::AbstractString`: SDDP result prefix.
+- `dr_tag::AbstractString`: TS-DDR trajectory prefix.
+- `curve_specs`: training-curve plot specifications.
+
+# Examples
+```julia
+plot = make_summary_plot(
+    "Integer problem",
+    results = integer_results,
+    base_stock_level = 160.0,
+    sddp_tag = "integer",
+    dr_tag = "integer",
+    curve_specs = [("integer", "FixedDiscrete", :steelblue)],
+)
+```
+"""
+function make_summary_plot(
+    problem::AbstractString;
+    results::Vector{MethodResult},
+    base_stock_level::Real,
+    sddp_tag::AbstractString,
+    dr_tag::AbstractString,
+    curve_specs,
+)
+    # Build each panel with a single responsibility.
+    sddp_panel = plot_sddp_learning_curve(sddp_tag)
+    training_panel = plot_training_curves(curve_specs)
+    trajectory_panel = plot_inventory_trajectories(dr_tag, sddp_tag)
+    distribution_panel = plot_cost_distribution(results, base_stock_level)
+
+    # Use a fixed layout so generated docs are stable.
     layout = @layout [a b; c d]
-    combined = plot(p1, p2, p3, p4; layout=layout, size=(1100, 800),
-        plot_title=title_suffix, plot_titlefontsize=12, margin=5Plots.mm)
-    return combined
-end
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# Demand process plot (shared)
-# ═══════════════════════════════════════════════════════════════════════════════
-periods = 1:INVENTORY_T
-demand_mid = (D_LO .+ D_HI) ./ 2
-plt_demand = plot(periods, demand_mid;
-    xlabel="Period", ylabel="Demand",
-    title="Latent demand process (random phase + regime + AR)",
-    label="Nominal seasonal center", linewidth=2, linestyle=:dash, color=:purple)
-rng_plot = MersenneTwister(1234)
-for k in 1:24
-    path = sample_inventory_demand_path(rng_plot)
-    plot!(plt_demand, periods, path; color=:gray, alpha=0.28, label=false)
-end
-savefig(plt_demand, joinpath(docs_dir, "inventory_demand_process.png"))
-println("Saved inventory_demand_process.png")
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# Section 1: Relaxed
-# ═══════════════════════════════════════════════════════════════════════════════
-println("\n" * "=" ^ 60)
-println("SECTION 1: Relaxed (continuous) comparison")
-println("=" ^ 60)
-
-r_dr = load_costs("relaxed", "dr")
-r_sddp = load_costs("relaxed", "sddp")
-r_bs = load_costs("relaxed", "basestock")
-r_rand = load_costs("relaxed", "random")
-r_timing = load_timing(["relaxed"])
-r_S = parse(Float64, strip(read(joinpath(result_dir, "relaxed_basestock_S_star.txt"), String)))
-r_bound = parse(Float64, strip(read(joinpath(result_dir, "relaxed_sddp_bound.txt"), String)))
-
-r_entries = [
-    ("TS-DDR (trained)", r_dr),
-    ("SDDP (PAR)", r_sddp),
-    ("Base-stock (S*=$(round(Int, r_S)))", r_bs),
-    ("Random (untrained)", r_rand),
-]
-print_table(r_entries, r_timing, r_bound)
-
-plt_relaxed = make_plots("relaxed", r_entries, r_S, "Relaxed (continuous) problem")
-savefig(plt_relaxed, joinpath(docs_dir, "inventory_relaxed_results.png"))
-println("Saved inventory_relaxed_results.png")
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# Section 2: Integer
-# ═══════════════════════════════════════════════════════════════════════════════
-println("\n" * "=" ^ 60)
-println("SECTION 2: Integer (MIP) comparison")
-println("=" ^ 60)
-
-i_dr = load_costs("integer", "dr")
-i_dr_cr = load_costs("integer_cr", "dr")
-i_sddp = load_costs("integer", "sddp")
-i_bs = load_costs("integer", "basestock")
-i_rand = load_costs("integer", "random")
-i_timing = load_timing(["integer", "integer_cr"])
-i_S = parse(Float64, strip(read(joinpath(result_dir, "integer_basestock_S_star.txt"), String)))
-i_bound = parse(Float64, strip(read(joinpath(result_dir, "integer_sddp_bound.txt"), String)))
-
-i_entries = [
-    ("TS-DDR (FixedDiscrete)", i_dr),
-    ("TS-DDR (ContRelax)", i_dr_cr),
-    ("SDDP.jl integer rollout", i_sddp),
-    ("Base-stock (S*=$(round(Int, i_S)))", i_bs),
-    ("Random (untrained)", i_rand),
-]
-print_table(i_entries, i_timing, i_bound)
-
-plt_integer = make_plots("integer", i_entries, i_S, "Integer (MIP) problem";
-    sddp_tag="integer", dr_tag="integer")
-savefig(plt_integer, joinpath(docs_dir, "inventory_integer_results.png"))
-println("Saved inventory_integer_results.png")
-
-println("\nAll assets saved to: $(relpath(docs_dir, example_dir))")
+
+    return plot(
+        sddp_panel,
+        training_panel,
+        trajectory_panel,
+        distribution_panel;
+        layout = layout,
+        size = (1200, 900),
+        plot_title = problem,
+        plot_titlefontsize = 12,
+        margin = 6Plots.mm,
+    )
+end
+
+"""
+    plot_demand_process() -> Nothing
+
+Regenerate the demand-process documentation figure.
+
+# Examples
+```julia
+plot_demand_process()
+```
+"""
+function plot_demand_process()
+    # Period numbers run from 1 to T.
+    periods = 1:INVENTORY_T
+
+    # The nominal seasonal center is the midpoint of the demand band.
+    demand_midpoint = (D_LO .+ D_HI) ./ 2
+
+    plot_handle = plot(
+        periods,
+        demand_midpoint;
+        xlabel = "Period",
+        ylabel = "Demand",
+        title = "Latent demand process (random phase + regime + AR)",
+        label = "Nominal seasonal center",
+        linewidth = 2,
+        linestyle = :dash,
+        color = :purple,
+    )
+
+    # Use a fixed RNG so the figure is reproducible.
+    rng = MersenneTwister(1234)
+
+    for _ in 1:24
+        # Each path has its own hidden phase, regime, and shock sequence.
+        path = sample_inventory_demand_path(rng)
+
+        plot!(
+            plot_handle,
+            periods,
+            path;
+            color = :gray,
+            alpha = 0.28,
+            label = false,
+        )
+    end
+
+    savefig(plot_handle, joinpath(DOCS_ASSET_DIR, "inventory_demand_process.png"))
+    println("Saved inventory_demand_process.png")
+
+    return nothing
+end
+
+"""
+    relaxed_results() -> (results, timing, base_stock_level, bound)
+
+Load all relaxed-problem comparison data.
+
+# Examples
+```julia
+results, timing, base_stock_level, bound = relaxed_results()
+```
+"""
+function relaxed_results()
+    # Load all relaxed operational-cost samples.
+    dr_costs = load_costs("relaxed", "dr")
+    sddp_costs = load_costs("relaxed", "sddp")
+    base_stock_costs = load_costs("relaxed", "basestock")
+    random_costs = load_costs("relaxed", "random")
+
+    # Load optional tuned-variant costs.
+    lstm_costs = optional_costs("relaxed_lstm", "dr")
+    hp_costs = optional_costs("relaxed_hp", "dr")
+    lstm_hp_costs = optional_costs("relaxed_lstm_hp", "dr")
+
+    # Load scalar baseline metadata.
+    base_stock_level = read_scalar(resolve_file("relaxed_basestock_S_star.txt"))
+    sddp_bound = read_scalar(resolve_file("relaxed_sddp_bound.txt"))
+
+    # Build display records in table order.
+    results = [
+        MethodResult("TS-DDR (trained)", dr_costs),
+    ]
+
+    # Insert tuned variants after the baseline feedforward.
+    !isnothing(hp_costs) &&
+        push!(results, MethodResult("TS-DDR Relaxed (HighPenalty)", hp_costs))
+    !isnothing(lstm_costs) &&
+        push!(results, MethodResult("TS-DDR Relaxed (LSTM)", lstm_costs))
+    !isnothing(lstm_hp_costs) &&
+        push!(results, MethodResult("TS-DDR Relaxed (LSTM+HP)", lstm_hp_costs))
+
+    # Append non-TS-DDR baselines.
+    push!(results, MethodResult("SDDP (PAR)", sddp_costs))
+    push!(results, MethodResult("Base-stock (S*=$(round(Int, base_stock_level)))", base_stock_costs))
+    push!(results, MethodResult("Random (untrained)", random_costs))
+
+    # Collect timing tags for all present variants.
+    timing_tags = ["relaxed"]
+    !isnothing(resolve_file_optional("relaxed_lstm_dr_timing.csv")) &&
+        push!(timing_tags, "relaxed_lstm")
+    !isnothing(resolve_file_optional("relaxed_hp_dr_timing.csv")) &&
+        push!(timing_tags, "relaxed_hp")
+    !isnothing(resolve_file_optional("relaxed_lstm_hp_dr_timing.csv")) &&
+        push!(timing_tags, "relaxed_lstm_hp")
+
+    return results, load_timing(timing_tags), base_stock_level, sddp_bound
+end
+
+"""
+    integer_results() -> (results, timing, base_stock_level, bound)
+
+Load all integer-problem comparison data.
+
+# Examples
+```julia
+results, timing, base_stock_level, bound = integer_results()
+```
+"""
+function integer_results()
+    # Load required integer operational-cost samples.
+    fixed_discrete_costs = load_costs("integer", "dr")
+    continuous_relaxation_costs = load_costs("integer_cr", "dr")
+    sddp_mip_forward_costs = load_costs("integer", "sddp")
+    sddp_lp_relaxation_costs = load_costs("integer", "sddp_lp")
+    base_stock_costs = load_costs("integer", "basestock")
+    random_costs = load_costs("integer", "random")
+
+    # Optional variants — load only if the result file exists.
+    mixed_gradient_costs = optional_costs("integer_sf", "dr")
+    hp_costs = optional_costs("integer_hp", "dr")
+    lstm_costs = optional_costs("integer_lstm", "dr")
+    lstm_sf_costs = optional_costs("integer_lstm_sf", "dr")
+
+    # Load scalar baseline metadata.
+    base_stock_level = read_scalar(resolve_file("integer_basestock_S_star.txt"))
+    sddp_bound = read_scalar(resolve_file("integer_sddp_bound.txt"))
+
+    # Build the method list: original TS-DDR variants first.
+    results = [
+        MethodResult("TS-DDR (FixedDiscrete)", fixed_discrete_costs),
+        MethodResult("TS-DDR (ContRelax)", continuous_relaxation_costs),
+    ]
+
+    # Insert optional TS-DDR variants in logical order.
+    !isnothing(mixed_gradient_costs) &&
+        push!(results, MethodResult("TS-DDR (MixedGrad)", mixed_gradient_costs))
+    !isnothing(hp_costs) &&
+        push!(results, MethodResult("TS-DDR (HighPenalty)", hp_costs))
+    !isnothing(lstm_costs) &&
+        push!(results, MethodResult("TS-DDR (LSTM)", lstm_costs))
+    !isnothing(lstm_sf_costs) &&
+        push!(results, MethodResult("TS-DDR (LSTM+SF)", lstm_sf_costs))
+
+    # Append non-TS-DDR baselines.
+    push!(results, MethodResult("SDDP (MIP fwd)", sddp_mip_forward_costs))
+    push!(results, MethodResult("SDDP (LP relax)", sddp_lp_relaxation_costs))
+    push!(results, MethodResult("Base-stock (S*=$(round(Int, base_stock_level)))", base_stock_costs))
+    push!(results, MethodResult("Random (untrained)", random_costs))
+
+    # Collect timing tags for all present variants.
+    timing_tags = ["integer", "integer_cr"]
+    for tag in ["integer_sf", "integer_hp", "integer_lstm", "integer_lstm_sf"]
+        !isnothing(resolve_file_optional("$(tag)_dr_timing.csv")) &&
+            push!(timing_tags, tag)
+    end
+
+    return results, load_timing(timing_tags), base_stock_level, sddp_bound
+end
+
+"""
+    integer_curve_specs()
+
+Return training-curve plot specs for the integer comparison.
+
+# Examples
+```julia
+curves = integer_curve_specs()
+```
+"""
+function integer_curve_specs()
+    # FixedDiscrete and ContRelax are always part of the integer benchmark.
+    specs = [
+        ("integer", "FixedDiscrete", :steelblue),
+        ("integer_cr", "ContRelax", :royalblue),
+    ]
+
+    # Optional variants appear only when their training curve exists.
+    optional = [
+        ("integer_sf", "MixedGrad", :darkorange),
+        ("integer_hp", "HighPenalty", :mediumpurple),
+        ("integer_lstm", "LSTM", :coral),
+        ("integer_lstm_sf", "LSTM+SF", :teal),
+    ]
+
+    for spec in optional
+        !isnothing(resolve_file_optional("$(spec[1])_training_curve.csv")) &&
+            push!(specs, spec)
+    end
+
+    return specs
+end
+
+"""
+    relaxed_curve_specs()
+
+Return training-curve plot specs for the relaxed comparison.
+
+# Examples
+```julia
+curves = relaxed_curve_specs()
+```
+"""
+function relaxed_curve_specs()
+    # Baseline feedforward is always present.
+    specs = [("relaxed", "Feedforward", :steelblue)]
+
+    # Optional tuned variants.
+    optional = [
+        ("relaxed_hp", "HighPenalty", :mediumpurple),
+        ("relaxed_lstm", "LSTM", :coral),
+        ("relaxed_lstm_hp", "LSTM+HP", :teal),
+    ]
+
+    for spec in optional
+        !isnothing(resolve_file_optional("$(spec[1])_training_curve.csv")) &&
+            push!(specs, spec)
+    end
+
+    return specs
+end
+
+"""
+    run_relaxed_comparison() -> Nothing
+
+Print and plot the relaxed continuous comparison.
+
+# Examples
+```julia
+run_relaxed_comparison()
+```
+"""
+function run_relaxed_comparison()
+    println("\n" * "=" ^ 60)
+    println("SECTION 1: Relaxed (continuous) comparison")
+    println("=" ^ 60)
+
+    # Load data, print table, and save the documentation figure.
+    results, timing, base_stock_level, bound = relaxed_results()
+    print_table(results, timing, bound)
+
+    figure = make_summary_plot(
+        "Relaxed (continuous) problem";
+        results = results,
+        base_stock_level = base_stock_level,
+        sddp_tag = "relaxed",
+        dr_tag = "relaxed",
+        curve_specs = relaxed_curve_specs(),
+    )
+
+    savefig(figure, joinpath(DOCS_ASSET_DIR, "inventory_relaxed_results.png"))
+    println("Saved inventory_relaxed_results.png")
+
+    return nothing
+end
+
+"""
+    run_integer_comparison() -> Nothing
+
+Print and plot the integer MIP comparison.
+
+# Examples
+```julia
+run_integer_comparison()
+```
+"""
+function run_integer_comparison()
+    println("\n" * "=" ^ 60)
+    println("SECTION 2: Integer (MIP) comparison")
+    println("=" ^ 60)
+
+    # Load data, print table, and save the documentation figure.
+    results, timing, base_stock_level, bound = integer_results()
+    print_table(results, timing, bound)
+
+    figure = make_summary_plot(
+        "Integer (MIP) problem";
+        results = results,
+        base_stock_level = base_stock_level,
+        sddp_tag = "integer",
+        dr_tag = "integer",
+        curve_specs = integer_curve_specs(),
+    )
+
+    savefig(figure, joinpath(DOCS_ASSET_DIR, "inventory_integer_results.png"))
+    println("Saved inventory_integer_results.png")
+
+    return nothing
+end
+
+"""
+    main() -> Nothing
+
+Run every inventory-result comparison.
+
+# Examples
+```julia
+main()
+```
+"""
+function main()
+    # Regenerate the demand-process figure before method comparisons.
+    plot_demand_process()
+
+    # Print and plot the relaxed benchmark.
+    run_relaxed_comparison()
+
+    # Print and plot the integer benchmark.
+    run_integer_comparison()
+
+    println("\nAll assets saved to: $(relpath(DOCS_ASSET_DIR, @__DIR__))")
+
+    return nothing
+end
+
+# Run only when invoked as a script.
+if abspath(PROGRAM_FILE) == @__FILE__
+    main()
+end
diff --git a/examples/inventory_control/evaluate_inventory.jl b/examples/inventory_control/evaluate_inventory.jl
index 9fd1f6a..a0e42f2 100644
--- a/examples/inventory_control/evaluate_inventory.jl
+++ b/examples/inventory_control/evaluate_inventory.jl
@@ -2,8 +2,16 @@
 Evaluate non-neural baselines for the inventory control problem.
 
 Evaluates base-stock and random policies for both relaxed and integer cases.
+
+Pass a run ID as the first CLI argument, or omit to generate one from the
+current timestamp:
+
+```bash
+julia --project=. evaluate_inventory.jl 20260619_231417
+```
 """
 
+using Dates
 using DecisionRules
 using JuMP
 using Flux
@@ -12,6 +20,9 @@ using Random, Statistics
 
 include(joinpath(@__DIR__, "build_inventory_problem.jl"))
 
+const RUN_ID = isempty(ARGS) ?
+    Dates.format(Dates.now(), "yyyymmdd_HHMMss") : ARGS[1]
+
 const N_EVAL = 300
 
 # ═══════════════════════════════════════════════════════════════════════════════
@@ -156,7 +167,7 @@ function evaluate_baselines(; tag::String, integer::Bool)
     println("  Mean: $(round(mean(rand_costs), digits=1)) ± $(round(std(rand_costs), digits=1))")
 
     # --- Save results ---
-    result_dir = joinpath(@__DIR__, "results")
+    result_dir = joinpath(@__DIR__, "results", RUN_ID)
     mkpath(result_dir)
 
     df_bs_traj = DataFrame(bs_traj, [Symbol("t$i") for i in 0:INVENTORY_T])
@@ -184,4 +195,4 @@ end
 evaluate_baselines(tag="relaxed", integer=false)
 println()
 evaluate_baselines(tag="integer", integer=true)
-println("\nAll baseline results saved to results/")
+println("\nAll baseline results saved to results/$RUN_ID/")
diff --git a/examples/inventory_control/solve_sddp.jl b/examples/inventory_control/solve_sddp.jl
index 3760612..c3058bf 100644
--- a/examples/inventory_control/solve_sddp.jl
+++ b/examples/inventory_control/solve_sddp.jl
@@ -7,9 +7,19 @@ where μ_t, α, and Ω are fitted from simulated demand paths.
 
 Two cases:
 1. Relaxed: no binary z, SDDP is near-optimal for convex problems
-2. Integer: z ∈ [0,1] relaxation + integer rounding at rollout
+2. Integer: AlternativeForwardPass — forward pass solves true MIP (z ∈ {0,1}),
+   backward pass uses LP relaxation (z ∈ [0,1]) to compute cuts with valid duals.
+   Both models share the same PAR(1) demand structure.
+
+Pass a run ID as the first CLI argument, or omit to generate one from the
+current timestamp:
+
+```bash
+julia --project=. solve_sddp.jl 20260619_231417
+```
 """
 
+using Dates
 using SDDP
 using JuMP
 using HiGHS
@@ -19,6 +29,9 @@ using Random
 
 include(joinpath(@__DIR__, "build_inventory_problem.jl"))
 
+const RUN_ID = isempty(ARGS) ?
+    Dates.format(Dates.now(), "yyyymmdd_HHMMss") : ARGS[1]
+
 const N_SIM = 300
 const ITERATION_LIMIT = 500
 
@@ -66,7 +79,8 @@ println("  Ω (innovations, $(length(par_omega)) points): $(round.(par_omega, di
 # ═══════════════════════════════════════════════════════════════════════════════
 # Build SDDP model with PAR(1) demand approximation
 # ═══════════════════════════════════════════════════════════════════════════════
-function build_sddp_model(; integer::Bool=false, mu=par_mu, alpha=par_alpha, omega=par_omega)
+function build_sddp_model(; integer::Bool=false, binary::Bool=false,
+                            mu=par_mu, alpha=par_alpha, omega=par_omega)
     d_lag_init = mu[1]
     SDDP.LinearPolicyGraph(
         stages=2 * INVENTORY_T,
@@ -81,7 +95,11 @@ function build_sddp_model(; integer::Bool=false, mu=par_mu, alpha=par_alpha, ome
         if isodd(stage)
             @variable(sp, 0 <= q <= INVENTORY_Q_MAX)
             if integer
-                @variable(sp, 0 <= z <= 1)
+                if binary
+                    @variable(sp, z, Bin)
+                else
+                    @variable(sp, 0 <= z <= 1)
+                end
                 @constraint(sp, q <= INVENTORY_Q_MAX * z)
                 @stageobjective(sp, INVENTORY_K * z + INVENTORY_C * q)
             else
@@ -171,7 +189,7 @@ function rollout_sddp(model, n_sim; integer_round::Bool=false, mu=par_mu)
     return costs, traj_inv
 end
 
-result_dir = joinpath(@__DIR__, "results")
+result_dir = joinpath(@__DIR__, "results", RUN_ID)
 mkpath(result_dir)
 
 # ═══════════════════════════════════════════════════════════════════════════════
@@ -219,44 +237,97 @@ open(joinpath(result_dir, "relaxed_sddp_bound.txt"), "w") do io
 end
 
 # ═══════════════════════════════════════════════════════════════════════════════
-# Section 2: Integer SDDP (LP relaxation + integer rounding rollout)
+# Section 2: Integer SDDP (MIP forward pass + LP cuts via AlternativeForwardPass)
+#
+# Two-phase training:
+#   Phase 1 — LP forward + LP backward until convergence (warm-start cuts)
+#   Phase 2 — MIP forward + LP backward (refine at MIP-realistic trial points)
+# Rollout on true MIP model (z ∈ {0,1}) with all accumulated cuts.
 # ═══════════════════════════════════════════════════════════════════════════════
 println("\n" * "=" ^ 60)
-println("SECTION 2: SDDP — Integer (LP relax + integer rollout)")
+println("SECTION 2: SDDP — Integer (MIP forward + LP cuts)")
 println("=" ^ 60)
 
-model_integer = build_sddp_model(; integer=true)
-println("Training integer-relaxed SDDP ($(2*INVENTORY_T) stages)...")
+model_lp = build_sddp_model(; integer=true, binary=false)
+model_mip = build_sddp_model(; integer=true, binary=true)
+
+# --- Phase 1: LP warm-start ---
+println("Phase 1: LP warm-start ($(2*INVENTORY_T) stages)...")
 sddp_int_start = time()
 SDDP.train(
-    model_integer;
+    model_lp;
     duality_handler=SDDP.ContinuousConicDuality(),
     iteration_limit=ITERATION_LIMIT,
     stopping_rules=[SDDP.BoundStalling(100, 1e-3)],
     print_level=1,
 )
+lp_bound = SDDP.calculate_bound(model_lp)
+println("  LP warm-start bound: $(round(lp_bound, digits=1))")
+
+phase1_log = training_log_dataframe(model_lp)
+sddp_lp_seconds = time() - sddp_int_start
+
+# --- LP rollout (default SDDP baseline: LP decisions + integer rounding) ---
+println("LP rollout (default SDDP) on $N_SIM fresh scenarios...")
+Random.seed!(555)
+lp_eval_start = time()
+lp_costs, lp_traj = rollout_sddp(model_lp, N_SIM; integer_round=true)
+lp_eval_seconds = time() - lp_eval_start
+μ_lp = mean(lp_costs)
+println("  Default SDDP (LP rollout) — mean: $(round(μ_lp, digits=1)) ± $(round(std(lp_costs), digits=1))")
+
+CSV.write(joinpath(result_dir, "integer_sddp_lp_costs.csv"), DataFrame(operational_cost=lp_costs))
+CSV.write(joinpath(result_dir, "integer_sddp_lp_trajectories.csv"),
+    DataFrame(lp_traj, [Symbol("t$i") for i in 0:INVENTORY_T]))
+CSV.write(joinpath(result_dir, "integer_sddp_lp_training_log.csv"), phase1_log)
+CSV.write(joinpath(result_dir, "integer_sddp_lp_timing.csv"),
+    DataFrame(method=["SDDP (LP relax)"], fit_seconds=[0.0],
+              eval_seconds=[sddp_lp_seconds], n_eval=[N_SIM]))
+
+cuts_file = joinpath(result_dir, "integer_lp_cuts.json")
+SDDP.write_cuts_to_file(model_lp, cuts_file)
+SDDP.read_cuts_from_file(model_mip, cuts_file)
+println("  Exported LP cuts to MIP model")
+
+# --- Phase 2: MIP forward + LP backward ---
+println("\nPhase 2: AlternativeForwardPass — MIP forward + LP cuts...")
+println("  Forward pass: true MIP (z ∈ {0,1})")
+println("  Backward pass: LP relaxation (z ∈ [0,1]) for cuts")
+SDDP.train(
+    model_lp;
+    forward_pass=SDDP.AlternativeForwardPass(model_mip),
+    post_iteration_callback=SDDP.AlternativePostIterationCallback(model_mip),
+    duality_handler=SDDP.ContinuousConicDuality(),
+    iteration_limit=ITERATION_LIMIT,
+    add_to_existing_cuts=true,
+    print_level=1,
+)
 sddp_int_seconds = time() - sddp_int_start
 
-int_bound = SDDP.calculate_bound(model_integer)
-println("\nInteger-relaxed SDDP bound: $(round(int_bound, digits=1))")
+phase2_log = training_log_dataframe(model_lp)
+phase2_log.iteration .+= maximum(phase1_log.iteration)
+combined_log = vcat(phase1_log, phase2_log)
+
+int_bound = SDDP.calculate_bound(model_lp)
+println("\nLP relaxation bound (after both phases): $(round(int_bound, digits=1))")
 
-println("Integer rollout on $N_SIM fresh scenarios (TRUE demand)...")
+println("MIP rollout on $N_SIM fresh scenarios (TRUE demand)...")
 Random.seed!(555)
 int_eval_start = time()
-int_costs, int_traj = rollout_sddp(model_integer, N_SIM; integer_round=true)
+int_costs, int_traj = rollout_sddp(model_mip, N_SIM; integer_round=true)
 int_eval_seconds = time() - int_eval_start
 
 μ_i = mean(int_costs)
 σ_i = std(int_costs)
-println("Integer SDDP — mean: $(round(μ_i, digits=1)) ± $(round(σ_i, digits=1))")
+println("Integer SDDP (MIP fwd) — mean: $(round(μ_i, digits=1)) ± $(round(σ_i, digits=1))")
 println("Gap to LP bound: $(round(100 * (μ_i - int_bound) / μ_i, digits=1))%")
 
 CSV.write(joinpath(result_dir, "integer_sddp_costs.csv"), DataFrame(operational_cost=int_costs))
 CSV.write(joinpath(result_dir, "integer_sddp_trajectories.csv"),
     DataFrame(int_traj, [Symbol("t$i") for i in 0:INVENTORY_T]))
-CSV.write(joinpath(result_dir, "integer_sddp_training_log.csv"), training_log_dataframe(model_integer))
+CSV.write(joinpath(result_dir, "integer_sddp_training_log.csv"), combined_log)
 CSV.write(joinpath(result_dir, "integer_sddp_timing.csv"),
-    DataFrame(method=["SDDP.jl integer rollout"], fit_seconds=[0.0],
+    DataFrame(method=["SDDP (MIP fwd)"], fit_seconds=[0.0],
               eval_seconds=[sddp_int_seconds], n_eval=[N_SIM]))
 open(joinpath(result_dir, "integer_sddp_bound.txt"), "w") do io
     println(io, int_bound)
diff --git a/examples/inventory_control/train_dr_inventory.jl b/examples/inventory_control/train_dr_inventory.jl
index eb2c387..7777e1b 100644
--- a/examples/inventory_control/train_dr_inventory.jl
+++ b/examples/inventory_control/train_dr_inventory.jl
@@ -1,207 +1,1149 @@
 """
-Train TS-DDR policies for the inventory control problem.
+Train TS-DDR policies for the inventory-control benchmark.
 
-Trains two policies:
-1. Relaxed (continuous LP subproblems, standard LP duals)
-2. Integer (MIP subproblems, FixedDiscreteIntegerStrategy) — uses more
-   batches and lower learning rate for stable convergence.
+The benchmark compares target-state decision-rule variants across two axes:
+
+**Gradient estimator** — how ∇_θ Q(w; θ) is computed for integer models:
+1. fixed-discrete local duals (solve MIP → fix z → re-solve LP → read duals);
+2. continuous-relaxation duals (relax z ∈ {0,1} → LP → read duals);
+3. mixed gradient (α · dual + (1-α) · score-function REINFORCE correction).
+
+**Policy architecture** — which function class maps observations to targets:
+- `ExAnteInventoryPolicy`: feedforward MLP, sigmoid output;
+- `LSTMExAntePolicy`: recurrent encoder on lagged demand, affine output.
+
+Each variant is independent and can be run via:
+
+    julia --project=. train_dr_inventory.jl <tag>
+
+where `<tag>` is one of `relaxed`, `integer`, `integer_cr`, `integer_sf`,
+`integer_hp`, `integer_lstm`, `integer_lstm_sf`.
 """
 
+using CSV
+using DataFrames
+using Dates
 using DecisionRules
-using JuMP
 using Flux
 using JLD2
-using CSV, DataFrames
-using Random, Statistics
+using JuMP
+using Random
+using Statistics
 
 include(joinpath(@__DIR__, "build_inventory_problem.jl"))
 
+# The script keeps generated models and CSV files out of the source directory.
+const EXAMPLE_DIR = @__DIR__
+
+# Each run writes to results/<RUN_ID>/ so concurrent or successive runs never
+# clobber each other. RUN_ID is set by launch_all.sh for batch submissions or
+# generated from the current timestamp for standalone runs.
+const RUN_ID = get(ENV, "RUN_ID", Dates.format(Dates.now(), "yyyymmdd_HHMMss"))
+const RESULT_DIR = joinpath(EXAMPLE_DIR, "results", RUN_ID)
+const MODEL_DIR = joinpath(EXAMPLE_DIR, "models", RUN_ID)
+
+# Create output directories before any training run tries to write into them.
+mkpath(MODEL_DIR)
+mkpath(RESULT_DIR)
+
+println("Run ID: $RUN_ID")
+println("Results → $RESULT_DIR")
+
+# Use one fixed training sample size for every TS-DDR variant.
 const N_TRAIN_SCENARIOS = 50
-const N_TEST = 300
-
-example_dir = @__DIR__
-model_dir = joinpath(example_dir, "models")
-result_dir = joinpath(example_dir, "results")
-mkpath(model_dir)
-mkpath(result_dir)
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# Rollout helper
-# ═══════════════════════════════════════════════════════════════════════════════
-function rollout_policy(policy, subproblems, spi, spo, unc_eval, init_state;
-    n_test=N_TEST, seed=555, integer=true)
+
+# Use one held-out evaluation size for every reported cost distribution.
+const N_TEST_SCENARIOS = 300
+
+"""
+    InventoryTrainingVariant
+
+Configuration for one TS-DDR inventory training run.
+
+Mathematically, every variant trains a policy ``\\pi_\\theta`` by stochastic
+gradient descent on sampled deterministic-equivalent objectives
+``Q(w; \\theta)``. The fields choose the model family and the gradient estimator:
+
+- `integer`: whether the JuMP model contains binary setup variables ``z_t``;
+- `training_integer_strategy`: how local dual information is read for
+  ``\\nabla_\\theta Q(w; \\theta)`` when the model is mixed-integer;
+- `score_function`: optional Monte Carlo correction using perturbed target
+  rollouts.
+
+# Fields
+- `tag::String`: prefix used for saved models and CSV files.
+- `integer::Bool`: whether to build the fixed-cost MIP formulation.
+- `num_batches::Int`: number of SGD updates.
+- `train_per_batch::Int`: sampled trajectories per SGD update.
+- `learning_rate::Float64`: Adam learning rate.
+- `warmup_batches::Int`: last batch of the low target-penalty phase.
+- `training_integer_strategy::AbstractIntegerStrategy`: dual-path strategy.
+- `score_function::Union{Nothing,ScoreFunctionConfig,ScoreFunctionSchedule}`:
+  optional score-function estimator.
+- `penalty::Float64`: target-deficit penalty λ.
+- `policy_builder::Function`: zero-argument callable returning a fresh policy.
+- `penalty_schedule_fn::Function`: `(variant) -> schedule` for target-penalty
+  multiplier ramp.
+
+The 8-argument constructor defaults `penalty = INVENTORY_PENALTY`,
+`policy_builder = () -> build_exante_policy(; seed = 2024)`, and
+`penalty_schedule_fn = penalty_schedule_for`.
+
+# Examples
+```julia
+variant = InventoryTrainingVariant(
+    "integer",
+    true,
+    800,
+    10,
+    8.0e-4,
+    120,
+    FixedDiscreteIntegerStrategy(),
+    nothing,
+)
+```
+"""
+struct InventoryTrainingVariant
+    tag::String
+    integer::Bool
+    num_batches::Int
+    train_per_batch::Int
+    learning_rate::Float64
+    warmup_batches::Int
+    training_integer_strategy::AbstractIntegerStrategy
+    score_function::Union{Nothing,ScoreFunctionConfig,ScoreFunctionSchedule}
+    penalty::Float64
+    policy_builder::Function
+    penalty_schedule_fn::Function
+end
+
+function InventoryTrainingVariant(
+    tag, integer, num_batches, train_per_batch, learning_rate, warmup_batches,
+    training_integer_strategy, score_function,
+)
+    return InventoryTrainingVariant(
+        tag, integer, num_batches, train_per_batch, learning_rate, warmup_batches,
+        training_integer_strategy, score_function,
+        INVENTORY_PENALTY,
+        () -> build_exante_policy(; seed = 2024),
+        penalty_schedule_for,
+    )
+end
+
+"""
+    penalty_schedule_for(variant::InventoryTrainingVariant)
+
+Return the two-phase target-penalty schedule used by one inventory variant.
+
+The target penalty is multiplied by `0.4` during warmup and by `1.0`
+afterward:
+
+```math
+m_k =
+\\begin{cases}
+0.4, & 1 \\le k \\le k_{warm}, \\\\
+1.0, & k_{warm} < k \\le K.
+\\end{cases}
+```
+
+# Arguments
+- `variant::InventoryTrainingVariant`: training configuration.
+
+# Examples
+```julia
+schedule = penalty_schedule_for(variant)
+```
+"""
+function penalty_schedule_for(variant::InventoryTrainingVariant)
+    # The first tuple covers the gentler warmup phase.
+    warmup_phase = (1, variant.warmup_batches, 0.4)
+
+    # The second tuple restores the nominal target penalty.
+    full_penalty_phase = (
+        variant.warmup_batches + 1,
+        variant.num_batches,
+        1.0,
+    )
+
+    return [warmup_phase, full_penalty_phase]
+end
+
+"""
+    method_label(variant::InventoryTrainingVariant) -> String
+
+Return the table label for one TS-DDR variant.
+
+# Arguments
+- `variant::InventoryTrainingVariant`: training configuration.
+
+# Examples
+```julia
+label = method_label(variant)
+```
+"""
+function method_label(variant::InventoryTrainingVariant)
+    tag = variant.tag
+
+    # --- Relaxed tuned variants ---
+    tag == "relaxed_lstm" && return "TS-DDR Relaxed (LSTM)"
+    tag == "relaxed_hp" && return "TS-DDR Relaxed (HighPenalty)"
+    tag == "relaxed_lstm_hp" && return "TS-DDR Relaxed (LSTM+HP)"
+
+    # --- Integer tuned variants ---
+    tag == "integer_lstm" && return "TS-DDR (LSTM)"
+    tag == "integer_lstm_sf" && return "TS-DDR (LSTM+SF)"
+    tag == "integer_hp" && return "TS-DDR (HighPenalty)"
+
+    # --- Original variants ---
+    !isnothing(variant.score_function) && return "TS-DDR (MixedGrad)"
+
+    variant.training_integer_strategy isa ContinuousRelaxationIntegerStrategy &&
+        return "TS-DDR (ContRelax)"
+
+    variant.training_integer_strategy isa FixedDiscreteIntegerStrategy &&
+        return "TS-DDR (FixedDiscrete)"
+
+    return "TS-DDR (trained)"
+end
+
+"""
+    operational_stage_cost(model::JuMP.Model, integer::Bool) -> Float64
+
+Return the realized inventory cost of one solved stage model.
+
+For the integer formulation, the operational cost is
+
+```math
+K z_t + c q_t + h \\max(s_t,0) + p \\max(-s_t,0).
+```
+
+For the relaxed formulation, the setup term ``K z_t`` is absent.
+
+# Arguments
+- `model::JuMP.Model`: solved inventory stage model.
+- `integer::Bool`: whether `model` contains the binary setup variable `z`.
+
+# Examples
+```julia
+cost = operational_stage_cost(stage_model, true)
+```
+"""
+function operational_stage_cost(model::JuMP.Model, integer::Bool)
+    # The order quantity is common to both formulations.
+    order_quantity = value(model[:q])
+
+    # Net inventory after demand determines holding or backlog cost.
+    next_inventory = value(model[:s_out])
+
+    # Positive inventory pays holding cost.
+    holding_cost = INVENTORY_H * max(next_inventory, 0.0)
+
+    # Negative inventory pays backlog cost.
+    backlog_cost = INVENTORY_P * max(-next_inventory, 0.0)
+
+    # Continuous formulations pay only variable ordering, holding, and backlog.
+    variable_cost = INVENTORY_C * order_quantity + holding_cost + backlog_cost
+
+    if integer
+        # MIP solves return an integral setup value; round removes solver noise.
+        setup_value = round(value(model[:z]))
+
+        return INVENTORY_K * setup_value + variable_cost
+    end
+
+    return variable_cost
+end
+
+"""
+    rollout_policy(policy, subproblems, state_params_in, state_params_out,
+                   uncertainty_sampler, initial_state; kwargs...)
+
+Evaluate a trained policy by stage-wise rollout on held-out trajectories.
+
+At each stage the policy proposes a target state. The stage model then solves
+
+```math
+\\min f_t(x_t,y_t) + \\lambda |x_t^{mid} - \\hat{x}_t|
+```
+
+subject to the inventory transition and capacity constraints. Only the
+operational term ``f_t`` is reported as cost, because target slack is a training
+device rather than a deployed cost.
+
+# Arguments
+- `policy`: Flux-compatible target-state policy.
+- `subproblems`: one solved-and-reused JuMP model per stage.
+- `state_params_in`: input-state parameters for each stage.
+- `state_params_out`: `(target_parameter, realized_state_variable)` pairs.
+- `uncertainty_sampler`: sampler for held-out demand trajectories.
+- `initial_state`: inventory state entering stage 1.
+
+# Keywords
+- `num_scenarios::Int`: number of held-out rollouts.
+- `seed::Int`: random seed for evaluation trajectories.
+- `integer::Bool`: whether to use the MIP operational cost formula.
+
+# Examples
+```julia
+costs, inventory, setup, order = rollout_policy(
+    policy,
+    subproblems,
+    state_params_in,
+    state_params_out,
+    uncertainty_sampler,
+    initial_state;
+    integer = true,
+)
+```
+"""
+function rollout_policy(
+    policy,
+    subproblems,
+    state_params_in,
+    state_params_out,
+    uncertainty_sampler,
+    initial_state;
+    num_scenarios::Int = N_TEST_SCENARIOS,
+    seed::Int = 555,
+    integer::Bool = true,
+)
+    # Fix the evaluation sample so variants see the same demand distribution.
     Random.seed!(seed)
-    traj_inv = Matrix{Float64}(undef, n_test, INVENTORY_T + 1)
-    traj_z = Matrix{Float64}(undef, n_test, INVENTORY_T)
-    traj_q = Matrix{Float64}(undef, n_test, INVENTORY_T)
-    op_costs = Vector{Float64}(undef, n_test)
-
-    for s in 1:n_test
-        unc_sample = sample(unc_eval)
-        state = Float64.(init_state)
-        traj_inv[s, 1] = state[1]
-        op_costs[s] = 0.0
-
-        for t in 1:INVENTORY_T
-            d_val = unc_sample[t][1][2]
-            target = Float64.(policy(Float32[d_val, state...]))
-
-            for i in eachindex(spi[t])
-                set_parameter_value(spi[t][i], state[i])
-            end
-            for (param, value) in unc_sample[t]
-                set_parameter_value(param, value)
+
+    # Store net-inventory trajectories, including the initial inventory at t=0.
+    inventory_paths = Matrix{Float64}(undef, num_scenarios, INVENTORY_T + 1)
+
+    # Store setup indicators for integer runs and order indicators for relaxed runs.
+    setup_paths = Matrix{Float64}(undef, num_scenarios, INVENTORY_T)
+
+    # Store order quantities for diagnostics.
+    order_paths = Matrix{Float64}(undef, num_scenarios, INVENTORY_T)
+
+    # Store operational cost for each scenario.
+    operational_costs = Vector{Float64}(undef, num_scenarios)
+
+    for scenario in 1:num_scenarios
+        # Reset recurrent state for LSTM policies.
+        Flux.reset!(policy)
+
+        # Draw one demand path for this rollout.
+        uncertainty_sample = sample(uncertainty_sampler)
+
+        # Start from the benchmark initial state.
+        state = Float64.(initial_state)
+
+        # Record inventory at t=0.
+        inventory_paths[scenario, 1] = state[1]
+
+        # Reset the scenario cost accumulator.
+        operational_costs[scenario] = 0.0
+
+        for stage in 1:INVENTORY_T
+            # Demand is the single uncertainty value in this inventory model.
+            demand_value = uncertainty_sample[stage][1][2]
+
+            # The policy maps observed demand plus current state to a target state.
+            target = Float64.(policy(Float32[demand_value, state...]))
+
+            # Input-state parameters receive the realized state entering this stage.
+            for index in eachindex(state_params_in[stage])
+                set_parameter_value(state_params_in[stage][index], state[index])
             end
-            for i in eachindex(spo[t])
-                set_parameter_value(spo[t][i][1], target[i])
+
+            # Uncertainty parameters receive this stage's realized demand.
+            for (parameter, value) in uncertainty_sample[stage]
+                set_parameter_value(parameter, value)
             end
-            optimize!(subproblems[t])
-
-            q_val = value(subproblems[t][:q])
-            s_val = value(subproblems[t][:s_out])
-
-            if integer
-                z_val = round(value(subproblems[t][:z]))
-                op_costs[s] += INVENTORY_K * z_val + INVENTORY_C * q_val +
-                               INVENTORY_H * max(s_val, 0.0) +
-                               INVENTORY_P * max(-s_val, 0.0)
-                traj_z[s, t] = z_val
-            else
-                op_costs[s] += INVENTORY_C * q_val +
-                               INVENTORY_H * max(s_val, 0.0) +
-                               INVENTORY_P * max(-s_val, 0.0)
-                traj_z[s, t] = q_val > 1e-7 ? 1.0 : 0.0
+
+            # Target parameters receive the policy output.
+            for index in eachindex(state_params_out[stage])
+                set_parameter_value(state_params_out[stage][index][1], target[index])
             end
-            traj_q[s, t] = q_val
-            traj_inv[s, t+1] = s_val
-            state = [s_val, d_val, state[2]]
+
+            # Solve the deployment stage exactly as modeled.
+            optimize!(subproblems[stage])
+
+            # Read decisions and realized inventory from the solved stage.
+            order_quantity = value(subproblems[stage][:q])
+            next_inventory = value(subproblems[stage][:s_out])
+
+            # Add the operational cost, excluding target-deficit penalty.
+            operational_costs[scenario] +=
+                operational_stage_cost(subproblems[stage], integer)
+
+            # Store setup or order activity for later diagnostics.
+            setup_paths[scenario, stage] = integer ?
+                round(value(subproblems[stage][:z])) :
+                Float64(order_quantity > 1.0e-7)
+
+            # Store order quantity and realized inventory trajectory.
+            order_paths[scenario, stage] = order_quantity
+            inventory_paths[scenario, stage + 1] = next_inventory
+
+            # The next state carries current inventory and demand history.
+            state = [next_inventory, demand_value, state[2]]
         end
     end
-    return op_costs, traj_inv, traj_z, traj_q
-end
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# Train + evaluate helper
-# ═══════════════════════════════════════════════════════════════════════════════
-function train_and_evaluate(;
-    tag::String,
-    integer::Bool,
-    num_batches::Int=400,
-    train_per_batch::Int=5,
-    lr::Float64=0.0015,
-    warmup_batches::Int=80,
-    int_strategy_override::Union{Nothing, AbstractIntegerStrategy}=nothing,
-)
-    println("=" ^ 60)
-    println("Training TS-DDR — $(tag) (integer=$integer)")
-    println("=" ^ 60)
 
-    model_path = joinpath(model_dir, "$(tag)_policy.jld2")
-    curve_path = joinpath(result_dir, "$(tag)_training_curve.csv")
+    return operational_costs, inventory_paths, setup_paths, order_paths
+end
 
-    println("Building deterministic equivalent...")
-    det_eq, spi_train, spo_train, unc_train, init_state = build_inventory_det_equivalent(;
-        num_scenarios=N_TRAIN_SCENARIOS, penalty=INVENTORY_PENALTY, seed=42, integer=integer)
+"""
+    build_training_problem(variant::InventoryTrainingVariant)
 
-    println("Building stage-wise subproblems...")
-    eval_subproblems, spi_eval, spo_eval, unc_eval, _ = build_inventory_subproblems(;
-        num_scenarios=N_TEST, penalty=INVENTORY_PENALTY, seed=99, integer=integer)
+Build the deterministic-equivalent model used for training a variant.
 
-    policy = build_exante_policy(; seed=2024)
+# Arguments
+- `variant::InventoryTrainingVariant`: variant configuration.
 
-    int_strategy = if int_strategy_override !== nothing
-        int_strategy_override
-    elseif integer
-        FixedDiscreteIntegerStrategy()
-    else
-        NoIntegerStrategy()
-    end
+# Examples
+```julia
+det_eq, state_in, state_out, sampler, initial_state =
+    build_training_problem(variant)
+```
+"""
+function build_training_problem(variant::InventoryTrainingVariant)
+    # Training uses a deterministic equivalent so target-dual gradients are coupled.
+    return build_inventory_det_equivalent(;
+        num_scenarios = N_TRAIN_SCENARIOS,
+        penalty = variant.penalty,
+        seed = 42,
+        integer = variant.integer,
+    )
+end
+
+"""
+    build_evaluation_problem(variant::InventoryTrainingVariant)
+
+Build the stage-wise models used for held-out rollout evaluation.
 
+# Arguments
+- `variant::InventoryTrainingVariant`: variant configuration.
+
+# Examples
+```julia
+subproblems, state_in, state_out, sampler, initial_state =
+    build_evaluation_problem(variant)
+```
+"""
+function build_evaluation_problem(variant::InventoryTrainingVariant)
+    # Evaluation uses stage-wise deployment semantics, not the training DE solve.
+    return build_inventory_subproblems(;
+        num_scenarios = N_TEST_SCENARIOS,
+        penalty = variant.penalty,
+        seed = 99,
+        integer = variant.integer,
+    )
+end
+
+"""
+    estimate_initial_loss(policy, det_eq, state_params_in, state_params_out,
+                          uncertainty_sampler, initial_state, variant)
+
+Estimate pre-training deterministic-equivalent cost for checkpoint initialization.
+
+# Arguments
+- `policy`: policy evaluated before training.
+- `det_eq::JuMP.Model`: deterministic-equivalent training model.
+- `state_params_in`: input-state parameters.
+- `state_params_out`: target-state parameters.
+- `uncertainty_sampler`: training sampler.
+- `initial_state`: state entering stage 1.
+- `variant::InventoryTrainingVariant`: training configuration.
+
+# Examples
+```julia
+loss = estimate_initial_loss(policy, det_eq, spi, spo, sampler, x0, variant)
+```
+"""
+function estimate_initial_loss(
+    policy,
+    det_eq::JuMP.Model,
+    state_params_in,
+    state_params_out,
+    uncertainty_sampler,
+    initial_state,
+    variant::InventoryTrainingVariant,
+)
+    # Use a small fixed sample only to seed SaveBest with a finite baseline.
     Random.seed!(111)
-    pre_costs = [
-        let unc = sample(unc_train)
+
+    return mean(
+        let uncertainty_sample = sample(uncertainty_sampler)
+            # Deterministic-equivalent simulation needs the full target trajectory.
+            target_states = simulate_states(initial_state, uncertainty_sample, policy)
+
             simulate_multistage(
-                det_eq, spi_train, spo_train, unc,
-                simulate_states(init_state, unc, policy);
-                integer_strategy=int_strategy,
+                det_eq,
+                state_params_in,
+                state_params_out,
+                uncertainty_sample,
+                target_states;
+                integer_strategy = variant.training_integer_strategy,
             )
         end for _ in 1:12
-    ]
-    pre_mean = mean(pre_costs)
-    println("Pre-training mean cost: $(round(pre_mean, digits=2))")
+    )
+end
 
-    save_best = SaveBest(pre_mean, model_path)
-    training_log = DataFrame(batch=Int[], loss=Float64[])
+"""
+    train_variant!(policy, variant, det_eq, state_params_in, state_params_out,
+                   uncertainty_sampler, initial_state, model_path, curve_path;
+                   eval_subproblems, eval_state_in, eval_state_out, eval_sampler,
+                   eval_every, eval_scenarios)
+
+Train one policy and write its training curve.
+
+The training curve records the true out-of-sample stage-wise rollout cost
+(not the deterministic-equivalent training objective) every `eval_every`
+batches. This is the deployment-relevant metric and allows fair comparison
+across integer strategies and with SDDP.
+
+# Arguments
+- `policy`: mutable Flux policy updated in place.
+- `variant::InventoryTrainingVariant`: training configuration.
+- `det_eq::JuMP.Model`: deterministic-equivalent training model.
+- `state_params_in`: input-state parameters.
+- `state_params_out`: target-state parameters.
+- `uncertainty_sampler`: training sampler.
+- `initial_state`: state entering stage 1.
+- `model_path::String`: path for the best model checkpoint.
+- `curve_path::String`: path for the training-curve CSV.
+
+# Keywords
+- `eval_subproblems`: stage-wise models for out-of-sample rollout evaluation.
+- `eval_state_in`: input-state parameters for evaluation models.
+- `eval_state_out`: output-state parameters for evaluation models.
+- `eval_sampler`: uncertainty sampler for evaluation scenarios.
+- `eval_every::Int = 25`: evaluate rollout cost every this many batches.
+- `eval_scenarios::Int = 30`: number of scenarios per periodic evaluation.
+
+# Examples
+```julia
+train_variant!(policy, variant, det_eq, spi, spo, sampler, x0,
+               model_path, curve_path;
+               eval_subproblems = esub, eval_state_in = esi,
+               eval_state_out = eso, eval_sampler = esamp)
+```
+"""
+function train_variant!(
+    policy,
+    variant::InventoryTrainingVariant,
+    det_eq::JuMP.Model,
+    state_params_in,
+    state_params_out,
+    uncertainty_sampler,
+    initial_state,
+    model_path::String,
+    curve_path::String;
+    eval_subproblems,
+    eval_state_in,
+    eval_state_out,
+    eval_sampler,
+    eval_every::Int = 25,
+    eval_scenarios::Int = 30,
+)
+    # Estimate a baseline loss before any optimizer step.
+    initial_loss = estimate_initial_loss(
+        policy,
+        det_eq,
+        state_params_in,
+        state_params_out,
+        uncertainty_sampler,
+        initial_state,
+        variant,
+    )
 
-    println("Training ($num_batches batches × $train_per_batch scenarios, lr=$lr)...")
+    # SaveBest stores the best policy according to the recorded operational loss.
+    save_best = SaveBest(initial_loss, model_path)
+
+    # Keep a small CSV trace for plots and sanity checks.
+    training_log = DataFrame(
+        batch = Int[], loss = Float64[], rollout_cost = Float64[],
+    )
+
+    println("=" ^ 60)
+    println("Training TS-DDR [$(variant.tag)]  integer=$(variant.integer)")
+    println("  $(variant.num_batches) batches x $(variant.train_per_batch) scenarios")
+    println("  learning rate: $(variant.learning_rate)")
+    println("  penalty: $(variant.penalty)")
+    println("  policy: $(typeof(policy))")
+    println("  training integer strategy: $(typeof(variant.training_integer_strategy))")
+    !isnothing(variant.score_function) &&
+        println("  score function: $(typeof(variant.score_function))")
+    println("  pre-training cost: $(round(initial_loss, digits = 1))")
+    println("  rollout eval every $(eval_every) batches on $(eval_scenarios) scenarios")
+    println("=" ^ 60)
+
+    # Fix optimizer randomness for repeatability.
     Random.seed!(2024)
-    train_start = time()
-    train_multistage(
-        policy, init_state, det_eq, spi_train, spo_train, unc_train;
-        num_batches=num_batches,
-        num_train_per_batch=train_per_batch,
-        optimizer=Flux.Adam(lr),
-        integer_strategy=int_strategy,
-        penalty_schedule=[(1, warmup_batches, 0.4), (warmup_batches+1, num_batches, 1.0)],
-        record=(sample_log, iter, model) -> begin
-            loss = isempty(sample_log.objectives_no_deficit) ? NaN : mean(sample_log.objectives_no_deficit)
-            push!(training_log, (batch=iter, loss=loss))
-            if mod(iter, 20) == 0 || iter == 1
-                println("  Batch $(lpad(iter, 3)) / $num_batches  loss = $(round(loss, digits=2))")
+
+    elapsed_seconds = @elapsed train_multistage(
+        policy,
+        initial_state,
+        det_eq,
+        state_params_in,
+        state_params_out,
+        uncertainty_sampler;
+        num_batches = variant.num_batches,
+        num_train_per_batch = variant.train_per_batch,
+        optimizer = Flux.Adam(variant.learning_rate),
+        integer_strategy = variant.training_integer_strategy,
+        penalty_schedule = variant.penalty_schedule_fn(variant),
+        score_function = variant.score_function,
+        record = (sample_log, iteration, current_policy) -> begin
+            loss = isempty(sample_log.objectives_no_deficit) ?
+                NaN :
+                mean(sample_log.objectives_no_deficit)
+
+            # Periodically evaluate the true out-of-sample stage-wise
+            # rollout cost — the metric that matters at deployment.
+            rollout_cost = NaN
+            if iteration == 1 || mod(iteration, eval_every) == 0
+                saved_state = hasproperty(current_policy, :state) ?
+                    deepcopy(current_policy.state) : nothing
+
+                eval_costs, _, _, _ = rollout_policy(
+                    current_policy,
+                    eval_subproblems,
+                    eval_state_in,
+                    eval_state_out,
+                    eval_sampler,
+                    initial_state;
+                    num_scenarios = eval_scenarios,
+                    seed = 777,
+                    integer = variant.integer,
+                )
+                rollout_cost = mean(eval_costs)
+
+                if !isnothing(saved_state)
+                    current_policy.state = saved_state
+                end
+            end
+
+            push!(training_log, (
+                batch = iteration, loss = loss, rollout_cost = rollout_cost,
+            ))
+
+            if iteration == 1 || mod(iteration, 50) == 0
+                cost_str = isnan(rollout_cost) ? "" :
+                    "  rollout=$(round(rollout_cost, digits = 1))"
+                println(
+                    "  batch $(lpad(iteration, 4))/$(variant.num_batches)  " *
+                    "loss=$(round(loss, digits = 1))$(cost_str)",
+                )
             end
-            save_best(iter, model, loss)
+
+            # Save best policy based on rollout cost when available,
+            # falling back to DE loss otherwise.
+            save_metric = isnan(rollout_cost) ? loss : rollout_cost
+            save_best(iteration, current_policy, save_metric)
+
             return false
         end,
     )
-    train_seconds = time() - train_start
+
+    # Persist the training curve after training finishes.
     CSV.write(curve_path, training_log)
-    println("Training time: $(round(train_seconds, digits=1))s")
-
-    model_state = JLD2.load(model_path, "model_state")
-    Flux.loadmodel!(policy, model_state)
-
-    println("Evaluating on $N_TEST test scenarios...")
-    eval_start = time()
-    op_costs, traj_inv, traj_z, traj_q = rollout_policy(
-        policy, eval_subproblems, spi_eval, spo_eval, unc_eval, init_state;
-        n_test=N_TEST, seed=555, integer=integer)
-    eval_seconds = time() - eval_start
-
-    df_inv = DataFrame(traj_inv, [Symbol("t$i") for i in 0:INVENTORY_T])
-    df_inv[!, :scenario] = 1:N_TEST
-    CSV.write(joinpath(result_dir, "$(tag)_dr_trajectories.csv"), df_inv)
-    CSV.write(joinpath(result_dir, "$(tag)_dr_costs.csv"),
-        DataFrame(scenario=1:N_TEST, operational_cost=op_costs))
-    method_name = if int_strategy isa ContinuousRelaxationIntegerStrategy
-        "TS-DDR (ContRelax)"
-    elseif int_strategy isa FixedDiscreteIntegerStrategy
-        "TS-DDR (FixedDiscrete)"
+
+    println("Training time: $(round(elapsed_seconds, digits = 1))s")
+
+    return elapsed_seconds
+end
+
+"""
+    save_evaluation_outputs(variant, costs, inventory_paths, train_seconds, eval_seconds)
+
+Write rollout costs, inventory trajectories, and timing rows for one variant.
+
+# Arguments
+- `variant::InventoryTrainingVariant`: variant configuration.
+- `costs::AbstractVector{<:Real}`: held-out operational costs.
+- `inventory_paths::AbstractMatrix{<:Real}`: inventory trajectory matrix.
+- `train_seconds::Real`: total training time.
+- `eval_seconds::Real`: total rollout evaluation time.
+
+# Examples
+```julia
+save_evaluation_outputs(variant, costs, inventory_paths, train_time, eval_time)
+```
+"""
+function save_evaluation_outputs(
+    variant::InventoryTrainingVariant,
+    costs::AbstractVector{<:Real},
+    inventory_paths::AbstractMatrix{<:Real},
+    train_seconds::Real,
+    eval_seconds::Real,
+)
+    # Name trajectory columns by period t=0,...,T.
+    time_columns = [Symbol("t$(period)") for period in 0:INVENTORY_T]
+
+    # Write inventory paths for plotting.
+    CSV.write(
+        joinpath(RESULT_DIR, "$(variant.tag)_dr_trajectories.csv"),
+        DataFrame(inventory_paths, time_columns),
+    )
+
+    # Write one operational-cost row per held-out scenario.
+    CSV.write(
+        joinpath(RESULT_DIR, "$(variant.tag)_dr_costs.csv"),
+        DataFrame(
+            scenario = 1:length(costs),
+            operational_cost = costs,
+        ),
+    )
+
+    # Write timing in the shared schema consumed by compare_results.jl.
+    CSV.write(
+        joinpath(RESULT_DIR, "$(variant.tag)_dr_timing.csv"),
+        DataFrame(
+            method = [method_label(variant)],
+            fit_seconds = [train_seconds],
+            eval_seconds = [eval_seconds / (N_TEST_SCENARIOS * INVENTORY_T)],
+            n_eval = [N_TEST_SCENARIOS],
+        ),
+    )
+
+    return nothing
+end
+
+"""
+    train_and_evaluate(variant::InventoryTrainingVariant)
+
+Train one TS-DDR variant and evaluate it by stage-wise rollout.
+
+# Arguments
+- `variant::InventoryTrainingVariant`: variant configuration.
+
+# Examples
+```julia
+costs = train_and_evaluate(variant)
+```
+"""
+function train_and_evaluate(variant::InventoryTrainingVariant)
+    # Keep model and curve paths tied to the variant tag.
+    model_path = joinpath(MODEL_DIR, "$(variant.tag)_policy.jld2")
+    curve_path = joinpath(RESULT_DIR, "$(variant.tag)_training_curve.csv")
+
+    # Build the training deterministic equivalent.
+    det_eq, train_state_in, train_state_out, train_sampler, initial_state =
+        build_training_problem(variant)
+
+    # Build separate stage-wise models for deployment evaluation.
+    eval_subproblems, eval_state_in, eval_state_out, eval_sampler, _ =
+        build_evaluation_problem(variant)
+
+    # Start from the variant's chosen policy architecture.
+    policy = variant.policy_builder()
+
+    # Train the policy and save the best checkpoint. The eval subproblems
+    # are shared with the post-training evaluation — rollout_policy resets
+    # parameters each call, so reuse is safe.
+    train_seconds = train_variant!(
+        policy,
+        variant,
+        det_eq,
+        train_state_in,
+        train_state_out,
+        train_sampler,
+        initial_state,
+        model_path,
+        curve_path;
+        eval_subproblems = eval_subproblems,
+        eval_state_in = eval_state_in,
+        eval_state_out = eval_state_out,
+        eval_sampler = eval_sampler,
+    )
+
+    # Reload the best checkpoint before evaluation.
+    Flux.loadmodel!(policy, JLD2.load(model_path, "model_state"))
+
+    # Allocate outer variables so the timed block can assign them.
+    costs = Float64[]
+    inventory_paths = Matrix{Float64}(undef, 0, 0)
+
+    # Evaluate under deployment semantics and time only the rollout solve work.
+    eval_seconds = @elapsed begin
+        rollout_costs, rollout_inventory_paths, _setup_paths, _order_paths =
+            rollout_policy(
+            policy,
+            eval_subproblems,
+            eval_state_in,
+            eval_state_out,
+            eval_sampler,
+            initial_state;
+            integer = variant.integer,
+        )
+
+        # Copy the rollout results into the outer scope.
+        costs = rollout_costs
+        inventory_paths = rollout_inventory_paths
+    end
+
+    # Write all result files after the elapsed time is known.
+    save_evaluation_outputs(
+        variant,
+        costs,
+        inventory_paths,
+        train_seconds,
+        eval_seconds,
+    )
+
+    # Print the headline cost distribution for this variant.
+    mean_cost = mean(costs)
+    std_cost = std(costs)
+    seconds_per_stage = eval_seconds / (N_TEST_SCENARIOS * INVENTORY_T)
+    println(
+        "Result: $(round(mean_cost, digits = 1)) +- " *
+        "$(round(std_cost, digits = 1))  " *
+        "(eval/stage: $(round(seconds_per_stage, digits = 4))s)",
+    )
+
+    return costs
+end
+
+"""
+    score_function_variant() -> InventoryTrainingVariant
+
+Build the mixed-gradient integer variant.
+
+The dual path uses `FixedDiscreteIntegerStrategy`. The score-function path uses
+separate integer rollout subproblems, so the Monte Carlo costs are true MIP
+rollout costs.
+
+# Examples
+```julia
+variant = score_function_variant()
+```
+"""
+function score_function_variant()
+    # Score-function rollouts use separate models so training solves do not
+    # mutate the deterministic-equivalent model.
+    rollout_subproblems, rollout_state_in, rollout_state_out, _sampler, _ =
+        build_inventory_subproblems(;
+            num_scenarios = N_TRAIN_SCENARIOS,
+            penalty = INVENTORY_PENALTY,
+            seed = 77,
+            integer = true,
+        )
+
+    # The score-function config describes the final estimator settings.
+    score_config = ScoreFunctionConfig(
+        rollout_subproblems,
+        rollout_state_in,
+        rollout_state_out;
+        dual_weight = 0.5,
+        perturbation_std = 1.0,
+        num_rollouts = 8,
+    )
+
+    # The schedule phases the Monte Carlo correction in after dual-only warmup.
+    score_schedule = ScoreFunctionSchedule(
+        score_config;
+        sf_start = 200,
+        ramp_batches = 300,
+        perturbation_std_initial = 0.1,
+        num_rollouts_initial = 2,
+    )
+
+    return InventoryTrainingVariant(
+        "integer_sf",
+        true,
+        800,
+        10,
+        8.0e-4,
+        120,
+        FixedDiscreteIntegerStrategy(),
+        score_schedule,
+    )
+end
+
+"""
+    three_phase_schedule(variant::InventoryTrainingVariant)
+
+Return a three-phase target-penalty multiplier schedule.
+
+The ramp starts gentle (0.2) so the optimizer sees smooth cost landscapes
+before the high penalty dominates:
+
+```math
+m_k =
+\\begin{cases}
+0.2, & 1 \\le k \\le K/6, \\\\
+0.6, & K/6 < k \\le K/2, \\\\
+1.0, & K/2 < k \\le K.
+\\end{cases}
+```
+
+# Arguments
+- `variant::InventoryTrainingVariant`: training configuration (uses
+  `num_batches` to compute phase boundaries).
+
+# Examples
+```julia
+schedule = three_phase_schedule(variant)
+```
+"""
+function three_phase_schedule(variant::InventoryTrainingVariant)
+    # Total number of SGD batches for this variant.
+    n = variant.num_batches
+
+    # Phase 1 (batches 1..n/6):   multiplier 0.2 — gentle start.
+    # Phase 2 (batches n/6..n/2): multiplier 0.6 — ramp up.
+    # Phase 3 (batches n/2..n):   multiplier 1.0 — full penalty.
+    return [
+        (1, div(n, 6), 0.2),
+        (div(n, 6) + 1, div(n, 2), 0.6),
+        (div(n, 2) + 1, n, 1.0),
+    ]
+end
+
+"""
+    lstm_score_function_variant() -> InventoryTrainingVariant
+
+Build the LSTM mixed-gradient variant with tuned score function.
+
+Compared to `score_function_variant()`, this variant:
+- uses `LSTMExAntePolicy` instead of `ExAnteInventoryPolicy`;
+- raises the target penalty to 250 (vs 75);
+- widens perturbation std to 15.0 (vs 1.0) so score-function rollouts are
+  large enough to flip the binary setup variable;
+- increases rollout count to 12 for lower REINFORCE variance.
+
+# Examples
+```julia
+variant = lstm_score_function_variant()
+```
+"""
+function lstm_score_function_variant()
+    # Higher penalty (250 vs 75) gives stronger dual signal to the optimizer.
+    penalty = 250.0
+
+    # Build separate stage-wise MIP models for score-function rollouts.
+    # These models are solved with full integrality — not relaxed.
+    rollout_subproblems, rollout_state_in, rollout_state_out, _sampler, _ =
+        build_inventory_subproblems(;
+            num_scenarios = N_TRAIN_SCENARIOS,
+            penalty = penalty,
+            seed = 77,
+            integer = true,
+        )
+
+    # Score-function config: α=0.7 dual weight, σ=15 perturbation, M=12 rollouts.
+    # σ=15 is ≈10% of typical target values (~150), enough to flip z decisions.
+    score_config = ScoreFunctionConfig(
+        rollout_subproblems,
+        rollout_state_in,
+        rollout_state_out;
+        dual_weight = 0.7,
+        perturbation_std = 15.0,
+        num_rollouts = 12,
+    )
+
+    # Schedule: no score function for first 400 batches (dual-only warmup),
+    # then linear ramp over 400 batches to full score-function parameters.
+    score_schedule = ScoreFunctionSchedule(
+        score_config;
+        sf_start = 400,
+        ramp_batches = 400,
+        perturbation_std_initial = 3.0,
+        num_rollouts_initial = 4,
+    )
+
+    return InventoryTrainingVariant(
+        "integer_lstm_sf",
+        true,
+        1200,
+        16,
+        5.0e-4,
+        200,
+        FixedDiscreteIntegerStrategy(),
+        score_schedule,
+        penalty,
+        () -> build_lstm_exante_policy(; seed = 2024),
+        three_phase_schedule,
+    )
+end
+
+"""
+    inventory_training_variants() -> Vector{InventoryTrainingVariant}
+
+Return all TS-DDR variants used in the benchmark.
+
+# Examples
+```julia
+for variant in inventory_training_variants()
+    train_and_evaluate(variant)
+end
+```
+"""
+function inventory_training_variants()
+    return [
+        InventoryTrainingVariant(
+            "relaxed",
+            false,
+            400,
+            5,
+            1.5e-3,
+            80,
+            NoIntegerStrategy(),
+            nothing,
+        ),
+        InventoryTrainingVariant(
+            "integer",
+            true,
+            800,
+            10,
+            8.0e-4,
+            120,
+            FixedDiscreteIntegerStrategy(),
+            nothing,
+        ),
+        InventoryTrainingVariant(
+            "integer_cr",
+            true,
+            800,
+            10,
+            8.0e-4,
+            120,
+            ContinuousRelaxationIntegerStrategy(),
+            nothing,
+        ),
+        score_function_variant(),
+        # --- Tuned variants (relaxed) ---
+        # LSTM on the relaxed problem: isolates temporal encoding benefit
+        # without integer complexity.
+        InventoryTrainingVariant(
+            "relaxed_lstm",
+            false,
+            800,
+            10,
+            1.0e-3,
+            120,
+            NoIntegerStrategy(),
+            nothing,
+            INVENTORY_PENALTY,
+            () -> build_lstm_exante_policy(; seed = 2024),
+            penalty_schedule_for,
+        ),
+        # Higher penalty feedforward on the relaxed problem.
+        InventoryTrainingVariant(
+            "relaxed_hp",
+            false,
+            800,
+            10,
+            1.0e-3,
+            120,
+            NoIntegerStrategy(),
+            nothing,
+            250.0,
+            () -> build_exante_policy(; seed = 2024),
+            penalty_schedule_for,
+        ),
+        # LSTM + high penalty on the relaxed problem.
+        InventoryTrainingVariant(
+            "relaxed_lstm_hp",
+            false,
+            800,
+            10,
+            1.0e-3,
+            120,
+            NoIntegerStrategy(),
+            nothing,
+            250.0,
+            () -> build_lstm_exante_policy(; seed = 2024),
+            penalty_schedule_for,
+        ),
+        # --- Tuned variants (integer) ---
+        # Improved feedforward with higher penalty.
+        InventoryTrainingVariant(
+            "integer_hp",
+            true,
+            1200,
+            16,
+            5.0e-4,
+            200,
+            FixedDiscreteIntegerStrategy(),
+            nothing,
+            250.0,
+            () -> build_exante_policy(; seed = 2024),
+            three_phase_schedule,
+        ),
+        # Variant A: LSTM with high penalty
+        InventoryTrainingVariant(
+            "integer_lstm",
+            true,
+            1200,
+            16,
+            5.0e-4,
+            200,
+            FixedDiscreteIntegerStrategy(),
+            nothing,
+            250.0,
+            () -> build_lstm_exante_policy(; seed = 2024),
+            three_phase_schedule,
+        ),
+        # Variant B: LSTM with tuned score function
+        lstm_score_function_variant(),
+    ]
+end
+
+"""
+    run_variant(tag::AbstractString) -> Nothing
+
+Train and evaluate a single variant by tag name.
+
+This is the entry point used by SLURM jobs to run one variant at a time:
+
+```bash
+julia --project=. train_dr_inventory.jl integer_lstm
+```
+
+# Arguments
+- `tag::AbstractString`: one of the tags returned by
+  `inventory_training_variants()`.
+
+# Examples
+```julia
+run_variant("integer_lstm")
+```
+"""
+function run_variant(tag::AbstractString)
+    all_variants = inventory_training_variants()
+    idx = findfirst(v -> v.tag == tag, all_variants)
+    isnothing(idx) && error(
+        "Unknown variant tag \"$tag\". " *
+        "Available: $(join([v.tag for v in all_variants], ", "))"
+    )
+    train_and_evaluate(all_variants[idx])
+    return nothing
+end
+
+"""
+    main() -> Nothing
+
+Run the full inventory TS-DDR training benchmark.
+
+# Examples
+```julia
+main()
+```
+"""
+function main()
+    for variant in inventory_training_variants()
+        train_and_evaluate(variant)
+        println()
+    end
+
+    println("All TS-DDR results saved to $(relpath(RESULT_DIR, EXAMPLE_DIR))")
+
+    return nothing
+end
+
+# Run the script only when invoked directly, not when included by tests.
+if abspath(PROGRAM_FILE) == @__FILE__
+    if isempty(ARGS)
+        main()
     else
-        "TS-DDR (trained)"
+        run_variant(ARGS[1])
     end
-    CSV.write(joinpath(result_dir, "$(tag)_dr_timing.csv"),
-        DataFrame(method=[method_name],
-                  fit_seconds=[train_seconds],
-                  eval_seconds=[eval_seconds / (N_TEST * INVENTORY_T)],
-                  n_eval=[N_TEST]))
-
-    μ = mean(op_costs)
-    σ = std(op_costs)
-    println("$(tag) TS-DDR — mean: $(round(μ, digits=1)) ± $(round(σ, digits=1))")
-    println("  Eval/stage: $(round(eval_seconds/(N_TEST*INVENTORY_T), digits=4))s")
-    return op_costs
-end
-
-# ═══════════════════════════════════════════════════════════════════════════════
-# Run both
-# ═══════════════════════════════════════════════════════════════════════════════
-train_and_evaluate(tag="relaxed", integer=false,
-    num_batches=400, train_per_batch=5, lr=0.0015, warmup_batches=80)
-println()
-train_and_evaluate(tag="integer", integer=true,
-    num_batches=800, train_per_batch=10, lr=0.0008, warmup_batches=120)
-println()
-train_and_evaluate(tag="integer_cr", integer=true,
-    num_batches=800, train_per_batch=10, lr=0.0008, warmup_batches=120,
-    int_strategy_override=ContinuousRelaxationIntegerStrategy())
-println("\nAll TS-DDR results saved to $(relpath(result_dir, example_dir))")
+end
diff --git a/src/DecisionRules.jl b/src/DecisionRules.jl
index a4e11ad..5f90a6a 100644
--- a/src/DecisionRules.jl
+++ b/src/DecisionRules.jl
@@ -9,6 +9,7 @@ using ChainRulesCore
 import ChainRulesCore.rrule
 using DiffOpt
 using Logging
+using Statistics: mean
 
 export simulate_multistage,
     sample,
@@ -35,13 +36,114 @@ export simulate_multistage,
     StateConditionedPolicy,
     state_conditioned_policy,
     materialize_tangent,
+    # Score-function gradient mixing
+    ScoreFunctionConfig,
+    ScoreFunctionSchedule,
+    sf_params,
     # Multiple shooting exports
     train_multiple_shooting,
     setup_shooting_windows,
     solve_window,
     predict_window_targets,
     simulate_multiple_shooting,
-    WindowData
+    WindowData,
+    # Gradient fallback
+    AbstractGradientFallback,
+    ZeroGradientFallback,
+    ErrorGradientFallback
+
+@doc raw"""
+    AbstractGradientFallback
+
+Abstract type governing what happens when a solver or differentiation error
+occurs during training.
+
+DecisionRules ships two concrete subtypes:
+
+| Type | Behavior |
+|------|----------|
+| [`ZeroGradientFallback`](@ref) | Log a warning, return zero gradients, continue training |
+| [`ErrorGradientFallback`](@ref) | Re-throw the error (useful in tests) |
+
+## Extending
+
+Implement your own subtype to customize recovery:
+
+```julia
+struct MyFallback <: DecisionRules.AbstractGradientFallback end
+
+function DecisionRules.handle_gradient_error(::MyFallback, e, n_state_in, n_state_out)
+    # e is the caught exception
+    # Return a tuple of cotangents (same shape as the rrule pullback) or rethrow
+    @error "Custom handler" exception=e
+    return DecisionRules._zero_cotangents(n_state_in, n_state_out)
+end
+
+function DecisionRules.handle_training_error(::MyFallback, e, iter)
+    # Return true to skip this iteration, false to rethrow
+    @error "Custom training handler" exception=e
+    return true  # skip
+end
+
+function DecisionRules.handle_rollout_error(::MyFallback, e, iter)
+    # Return true to skip this scenario, false to rethrow
+    return true
+end
+```
+
+Then pass `gradient_fallback=MyFallback()` to [`train_multistage`](@ref) or
+[`train_multiple_shooting`](@ref).
+"""
+abstract type AbstractGradientFallback end
+
+"""
+    ZeroGradientFallback()
+
+Default fallback: log a warning and return zero gradients when the solver or
+DiffOpt differentiation fails. Training continues with a skipped update for
+that iteration.
+"""
+struct ZeroGradientFallback <: AbstractGradientFallback end
+
+"""
+    ErrorGradientFallback()
+
+Strict fallback: re-throw any solver or differentiation error. Use this in
+tests to ensure that controlled problems never silently produce zero gradients.
+"""
+struct ErrorGradientFallback <: AbstractGradientFallback end
+
+_zero_cotangents(n_in, n_out) = (
+    NoTangent(), NoTangent(), NoTangent(), NoTangent(),
+    zeros(n_in), zeros(n_out), NoTangent(),
+)
+
+function handle_gradient_error(::ZeroGradientFallback, e, n_state_in, n_state_out)
+    @warn "get_next_state pullback failed — returning zero gradients" exception=(e, catch_backtrace())
+    return _zero_cotangents(n_state_in, n_state_out)
+end
+
+function handle_gradient_error(::ErrorGradientFallback, e, n_state_in, n_state_out)
+    rethrow(e)
+end
+
+function handle_training_error(::ZeroGradientFallback, e, iter)
+    @warn "Gradient computation failed at iter $iter — skipping update" exception=(e, catch_backtrace())
+    return true
+end
+
+function handle_training_error(::ErrorGradientFallback, e, iter)
+    rethrow(e)
+end
+
+function handle_rollout_error(::ZeroGradientFallback, e, iter)
+    @warn "Rollout scenario failed at iter $iter — skipping" exception=(e, catch_backtrace())
+    return true
+end
+
+function handle_rollout_error(::ErrorGradientFallback, e, iter)
+    rethrow(e)
+end
 
 """
     STRICT_GRADIENTS
@@ -57,13 +159,22 @@ to verify that controlled test cases never silently fall through to zero
 gradients:
 
     DecisionRules.STRICT_GRADIENTS[] = true
+
+!!! note
+    This flag controls the **rrule-level** fallback for bad solver status.
+    For the **training-loop-level** fallback (DiffOpt assertion errors, etc.),
+    use the `gradient_fallback` keyword in [`train_multistage`](@ref) and
+    [`train_multiple_shooting`](@ref).
 """
 const STRICT_GRADIENTS = Ref(false)
 
+const _DEFAULT_GRADIENT_FALLBACK = ZeroGradientFallback()
+
 const _SUCCESSFUL_TERM_STATUSES = (MOI.OPTIMAL, MOI.ALMOST_OPTIMAL, MOI.LOCALLY_SOLVED)
 
 include("integer_strategies.jl")
 include("parameter_duals.jl")
+include("score_function.jl")
 include("simulate_multistage.jl")
 include("dense_multilayer_nn.jl")
 include("utils.jl")
diff --git a/src/integer_strategies.jl b/src/integer_strategies.jl
index 33f001e..046576a 100644
--- a/src/integer_strategies.jl
+++ b/src/integer_strategies.jl
@@ -1,46 +1,127 @@
 """
     AbstractIntegerStrategy
 
-Extension point for preparing models with discrete variables before reading duals
-or solver sensitivities.
+Abstract supertype for strategies that prepare a JuMP model before reading
+duals or solver sensitivities.
+
+# Arguments
+This abstract type has no fields. Concrete subtypes are passed as the
+`integer_strategy::AbstractIntegerStrategy` keyword to simulation and training
+functions.
+
+# Examples
+```julia
+simulate_multistage(
+    subproblems,
+    state_params_in,
+    state_params_out,
+    initial_state,
+    uncertainties,
+    policy;
+    integer_strategy = FixedDiscreteIntegerStrategy(),
+)
+```
 """
 abstract type AbstractIntegerStrategy end
 
 """
     NoIntegerStrategy()
 
-Default integer strategy. Solves the model exactly as-is and preserves the
-historical continuous-model behavior.
+Solve the model exactly as written before reading duals or sensitivities.
+
+Use this for continuous LP, conic, or nonlinear models whose derivative
+information is available directly from the solved model.
+
+# Arguments
+This type has no fields.
+
+# Examples
+```julia
+strategy = NoIntegerStrategy()
+```
 """
 struct NoIntegerStrategy <: AbstractIntegerStrategy end
 
 """
     FixedDiscreteIntegerStrategy()
 
-Solve the original model, fix binary/integer variables to their incumbent values,
-relax integrality, re-solve the fixed continuous model, and read duals or
-sensitivities in that fixed-incumbent continuous state.
+Solve a mixed-integer model, fix discrete variables to their incumbent values,
+relax integrality, re-solve, and read duals or sensitivities from the fixed
+continuous model.
 
-The returned derivative-like information is local to the incumbent integer
-assignment and should be interpreted as a postprocessing surrogate, not as a full
+If ``z^*`` is the incumbent binary/integer solution, this strategy reads
+derivative-like information from the continuous problem
+
+```math
+\\min_x f(x, z^*) \\quad \\text{subject to} \\quad g(x, z^*) \\le 0.
+```
+
+The result is local to the incumbent integer assignment. It is not a
 differentiable MIP method.
+
+# Arguments
+This type has no fields.
+
+# Examples
+```julia
+strategy = FixedDiscreteIntegerStrategy()
+```
 """
 struct FixedDiscreteIntegerStrategy <: AbstractIntegerStrategy end
 
 """
     discrete_variables(model::JuMP.Model)
 
-Return all binary or integer variables in `model`.
+Return the binary or integer variables in `model`.
+
+# Arguments
+- `model::JuMP.Model`: model to inspect.
+
+# Examples
+```julia
+vars = DecisionRules.discrete_variables(model)
+```
 """
 function discrete_variables(model::JuMP.Model)
+    # JuMP tracks binary and integer status on variables, not in one shared list.
     return filter(JuMP.all_variables(model)) do variable
         JuMP.is_binary(variable) || JuMP.is_integer(variable)
     end
 end
 
+"""
+    has_discrete_variables(model::JuMP.Model) -> Bool
+
+Return whether `model` contains at least one binary or integer variable.
+
+# Arguments
+- `model::JuMP.Model`: model to inspect.
+
+# Examples
+```julia
+if DecisionRules.has_discrete_variables(model)
+    @info "MIP model"
+end
+```
+"""
 has_discrete_variables(model::JuMP.Model) = !isempty(discrete_variables(model))
 
+"""
+    _assert_successful_solve(model::JuMP.Model; context::AbstractString = "solve")
+
+Throw an error unless `model` terminated with an accepted success status.
+
+# Arguments
+- `model::JuMP.Model`: model whose termination status is checked.
+- `context::AbstractString`: human-readable phrase included in the error.
+
+# Examples
+```julia
+DecisionRules._assert_successful_solve(model; context = "fixed LP solve")
+```
+"""
 function _assert_successful_solve(model::JuMP.Model; context::AbstractString="solve")
+    # Keep the accepted statuses centralized in DecisionRules.jl.
     status = JuMP.termination_status(model)
     status in _SUCCESSFUL_TERM_STATUSES && return status
     throw(
@@ -54,13 +135,26 @@ end
 """
     with_sensitivity_solution(f, model, integer_strategy)
 
-Run `f(model)` while `model` is in a state where duals or DiffOpt sensitivities
-can be read. Integer strategies that temporarily mutate the model must restore it
-before returning, including when `f` throws.
+Run `f(model)` while `model` is in a state suitable for reading duals or
+DiffOpt sensitivities.
+
+# Arguments
+- `f::Function`: callback that reads values, duals, or sensitivities.
+- `model::JuMP.Model`: model to solve and inspect.
+- `integer_strategy::AbstractIntegerStrategy`: strategy used to prepare models
+  with binary or integer variables.
+
+# Examples
+```julia
+objective = with_sensitivity_solution(model, FixedDiscreteIntegerStrategy()) do m
+    JuMP.objective_value(m)
+end
+```
 """
 function with_sensitivity_solution(
     f::Function, model::JuMP.Model, ::NoIntegerStrategy
 )
+    # Continuous models can be solved directly.
     optimize!(model)
     return f(model)
 end
@@ -68,21 +162,46 @@ end
 function with_sensitivity_solution(
     f::Function, model::JuMP.Model, ::FixedDiscreteIntegerStrategy
 )
+    # First solve the original MIP to obtain an incumbent integer assignment.
     optimize!(model)
     _assert_successful_solve(model; context="original integer solve")
 
+    # Models without discrete variables fall back to the direct solved state.
     has_discrete_variables(model) || return f(model)
 
+    # JuMP returns an undo callback that restores integrality and bounds.
     undo = JuMP.fix_discrete_variables(model)
     try
+        # Re-solve the fixed continuous problem before reading duals.
         optimize!(model)
         _assert_successful_solve(model; context="fixed-discrete sensitivity solve")
         return f(model)
     finally
+        # Always restore the original model, even when the callback fails.
         undo()
     end
 end
 
+"""
+    _with_current_or_sensitivity_solution(f, model, integer_strategy)
+
+Run `f(model)` directly for continuous models and through
+[`with_sensitivity_solution`](@ref) for integer strategies.
+
+# Arguments
+- `f::Function`: callback that reads values, duals, or sensitivities.
+- `model::JuMP.Model`: model to inspect.
+- `integer_strategy::AbstractIntegerStrategy`: current integer strategy.
+
+# Examples
+```julia
+value = DecisionRules._with_current_or_sensitivity_solution(
+    m -> JuMP.objective_value(m),
+    model,
+    strategy,
+)
+```
+"""
 _with_current_or_sensitivity_solution(
     f::Function, model::JuMP.Model, ::NoIntegerStrategy
 ) = f(model)
@@ -99,6 +218,10 @@ end
 Relax all binary/integer constraints to continuous bounds (binary → [0,1]),
 solve the resulting LP, and read duals in that relaxed state.
 
+Mathematically, this replaces ``z \\in \\{0,1\\}`` or integer restrictions with
+continuous bounds before solving. The derivative signal belongs to the relaxed
+problem, not to the original MIP.
+
 Compared to [`FixedDiscreteIntegerStrategy`](@ref):
 - **Faster**: one LP solve instead of MIP + LP.
 - **Smoother gradients**: no integer fixing means no zero-gradient dead zones.
@@ -109,31 +232,59 @@ A practical pattern is to train with `ContinuousRelaxationIntegerStrategy`
 during warmup (smooth landscape for initial learning) and switch to
 `FixedDiscreteIntegerStrategy` later (integer-accurate gradients for
 fine-tuning).
+
+# Arguments
+This type has no fields.
+
+# Examples
+```julia
+strategy = ContinuousRelaxationIntegerStrategy()
+```
 """
 struct ContinuousRelaxationIntegerStrategy <: AbstractIntegerStrategy end
 
 function with_sensitivity_solution(
     f::Function, model::JuMP.Model, ::ContinuousRelaxationIntegerStrategy
 )
+    # Continuous models need no relaxation step.
     has_discrete_variables(model) || begin
         optimize!(model)
         return f(model)
     end
+
+    # JuMP returns an undo callback that restores integrality after the solve.
     undo = JuMP.relax_integrality(model)
     try
+        # Solve the continuous relaxation before reading duals.
         optimize!(model)
         _assert_successful_solve(model; context="continuous relaxation sensitivity solve")
         return f(model)
     finally
+        # Restore integer declarations before returning control to the caller.
         undo()
     end
 end
 
+"""
+    _sensitivity_forward_status(model::JuMP.Model, strategy) -> MOI.TerminationStatusCode
+
+Return the termination status that an rrule should use for gradient fallback.
+
+# Arguments
+- `model::JuMP.Model`: model inspected after the forward pass.
+- `strategy::AbstractIntegerStrategy`: integer strategy used for the solve.
+
+# Examples
+```julia
+status = DecisionRules._sensitivity_forward_status(model, strategy)
+```
+"""
 _sensitivity_forward_status(model::JuMP.Model, ::NoIntegerStrategy) =
     JuMP.termination_status(model)
 
 function _sensitivity_forward_status(
     ::JuMP.Model, ::AbstractIntegerStrategy
 )
+    # Integer strategies do their own solve checks inside the sensitivity pass.
     return MOI.OPTIMAL
 end
diff --git a/src/multiple_shooting.jl b/src/multiple_shooting.jl
index afe93de..322d174 100644
--- a/src/multiple_shooting.jl
+++ b/src/multiple_shooting.jl
@@ -60,13 +60,16 @@ end
 =============================================================================#
 
 """
-    extract_uncertainty_params(window_uncertainties_new)
+    extract_uncertainty_params(window_uncertainties) -> Vector{Vector{VariableRef}}
 
-Normalize uncertainty data to a per-stage vector of parameter VariableRefs.
+Extract the JuMP parameter `VariableRef`s from each stage of an uncertainty pool.
 
-Accepts either:
-- Vector{Vector{Tuple{VariableRef, Any}}} (common in this package), or
-- Vector{Vector{VariableRef}}.
+Handles three possible input shapes (automatically detected):
+- Already-extracted `Vector{Vector{VariableRef}}` — returned as-is.
+- Per-unit pool `Vector{Vector{Tuple{VariableRef, Vector}}}` — extracts the first
+  element of each tuple.
+- Joint-scenario pool `Vector{Vector{Vector{Tuple{VariableRef, T}}}}` — extracts
+  params from the first scenario of each stage (all scenarios share the same params).
 """
 function extract_uncertainty_params(window_uncertainties)
     if isempty(window_uncertainties)
@@ -76,10 +79,15 @@ function extract_uncertainty_params(window_uncertainties)
     if isempty(first_stage)
         return [VariableRef[] for _ in 1:length(window_uncertainties)]
     end
-    if first(first_stage) isa VariableRef
+    elem = first(first_stage)
+    if elem isa VariableRef
         return window_uncertainties
+    elseif elem isa AbstractVector
+        # Joint-scenario format: each stage is [scenario₁, scenario₂, ...],
+        # each scenario is [(param, val), ...]. Extract params from the first scenario.
+        return [[pair[1] for pair in first(stage)] for stage in window_uncertainties]
     else
-        # assume tuples (param, something)
+        # Per-unit format: (param, support_vector)
         return [[u[1] for u in stage_u] for stage_u in window_uncertainties]
     end
 end
@@ -115,10 +123,20 @@ function _create_like_variable(
 end
 
 """
-    windows_equivalent!(model, subproblems, state_params_in, state_params_out, initial_state, uncertainties)
+    windows_equivalent!(model, subproblems, state_params_in, state_params_out,
+                         initial_state, uncertainties)
 
-Create a window equivalent without mutating the original subproblems and without
-adding extra variables/constraints beyond those already present in the subproblems.
+Build a coupled JuMP model for a contiguous window of stages by copying all variables,
+constraints, and objectives from `subproblems` into `model`. Stage coupling is enforced
+by identifying each stage's realized state variable with the next stage's incoming state
+parameter (same approach as [`deterministic_equivalent!`](@ref), but scoped to a
+window).
+
+`uncertainties` accepts both per-unit and joint-scenario pool formats
+(see [`sample`](@ref)). The returned `uncertainties_new` preserves the input format
+with variable refs remapped to the window model.
+
+Returns `(model, state_params_in_new, state_params_out_new, uncertainties_new)`.
 """
 function windows_equivalent!(
     model::JuMP.Model,
@@ -134,7 +152,10 @@ function windows_equivalent!(
     var_src_to_dest = Dict{VariableRef,VariableRef}()
     state_in_new = Vector{Vector{Any}}(undef, num_stages)
     state_out_new = Vector{Vector{Tuple{Any,VariableRef}}}(undef, num_stages)
-    uncertainties_new = Vector{Vector{Tuple{Any,Vector{Float64}}}}(undef, num_stages)
+    # Detect format: joint-scenario (Vector{Vector{Tuple}}) vs per-unit (Vector{Tuple{...,Vector}})
+    _is_joint = !isempty(uncertainties) && !isempty(uncertainties[1]) &&
+        first(uncertainties[1]) isa AbstractVector
+    uncertainties_new = Any[nothing for _ in 1:num_stages]
 
     for t in 1:num_stages
         subproblem = subproblems[t]
@@ -200,21 +221,42 @@ function windows_equivalent!(
             end
         end
 
-        # uncertainties
-        uncertainties_new[t] = Vector{Tuple{Any,Vector{Float64}}}(
-            undef, length(uncertainties[t])
-        )
-        for (i, tup) in enumerate(uncertainties[t])
-            u_src, u_vals = tup
-            if u_src isa VariableRef
-                dest = get(var_src_to_dest, u_src, nothing)
-                if dest === nothing
-                    dest = _create_like_variable(model, u_src, t; force_parameter=true)
-                    var_src_to_dest[u_src] = dest
+        # uncertainties — remap VariableRefs through var_src_to_dest
+        if _is_joint
+            # Joint-scenario format: each element is a scenario (Vector of Tuples)
+            uncertainties_new[t] = [
+                [begin
+                    u_src, u_val = pair
+                    if u_src isa VariableRef
+                        dest = get(var_src_to_dest, u_src, nothing)
+                        if dest === nothing
+                            dest = _create_like_variable(model, u_src, t; force_parameter=true)
+                            var_src_to_dest[u_src] = dest
+                        end
+                        (dest, Float64(u_val))
+                    else
+                        (u_src, Float64(u_val))
+                    end
+                end for pair in scenario]
+                for scenario in uncertainties[t]
+            ]
+        else
+            # Per-unit format: each element is (param, support_vector)
+            uncertainties_new[t] = Vector{Tuple{Any,Vector{Float64}}}(
+                undef, length(uncertainties[t])
+            )
+            for (i, tup) in enumerate(uncertainties[t])
+                u_src, u_vals = tup
+                if u_src isa VariableRef
+                    dest = get(var_src_to_dest, u_src, nothing)
+                    if dest === nothing
+                        dest = _create_like_variable(model, u_src, t; force_parameter=true)
+                        var_src_to_dest[u_src] = dest
+                    end
+                    uncertainties_new[t][i] = (dest, _as_float64_vec(u_vals))
+                else
+                    uncertainties_new[t][i] = (u_src, _as_float64_vec(u_vals))
                 end
-                uncertainties_new[t][i] = (dest, _as_float64_vec(u_vals))
-            else
-                uncertainties_new[t][i] = (u_src, _as_float64_vec(u_vals))
             end
         end
 
@@ -266,10 +308,11 @@ end
 """
     set_window_uncertainties!(window, uncertainty_sample)
 
-Set sampled uncertainty values into the window model parameters.
+Set sampled (realized) uncertainty values into the window model's JuMP parameters.
 
-- `window.uncertainty_params[t][i]` is the parameter VariableRef in the window model
-- `uncertainty_sample[global_t][i][2]` is the sampled numeric value (original structure)
+`uncertainty_sample` is a **realized** trajectory (output of [`sample`](@ref)), so
+each stage is `Vector{Tuple{VariableRef, Float64}}` regardless of whether the
+original pool used independent or joint-scenario format.
 """
 function set_window_uncertainties!(
     window,
@@ -880,10 +923,19 @@ end
 """
     train_multiple_shooting(model, initial_state, windows, uncertainty_sampler; ...)
 
-This mirrors your other training loops:
-- Reuse pre-built window models.
-- For each SGD step, sample uncertainties, build uncertainties_vec for the policy,
-  evaluate simulate_multiple_shooting, and update parameters.
+Train a target-state policy with multiple-shooting decomposition (windowed).
+
+`uncertainty_sampler` controls how trajectories are drawn at each SGD step.
+Three formats are accepted (same API as [`train_multistage`](@ref)):
+1. **Per-unit pool** (`Vector{Vector{Tuple{VariableRef, Vector{T}}}}`):
+   independent sampling per parameter per stage.
+2. **Joint-scenario pool** (`Vector{Vector{Vector{Tuple{VariableRef, T}}}}`):
+   one scenario drawn per stage, preserving spatial correlation.
+3. **Callable** (`() -> Vector{Vector{Tuple{VariableRef, T}}}`): a zero-arg
+   function returning a realized trajectory. Use this for temporal correlation;
+   see [`sample`](@ref).
+
+See the [Uncertainty Sampling](@ref) documentation for details.
 """
 function train_multiple_shooting(
     model,
@@ -901,6 +953,7 @@ function train_multiple_shooting(
     get_objective_no_target_deficit=get_objective_no_target_deficit,
     penalty_schedule=nothing,
     integer_strategy::AbstractIntegerStrategy=NoIntegerStrategy(),
+    gradient_fallback::AbstractGradientFallback=ZeroGradientFallback(),
 )
     opt_state = Flux.setup(optimizer, model)
 
@@ -913,18 +966,7 @@ function train_multiple_shooting(
     end
     current_multiplier = NaN
 
-    # We only need the uncertainty *structure* here.
-    base_uncertainty = uncertainty_sampler()
-    # If uncertainty values are vectors (sample sets), draw realized values per iteration.
-    has_sample_sets =
-        !isempty(base_uncertainty) &&
-        !isempty(base_uncertainty[1]) &&
-        (base_uncertainty[1][1][2] isa AbstractVector)
-    draw_uncertainty = if has_sample_sets
-        (() -> DecisionRules.sample(base_uncertainty))
-    else
-        uncertainty_sampler
-    end
+    draw_uncertainty = () -> DecisionRules.sample(uncertainty_sampler)
 
     initial_state_f32 = Float32.(initial_state)
 
@@ -940,7 +982,8 @@ function train_multiple_shooting(
 
         objective = 0.0
 
-        grads = Flux.gradient(model) do m
+        grads = try
+        Flux.gradient(model) do m
             objective = 0.0
             for _ in 1:num_train_per_batch
                 @ignore_derivatives Flux.reset!(m)
@@ -962,6 +1005,11 @@ function train_multiple_shooting(
             objective /= num_train_per_batch
             return objective
         end
+        catch e
+            if handle_training_error(gradient_fallback, e, iter)
+                nothing
+            end
+        end
 
         eval_loss = @ignore_derivatives begin
             total = 0.0
@@ -1015,6 +1063,10 @@ function train_multiple_shooting(
         record_loss(iter, model, eval_loss, "metrics/loss") && break
         record_loss(iter, model, objective, "metrics/training_loss") && break
 
+        if isnothing(grads)
+            continue
+        end
+
         grad = materialize_tangent(grads[1])
         Flux.update!(opt_state, model, grad)
     end
diff --git a/src/score_function.jl b/src/score_function.jl
new file mode 100644
index 0000000..120d392
--- /dev/null
+++ b/src/score_function.jl
@@ -0,0 +1,638 @@
+"""
+    ScoreFunctionConfig(
+        subproblems::AbstractVector{<:JuMP.Model},
+        state_params_in::AbstractVector,
+        state_params_out::AbstractVector;
+        dual_weight::Real = 0.5,
+        perturbation_std::Real = 1.0,
+        num_rollouts::Integer = 8,
+        baseline::Symbol = :mean,
+    )
+
+Configure the score-function correction used by [`train_multistage`](@ref).
+
+The deterministic-equivalent training path differentiates the target policy
+through dual information. For mixed-integer subproblems, those duals are local
+to a fixed integer assignment. This configuration adds a REINFORCE-style
+correction estimated from stage-wise rollouts with perturbed targets.
+
+The rollout models are solved exactly as they are built. If `subproblems`
+contain binary variables, the score-function rollouts solve MIPs. If they
+contain relaxed variables, the score-function rollouts solve the relaxation.
+This is intentionally separate from the `integer_strategy` keyword of
+[`train_multistage`](@ref), which controls only how the differentiable
+dual-gradient path reads local sensitivity information from the deterministic
+equivalent.
+
+If ``\\hat{x}_{t+1}(\\theta)`` is the target emitted by the policy and
+``\\delta_t \\sim \\mathcal{N}(0, \\sigma^2 I)``, the perturbed rollout solves
+with target ``\\hat{x}_{t+1}(\\theta) + \\delta_t``. The score-function
+surrogate loss is
+
+```math
+L_{sf}(\\theta)
+    = \\frac{1}{M} \\sum_{m=1}^{M}
+      (R_m - b)
+      \\sum_{t=1}^{T}
+      \\left\\langle \\frac{\\delta_{m,t}}{\\sigma^2},
+      \\hat{x}_{t+1}(\\theta) \\right\\rangle ,
+```
+
+and the mixed gradient is
+
+```math
+\\nabla L
+    = \\alpha \\nabla L_{dual}
+      + (1 - \\alpha) \\nabla L_{sf}.
+```
+
+# Arguments
+- `subproblems::AbstractVector{<:JuMP.Model}`: stage-wise rollout models used to
+  estimate realized costs under perturbed targets.
+- `state_params_in::AbstractVector`: stage input-state parameters.
+- `state_params_out::AbstractVector`: pairs `(target_parameter, state_variable)`
+  for every stage output state.
+
+# Keywords
+- `dual_weight::Real`: mixing weight ``\\alpha`` on the dual-gradient term.
+- `perturbation_std::Real`: Gaussian standard deviation ``\\sigma``.
+- `num_rollouts::Integer`: number of perturbed rollouts ``M`` per sample.
+- `baseline::Symbol`: either `:mean` for mean-centering costs or `:none`.
+
+# Examples
+```julia
+score_function = ScoreFunctionConfig(
+    rollout_subproblems,
+    state_params_in,
+    state_params_out;
+    dual_weight = 0.5,
+    perturbation_std = 1.0,
+    num_rollouts = 8,
+)
+
+train_multistage(
+    policy,
+    initial_state,
+    det_equivalent,
+    state_params_in,
+    state_params_out,
+    uncertainty_sampler;
+    score_function,
+)
+```
+"""
+struct ScoreFunctionConfig
+    subproblems::Vector{JuMP.Model}
+    state_params_in::AbstractVector
+    state_params_out::AbstractVector
+    dual_weight::Float64
+    perturbation_std::Float64
+    num_rollouts::Int
+    baseline::Symbol
+end
+
+function ScoreFunctionConfig(
+    subproblems::AbstractVector{<:JuMP.Model},
+    state_params_in::AbstractVector,
+    state_params_out::AbstractVector;
+    dual_weight::Real = 0.5,
+    perturbation_std::Real = 1.0,
+    num_rollouts::Integer = 8,
+    baseline::Symbol = :mean,
+)
+    # Validate dimensions first so later rollout errors point at modeling issues.
+    length(subproblems) == length(state_params_in) ||
+        throw(ArgumentError("state_params_in must have one entry per subproblem."))
+    length(subproblems) == length(state_params_out) ||
+        throw(ArgumentError("state_params_out must have one entry per subproblem."))
+
+    # Convert scalar configuration values once, at construction time.
+    dual_weight_value = Float64(dual_weight)
+    perturbation_std_value = Float64(perturbation_std)
+    num_rollouts_value = Int(num_rollouts)
+
+    # Keep validation messages tied to the public keyword names.
+    0.0 <= dual_weight_value <= 1.0 ||
+        throw(ArgumentError("dual_weight must lie in [0, 1]."))
+    perturbation_std_value > 0.0 ||
+        throw(ArgumentError("perturbation_std must be positive."))
+    num_rollouts_value >= 1 ||
+        throw(ArgumentError("num_rollouts must be at least 1."))
+    baseline in (:mean, :none) ||
+        throw(ArgumentError("baseline must be either :mean or :none."))
+
+    # Store a plain Vector of models so iteration order is concrete and stable.
+    return ScoreFunctionConfig(
+        collect(subproblems),
+        state_params_in,
+        state_params_out,
+        dual_weight_value,
+        perturbation_std_value,
+        num_rollouts_value,
+        baseline,
+    )
+end
+
+"""
+    ScoreFunctionSchedule(config::ScoreFunctionConfig; <keyword arguments>)
+
+Ramp a [`ScoreFunctionConfig`](@ref) into training after a pure-dual warmup.
+
+The schedule delays score-function rollouts until `sf_start`, then linearly
+increases the score-function weight, perturbation scale, and rollout count until
+the final values stored in `config` are reached.
+
+Let ``k`` be the current iteration and
+``\\rho_k = \\operatorname{clip}((k - k_0) / r, 0, 1)``. The effective
+score-function weight is
+``\\rho_k (1 - \\alpha)``. The effective dual weight is one minus that value.
+
+# Arguments
+- `config::ScoreFunctionConfig`: final score-function configuration.
+
+# Keywords
+- `sf_start::Integer`: first iteration at which score-function rollouts are
+  considered.
+- `ramp_batches::Integer`: number of iterations in the linear ramp.
+- `perturbation_std_initial::Real`: initial ``\\sigma`` at ramp start.
+- `num_rollouts_initial::Integer`: initial rollout count at ramp start.
+
+# Examples
+```julia
+schedule = ScoreFunctionSchedule(
+    score_function;
+    sf_start = 200,
+    ramp_batches = 300,
+    perturbation_std_initial = 0.1,
+    num_rollouts_initial = 2,
+)
+```
+"""
+struct ScoreFunctionSchedule
+    config::ScoreFunctionConfig
+    sf_start::Int
+    ramp_batches::Int
+    final_dual_weight::Float64
+    initial_perturbation_std::Float64
+    final_perturbation_std::Float64
+    initial_num_rollouts::Int
+    final_num_rollouts::Int
+end
+
+function ScoreFunctionSchedule(
+    config::ScoreFunctionConfig;
+    sf_start::Integer = 200,
+    ramp_batches::Integer = 200,
+    perturbation_std_initial::Real = 0.1,
+    num_rollouts_initial::Integer = 2,
+)
+    # Convert public numeric inputs before validation.
+    sf_start_value = Int(sf_start)
+    ramp_batches_value = Int(ramp_batches)
+    initial_std_value = Float64(perturbation_std_initial)
+    initial_rollouts_value = Int(num_rollouts_initial)
+
+    # Reject invalid schedules with keyword-specific messages.
+    sf_start_value >= 1 ||
+        throw(ArgumentError("sf_start must be at least 1."))
+    ramp_batches_value >= 1 ||
+        throw(ArgumentError("ramp_batches must be at least 1."))
+    initial_std_value > 0.0 ||
+        throw(ArgumentError("perturbation_std_initial must be positive."))
+    initial_rollouts_value >= 1 ||
+        throw(ArgumentError("num_rollouts_initial must be at least 1."))
+
+    return ScoreFunctionSchedule(
+        config,
+        sf_start_value,
+        ramp_batches_value,
+        config.dual_weight,
+        initial_std_value,
+        config.perturbation_std,
+        initial_rollouts_value,
+        config.num_rollouts,
+    )
+end
+
+const _ScoreFunctionParameters = @NamedTuple{
+    alpha::Float64,
+    score_weight::Float64,
+    perturbation_std::Float64,
+    num_rollouts::Int,
+    active::Bool,
+}
+
+"""
+    sf_params(config::ScoreFunctionConfig, iteration::Integer)
+    sf_params(schedule::ScoreFunctionSchedule, iteration::Integer)
+
+Return the effective score-function parameters for `iteration`.
+
+# Arguments
+- `config::ScoreFunctionConfig`: unscheduled score-function configuration.
+- `schedule::ScoreFunctionSchedule`: scheduled score-function configuration.
+- `iteration::Integer`: one-based training iteration.
+
+# Returns
+A named tuple with fields:
+- `alpha::Float64`: weight on the dual-gradient term.
+- `score_weight::Float64`: weight on the score-function term.
+- `perturbation_std::Float64`: Gaussian standard deviation ``\\sigma``.
+- `num_rollouts::Int`: number of perturbed rollouts.
+- `active::Bool`: whether rollout estimation should run.
+
+# Examples
+```julia
+params = sf_params(schedule, 250)
+params.active && @show params.score_weight
+```
+"""
+function sf_params(
+    config::ScoreFunctionConfig,
+    ::Integer,
+)::_ScoreFunctionParameters
+    # Static configurations are active at every training iteration.
+    return (
+        alpha = config.dual_weight,
+        score_weight = 1.0 - config.dual_weight,
+        perturbation_std = config.perturbation_std,
+        num_rollouts = config.num_rollouts,
+        active = true,
+    )
+end
+
+function sf_params(
+    schedule::ScoreFunctionSchedule,
+    iteration::Integer,
+)::_ScoreFunctionParameters
+    # Before warmup ends, keep the original deterministic-equivalent gradient.
+    if iteration < schedule.sf_start
+        return (
+            alpha = 1.0,
+            score_weight = 0.0,
+            perturbation_std = 0.0,
+            num_rollouts = 0,
+            active = false,
+        )
+    end
+
+    # A clipped ramp fraction keeps all interpolated values inside bounds.
+    ramp_fraction = clamp(
+        (iteration - schedule.sf_start) / schedule.ramp_batches,
+        0.0,
+        1.0,
+    )
+
+    # The score-function weight follows the linear ramp.
+    uncapped_score_weight = ramp_fraction * (1.0 - schedule.final_dual_weight)
+
+    # Interpolate the perturbation scale and rollout count over the same ramp.
+    perturbation_std = schedule.initial_perturbation_std +
+        ramp_fraction *
+        (schedule.final_perturbation_std - schedule.initial_perturbation_std)
+    num_rollouts = round(
+        Int,
+        schedule.initial_num_rollouts +
+            ramp_fraction *
+            (schedule.final_num_rollouts - schedule.initial_num_rollouts),
+    )
+
+    return (
+        alpha = 1.0 - uncapped_score_weight,
+        score_weight = uncapped_score_weight,
+        perturbation_std = perturbation_std,
+        num_rollouts = max(1, num_rollouts),
+        active = true,
+    )
+end
+
+"""
+    _sf_config(score_function) -> Union{Nothing,ScoreFunctionConfig}
+
+Extract the underlying [`ScoreFunctionConfig`](@ref), if one exists.
+
+# Arguments
+- `score_function::Nothing`: score-function correction is disabled.
+- `score_function::ScoreFunctionConfig`: returned as-is.
+- `score_function::ScoreFunctionSchedule`: unwraps `score_function.config`.
+
+# Examples
+```julia
+config = DecisionRules._sf_config(score_function)
+```
+"""
+_sf_config(::Nothing) = nothing
+_sf_config(config::ScoreFunctionConfig) = config
+_sf_config(schedule::ScoreFunctionSchedule) = schedule.config
+
+"""
+    _set_score_function_stage_parameters!(
+        state_params_in,
+        state_params_out,
+        uncertainties,
+        state,
+        target,
+    ) -> Nothing
+
+Set the JuMP parameters needed for one perturbed rollout stage.
+
+# Arguments
+- `state_params_in::AbstractVector`: parameters receiving the current state.
+- `state_params_out::AbstractVector`: `(target_parameter, state_variable)`
+  pairs receiving the target state.
+- `uncertainties::AbstractVector`: `(parameter, value)` pairs for stage
+  uncertainty.
+- `state::AbstractVector{<:Real}`: realized state entering this stage.
+- `target::AbstractVector{<:Real}`: perturbed target for the output state.
+
+# Examples
+```julia
+DecisionRules._set_score_function_stage_parameters!(
+    spi[t],
+    spo[t],
+    uncertainty_sample[t],
+    state,
+    target,
+)
+```
+"""
+function _set_score_function_stage_parameters!(
+    state_params_in,
+    state_params_out,
+    uncertainties,
+    state::AbstractVector{<:Real},
+    target::AbstractVector{<:Real},
+)
+    # Input-state parameters receive the realized state from the prior stage.
+    for index in eachindex(state_params_in)
+        set_parameter_value(state_params_in[index], state[index])
+    end
+
+    # Uncertainty parameters receive the sampled exogenous values.
+    for (parameter, value) in uncertainties
+        set_parameter_value(parameter, value)
+    end
+
+    # Output target parameters receive the perturbed policy targets.
+    for index in eachindex(state_params_out)
+        set_parameter_value(state_params_out[index][1], target[index])
+    end
+
+    return nothing
+end
+
+"""
+    rollout_with_perturbation(
+        config::ScoreFunctionConfig,
+        initial_state::AbstractVector,
+        uncertainties,
+        targets,
+        perturbations,
+    ) -> Float64
+
+Run one stage-wise rollout with fixed target perturbations.
+
+The rollout target at stage `t` is `targets[t + 1] + perturbations[t]`. The
+returned cost excludes the target-deficit penalty so the score-function signal
+estimates operational cost rather than target-following slack.
+
+# Arguments
+- `config::ScoreFunctionConfig`: rollout models and parameter mappings.
+- `initial_state::AbstractVector`: state entering stage 1.
+- `uncertainties`: sampled uncertainty trajectory.
+- `targets`: target trajectory, including `targets[1] == initial_state`.
+- `perturbations`: one perturbation vector for each stage target.
+
+# Examples
+```julia
+cost = DecisionRules.rollout_with_perturbation(
+    score_function,
+    initial_state,
+    uncertainty_sample,
+    targets,
+    perturbations,
+)
+```
+"""
+function rollout_with_perturbation(
+    config::ScoreFunctionConfig,
+    initial_state::AbstractVector,
+    uncertainties,
+    targets,
+    perturbations,
+)::Float64
+    # Rollouts always start from the true initial state.
+    state = Float64.(initial_state)
+
+    # Accumulate operational cost over the horizon.
+    total_cost = 0.0
+
+    for stage in eachindex(config.subproblems)
+        # The deterministic target sequence includes the initial state at index 1.
+        target = Float64.(targets[stage + 1]) .+ perturbations[stage]
+
+        # Set all model parameters before solving this stage.
+        _set_score_function_stage_parameters!(
+            config.state_params_in[stage],
+            config.state_params_out[stage],
+            uncertainties[stage],
+            state,
+            target,
+        )
+
+        # Score-function rollouts need realized costs, not duals, so solve the
+        # model exactly as it was built.
+        optimize!(config.subproblems[stage])
+
+        # Fail loudly when a sampled rollout is not solved to a usable status.
+        _assert_successful_solve(
+            config.subproblems[stage];
+            context = "score-function rollout solve",
+        )
+
+        # Read the operational cost after the successful solve.
+        stage_cost = get_objective_no_target_deficit(config.subproblems[stage])
+
+        # Read the realized output state that becomes the next input state.
+        next_state = Float64.([
+            JuMP.value(config.state_params_out[stage][index][2])
+            for index in eachindex(config.state_params_out[stage])
+        ])
+
+        # Feed the realized output state to the next stage.
+        total_cost += stage_cost
+        state = next_state
+    end
+
+    return total_cost
+end
+
+"""
+    _sample_target_perturbations(num_stages::Integer, state_dimension::Integer, sigma::Real)
+
+Draw Gaussian target perturbations for one score-function rollout.
+
+# Arguments
+- `num_stages::Integer`: number of stage targets to perturb.
+- `state_dimension::Integer`: length of each target state vector.
+- `sigma::Real`: Gaussian standard deviation ``\\sigma``.
+
+# Examples
+```julia
+perturbations = DecisionRules._sample_target_perturbations(3, 2, 0.5)
+```
+"""
+function _sample_target_perturbations(
+    num_stages::Integer,
+    state_dimension::Integer,
+    sigma::Real,
+)
+    # Multiplying standard normal draws by sigma stores actual perturbations.
+    return [Float64(sigma) .* randn(Int(state_dimension)) for _ in 1:Int(num_stages)]
+end
+
+"""
+    _center_rollout_costs(costs::AbstractVector{<:Real}, baseline::Symbol)
+
+Convert rollout costs into score-function advantages.
+
+# Arguments
+- `costs::AbstractVector{<:Real}`: operational costs from perturbed rollouts.
+- `baseline::Symbol`: either `:mean` or `:none`.
+
+# Examples
+```julia
+advantages = DecisionRules._center_rollout_costs([10.0, 12.0], :mean)
+```
+"""
+function _center_rollout_costs(
+    costs::AbstractVector{<:Real},
+    baseline::Symbol,
+)
+    # A mean baseline reduces variance without changing the expected gradient.
+    baseline_value = baseline === :mean ? mean(costs) : 0.0
+
+    return Float64.(costs) .- baseline_value
+end
+
+"""
+    _score_function_rollouts(
+        config::ScoreFunctionConfig,
+        initial_state::AbstractVector,
+        uncertainties,
+        targets;
+        perturbation_std = config.perturbation_std,
+        num_rollouts = config.num_rollouts,
+    ) -> (advantages, perturbations)
+
+Estimate rollout advantages for the score-function term.
+
+# Arguments
+- `config::ScoreFunctionConfig`: score-function rollout configuration.
+- `initial_state::AbstractVector`: state entering stage 1.
+- `uncertainties`: sampled uncertainty trajectory.
+- `targets`: target trajectory, including the initial state.
+- `perturbation_std::Real`: Gaussian standard deviation ``\\sigma``.
+- `num_rollouts::Integer`: number of perturbed rollouts to sample.
+
+# Examples
+```julia
+advantages, perturbations = DecisionRules._score_function_rollouts(
+    score_function,
+    initial_state,
+    uncertainty_sample,
+    targets;
+    perturbation_std = 0.5,
+    num_rollouts = 4,
+)
+```
+"""
+function _score_function_rollouts(
+    config::ScoreFunctionConfig,
+    initial_state::AbstractVector,
+    uncertainties,
+    targets;
+    perturbation_std::Real = config.perturbation_std,
+    num_rollouts::Integer = config.num_rollouts,
+)
+    # Use the first target after the initial state to infer the state dimension.
+    state_dimension = length(targets[2])
+    num_stages = length(config.subproblems)
+
+    # Allocate both arrays up front so each rollout has a visible slot.
+    costs = Vector{Float64}(undef, Int(num_rollouts))
+    perturbations = Vector{Vector{Vector{Float64}}}(undef, Int(num_rollouts))
+
+    for rollout in eachindex(costs)
+        # Draw perturbations once, then reuse them in the surrogate gradient.
+        perturbations[rollout] = _sample_target_perturbations(
+            num_stages,
+            state_dimension,
+            perturbation_std,
+        )
+
+        # Evaluate the realized cost under the perturbed target trajectory.
+        costs[rollout] = rollout_with_perturbation(
+            config,
+            initial_state,
+            uncertainties,
+            targets,
+            perturbations[rollout],
+        )
+    end
+
+    return _center_rollout_costs(costs, config.baseline), perturbations
+end
+
+"""
+    _score_function_surrogate(
+        advantage::Real,
+        perturbations,
+        targets,
+        perturbation_std::Real,
+    ) -> Real
+
+Build the differentiable scalar whose gradient is the Gaussian score estimate.
+
+For fixed rollout cost advantage ``A`` and perturbations ``\\delta_t``, the
+surrogate is
+
+```math
+A \\sum_t \\left\\langle
+    \\delta_t / \\sigma^2, \\hat{x}_{t+1}(\\theta)
+\\right\\rangle .
+```
+
+# Arguments
+- `advantage::Real`: centered rollout cost ``R - b``.
+- `perturbations`: stage perturbations ``\\delta_t``.
+- `targets`: differentiable target trajectory produced by the policy.
+- `perturbation_std::Real`: Gaussian standard deviation ``\\sigma``.
+
+# Examples
+```julia
+loss = DecisionRules._score_function_surrogate(
+    3.0,
+    perturbations,
+    targets,
+    0.5,
+)
+```
+"""
+function _score_function_surrogate(
+    advantage::Real,
+    perturbations,
+    targets,
+    perturbation_std::Real,
+)
+    # The Gaussian location score divides actual perturbations by sigma squared.
+    inverse_variance = inv(Float32(perturbation_std)^2)
+
+    # Targets include the initial state, so stage t uses targets[t + 1].
+    score = sum(eachindex(perturbations)) do stage
+        sum(Float32.(perturbations[stage]) .* targets[stage + 1]) *
+            inverse_variance
+    end
+
+    return Float32(advantage) * score
+end
diff --git a/src/simulate_multistage.jl b/src/simulate_multistage.jl
index f7d1801..a5116e7 100644
--- a/src/simulate_multistage.jl
+++ b/src/simulate_multistage.jl
@@ -301,16 +301,8 @@ function ChainRulesCore.rrule(
                 )
             end
         catch e
-            msg = sprint(showerror, e)
-            throw(
-                ArgumentError(
-                    "Differentiating get_next_state requires a DiffOpt-enabled model " *
-                    "because the closed-loop rollout needs solution sensitivities of the " *
-                    "realized state variables. Use an appropriate DiffOpt wrapper for the " *
-                    "stage subproblems (for target-slack conic models, " *
-                    "`DiffOpt.conic_diff_model(...)`), or use the deterministic-equivalent " *
-                    "training path when only target duals are needed. Original error: $msg",
-                ),
+            return handle_gradient_error(
+                _DEFAULT_GRADIENT_FALLBACK, e, length(state_in), length(state_out_target)
             )
         end
     end
@@ -347,6 +339,9 @@ end
 function get_objective_no_target_deficit(
     subproblem::JuMP.Model; norm_deficit::AbstractString="norm_deficit"
 )
+    if subproblem.is_model_dirty
+        return get(subproblem.ext, :_last_obj_no_deficit, 0.0)
+    end
     try
         obj = JuMP.objective_function(subproblem)
         objective_val = objective_value(subproblem)
@@ -902,6 +897,99 @@ function ChainRulesCore.rrule(
     return y, public_pullback
 end
 
+@doc raw"""
+    sample(uncertainty_pool) -> Vector{Vector{Tuple{VariableRef, T}}}
+
+Draw one full uncertainty trajectory from a DecisionRules uncertainty pool.
+
+The returned trajectory is a length-``T`` vector where each element is
+`Vector{Tuple{VariableRef, Float64}}` — one realized value per uncertain
+parameter for that stage. This is the format consumed by `simulate_multistage`,
+`train_multistage`, and all other training/evaluation functions.
+
+Three pool formats are supported, offering increasing levels of correlation:
+
+## 1. Independent sampling (per-unit pools)
+
+Each uncertain parameter has its own finite support; sampling draws
+independently from each support at each stage.
+
+    sample(multistage_pool::Vector{Vector{Tuple{VariableRef, Vector{T}}}})
+
+`multistage_pool[t]` is `[(param₁, [v₁₁, v₁₂, …]), (param₂, [v₂₁, v₂₂, …]), …]`.
+Each parameter picks one value uniformly at random from its own support.
+**No spatial or temporal correlation is preserved.**
+
+## 2. Joint-scenario sampling (spatial correlation)
+
+Scenarios are pre-defined joint realizations across all parameters at each
+stage. Sampling picks one complete scenario per stage uniformly, preserving
+cross-parameter correlations (e.g., spatially correlated inflows across
+hydro reservoirs). Stages are still drawn independently.
+
+    sample(multistage_joint::Vector{Vector{Vector{Tuple{VariableRef, T}}}})
+
+`multistage_joint[t]` is `[scenario₁, scenario₂, …]` where each scenario
+is `[(param₁, val₁), (param₂, val₂), …]`.
+
+## 3. Trajectory sampler (spatial + temporal correlation)
+
+A callable `sampler(t, past) -> Vector{Tuple{VariableRef, T}}` that generates
+stage `t`'s realization given the realized values from stages `1:t-1`. This
+enables autoregressive, Markovian, or any custom temporal dependence.
+
+    sample(sampler::Function, T::Int)
+
+The callable receives:
+- `t::Int` — the current stage (1-indexed)
+- `past::Vector{Vector{Tuple{VariableRef, T}}}` — realized samples from
+  stages `1:t-1` (empty vector for `t=1`)
+
+and must return `Vector{Tuple{VariableRef, T}}` — the realized sample for
+stage `t`.
+
+## Output format
+
+All three methods return `Vector{Vector{Tuple{VariableRef, T}}}` — a length-``T``
+vector of per-stage realized samples. This is the universal input to
+`simulate_multistage`, `train_multistage`, `simulate_multiple_shooting`, and all
+evaluation functions.
+
+# Examples
+```julia
+# 1. Independent sampling (each unit draws independently):
+independent_pool = [
+    [(inflow_1, [10.0, 15.0, 12.0]), (inflow_2, [8.0, 12.0, 9.0])],
+    [(inflow_1, [11.0, 14.0, 13.0]), (inflow_2, [7.0, 11.0, 10.0])],
+]
+path = sample(independent_pool)
+
+# 2. Joint-scenario sampling (preserves spatial correlation):
+joint_pool = [
+    [[(inflow_1, 10.0), (inflow_2, 8.0)],   # scenario 1
+     [(inflow_1, 15.0), (inflow_2, 12.0)]],  # scenario 2 — stage 1
+    [[(inflow_1, 11.0), (inflow_2, 7.0)],
+     [(inflow_1, 14.0), (inflow_2, 11.0)]],  # stage 2
+]
+path = sample(joint_pool)
+
+# 3. Trajectory sampler (preserves temporal + spatial correlation):
+function my_sampler(t, past)
+    if t == 1
+        ω = rand(1:nScenarios)
+        return [(inflow_params[t][r], data[r][t, ω]) for r in 1:nHyd]
+    else
+        # AR(1): next inflow depends on previous realized inflow
+        prev_values = [pair[2] for pair in past[end]]
+        noise = randn(nHyd) .* σ
+        return [(inflow_params[t][r], ρ * prev_values[r] + noise[r]) for r in 1:nHyd]
+    end
+end
+path = sample(my_sampler, T)
+```
+
+See the [Uncertainty Sampling](@ref) documentation page for a complete guide.
+"""
 function sample(uncertainty_samples::Vector{Tuple{VariableRef,Vector{T}}}) where {T<:Real}
     uncertainty_sample = Vector{Tuple{VariableRef,T}}(undef, length(uncertainty_samples))
     for i in 1:length(uncertainty_samples)
@@ -910,20 +998,168 @@ function sample(uncertainty_samples::Vector{Tuple{VariableRef,Vector{T}}}) where
     return uncertainty_sample
 end
 
+function sample(joint_scenarios::Vector{Vector{Tuple{VariableRef,T}}}) where {T<:Real}
+    return rand(joint_scenarios)
+end
+
 function sample(
     uncertainty_samples::Vector{Vector{Tuple{VariableRef,Vector{T}}}}
 ) where {T<:Real}
     return [sample(uncertainty_samples[t]) for t in 1:length(uncertainty_samples)]
 end
 
+function sample(
+    uncertainty_samples::Vector{Vector{Vector{Tuple{VariableRef,T}}}}
+) where {T<:Real}
+    return [sample(uncertainty_samples[t]) for t in 1:length(uncertainty_samples)]
+end
+
 """
-    train_multistage(model, initial_state, subproblems, state_params_in,
-                     state_params_out, uncertainty_sampler; kwargs...)
+    sample(sampler::Function, T::Int)
+
+Draw a full trajectory using a callable trajectory sampler with temporal dependence.
+
+`sampler(t, past)` receives the current stage `t` and a vector of all previously
+realized samples `past[1:t-1]`, and returns the realized sample for stage `t`.
+
+This enables autoregressive, Markovian, or any custom temporal correlation between
+stages — something the data-based pool formats cannot express.
 
-Train a policy with **stage-wise decomposition** (single shooting, Extension §2).
-Each SGD step samples `num_train_per_batch` uncertainty trajectories, rolls out the
-policy through `simulate_multistage` (stage-wise overload), and updates `model` via
-the Flux optimizer.
+See [`sample`](@ref) for the full API and examples.
+"""
+function sample(sampler::Function, T::Int)
+    trajectory = Vector{Vector{Tuple{VariableRef,Float64}}}(undef, T)
+    past = Vector{Vector{Tuple{VariableRef,Float64}}}()
+    for t in 1:T
+        trajectory[t] = sampler(t, past)
+        push!(past, trajectory[t])
+    end
+    return trajectory
+end
+
+"""
+    sample(sampler::Function)
+
+Call a zero-argument trajectory sampler that returns a complete trajectory.
+
+This is the dispatch used by `train_multistage` and `train_multiple_shooting`
+when `uncertainty_sampler` is a callable. Wrap a trajectory sampler as:
+
+```julia
+uncertainty_sampler = () -> sample(my_stage_sampler, T)
+```
+"""
+function sample(sampler::Function)
+    return sampler()
+end
+
+@doc raw"""
+    train_multistage(model, initial_state, subproblems::Vector{JuMP.Model},
+                     state_params_in, state_params_out, uncertainty_sampler;
+                     kwargs...)
+
+Train a target-state policy with stage-wise decomposition (single shooting).
+
+For one sampled uncertainty trajectory ``w_{1:T}``, this overload solves one
+optimization problem per stage. At stage ``t``, given the realized incoming
+state ``x_{t-1}``, the policy predicts a target
+``\hat{x}_t = \pi_\theta(w_t, x_{t-1})`` and the stage problem is
+
+```math
+\begin{aligned}
+q_t(x_{t-1}, w_t; \hat{x}_t)
+    = \min_{x_t, y_t, \delta_t}
+    \quad & f_t(x_t, y_t) + C_\delta \|\delta_t\| \\
+\text{s.t.}\quad
+    & x_t = T_t(w_t, y_t, x_{t-1})                 && : \mu_t, \\
+    & x_t + \delta_t = \hat{x}_t                   && : \lambda_t, \\
+    & h_t(x_t, y_t) \ge 0 .
+\end{aligned}
+```
+
+The rollout objective is the sum of stage values,
+
+```math
+Q(\theta; w) =
+    \sum_{t=1}^{T} q_t(x_{t-1}, w_t; \hat{x}_t),
+```
+
+where each realized ``x_t`` is read from the previous stage solve. The gradient
+therefore contains both the target duals ``\lambda_t`` and the sensitivity of
+later realized states with respect to earlier targets. In the notation of the
+extension note,
+
+```math
+\nabla_\theta Q(\theta; w)
+=
+\sum_{t=1}^{T}
+\left[
+    \frac{\partial q_t}{\partial \hat{x}_t}
+    +
+    \sum_{k=t+1}^{T}
+    \frac{\partial q_k}{\partial x_{k-1}}
+    \prod_{j=t+1}^{k-1}
+    \frac{\partial x_j}{\partial x_{j-1}}
+    \frac{\partial x_t}{\partial \hat{x}_t}
+\right]
+\nabla_\theta \pi_\theta(w_t, x_{t-1}).
+```
+
+The dual terms come from target and transition constraints; the state
+sensitivities are computed through DiffOpt in the rrules for
+[`simulate_stage`](@ref) and [`get_next_state`](@ref).
+
+# Arguments
+- `model`: differentiable Flux-compatible policy. It receives
+  `vcat(stage_uncertainty, realized_state)` and returns the next target state.
+- `initial_state::AbstractVector{<:Real}`: state ``x_0`` entering stage 1.
+- `subproblems::Vector{JuMP.Model}`: one JuMP model per stage.
+- `state_params_in`: stage input-state parameters.
+- `state_params_out`: `(target_parameter, realized_state_variable)` pairs for
+  each stage output state.
+- `uncertainty_sampler`: source of uncertainty trajectories, passed to
+  [`sample`](@ref). Three formats are accepted:
+  1. **Per-unit pool** (`Vector{Vector{Tuple{VariableRef, Vector{T}}}}`):
+     independent sampling per parameter per stage.
+  2. **Joint-scenario pool** (`Vector{Vector{Vector{Tuple{VariableRef, T}}}}`):
+     one scenario drawn per stage, preserving spatial correlation.
+  3. **Callable** (`() -> Vector{Vector{Tuple{VariableRef, T}}}`): a zero-arg
+     function returning a full trajectory. Use this for temporal correlation
+     by wrapping a trajectory sampler:
+     `() -> sample(my_stage_sampler, T)` where `my_stage_sampler(t, past)`
+     generates stage `t` conditioned on past realizations.
+
+# Keywords
+- `num_batches::Integer`: number of SGD batches.
+- `num_train_per_batch::Integer`: sampled trajectories per batch.
+- `optimizer`: Flux optimizer used to update `model`.
+- `adjust_hyperparameters::Function`: optional hook returning the batch size for
+  the current iteration.
+- `record_loss`: legacy logging callback.
+- `sample_log::SampleLog`: per-batch objective cache.
+- `record::Function`: callback called as `record(sample_log, iter, model)`.
+- `penalty_schedule`: optional multiplier schedule for target-penalty terms.
+- `integer_strategy::AbstractIntegerStrategy`: strategy used when a stage model
+  has discrete variables and derivative information must be read.
+
+# Examples
+```julia
+# With data pool (independent or joint):
+train_multistage(
+    policy, initial_state, subproblems,
+    state_params_in, state_params_out, uncertainty_pool;
+    num_batches=200, optimizer=Flux.Adam(1e-3),
+)
+
+# With trajectory sampler (temporal correlation):
+ar_sampler(t, past) = my_ar1_model(t, past, inflow_params)
+train_multistage(
+    policy, initial_state, subproblems,
+    state_params_in, state_params_out,
+    () -> sample(ar_sampler, T);
+    num_batches=200, optimizer=Flux.Adam(1e-3),
+)
+```
 """
 function train_multistage(
     model,
@@ -942,9 +1178,16 @@ function train_multistage(
     record=default_record,
     penalty_schedule=nothing,
     integer_strategy::AbstractIntegerStrategy=NoIntegerStrategy(),
+    gradient_fallback::AbstractGradientFallback=ZeroGradientFallback(),
 )
+    if gradient_fallback isa ZeroGradientFallback
+        @info "Training with ZeroGradientFallback: solver/differentiation errors will be " *
+              "caught and the iteration skipped (zero gradient). Pass " *
+              "`gradient_fallback=ErrorGradientFallback()` to throw instead, or implement " *
+              "a custom `AbstractGradientFallback` subtype."
+    end
+
     record = _resolve_record(record, record_loss)
-    # Initialise the optimiser for this model:
     opt_state = Flux.setup(optimizer, model)
 
     schedule = _resolve_penalty_schedule(penalty_schedule, num_batches)
@@ -964,32 +1207,40 @@ function train_multistage(
             end
         end
         num_train_per_batch = adjust_hyperparameters(iter, opt_state, num_train_per_batch)
-        # Sample uncertainties
+
         uncertainty_samples = [sample(uncertainty_sampler) for _ in 1:num_train_per_batch]
+
         objective = 0.0
         _reset_sample_log!(sample_log)
-        grads = Flux.gradient(model) do m
-            for s in 1:num_train_per_batch
-                Flux.reset!(m)
-                objective += simulate_multistage(
-                    subproblems,
-                    state_params_in,
-                    state_params_out,
-                    initial_state,
-                    uncertainty_samples[s],
-                    m;
-                    integer_strategy=integer_strategy,
-                )
-                @ignore_derivatives sample_log(s, subproblems)
+        grads = try
+            Flux.gradient(model) do m
+                for s in 1:num_train_per_batch
+                    Flux.reset!(m)
+                    objective += simulate_multistage(
+                        subproblems,
+                        state_params_in,
+                        state_params_out,
+                        initial_state,
+                        uncertainty_samples[s],
+                        m;
+                        integer_strategy=integer_strategy,
+                    )
+                    @ignore_derivatives sample_log(s, subproblems)
+                end
+                objective /= num_train_per_batch
+                return objective
+            end
+        catch e
+            if handle_training_error(gradient_fallback, e, iter)
+                nothing
             end
-            objective /= num_train_per_batch
-            return objective
         end
         record(sample_log, iter, model) && break
 
-        # Update the parameters so as to reduce the objective,
-        # according the chosen optimisation rule:
-        # Convert gradients from MutableTangent to plain NamedTuples for Flux.update!
+        if isnothing(grads)
+            continue
+        end
+
         grad = materialize_tangent(grads[1])
         Flux.update!(opt_state, model, grad)
     end
@@ -1010,14 +1261,121 @@ function sim_states(t, m, initial_state, uncertainty_sample_vec, prev_states)
     end
 end
 
-"""
+@doc raw"""
     train_multistage(model, initial_state, det_equivalent::JuMP.Model,
-                     state_params_in, state_params_out, uncertainty_sampler; kwargs...)
-
-Train a policy with the **deterministic equivalent** (direct transcription,
-Extension §1).  Each SGD step samples uncertainty trajectories, rolls out target
-states with `Base.accumulate`, solves the coupled `det_equivalent`, and updates
-`model`.  Gradient: Eq. 1.2, ``λ^s ⊙ ∇_θ π``.
+                     state_params_in, state_params_out, uncertainty_sampler;
+                     score_function=nothing, kwargs...)
+
+Train a target-state policy with a deterministic equivalent (direct transcription).
+
+For one sampled trajectory ``w_{1:T}``, the policy first produces the full target
+trajectory
+
+```math
+\hat{x}_{1:T}(\theta) = \pi_\theta(w_{1:T}, x_0).
+```
+
+The coupled implementation problem is
+
+```math
+\begin{aligned}
+Q(w; \theta)
+    =
+    \min_{\{x_t, y_t, \delta_t\}_{t=1}^{T}}
+    \quad &
+    \sum_{t=1}^{T} f_t(x_t, y_t)
+    + C_\delta \sum_{t=1}^{T} \|\delta_t\| \\
+\text{s.t.}\quad
+    & x_t = T_t(w_t, y_t, x_{t-1})        && t=1,\ldots,T, \\
+    & x_t + \delta_t = \hat{x}_t(\theta)  && : \lambda_t,\quad t=1,\ldots,T, \\
+    & h_t(x_t, y_t) \ge 0                 && t=1,\ldots,T .
+\end{aligned}
+```
+
+The target trajectory appears as right-hand-side parameters. If
+``\lambda_t`` is the dual multiplier of the target constraint, the envelope
+gradient used by this overload is
+
+```math
+\nabla_\theta \mathbb{E}[Q(w; \theta)]
+\approx
+\frac{1}{S}
+\sum_{s=1}^{S}
+\sum_{t=1}^{T}
+\lambda_t^s \odot
+\nabla_\theta \hat{x}_t^s(\theta),
+```
+
+where ``S`` is `num_train_per_batch` and ``\odot`` denotes componentwise
+multiplication.
+
+Pass a [`ScoreFunctionConfig`](@ref) or [`ScoreFunctionSchedule`](@ref) via
+`score_function` to mix the dual gradient with a REINFORCE correction
+estimated from rollouts under perturbed targets.
+
+When `score_function` is used, there are two separate solve paths:
+
+1. `integer_strategy` applies to `det_equivalent` and controls how local dual
+   information is read for the differentiable dual-gradient term.
+2. `score_function` owns separate rollout subproblems. Those models are solved
+   exactly as they are built, and their realized costs define the Monte Carlo
+   score-function term.
+
+For a mixed-integer model, this usually means
+`integer_strategy = FixedDiscreteIntegerStrategy()` for the dual path and
+MIP rollout subproblems inside `ScoreFunctionConfig` for the score-function
+path.
+
+# Arguments
+- `model`: differentiable Flux-compatible policy. It is rolled forward over
+  uncertainty values to produce ``\hat{x}_{1:T}``.
+- `initial_state::AbstractVector{<:Real}`: state ``x_0``.
+- `det_equivalent::JuMP.Model`: full-horizon JuMP model for one sampled
+  trajectory.
+- `state_params_in`: input-state parameters in the deterministic equivalent.
+- `state_params_out`: `(target_parameter, realized_state_variable)` pairs for
+  each target state.
+- `uncertainty_sampler`: source of uncertainty trajectories, passed to
+  [`sample`](@ref). Three formats are accepted:
+  1. **Per-unit pool** (`Vector{Vector{Tuple{VariableRef, Vector{T}}}}`):
+     independent sampling per parameter per stage.
+  2. **Joint-scenario pool** (`Vector{Vector{Vector{Tuple{VariableRef, T}}}}`):
+     one scenario drawn per stage, preserving spatial correlation.
+  3. **Callable** (`() -> Vector{Vector{Tuple{VariableRef, T}}}`): a zero-arg
+     function returning a full trajectory. Use this for temporal correlation;
+     see [`sample`](@ref).
+
+# Keywords
+- `num_batches::Integer`: number of SGD batches.
+- `num_train_per_batch::Integer`: sampled trajectories per batch ``S``.
+- `optimizer`: Flux optimizer used to update `model`.
+- `adjust_hyperparameters::Function`: optional hook returning the batch size for
+  the current iteration.
+- `record_loss`: legacy logging callback.
+- `sample_log::SampleLog`: per-batch objective cache.
+- `record::Function`: callback called as `record(sample_log, iter, model)`.
+- `penalty_schedule`: optional multiplier schedule for target-penalty terms.
+- `integer_strategy::AbstractIntegerStrategy`: strategy used to read local dual
+  information from `det_equivalent` when it has discrete variables.
+- `score_function`: optional [`ScoreFunctionConfig`](@ref) or
+  [`ScoreFunctionSchedule`](@ref) for mixed dual/score-function gradients.
+
+# Examples
+```julia
+train_multistage(
+    policy,
+    initial_state,
+    det_equivalent,
+    state_params_in,
+    state_params_out,
+    uncertainty_sampler;
+    num_batches = 200,
+    num_train_per_batch = 16,
+    optimizer = Flux.Adam(1.0e-3),
+    integer_strategy = FixedDiscreteIntegerStrategy(),
+    score_function = nothing,
+)
+```
 """
 function train_multistage(
     model,
@@ -1036,75 +1394,110 @@ function train_multistage(
     record=default_record,
     penalty_schedule=nothing,
     integer_strategy::AbstractIntegerStrategy=NoIntegerStrategy(),
+    score_function::Union{Nothing,ScoreFunctionConfig,ScoreFunctionSchedule}=nothing,
+    gradient_fallback::AbstractGradientFallback=ZeroGradientFallback(),
 )
     record = _resolve_record(record, record_loss)
-    # Initialise the optimiser for this model:
     opt_state = Flux.setup(optimizer, model)
     num_stages = length(state_params_in)
 
     schedule = _resolve_penalty_schedule(penalty_schedule, num_batches)
-    penalty_bases = if isnothing(schedule)
-        nothing
-    else
+    penalty_bases = isnothing(schedule) ? nothing :
         _check_deficit_penalty_bases(_deficit_penalty_bases(det_equivalent))
-    end
     current_multiplier = NaN
 
+    sf_cfg = _sf_config(score_function)
+    use_sf = !isnothing(sf_cfg)
+
     for iter in 1:num_batches
         if !isnothing(schedule)
             multiplier = _penalty_multiplier_for(schedule, iter)
             if multiplier != current_multiplier
                 _apply_deficit_penalty_multiplier!(
-                    det_equivalent, penalty_bases, multiplier
-                )
+                    det_equivalent, penalty_bases, multiplier)
                 current_multiplier = multiplier
             end
         end
         num_train_per_batch = adjust_hyperparameters(iter, opt_state, num_train_per_batch)
-        # Sample uncertainties
+
+        score_params = use_sf ? sf_params(score_function, iter) :
+            (
+                alpha = 1.0,
+                score_weight = 0.0,
+                perturbation_std = 0.0,
+                num_rollouts = 0,
+                active = false,
+            )
+
         uncertainty_samples = [sample(uncertainty_sampler) for _ in 1:num_train_per_batch]
         num_uncertainties = length(uncertainty_samples[1][1])
         uncertainty_samples_vec = [
             [
-                [uncertainty_samples[s][stage][i][2] for i in 1:num_uncertainties] for
-                stage in 1:length(uncertainty_samples[1])
+                [uncertainty_samples[s][stage][i][2] for i in 1:num_uncertainties]
+                for stage in 1:num_stages
             ] for s in 1:num_train_per_batch
         ]
 
-        # Calculate the gradient of the objective
-        # with respect to the parameters within the model:
         objective = 0.0
         _reset_sample_log!(sample_log)
-        grads = Flux.gradient(model) do m
-            for s in 1:num_train_per_batch
-                Flux.reset!(m)
-                init_state = Float32.(initial_state)
-                predicted_states = accumulate(
-                    uncertainty_samples_vec[s]; init=init_state
-                ) do prev_state, uncertainties_t
-                    return m(vcat(uncertainties_t, prev_state))
+        grads = try
+            Flux.gradient(model) do m
+                for s in 1:num_train_per_batch
+                    Flux.reset!(m)
+                    x0 = Float32.(initial_state)
+                    states = vcat([x0], accumulate(
+                        uncertainty_samples_vec[s]; init=x0
+                    ) do prev, ξ
+                        m(vcat(ξ, prev))
+                    end)
+
+                    dual_obj = simulate_multistage(
+                        det_equivalent, state_params_in, state_params_out,
+                        uncertainty_samples[s], states;
+                        integer_strategy=integer_strategy)
+                    @ignore_derivatives sample_log(s, det_equivalent)
+                    objective += score_params.alpha * dual_obj
+
+                    if score_params.active
+                        advantages, perturbations = @ignore_derivatives(
+                            _score_function_rollouts(
+                                sf_cfg,
+                                initial_state,
+                                uncertainty_samples[s],
+                                states;
+                                perturbation_std = score_params.perturbation_std,
+                                num_rollouts = score_params.num_rollouts,
+                            )
+                        )
+                        for rollout in 1:score_params.num_rollouts
+                            advantage = @ignore_derivatives advantages[rollout]
+                            perturbation = @ignore_derivatives perturbations[rollout]
+                            surrogate = _score_function_surrogate(
+                                advantage,
+                                perturbation,
+                                states,
+                                score_params.perturbation_std,
+                            )
+                            objective += score_params.score_weight *
+                                surrogate / Float32(score_params.num_rollouts)
+                        end
+                    end
                 end
-                states = vcat([init_state], predicted_states)
-                objective += simulate_multistage(
-                    det_equivalent,
-                    state_params_in,
-                    state_params_out,
-                    uncertainty_samples[s],
-                    states;
-                    integer_strategy=integer_strategy,
-                )
-                @ignore_derivatives sample_log(s, det_equivalent)
+                objective /= num_train_per_batch
+                return objective
+            end
+        catch e
+            if handle_training_error(gradient_fallback, e, iter)
+                nothing
             end
-            objective /= num_train_per_batch
-            return objective
         end
         record(sample_log, iter, model) && break
 
-        # Update the parameters so as to reduce the objective,
-        # according the chosen optimisation rule:
-        # Convert gradients from MutableTangent to plain NamedTuples for Flux.update!
-        grad = materialize_tangent(grads[1])
-        Flux.update!(opt_state, model, grad)
+        if isnothing(grads)
+            continue
+        end
+
+        Flux.update!(opt_state, model, materialize_tangent(grads[1]))
     end
 
     return model
diff --git a/src/utils.jl b/src/utils.jl
index f8306a3..6dc07e5 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -323,6 +323,30 @@ function _apply_deficit_penalty_multiplier!(
     return models
 end
 
+"""
+    SaveBest(best_loss::Float64, model_path::String)
+
+Callback that saves the best policy state seen during training.
+
+`SaveBest` is a small callable object used as a training callback. When called
+as `callback(iter, model, loss)`, it compares `loss` with the best loss stored
+so far. If the new loss is smaller, it copies `model` to CPU, normalizes any
+recurrent layer state, and writes the Flux state to `model_path` with JLD2.
+It returns `false`, so it records checkpoints without stopping training.
+
+# Arguments
+- `best_loss::Float64`: incumbent loss. Use `Inf` to save the first observed
+  model.
+- `model_path::String`: path of the JLD2 file that receives the best model
+  state.
+
+# Examples
+```julia
+callback = SaveBest(Inf, "best_policy.jld2")
+train_multistage(policy, x0, subproblems, state_in, state_out, sampler;
+    record = (log, iter, model) -> callback(iter, model, mean(log.losses)))
+```
+"""
 mutable struct SaveBest <: Function
     best_loss::Float64
     model_path::String
@@ -428,6 +452,9 @@ _reset_sample_log!(sample_log::SampleLog) = empty!(sample_log)
 _reset_sample_log!(sample_log) = sample_log
 
 function _total_objective_value(model::JuMP.Model)
+    if model.is_model_dirty
+        return get(model.ext, :_last_obj, 0.0)
+    end
     try
         return objective_value(model)
     catch
@@ -575,6 +602,7 @@ mutable struct RolloutEvaluation <: Function
     stride::Int
     policy_state::Symbol
     integer_strategy::AbstractIntegerStrategy
+    gradient_fallback::AbstractGradientFallback
     last_objective_no_deficit::Float64
     last_violation_share::Float64
 end
@@ -588,6 +616,7 @@ function RolloutEvaluation(
     stride=1,
     policy_state::Symbol=:realized,
     integer_strategy::AbstractIntegerStrategy=NoIntegerStrategy(),
+    gradient_fallback::AbstractGradientFallback=ZeroGradientFallback(),
 )
     isempty(scenarios) && throw(
         ArgumentError(
@@ -607,6 +636,7 @@ function RolloutEvaluation(
         stride,
         policy_state,
         integer_strategy,
+        gradient_fallback,
         NaN,
         NaN,
     )
@@ -658,32 +688,45 @@ function (evaluation::RolloutEvaluation)(iter, model)
     iter % evaluation.stride == 0 || return nothing
     total = 0.0
     total_no_deficit = 0.0
+    n_success = 0
     for scenario in evaluation.scenarios
-        total += if evaluation.policy_state === :realized
-            simulate_multistage(
-                evaluation.subproblems,
-                evaluation.state_params_in,
-                evaluation.state_params_out,
-                evaluation.initial_state,
-                scenario,
-                model;
-                integer_strategy=evaluation.integer_strategy,
-            )
-        else
-            _simulate_multistage_target_feedback(
-                evaluation.subproblems,
-                evaluation.state_params_in,
-                evaluation.state_params_out,
-                evaluation.initial_state,
-                scenario,
-                model,
-                evaluation.integer_strategy,
-            )
+        obj = try
+            if evaluation.policy_state === :realized
+                simulate_multistage(
+                    evaluation.subproblems,
+                    evaluation.state_params_in,
+                    evaluation.state_params_out,
+                    evaluation.initial_state,
+                    scenario,
+                    model;
+                    integer_strategy=evaluation.integer_strategy,
+                )
+            else
+                _simulate_multistage_target_feedback(
+                    evaluation.subproblems,
+                    evaluation.state_params_in,
+                    evaluation.state_params_out,
+                    evaluation.initial_state,
+                    scenario,
+                    model,
+                    evaluation.integer_strategy,
+                )
+            end
+        catch e
+            handle_rollout_error(evaluation.gradient_fallback, e, iter)
+            nothing
         end
+        isnothing(obj) && continue
+        total += obj
         total_no_deficit += get_objective_no_target_deficit(evaluation.subproblems)
+        n_success += 1
     end
-    objective = total / length(evaluation.scenarios)
-    evaluation.last_objective_no_deficit = total_no_deficit / length(evaluation.scenarios)
+    if n_success == 0
+        @warn "All rollout scenarios failed at iter $iter"
+        return nothing
+    end
+    objective = total / n_success
+    evaluation.last_objective_no_deficit = total_no_deficit / n_success
     evaluation.last_violation_share = _target_violation_share(
         objective, evaluation.last_objective_no_deficit
     )
@@ -909,8 +952,15 @@ stage subproblems into `model`.  Variables are renamed with a `#t` suffix to avo
 conflicts.  Stage coupling is enforced by identifying the realized state variable of
 stage `t` with the incoming state parameter of stage `t+1`.
 
-Returns `(model, uncertainties_new)` where `uncertainties_new` maps the original
-uncertainty parameter refs to the new refs in the combined model.
+`uncertainties` accepts both sampling formats (see [`sample`](@ref)):
+
+- **Per-unit pools**: `Vector{Vector{Tuple{VariableRef, Vector{T}}}}` — one pool per
+  parameter, drawing independently per parameter.
+- **Joint-scenario pools**: `Vector{Vector{Vector{Tuple{VariableRef, T}}}}` — pre-built
+  joint scenarios preserving cross-parameter correlations.
+
+Returns `(model, uncertainties_new)` where `uncertainties_new` has the same format as
+the input but with variable refs remapped to the deterministic-equivalent model.
 """
 function deterministic_equivalent!(
     model::JuMP.Model,
@@ -918,12 +968,9 @@ function deterministic_equivalent!(
     state_params_in::Vector{Vector{Any}},
     state_params_out::Vector{Vector{Tuple{Any,VariableRef}}},
     initial_state::Vector{Float64},
-    uncertainties::Vector{Vector{Tuple{VariableRef,Vector{Float64}}}},
+    uncertainties,
 )
     set_objective_sense(model, objective_sense(subproblems[1]))
-    uncertainties_new = Vector{Vector{Tuple{VariableRef,Vector{Float64}}}}(
-        undef, length(uncertainties)
-    )
     var_src_to_dest = Dict{VariableRef,VariableRef}()
     for t in 1:length(subproblems)
         DecisionRules.add_child_model_vars!(
@@ -944,31 +991,52 @@ function deterministic_equivalent!(
         )
     end
 
-    if uncertainties[1][1][1] isa VariableRef
-        # use var_src_to_dest
-        for t in 1:length(subproblems)
-            uncertainties_new[t] = Vector{Tuple{VariableRef,Vector{Float64}}}(
-                undef, length(uncertainties[t])
-            )
-            for (i, tup) in enumerate(uncertainties[t])
-                ky, val = tup
-                uncertainties_new[t][i] = (var_src_to_dest[ky], val)
-            end
-        end
+    uncertainties_new = _remap_uncertainties(uncertainties, var_src_to_dest, cons_to_cons)
+    return model, uncertainties_new
+end
+
+"""
+    _remap_uncertainties(uncertainties, var_src_to_dest, cons_to_cons)
+
+Replace source-model `VariableRef` keys in an uncertainty pool with their
+destination-model counterparts (using the variable or constraint mapping built
+by [`deterministic_equivalent!`](@ref)).
+
+Two methods dispatch on the pool format:
+
+- **Per-unit pools** (`Vector{Vector{Tuple{VariableRef, Vector{T}}}}`):
+  each stage maps `[(param₁, [v₁, …]), …]` independently.
+- **Joint-scenario pools** (`Vector{Vector{Vector{Tuple{VariableRef, T}}}}`):
+  each stage maps `[[scenario₁…], [scenario₂…], …]` preserving the grouped
+  structure.
+
+This is an internal helper; users interact with it indirectly through
+[`deterministic_equivalent!`](@ref).
+"""
+function _remap_uncertainties(
+    uncertainties::Vector{Vector{Tuple{VariableRef,Vector{T}}}},
+    var_src_to_dest, cons_to_cons,
+) where {T<:Real}
+    remap = if uncertainties[1][1][1] isa VariableRef
+        ky -> var_src_to_dest[ky]
     else
-        # use cons_to_cons
-        for t in 1:length(subproblems)
-            uncertainties_new[t] = Vector{Tuple{VariableRef,Vector{Float64}}}(
-                undef, length(uncertainties[t])
-            )
-            for (i, tup) in enumerate(uncertainties[t])
-                ky, val = tup
-                uncertainties_new[t] = (cons_to_cons[t][ky], val)
-            end
-        end
+        ky -> cons_to_cons[1][ky]
     end
+    return [
+        [(remap(ky), val) for (ky, val) in uncertainties[t]]
+        for t in eachindex(uncertainties)
+    ]
+end
 
-    return model, uncertainties_new
+function _remap_uncertainties(
+    uncertainties::Vector{Vector{Vector{Tuple{VariableRef,T}}}},
+    var_src_to_dest, cons_to_cons,
+) where {T<:Real}
+    remap = ky -> haskey(var_src_to_dest, ky) ? var_src_to_dest[ky] : cons_to_cons[1][ky]
+    return [
+        [[(remap(ky), val) for (ky, val) in scenario] for scenario in uncertainties[t]]
+        for t in eachindex(uncertainties)
+    ]
 end
 
 function find_variables(model::JuMP.Model, variable_name_parts::Vector{S}) where {S}
diff --git a/test/runtests.jl b/test/runtests.jl
index 3919e78..84814a4 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -44,6 +44,8 @@ function build_subproblem(
     return subproblem, state_in, state_out, state_out_var, uncertainty
 end
 
+include("test_score_function.jl")
+
 @testset "DecisionRules.jl" begin
     @testset "pdual at infeasibility" begin
         subproblem1, state_in_1, state_out_1, state_out_var_1, uncertainty_1 = build_subproblem(
@@ -756,7 +758,7 @@ end
                 model,
                 [5.0],
                 windows,
-                () -> usamples;
+                usamples;
                 num_batches=4,
                 num_train_per_batch=1,
                 optimizer=Flux.Descent(0.0),
@@ -1732,7 +1734,7 @@ end
                 model,
                 [1.0],
                 windows,
-                () -> uncertainty_samples;
+                uncertainty_samples;
                 num_batches=1,
                 num_train_per_batch=1,
                 optimizer=Flux.Descent(0.0),
@@ -1933,6 +1935,186 @@ end
         end
     end
 
+    @testset "sample (independent vs joint-scenario)" begin
+        # Build a two-stage problem with 2 uncertain parameters per stage
+        sp1 = quiet_conic_ipopt_model()
+        @variable(sp1, u1_1 in MOI.Parameter(0.0))
+        @variable(sp1, u2_1 in MOI.Parameter(0.0))
+        sp2 = quiet_conic_ipopt_model()
+        @variable(sp2, u1_2 in MOI.Parameter(0.0))
+        @variable(sp2, u2_2 in MOI.Parameter(0.0))
+
+        # -- Independent (per-unit) format --
+        indep_pool = [
+            [(u1_1, [10.0, 20.0, 30.0]), (u2_1, [100.0, 200.0])],
+            [(u1_2, [40.0, 50.0]),        (u2_2, [300.0, 400.0, 500.0])],
+        ]
+        Random.seed!(42)
+        indep_sample = sample(indep_pool)
+        @test length(indep_sample) == 2
+        @test length(indep_sample[1]) == 2
+        @test length(indep_sample[2]) == 2
+        @test indep_sample[1][1][1] === u1_1
+        @test indep_sample[1][1][2] in [10.0, 20.0, 30.0]
+        @test indep_sample[1][2][2] in [100.0, 200.0]
+
+        # -- Joint-scenario format --
+        # 3 scenarios per stage, 2 parameters each
+        joint_pool = [
+            [
+                [(u1_1, 10.0), (u2_1, 100.0)],
+                [(u1_1, 20.0), (u2_1, 200.0)],
+                [(u1_1, 30.0), (u2_1, 300.0)],
+            ],
+            [
+                [(u1_2, 40.0), (u2_2, 400.0)],
+                [(u1_2, 50.0), (u2_2, 500.0)],
+                [(u1_2, 60.0), (u2_2, 600.0)],
+            ],
+        ]
+        Random.seed!(42)
+        joint_sample = sample(joint_pool)
+        @test length(joint_sample) == 2
+        @test length(joint_sample[1]) == 2  # 2 params per stage
+        @test joint_sample[1][1][1] === u1_1
+        # Verify the sample is one of the pre-defined scenarios (not mixed)
+        @test joint_sample[1] in joint_pool[1]
+        @test joint_sample[2] in joint_pool[2]
+
+        # Key property: joint sampling preserves correlation within each stage
+        Random.seed!(999)
+        n_draws = 50
+        for _ in 1:n_draws
+            s = sample(joint_pool)
+            @test s[1] in joint_pool[1]
+            @test s[2] in joint_pool[2]
+        end
+    end
+
+    @testset "deterministic_equivalent with joint-scenario format" begin
+        sp1, si1, so1, sov1, u1 = build_subproblem(10)
+        sp2, si2, so2, sov2, u2 = build_subproblem(
+            10; state_i_val=4.0, state_out_val=3.0, uncertainty_val=1.0
+        )
+
+        sps = [sp1, sp2]
+        spi = Vector{Vector{Any}}(undef, 2)
+        spo = Vector{Vector{Tuple{Any,VariableRef}}}(undef, 2)
+        spi .= [[si1], [si2]]
+        spo .= [[(so1, sov1)], [(so2, sov2)]]
+
+        # Build joint-scenario format: 1 scenario per stage (deterministic)
+        joint_pool = [
+            [[(u1, 2.0)]],  # stage 1: one scenario with inflow=2.0
+            [[(u2, 1.0)]],  # stage 2: one scenario with inflow=1.0
+        ]
+
+        det_equivalent, joint_new = DecisionRules.deterministic_equivalent!(
+            quiet_nonlinear_ipopt_model(), sps, spi, spo, [5.0], joint_pool
+        )
+        # Remapped pool should have the same structure
+        @test length(joint_new) == 2
+        @test length(joint_new[1]) == 1  # 1 scenario
+        @test length(joint_new[1][1]) == 1  # 1 param
+
+        # Sample and simulate
+        s = sample(joint_new)
+        obj = DecisionRules.simulate_multistage(
+            det_equivalent, spi, spo, s, [[9.0], [7.0], [4.0]]
+        )
+        @test obj ≈ 359 rtol=1.0e-1
+    end
+
+    @testset "simulate_multistage with joint-scenario sampling" begin
+        sp1, si1, so1, sov1, u1 = build_subproblem(
+            10; subproblem=quiet_conic_ipopt_model()
+        )
+        sp2, si2, so2, sov2, u2 = build_subproblem(
+            10;
+            state_i_val=1.0, state_out_val=9.0, uncertainty_val=2.0,
+            subproblem=quiet_conic_ipopt_model(),
+        )
+
+        sps = [sp1, sp2]
+        spi = Vector{Vector{Any}}(undef, 2)
+        spo = Vector{Vector{Tuple{Any,VariableRef}}}(undef, 2)
+        spi .= [[si1], [si2]]
+        spo .= [[(so1, sov1)], [(so2, sov2)]]
+
+        joint_pool = [
+            [[(u1, 2.0)]],  # stage 1
+            [[(u2, 1.0)]],  # stage 2
+        ]
+
+        Random.seed!(222)
+        m = Chain(Dense(2, 10), Dense(10, 1))
+        obj_before = simulate_multistage(
+            sps, spi, spo, [5.0], sample(joint_pool), m
+        )
+
+        train_multistage(
+            m, [5.0], sps, spi, spo, joint_pool;
+            num_batches=100, num_train_per_batch=1,
+        )
+
+        obj_after = simulate_multistage(
+            sps, spi, spo, [5.0], sample(joint_pool), m
+        )
+        @test obj_after < obj_before
+    end
+
+    @testset "multiple_shooting with joint-scenario sampling" begin
+        num_stages = 2
+        subproblems = Vector{JuMP.Model}(undef, num_stages)
+        state_params_in = Vector{Vector{Any}}(undef, num_stages)
+        state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, num_stages)
+
+        for t in 1:num_stages
+            subproblems[t] = quiet_diffopt_ipopt_model()
+            @variable(subproblems[t], x[1:4] >= 0)
+            @variable(subproblems[t], state_in in MOI.Parameter(1.0))
+            @variable(subproblems[t], uncertainty in MOI.Parameter(0.5))
+            @variable(subproblems[t], state_out in MOI.Parameter(1.0))
+            @variable(subproblems[t], state_out_var)
+            @constraint(subproblems[t], sum(x) >= state_in + uncertainty)
+            @constraint(subproblems[t], state_out_var == sum(x[1:2]))
+            @constraint(subproblems[t], state_out_var >= state_out - 5.0)
+            @constraint(subproblems[t], state_out_var <= state_out + 5.0)
+            @objective(subproblems[t], Min, sum(x) + 10 * (state_out - state_out_var)^2)
+
+            state_params_in[t] = [state_in]
+            state_params_out[t] = [(state_out, state_out_var)]
+        end
+
+        # Joint-scenario pool: 3 scenarios per stage
+        joint_pool = [
+            [[(subproblems[t][:uncertainty], v)] for v in [0.3, 0.5, 0.7]]
+            for t in 1:num_stages
+        ]
+
+        windows = DecisionRules.setup_shooting_windows(
+            subproblems,
+            state_params_in,
+            state_params_out,
+            [1.5],
+            joint_pool;
+            window_size=2,
+            model_factory=() -> quiet_nonlinear_ipopt_model(),
+        )
+        @test length(windows) == 1
+
+        decision_rule(x) = x[2:2] .+ 0.1f0
+        uncertainty_sample = sample(joint_pool)
+        uncertainties_vec = [
+            [Float32(u[2]) for u in stage_u] for stage_u in uncertainty_sample
+        ]
+
+        obj = DecisionRules.simulate_multiple_shooting(
+            windows, decision_rule, Float32[1.5], uncertainty_sample, uncertainties_vec
+        )
+        @test obj > 0
+    end
+
     @testset "dense_multilayer_nn" begin
         # Dense layers
         m = dense_multilayer_nn(3, 2, [8, 4]; activation=relu, dense=Dense)
@@ -2668,4 +2850,227 @@ end
             @test_skip false
         end
     end
+
+    @testset "GradientFallback" begin
+        @testset "type hierarchy and exports" begin
+            @test ZeroGradientFallback() isa AbstractGradientFallback
+            @test ErrorGradientFallback() isa AbstractGradientFallback
+        end
+
+        @testset "ZeroGradientFallback returns zero cotangents" begin
+            fb = ZeroGradientFallback()
+            result = @test_logs (:warn,) DecisionRules.handle_gradient_error(
+                fb, ErrorException("test"), 3, 2
+            )
+            @test result[5] == zeros(3)
+            @test result[6] == zeros(2)
+            @test result[1] == ChainRulesCore.NoTangent()
+        end
+
+        @testset "ErrorGradientFallback rethrows" begin
+            fb = ErrorGradientFallback()
+            @test_throws ErrorException DecisionRules.handle_gradient_error(
+                fb, ErrorException("test"), 3, 2
+            )
+        end
+
+        @testset "handle_training_error" begin
+            @test_logs (:warn,) DecisionRules.handle_training_error(
+                ZeroGradientFallback(), ErrorException("test"), 1
+            ) == true
+            @test_throws ErrorException DecisionRules.handle_training_error(
+                ErrorGradientFallback(), ErrorException("test"), 1
+            )
+        end
+
+        @testset "handle_rollout_error" begin
+            @test_logs (:warn,) DecisionRules.handle_rollout_error(
+                ZeroGradientFallback(), ErrorException("test"), 1
+            ) == true
+            @test_throws ErrorException DecisionRules.handle_rollout_error(
+                ErrorGradientFallback(), ErrorException("test"), 1
+            )
+        end
+
+        @testset "custom fallback subtype" begin
+            struct CountingFallback <: AbstractGradientFallback
+                count::Ref{Int}
+            end
+            function DecisionRules.handle_gradient_error(fb::CountingFallback, e, n_in, n_out)
+                fb.count[] += 1
+                return DecisionRules._zero_cotangents(n_in, n_out)
+            end
+            function DecisionRules.handle_training_error(fb::CountingFallback, e, iter)
+                fb.count[] += 1
+                return true
+            end
+            function DecisionRules.handle_rollout_error(fb::CountingFallback, e, iter)
+                fb.count[] += 1
+                return true
+            end
+
+            fb = CountingFallback(Ref(0))
+            DecisionRules.handle_gradient_error(fb, ErrorException("x"), 2, 2)
+            @test fb.count[] == 1
+            DecisionRules.handle_training_error(fb, ErrorException("x"), 1)
+            @test fb.count[] == 2
+            DecisionRules.handle_rollout_error(fb, ErrorException("x"), 1)
+            @test fb.count[] == 3
+        end
+
+        @testset "train_multistage accepts gradient_fallback kwarg" begin
+            subproblems, spi, spo, usamples, _ = let
+                subs = JuMP.Model[]
+                spi_vec = Vector{Any}[]
+                spo_vec = Vector{Tuple{Any,VariableRef}}[]
+                us_vec = Vector{Tuple{VariableRef,Vector{Float64}}}[]
+                for d in [4.0, 5.0]
+                    s, si, so, sov, u = build_subproblem(d; subproblem=quiet_conic_ipopt_model())
+                    push!(subs, s)
+                    push!(spi_vec, [si])
+                    push!(spo_vec, [(so, sov)])
+                    push!(us_vec, [(u, [1.0, 2.0, 3.0])])
+                end
+                subs, spi_vec, spo_vec, us_vec, [5.0]
+            end
+            policy = dense_multilayer_nn(2, 1, [8]; activation=Flux.relu)
+
+            model_out = train_multistage(
+                policy, [5.0], subproblems, spi, spo, usamples;
+                num_batches=2,
+                num_train_per_batch=1,
+                optimizer=Flux.Adam(0.01),
+                gradient_fallback=ErrorGradientFallback(),
+            )
+            @test model_out isa Any
+
+            model_out2 = train_multistage(
+                policy, [5.0], subproblems, spi, spo, usamples;
+                num_batches=2,
+                num_train_per_batch=1,
+                optimizer=Flux.Adam(0.01),
+                gradient_fallback=ZeroGradientFallback(),
+            )
+            @test model_out2 isa Any
+        end
+
+        @testset "RolloutEvaluation accepts gradient_fallback kwarg" begin
+            subproblems, spi, spo, usamples, _ = let
+                subs = JuMP.Model[]
+                spi_vec = Vector{Any}[]
+                spo_vec = Vector{Tuple{Any,VariableRef}}[]
+                us_vec = Vector{Tuple{VariableRef,Vector{Float64}}}[]
+                for d in [4.0, 5.0]
+                    s, si, so, sov, u = build_subproblem(d; subproblem=quiet_conic_ipopt_model())
+                    push!(subs, s)
+                    push!(spi_vec, [si])
+                    push!(spo_vec, [(so, sov)])
+                    push!(us_vec, [(u, [1.0, 2.0, 3.0])])
+                end
+                subs, spi_vec, spo_vec, us_vec, [5.0]
+            end
+            policy = dense_multilayer_nn(2, 1, [8]; activation=Flux.relu)
+
+            eval_scenarios = [sample(usamples) for _ in 1:2]
+            re = RolloutEvaluation(
+                subproblems, spi, spo, [5.0], eval_scenarios;
+                stride=1, policy_state=:realized,
+                gradient_fallback=ErrorGradientFallback(),
+            )
+            re(1, policy)
+            @test isfinite(re.last_objective_no_deficit)
+
+            re2 = RolloutEvaluation(
+                subproblems, spi, spo, [5.0], eval_scenarios;
+                stride=1, policy_state=:realized,
+                gradient_fallback=ZeroGradientFallback(),
+            )
+            re2(1, policy)
+            @test isfinite(re2.last_objective_no_deficit)
+        end
+    end
+
+    @testset "Uncertainty sampling" begin
+        m = quiet_highs_model()
+        @variable(m, p1 in MOI.Parameter(0.0))
+        @variable(m, p2 in MOI.Parameter(0.0))
+        T = 3
+
+        @testset "independent pool: each param drawn independently" begin
+            indep_pool = [
+                [(p1, [1.0, 2.0, 3.0]), (p2, [10.0, 20.0, 30.0])]
+                for _ in 1:T
+            ]
+            Random.seed!(42)
+            N = 3000
+            trajectories = [sample(indep_pool) for _ in 1:N]
+
+            @test length(trajectories[1]) == T
+            @test length(trajectories[1][1]) == 2
+
+            # Values always come from the declared support
+            for traj in trajectories, stage in traj
+                @test stage[1][2] in [1.0, 2.0, 3.0]
+                @test stage[2][2] in [10.0, 20.0, 30.0]
+            end
+
+            # k^n = 9 combinations possible; independent draws break correlation
+            combos = Set((s[1][2], s[2][2]) for traj in trajectories for s in traj)
+            @test length(combos) == 9
+        end
+
+        @testset "joint pool: all params from same scenario" begin
+            joint_pool = [
+                [[(p1, 1.0), (p2, 10.0)],
+                 [(p1, 2.0), (p2, 20.0)],
+                 [(p1, 3.0), (p2, 30.0)]]
+                for _ in 1:T
+            ]
+            Random.seed!(42)
+            N = 3000
+            trajectories = [sample(joint_pool) for _ in 1:N]
+
+            @test length(trajectories[1]) == T
+            @test length(trajectories[1][1]) == 2
+
+            # Only k=3 combinations possible (never cross-scenario combos)
+            combos = Set((s[1][2], s[2][2]) for traj in trajectories for s in traj)
+            @test combos == Set([(1.0, 10.0), (2.0, 20.0), (3.0, 30.0)])
+        end
+
+        @testset "trajectory sampler: temporal conditioning" begin
+            calls = Tuple{Int, Vector}[]
+            function my_sampler(t, past)
+                push!(calls, (t, copy(past)))
+                prev = isempty(past) ? 0.0 : past[end][1][2]
+                return [(p1, prev + 1.0)]
+            end
+
+            Random.seed!(42)
+            traj = sample(my_sampler, 3)
+
+            @test length(traj) == 3
+            @test traj[1][1][2] == 1.0   # 0 + 1
+            @test traj[2][1][2] == 2.0   # 1 + 1
+            @test traj[3][1][2] == 3.0   # 2 + 1
+
+            # Sampler received correct past at each stage
+            @test calls[1] == (1, [])
+            @test length(calls[2][2]) == 1
+            @test length(calls[3][2]) == 2
+        end
+
+        @testset "all formats produce same trajectory type" begin
+            indep_pool = [[(p1, [1.0]), (p2, [10.0])] for _ in 1:T]
+            joint_pool = [[[(p1, 1.0), (p2, 10.0)]] for _ in 1:T]
+            sampler_fn = (t, past) -> [(p1, 1.0), (p2, 10.0)]
+
+            t1 = sample(indep_pool)
+            t2 = sample(joint_pool)
+            t3 = sample(sampler_fn, T)
+
+            @test typeof(t1) == typeof(t2) == typeof(t3)
+            @test t1[1][1][2] == t2[1][1][2] == t3[1][1][2] == 1.0
+        end
+    end
 end
diff --git a/test/test_score_function.jl b/test/test_score_function.jl
new file mode 100644
index 0000000..e4ecf9d
--- /dev/null
+++ b/test/test_score_function.jl
@@ -0,0 +1,382 @@
+using Statistics
+
+raw"""
+    _score_function_stage_model(; kwargs...)
+
+Build a one-dimensional linear stage model for score-function tests.
+
+The continuous version is
+
+```math
+\begin{aligned}
+\min_{u,x,\delta}\quad
+    & 30u + 10^4 |\delta| \\
+\text{s.t.}\quad
+    & x = x^{in} + \xi - u, \\
+    & u \ge d, \\
+    & \delta = x - \hat{x}.
+\end{aligned}
+```
+
+When `integer = true`, the model adds a binary setup variable `z`,
+
+```math
+u \le 10 z,\qquad z \in \{0,1\},
+```
+
+and the objective becomes ``5z + 30u + 10^4|\delta|``.
+
+# Keywords
+- `state_value::Real`: initial value for the input-state parameter.
+- `target_value::Real`: initial value for the target parameter.
+- `uncertainty_value::Real`: initial value for the uncertainty parameter.
+- `demand::Real`: lower bound that forces positive ordering cost.
+- `integer::Bool`: whether to include a binary setup decision.
+
+# Examples
+```julia
+model, state_in, target, state_out, uncertainty =
+    _score_function_stage_model(; integer = true)
+```
+"""
+function _score_function_stage_model(;
+    state_value::Real = 5.0,
+    target_value::Real = 4.0,
+    uncertainty_value::Real = 2.0,
+    demand::Real = 1.0,
+    integer::Bool = false,
+)
+    # HiGHS keeps these tests fast and supports both LP and small MIP cases.
+    model = quiet_highs_model()
+
+    # The order quantity is the operational decision whose cost we measure.
+    @variable(model, order >= 0.0)
+
+    # The output state is what the policy target tries to guide.
+    @variable(model, state_out >= 0.0)
+
+    # Parameters are updated by rollout helpers before every stage solve.
+    @variable(model, state_in in MOI.Parameter(Float64(state_value)))
+    @variable(model, target in MOI.Parameter(Float64(target_value)))
+    @variable(model, uncertainty in MOI.Parameter(Float64(uncertainty_value)))
+
+    # The transition is intentionally simple so expected states are easy to audit.
+    @constraint(model, state_out == state_in + uncertainty - order)
+
+    # A positive lower bound prevents the zero-cost solution from hiding bugs.
+    @constraint(model, order >= Float64(demand))
+
+    if integer
+        # The binary variable gives integer-strategy tests a real discrete object.
+        @variable(model, setup, Bin)
+
+        # This links setup to order without changing the simple state equation.
+        @constraint(model, order <= 10.0 * setup)
+
+        # The objective includes operational cost and the generated deficit cost.
+        _norm_deficit, deficit = create_deficit!(model, 1; penalty = 1.0e4)
+        @constraint(model, deficit[1] == state_out - target)
+        @objective(model, Min, 5.0 * setup + 30.0 * order + objective_function(model))
+    else
+        # Continuous tests use the same deficit structure without a setup binary.
+        _norm_deficit, deficit = create_deficit!(model, 1; penalty = 1.0e4)
+        @constraint(model, deficit[1] == state_out - target)
+        @objective(model, Min, 30.0 * order + objective_function(model))
+    end
+
+    return model, state_in, target, state_out, uncertainty
+end
+
+raw"""
+    _two_stage_score_function_fixture(; integer = false)
+
+Create a reusable two-stage score-function fixture.
+
+The fixture represents a two-stage rollout
+
+```math
+Q(\hat{x}_{1:2})
+    =
+    q_1(x_0,\xi_1;\hat{x}_1)
+    +
+    q_2(x_1,\xi_2;\hat{x}_2),
+```
+
+where each ``q_t`` is the one-dimensional stage model built by
+[`_score_function_stage_model`](@ref). The two stages intentionally use
+different parameter defaults so indexing mistakes change the solved model.
+
+# Keywords
+- `integer::Bool`: whether stage models contain binary setup variables.
+
+# Examples
+```julia
+config, initial_state, uncertainties, targets =
+    _two_stage_score_function_fixture()
+```
+"""
+function _two_stage_score_function_fixture(; integer::Bool = false)
+    # Stage 1 starts from inventory 5 and observes uncertainty 2.
+    stage_1, state_in_1, target_1, state_out_1, uncertainty_1 =
+        _score_function_stage_model(; integer)
+
+    # Stage 2 uses different parameter defaults to catch stage-index mistakes.
+    stage_2, state_in_2, target_2, state_out_2, uncertainty_2 =
+        _score_function_stage_model(;
+            state_value = 4.0,
+            target_value = 3.0,
+            uncertainty_value = 1.0,
+            integer,
+        )
+
+    # The config mirrors the shape used by train_multistage.
+    state_params_in = [[state_in_1], [state_in_2]]
+    state_params_out = [[(target_1, state_out_1)], [(target_2, state_out_2)]]
+    config = ScoreFunctionConfig(
+        [stage_1, stage_2],
+        state_params_in,
+        state_params_out;
+        num_rollouts = 4,
+        perturbation_std = 0.5,
+    )
+
+    # Targets include the initial state at index 1.
+    initial_state = [5.0]
+    targets = [[5.0], [4.0], [3.0]]
+    uncertainties = [
+        [(uncertainty_1, 2.0)],
+        [(uncertainty_2, 1.0)],
+    ]
+
+    return config, initial_state, uncertainties, targets
+end
+
+@testset "Score-function gradient mixing" begin
+    @testset "ScoreFunctionConfig validates public arguments" begin
+        config, _, _, _ = _two_stage_score_function_fixture()
+
+        @test config.dual_weight == 0.5
+        @test config.perturbation_std == 0.5
+        @test config.num_rollouts == 4
+        @test config.baseline == :mean
+
+        @test_throws ArgumentError ScoreFunctionConfig(
+            config.subproblems,
+            config.state_params_in[1:1],
+            config.state_params_out,
+        )
+        @test_throws ArgumentError ScoreFunctionConfig(
+            config.subproblems,
+            config.state_params_in,
+            config.state_params_out;
+            dual_weight = -0.1,
+        )
+        @test_throws ArgumentError ScoreFunctionConfig(
+            config.subproblems,
+            config.state_params_in,
+            config.state_params_out;
+            perturbation_std = 0.0,
+        )
+        @test_throws ArgumentError ScoreFunctionConfig(
+            config.subproblems,
+            config.state_params_in,
+            config.state_params_out;
+            num_rollouts = 0,
+        )
+        @test_throws ArgumentError ScoreFunctionConfig(
+            config.subproblems,
+            config.state_params_in,
+            config.state_params_out;
+            baseline = :median,
+        )
+    end
+
+    @testset "rollout_with_perturbation returns operational cost" begin
+        config, initial_state, uncertainties, targets =
+            _two_stage_score_function_fixture()
+
+        # Zero perturbation should still solve the staged rollout successfully.
+        zero_cost = DecisionRules.rollout_with_perturbation(
+            config,
+            initial_state,
+            uncertainties,
+            targets,
+            [[0.0], [0.0]],
+        )
+
+        # A nonzero perturbation exercises target parameter updates.
+        perturbed_cost = DecisionRules.rollout_with_perturbation(
+            config,
+            initial_state,
+            uncertainties,
+            targets,
+            [[0.1], [-0.2]],
+        )
+
+        @test isfinite(zero_cost)
+        @test isfinite(perturbed_cost)
+        @test zero_cost > 0.0
+        @test perturbed_cost > 0.0
+    end
+
+    @testset "_score_function_rollouts samples centered advantages" begin
+        config, initial_state, uncertainties, targets =
+            _two_stage_score_function_fixture()
+
+        Random.seed!(42)
+        advantages, perturbations = DecisionRules._score_function_rollouts(
+            config,
+            initial_state,
+            uncertainties,
+            targets;
+            perturbation_std = 0.5,
+            num_rollouts = 6,
+        )
+
+        @test length(advantages) == 6
+        @test length(perturbations) == 6
+        @test all(isfinite, advantages)
+        @test all(length(rollout) == 2 for rollout in perturbations)
+        @test all(length(stage) == 1 for rollout in perturbations for stage in rollout)
+
+        # The default :mean baseline centers advantages by construction.
+        @test sum(advantages) ≈ 0.0 atol = 1.0e-8
+    end
+
+    @testset "_score_function_surrogate matches Gaussian location score" begin
+        # These targets are differentiable arrays in the training loop.
+        targets = [[1.0f0], [2.0f0], [4.0f0]]
+
+        # Perturbations are actual target perturbations, not standard normals.
+        perturbations = [[0.5], [-0.25]]
+
+        surrogate = DecisionRules._score_function_surrogate(
+            3.0,
+            perturbations,
+            targets,
+            0.5,
+        )
+
+        # 3 * ((0.5 / 0.25) * 2 + (-0.25 / 0.25) * 4) == 0.
+        @test surrogate ≈ 0.0f0
+    end
+
+    @testset "sf_params reports scheduled ASCII-named fields" begin
+        config, _, _, _ = _two_stage_score_function_fixture()
+        schedule = ScoreFunctionSchedule(
+            config;
+            sf_start = 10,
+            ramp_batches = 20,
+            perturbation_std_initial = 0.1,
+            num_rollouts_initial = 2,
+        )
+
+        before_start = sf_params(schedule, 9)
+        @test before_start.active == false
+        @test before_start.alpha == 1.0
+        @test before_start.score_weight == 0.0
+
+        at_start = sf_params(schedule, 10)
+        @test at_start.active == true
+        @test at_start.alpha == 1.0
+        @test at_start.num_rollouts == 2
+
+        halfway = sf_params(schedule, 20)
+        @test halfway.active == true
+        @test 0.0 < halfway.score_weight < 0.5
+        @test halfway.perturbation_std > 0.1
+
+        after_ramp = sf_params(schedule, 30)
+        @test after_ramp.alpha ≈ config.dual_weight
+        @test after_ramp.score_weight ≈ 1.0 - config.dual_weight
+        @test after_ramp.perturbation_std ≈ config.perturbation_std
+        @test after_ramp.num_rollouts == config.num_rollouts
+
+        static_params = sf_params(config, 1)
+        @test static_params.active == true
+        @test static_params.alpha == config.dual_weight
+    end
+
+    @testset "rollout solves integer models exactly as written" begin
+        config, initial_state, uncertainties, targets =
+            _two_stage_score_function_fixture(; integer = true)
+
+        cost = DecisionRules.rollout_with_perturbation(
+            config,
+            initial_state,
+            uncertainties,
+            targets,
+            [[0.1], [0.0]],
+        )
+
+        @test isfinite(cost)
+        @test any(JuMP.is_binary, JuMP.all_variables(config.subproblems[1]))
+    end
+
+    @testset "train_multistage accepts ScoreFunctionConfig on deterministic equivalent" begin
+        # Build the deterministic-equivalent problem used by the dual path.
+        stage_1, state_in_1, target_1, state_out_1, uncertainty_1 =
+            build_subproblem(10; subproblem = quiet_highs_model())
+        stage_2, state_in_2, target_2, state_out_2, uncertainty_2 =
+            build_subproblem(
+                10;
+                state_i_val = 4.0,
+                state_out_val = 3.0,
+                uncertainty_val = 1.0,
+                subproblem = quiet_highs_model(),
+        )
+        subproblems = [stage_1, stage_2]
+        state_params_in = Vector{Vector{Any}}(undef, 2)
+        state_params_in .= [[state_in_1], [state_in_2]]
+        state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, 2)
+        state_params_out .= [[(target_1, state_out_1)], [(target_2, state_out_2)]]
+        uncertainty_samples = [[(uncertainty_1, [2.0])], [(uncertainty_2, [1.0])]]
+
+        det_equivalent, deterministic_sampler = DecisionRules.deterministic_equivalent!(
+            quiet_highs_model(),
+            subproblems,
+            state_params_in,
+            state_params_out,
+            [5.0],
+            uncertainty_samples,
+        )
+
+        # Build separate rollout models so the score-function solves do not
+        # mutate the deterministic-equivalent model.
+        score_config, _, _, _ = _two_stage_score_function_fixture()
+        score_config = ScoreFunctionConfig(
+            score_config.subproblems,
+            score_config.state_params_in,
+            score_config.state_params_out;
+            dual_weight = 0.5,
+            perturbation_std = 0.3,
+            num_rollouts = 2,
+        )
+
+        Random.seed!(222)
+        policy = Chain(Dense(2, 8, relu), Dense(8, 1))
+
+        Random.seed!(42)
+        train_multistage(
+            policy,
+            [5.0],
+            det_equivalent,
+            state_params_in,
+            state_params_out,
+            deterministic_sampler;
+            num_batches = 4,
+            num_train_per_batch = 2,
+            score_function = score_config,
+        )
+
+        objective = simulate_multistage(
+            det_equivalent,
+            state_params_in,
+            state_params_out,
+            [5.0],
+            sample(deterministic_sampler),
+            policy,
+        )
+        @test isfinite(objective)
+    end
+end