diff --git a/.gitignore b/.gitignore index 67fda7c..c019973 100644 --- a/.gitignore +++ b/.gitignore @@ -51,4 +51,7 @@ examples/**/.CondaPkg/* *.err *.tsv *.pdf -plan.md +plan/ +*_cuts.json +settings.json +*.sh diff --git a/Project.toml b/Project.toml index bf7226f..5e9b71c 100644 --- a/Project.toml +++ b/Project.toml @@ -16,6 +16,7 @@ Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" ParametricOptInterface = "0ce4ce61-57bf-432b-a095-efac525d185e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] @@ -35,6 +36,7 @@ MadNLP = "0.8, 0.9, 0.10" MadNLPGPU = "0.7, 0.8, 0.9, 0.10" MathOptInterface = "1.48.0" ParametricOptInterface = "0.14.1, 0.15, 0.16" +Statistics = "1.10, 1.11" Zygote = "0.6.77, 0.7" julia = "1.10, 1.11, 1.12" diff --git a/README.md b/README.md index 8fb6b16..710b710 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ DecisionRules.jl implements this workflow in three flavors: ```julia using Pkg -Pkg.add(url="https://github.com/LearningToOptimize/DecisionRules.jl.git") +Pkg.add("DecisionRules") ``` ## What you need to provide @@ -202,6 +202,18 @@ Each evaluation reports (a) the rollout objective **excluding** the target-slack Per-sample debugging hooks can be attached with `SampleLog(on_sample=(s, models, log) -> ...)`; the training loop calls the hook after each sample's solve with the live JuMP model(s). The previous `record_loss=(iter, model, loss, tag) -> ...` keyword keeps working as a deprecated adapter. +## GPU acceleration with DecisionRulesExa.jl + +For large-scale problems where the inner NLP solve is the bottleneck (e.g., AC-OPF with hundreds of buses), [DecisionRulesExa.jl](https://github.com/LearningToOptimize/DecisionRulesExa.jl) provides a GPU-accelerated backend that replaces JuMP with [ExaModels.jl](https://github.com/exanauts/ExaModels.jl) and solves with [MadNLP.jl](https://github.com/MadNLP/MadNLP.jl) + CUDSS on GPU. + +DecisionRulesExa.jl implements the same TS-DDR algorithm (deterministic-equivalent mode) with the same envelope-theorem gradient computation but formulates the NLP in ExaModels' SIMD-compatible modeling layer. This enables: + +- **GPU-native interior-point solves** via MadNLP + CUDSS +- **Parallel GPU solves** for multiple training samples per gradient step +- **Runtime parameter updates** via `ExaModels.set_parameter!` (no model reconstruction) + +See the [GPU Acceleration](https://LearningToOptimize.github.io/DecisionRules.jl/dev/gpu_acceleration/) page in the documentation for a tutorial on getting started with DecisionRulesExa.jl. + ## Examples and tests Examples live in `examples/`. Run tests with: diff --git a/docs/Project.toml b/docs/Project.toml index 6829622..cadd9b3 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -3,9 +3,12 @@ DecisionRules = "47937410-f832-486f-8300-12c95b225dfc" DiffOpt = "930fe3bc-9c6b-11ea-2d94-6184641e85e7" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196" +HiGHS = "87dc4568-4c63-4d18-b0c0-bb2238e4078b" Ipopt = "b6b21f68-93f8-5de0-b562-5493be1d77c9" JuMP = "4076af6c-e467-56ae-b986-b466b2749572" Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" +MathOptInterface = "b8f27783-ece8-5eb3-8dc8-9495eed66fee" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/docs/make.jl b/docs/make.jl index 9f169c8..490de9b 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -24,6 +24,9 @@ makedocs(; pages=[ "Home" => "index.md", "Algorithm" => "algorithm.md", + "Gradient Fallback" => "gradient_fallback.md", + "Uncertainty Sampling" => "sampling.md", + "GPU Acceleration" => "gpu_acceleration.md", "Examples" => [ "Hydropower Scheduling" => "examples/hydro.md", "Rocket Control" => "examples/rocket.md", diff --git a/docs/src/algorithm.md b/docs/src/algorithm.md index 22cbe1e..36c3d24 100644 --- a/docs/src/algorithm.md +++ b/docs/src/algorithm.md @@ -108,6 +108,86 @@ for k = 1, ..., ⌈T/W⌉: **Pros**: balances coupling (within windows) with tractability; parallelizable windows. **Cons**: continuity gaps between windows require penalty tuning. +## Mixed gradient: score-function (REINFORCE) correction + +For problems with integer variables or non-smooth subproblems, the dual +gradient can be biased — it is local to a fixed integer assignment and cannot +see the effect of discrete switches (e.g., opening a setup variable). + +DecisionRules provides a **score-function (REINFORCE)** correction that mixes +the dual gradient with a model-free policy gradient estimated from stage-wise +rollouts under perturbed targets. + +### How the score-function estimator works + +1. **Perturb**: add Gaussian noise to the policy targets: + ``\tilde{x}_t = \hat{x}_t(\theta) + \delta_t``, where + ``\delta_t \sim \mathcal{N}(0, \sigma^2 I)``. + +2. **Rollout**: solve the stage-wise subproblems with the perturbed targets to + obtain realized costs ``R_m`` for ``m = 1, \ldots, M`` rollouts. These + rollouts solve the models exactly as built (MIPs stay MIPs), so the costs + reflect true integer-feasible decisions. + +3. **Advantage**: center the costs ``A_m = R_m - \bar{R}`` (mean baseline + reduces variance without changing the expected gradient). + +4. **Surrogate loss**: the differentiable scalar whose gradient recovers the + REINFORCE estimate: + +```math +L_{\text{sf}}(\theta) +\;=\; +\frac{1}{M} \sum_{m=1}^{M} + A_m + \sum_{t=1}^{T} + \left\langle + \frac{\delta_{m,t}}{\sigma^2},\; + \hat{x}_{t+1}(\theta) + \right\rangle. +``` + +This is the standard score-function estimator for Gaussian perturbations. +The key identity is +``\nabla_\theta \log p(\delta_t \mid \theta) = \delta_t / \sigma^2`` +for a Gaussian centered at ``\hat{x}_t(\theta)``. + +### Mixed gradient + +The final training gradient combines both signals: + +```math +\nabla L +\;=\; +\alpha\, \nabla L_{\text{dual}} ++ (1 - \alpha)\, \nabla L_{\text{sf}}, +``` + +where ``\alpha \in [0, 1]`` is the `dual_weight`. + +There are two separate solve paths in the mixed-gradient training loop: + +- **Dual path**: controlled by `integer_strategy`, which determines how local + dual information is read from the deterministic equivalent + (e.g., [`FixedDiscreteIntegerStrategy`](@ref) solves the MIP, fixes integers, + re-solves the LP, and reads LP duals). +- **Score-function path**: controlled by [`ScoreFunctionConfig`](@ref), which + owns separate rollout subproblems. These are solved exactly as built, and + their realized costs define the Monte Carlo score-function term. + +### Scheduled ramp-in + +A [`ScoreFunctionSchedule`](@ref) can ramp ``\alpha`` from 1 (pure dual) to +its final value over a warmup period. Let ``k`` be the current iteration and +``\rho_k = \operatorname{clip}((k - k_0) / r,\, 0,\, 1)``. The effective +score-function weight is ``\rho_k (1 - \alpha)``. + +This lets the DE dual gradient establish a good initial policy before +introducing the higher-variance REINFORCE signal. + +See the [Stochastic Lot-Sizing with Fixed Ordering Costs](@ref) example for a +complete worked example with integer variables and mixed gradients. + ## Penalty annealing The target penalty ``\lambda`` is critical: too small and the optimizer ignores diff --git a/docs/src/api.md b/docs/src/api.md index 02d432c..c231205 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -16,4 +16,5 @@ Private = false ```@autodocs Modules = [DecisionRules] Public = false +Filter = t -> t != DecisionRules ``` diff --git a/docs/src/assets/hydro_generation_comparison.png b/docs/src/assets/hydro_generation_comparison.png new file mode 100644 index 0000000..b601e59 Binary files /dev/null and b/docs/src/assets/hydro_generation_comparison.png differ diff --git a/docs/src/assets/hydro_volume_comparison.png b/docs/src/assets/hydro_volume_comparison.png new file mode 100644 index 0000000..4e0f863 Binary files /dev/null and b/docs/src/assets/hydro_volume_comparison.png differ diff --git a/docs/src/assets/inventory_integer_results.png b/docs/src/assets/inventory_integer_results.png index e905f7a..0f26dd2 100644 Binary files a/docs/src/assets/inventory_integer_results.png and b/docs/src/assets/inventory_integer_results.png differ diff --git a/docs/src/assets/inventory_relaxed_results.png b/docs/src/assets/inventory_relaxed_results.png index 73e5dab..8a16f85 100644 Binary files a/docs/src/assets/inventory_relaxed_results.png and b/docs/src/assets/inventory_relaxed_results.png differ diff --git a/docs/src/examples/hydro.jl b/docs/src/examples/hydro.jl index c5e476f..e7bf3fd 100644 --- a/docs/src/examples/hydro.jl +++ b/docs/src/examples/hydro.jl @@ -1,16 +1,86 @@ # # Hydropower Scheduling # -# This example trains TS-DDR policies for the Bolivia long-term hydrothermal -# dispatch (LTHD) problem using all three formulations: deterministic equivalent, -# stage-wise subproblem decomposition, and multiple shooting. +# This example trains target-setting decision rules for the Bolivia +# long-term hydrothermal dispatch (LTHD) problem — both **TS-DDR** (deep, +# LSTM-based) and **TS-LDR** (linear) — and compares them against an SDDP +# baseline with inconsistent formulations. # -# The Bolivia system has 10 hydro plants, 96 monthly stages, and AC power flow -# constraints. Inflow uncertainty is sampled from historical scenarios. +# The Bolivia system has **10 hydro plants**, **96 monthly stages**, and +# **AC power flow** constraints. Inflow uncertainty is sampled from 47 +# historical scenarios. # +# ## Overview of the TS-DDR approach +# +# Classical stochastic programming (e.g., SDDP) constructs piecewise-linear +# value-function approximations. TS-DDR takes a different route: a neural +# network policy ``\pi_\theta`` maps observations to **target states**, and a +# projection subproblem at each stage enforces physical feasibility while +# tracking those targets as closely as possible. +# +# The key insight is that the gradient of the projection subproblem with +# respect to the target parameters is available through Lagrange duality +# (or equivalently, implicit differentiation of the KKT conditions). +# This avoids differentiating through the full optimization solver. +# +# ## Problem formulation +# +# At each stage ``t``, the operator observes inflows ``w_t`` and the current +# reservoir state ``x_{t-1}``. The policy predicts target volumes: +# +# ```math +# \hat{x}_t = \pi_\theta(w_{1:t},\, x_{t-1}). +# ``` +# +# A stage subproblem projects onto the feasible set: +# +# ```math +# \begin{aligned} +# q_t(x_{t-1},\, w_t;\; \hat{x}_t) +# \;=\; +# \min_{x_t, u_t, \delta_t} +# \quad & +# c_t(x_t, u_t) + C_\delta\, \|\delta_t\| \\ +# \text{s.t.}\quad +# & x_t = x_{t-1} + w_t - \text{turbined}_t - \text{spilled}_t, +# && \text{(reservoir balance)} \\ +# & x_t + \delta_t = \hat{x}_t, +# && : \lambda_t \quad \text{(target constraint)} \\ +# & \text{AC-OPF}(u_t), +# && \text{(power flow)} \\ +# & x_t \in [0, \bar{x}],\; u_t \ge 0. +# \end{aligned} +# ``` +# +# The slack variable ``\delta_t`` absorbs infeasible targets; ``\lambda_t`` is +# the dual multiplier that provides the gradient signal. +# +# ## Gradient computation: the envelope theorem +# +# By the envelope theorem, the sensitivity of the optimal value with respect +# to the target parameter is simply the dual: +# +# ```math +# \frac{\partial q_t}{\partial \hat{x}_t} +# \;=\; -\lambda_t. +# ``` +# +# Combined with backpropagation through the policy network, the full gradient +# of the expected cost is: +# +# ```math +# \nabla_\theta \mathbb{E}[Q] +# \;\approx\; +# \frac{1}{S} \sum_{s=1}^{S} \sum_{t=1}^{T} +# \lambda_t^s \odot \nabla_\theta \hat{x}_t^s(\theta), +# ``` +# +# where ``S`` is the number of sampled trajectories per batch and ``\odot`` +# denotes elementwise multiplication. + # ## Problem setup # # The JuMP subproblems are built from a MOF file (exported from PowerModels.jl) -# plus hydro data (reservoir limits, inflow scenarios). Each subproblem contains: +# plus hydro data (reservoir limits, inflow scenarios). Each subproblem contains: # - AC optimal power flow constraints # - Reservoir balance: `vol_out = vol_in + inflow - turbined - spilled` # - Target-slack deficit variables penalizing deviation from the policy's targets @@ -25,7 +95,7 @@ using Flux using Statistics, Random # Load the problem builder (reads MOF + hydro JSON + inflow CSV). - +# # ```julia # include("load_hydropowermodels.jl") # ``` @@ -51,8 +121,28 @@ using Statistics, Random # ## Policy architecture # -# The policy is a `StateConditionedPolicy` with an LSTM encoder. At each stage it -# receives `[inflow_t; reservoir_state_{t-1}]` and outputs target reservoir volumes: +# The policy is a [`StateConditionedPolicy`](@ref) with two components: +# +# 1. **Encoder** — a stack of LSTM cells that processes only the uncertainty +# (inflow) sequence, capturing temporal dependencies across stages. +# 2. **Combiner** — a Dense layer that merges the encoded uncertainty with the +# previous state to produce the next target. +# +# At each stage the policy receives ``[w_t;\; x_{t-1}]`` and outputs +# target reservoir volumes ``\hat{x}_t``: +# +# ``` +# ┌─────────┐ ┌────────────────┐ ┌──────────────┐ +# │ w_t │─────▶│ LSTM encoder │─────▶│ │ +# └─────────┘ └────────────────┘ │ Dense │──▶ x̂_t +# ┌─────────┐ │ combiner │ +# │ x_{t-1} │─────────────────────────────▶│ │ +# └─────────┘ └──────────────┘ +# ``` +# +# The LSTM carries hidden state across stages, giving the policy memory of +# past inflows. The activation is `sigmoid` (bounding outputs to ``[0,1]``, +# which is then scaled by the feasibility mapping). # ```julia # models = state_conditioned_policy( @@ -61,11 +151,86 @@ using Statistics, Random # ) # ``` -# ## Training: Deterministic Equivalent +# ## TS-LDR: Linear Decision Rules +# +# As a baseline, we also train a **linear** policy (TS-LDR). This uses +# `dense_multilayer_nn` with identity activation — a composition of linear +# layers equivalent to a single affine map: # -# The deterministic equivalent couples all 96 stages into a single NLP. The policy -# generates targets in one forward pass; the coupled solve determines realized states. -# This gives the strongest gradient signal but requires solving the largest subproblem. +# ```math +# \hat{x}_t = W [w_{1:t};\; x_{t-1}] + b. +# ``` +# +# TS-LDR uses the same target-setting framework and training pipeline as +# TS-DDR. The only difference is the policy class: linear maps have fewer +# parameters and cannot capture nonlinear inflow patterns, but they are a +# natural baseline from the classical LDR literature. + +# ```julia +# num_inputs = DecisionRules.policy_input_dim(num_uncertainties, num_hydro) +# models = dense_multilayer_nn(num_inputs, num_hydro, [64, 64]; activation=identity) +# ``` + +# ## Training pipeline 1: Deterministic Equivalent +# +# The deterministic equivalent (DE) couples all 96 stages into a **single NLP** +# for each sampled trajectory. This is the most direct formulation: the policy +# generates the full target trajectory ``\hat{x}_{1:T}`` in one forward pass, +# and a single coupled solve determines all realized states simultaneously. +# +# ### How it works +# +# ``` +# ┌──────────────────────────────────────────────────────────┐ +# │ For each sampled trajectory w_{1:T}: │ +# │ │ +# │ 1. Forward pass: x̂_{1:T} = π_θ(w_{1:T}, x_0) │ +# │ │ +# │ 2. Solve coupled NLP: │ +# │ min Σ_t c_t(x_t, u_t) + C_δ Σ_t ‖δ_t‖ │ +# │ s.t. dynamics + AC-OPF for ALL stages simultaneously │ +# │ x_t + δ_t = x̂_t(θ) ∀t (target constraint) │ +# │ │ +# │ 3. Read duals λ_t of target constraints │ +# │ Gradient: Σ_t λ_t ⊙ ∇_θ x̂_t(θ) │ +# └──────────────────────────────────────────────────────────┘ +# ``` +# +# ### Mathematical formulation +# +# ```math +# \begin{aligned} +# Q(w;\, \theta) +# \;=\; +# \min_{\{x_t, u_t, \delta_t\}_{t=1}^{T}} +# \quad & +# \sum_{t=1}^{T} c_t(x_t, u_t) +# + C_\delta \sum_{t=1}^{T} \|\delta_t\| \\ +# \text{s.t.}\quad +# & x_t = T_t(w_t,\, u_t,\, x_{t-1}), +# && t=1,\ldots,T \\ +# & x_t + \delta_t = \hat{x}_t(\theta), +# && : \lambda_t,\quad t=1,\ldots,T \\ +# & h_t(x_t, u_t) \ge 0, +# && t=1,\ldots,T +# \end{aligned} +# ``` +# +# The gradient is exact by the envelope theorem: +# +# ```math +# \nabla_\theta Q +# \;=\; +# \sum_{t=1}^{T} +# \lambda_t \odot \nabla_\theta \hat{x}_t(\theta). +# ``` +# +# **Advantages**: strongest gradient signal — full cross-stage coupling +# captures how a target at stage 3 affects costs at stage 50. +# +# **Disadvantage**: the NLP has ``96 \times (\text{AC-OPF variables})`` +# decision variables; the policy generates targets without seeing realized +# states (open-loop target generation). # ```julia # det_equivalent, uncertainty_samples_det = DecisionRules.deterministic_equivalent!( @@ -76,31 +241,129 @@ using Statistics, Random # train_multistage( # models, initial_state, det_equivalent, # state_params_in, state_params_out, uncertainty_samples; -# num_batches=2000, optimizer=Flux.Adam(), -# penalty_schedule=:default_annealed, +# num_batches=4000, optimizer=Flux.Adam(), +# penalty_schedule=[(1,100,0.1), (101,210,1.0), (211,300,10.0), (301,4000,30.0)], # ) # ``` -# ## Training: Stage-wise Subproblems +# ## Training pipeline 2: Stage-wise Decomposition (Single Shooting) +# +# Stage-wise decomposition solves one subproblem per stage sequentially. +# Unlike the DE, the policy operates in **closed loop**: after each stage +# solve, the realized state ``x_t`` (not the predicted target) is fed back +# as input to the next stage. +# +# ### How it works +# +# ``` +# ┌─────────────────────────────────────────────────────────────┐ +# │ For each sampled trajectory w_{1:T}: │ +# │ │ +# │ x_0 = initial state │ +# │ for t = 1, ..., T: │ +# │ x̂_t = π_θ(w_t, x_{t-1}) ← predict target │ +# │ solve stage-t subproblem ← project to feasible│ +# │ x_t = realized state from solver ← closed-loop │ +# │ accumulate c_t + C_δ ‖δ_t‖ │ +# │ │ +# │ Gradient: chain rule through all stage solves │ +# └─────────────────────────────────────────────────────────────┘ +# ``` +# +# ### Gradient chain +# +# The gradient must account for how the realized state at stage ``t`` +# depends on the targets at all earlier stages. By the chain rule: # -# Stage-wise decomposition solves one subproblem per stage sequentially. The policy -# receives the realized state from the previous stage (closed-loop). Gradients -# combine dual information with DiffOpt sensitivities along the rollout. +# ```math +# \frac{\partial Q}{\partial \hat{x}_t} +# \;=\; +# \lambda_t +# + \sum_{k>t} +# \frac{\partial q_k}{\partial x_{k-1}} +# \cdot \prod_{j=t+1}^{k-1} +# \frac{\partial x_j}{\partial x_{j-1}} +# \cdot \frac{\partial x_t}{\partial \hat{x}_t}. +# ``` +# +# In practice, automatic differentiation (Zygote + ChainRules `rrule`s +# defined on each stage solve) handles this chain automatically. +# The `rrule` for each stage solve reads the dual ``\lambda_t`` for the +# target constraint and uses DiffOpt's implicit differentiation for the +# state-transition sensitivities. +# +# **Advantages**: closed-loop — the policy sees realized states, matching +# deployment semantics. Each solve is small (single-stage AC-OPF). +# +# **Disadvantage**: gradients weaken over long horizons because the +# chain rule multiplies many Jacobians; sequential solve prevents +# parallelism. # ```julia # train_multistage( # models, initial_state, subproblems, # state_params_in, state_params_out, uncertainty_samples; -# num_batches=2000, optimizer=Flux.Adam(), +# num_batches=3000, optimizer=Flux.Adam(), # penalty_schedule=:default_annealed, # ) # ``` -# ## Training: Multiple Shooting +# ## Training pipeline 3: Multiple Shooting +# +# Multiple shooting partitions the ``T``-stage horizon into ``K`` windows of +# ``W`` stages each. Within each window, a local deterministic equivalent +# couples the stages (strong gradient signal). Between windows, the realized +# end-state is passed to the next window (closed-loop continuity). +# +# ### How it works +# +# ``` +# ┌────────────────────────────────────────────────────────────────┐ +# │ Partition T=96 stages into K=⌈96/12⌉=8 windows of W=12 │ +# │ │ +# │ x_0 = initial state │ +# │ for k = 1, ..., K: │ +# │ stages = [(k-1)W+1, ..., kW] │ +# │ x̂_{stages} = π_θ(w_{stages}, x_{start_k}) │ +# │ solve window-k DE (12-stage coupled NLP) │ +# │ x_{end_k} = realized end-state from window solve │ +# │ x_{start_{k+1}} = x_{end_k} │ +# │ │ +# │ Gradient: │ +# │ Within window: duals from the coupled solve (like full DE) │ +# │ Across windows: DiffOpt chain rule through end-states │ +# └────────────────────────────────────────────────────────────────┘ +# ``` +# +# ### Gradient structure +# +# Let ``Q_k`` be the cost of window ``k``. The total cost is +# ``Q = \sum_k Q_k``. Within a window, the gradient is identical to the +# DE case (duals of the target constraints in the coupled model). Across +# windows, the chain rule threads through the realized end-state: # -# Multiple shooting partitions the 96-stage horizon into windows (e.g., 12 stages -# each). Each window solves a local deterministic equivalent, then passes the -# realized end-state to the next window. +# ```math +# \frac{dQ}{d\theta} +# \;=\; +# \sum_{k=1}^{K} +# \left( +# \frac{\partial Q_k}{\partial \hat{x}_k} +# \cdot \frac{\partial \hat{x}_k}{\partial \theta} +# \;+\; +# \frac{\partial Q_k}{\partial x_{\text{start}_k}} +# \cdot \frac{d x_{\text{start}_k}}{d\theta} +# \right), +# ``` +# +# where ``\frac{d x_{\text{start}_k}}{d\theta}`` involves the chain +# through all prior windows via ``x_{\text{end}_{k-1}}``. +# +# **Advantages**: balances gradient quality (12-stage coupling) with +# tractability (8 small DEs instead of one large one); inter-window +# chain provides some closed-loop signal. +# +# **Disadvantage**: window boundaries introduce gradient discontinuities; +# the full-horizon coupling is weaker than the single DE. # ```julia # windows = DecisionRules.setup_shooting_windows( @@ -112,20 +375,40 @@ using Statistics, Random # # train_multiple_shooting( # models, initial_state, windows, () -> uncertainty_samples; -# num_batches=2000, optimizer=Flux.Adam(), +# num_batches=3000, optimizer=Flux.Adam(), # penalty_schedule=:default_annealed, # ) # ``` +# ## Penalty annealing +# +# The target penalty ``C_\delta`` controls the trade-off between following +# the policy's targets and minimizing operational cost. DecisionRules +# supports a **penalty annealing schedule** that ramps the penalty multiplier +# during training: +# +# | Phase | Multiplier | Purpose | +# |:------|:----------:|:--------| +# | Warmup | ``0.1 \times C_\delta`` | Let the policy explore freely | +# | Nominal | ``1.0 \times C_\delta`` | Standard training | +# | Tighten | ``10.0 \times C_\delta`` | Sharpen target tracking | +# | Lock | ``30.0 \times C_\delta`` | Final precision | +# +# This is activated with `penalty_schedule=:default_annealed` or by passing +# an explicit list of `(start_iter, end_iter, multiplier)` tuples. + # ## Evaluation # # After training, we evaluate the policy using stage-wise rollout on held-out -# scenarios. Two modes: -# - **Target feedback** (`policy_state=:target`): matches DE training semantics -# - **Realized feedback** (`policy_state=:realized`): deployment/closed-loop semantics +# scenarios. Two modes: +# - **Target feedback** (`policy_state=:target`): the policy receives its own +# predicted target as input, matching DE training semantics. +# - **Realized feedback** (`policy_state=:realized`): the policy receives the +# realized state from the solver, matching deployment semantics. # -# The target-violation share measures how much cost comes from the slack penalty -# rather than actual operations — it should be small (≤ 5%) for a well-trained policy. +# The **target-violation share** measures how much cost comes from the slack +# penalty rather than actual operations — it should be small (``\le 5\%``) for +# a well-trained policy. # ```julia # rollout_eval = RolloutEvaluation( @@ -137,20 +420,62 @@ using Statistics, Random # println("Violation share: ", rollout_eval.last_violation_share) # ``` +# ## SDDP baseline +# +# For comparison, we also train an SDDP policy using +# [SDDP.jl](https://github.com/odow/SDDP.jl) with **inconsistent +# formulations**: a convex SOC-WR relaxation for the backward pass +# (cut generation) and the nonconvex ACP formulation for the forward +# pass (simulation). This is a pragmatic approach when the true problem +# (AC-OPF) is nonconvex — SDDP requires convexity for valid cuts, so a +# convex relaxation approximates the value function while the forward pass +# evaluates under the true physics. +# +# The SDDP policy is trained for up to 2000 iterations and the learned +# cuts are saved to a JSON file, which can be loaded to simulate the +# policy under the ACP formulation. + # ## Results # -# The plots below compare all three training formulations on the Bolivia case. -# Training curves, out-of-sample cost distributions, and reservoir trajectories -# are generated from full training runs (20 epochs × 100 batches each). +# The plots below compare the TS-DDR and TS-LDR training formulations and +# the SDDP baseline on the Bolivia case. Training curves, out-of-sample +# cost distributions, reservoir volume trajectories, and thermal generation +# profiles are shown. +# +# ### Training convergence (TS-DDR methods) # # ![Training convergence](../assets/hydro_training_convergence.png) # +# ### Out-of-sample cost (TS-DDR methods) +# # ![Out-of-sample cost comparison](../assets/hydro_cost_comparison.png) # -# ![Reservoir trajectories](../assets/hydro_trajectories.png) +# ### Target-violation share (TS-DDR methods) +# +# ![Violation share](../assets/hydro_violation_share.png) +# +# ### Reservoir volume comparison (all methods) +# +# ![Volume comparison](../assets/hydro_volume_comparison.png) +# +# ### Thermal generation comparison (all methods) +# +# ![Generation comparison](../assets/hydro_generation_comparison.png) +# +# ### Summary +# +# | Method | Policy | Mean Cost | Std | N | +# |:-------|:------:|----------:|----:|--:| +# | TS-DDR (DE) | LSTM | 325 540 | 6 266 | 100 | +# | TS-DDR (DE, anneal) | LSTM | 324 445 | 6 134 | 100 | +# | TS-DDR (shooting w=12) | LSTM | 323 289 | 5 593 | 100 | +# | TS-DDR (shooting w=12, anneal) | LSTM | 322 812 | 6 081 | 100 | +# | TS-DDR (stage-wise, anneal) | LSTM | 321 543 | 6 214 | 100 | +# | SDDP (SOC-WR / ACP) | cuts | 303 684 | — | 100 | +# +# All three TS-DDR methods with penalty annealing converge to similar +# costs (321K–325K). SDDP trains on 126 stages (96 + 30 margin). # -# | Method | Mean Cost | Std | Violation % | Train Time | -# |:---|---:|---:|---:|---:| -# | Deterministic Equivalent | 321189.0 | — | 48.66% | 158 steps | -# | Stage-wise Subproblems | 364110.0 | — | 0.59% | 159 steps | -# | Multiple Shooting | 319462.0 | — | 36.18% | 236 steps | +# !!! note "Preliminary results" +# These numbers reflect the current default training scripts. +# They will be updated as the package evolves. diff --git a/docs/src/examples/hydro.md b/docs/src/examples/hydro.md index 5f43985..6c23019 100644 --- a/docs/src/examples/hydro.md +++ b/docs/src/examples/hydro.md @@ -4,17 +4,87 @@ EditURL = "hydro.jl" # Hydropower Scheduling -This example trains TS-DDR policies for the Bolivia long-term hydrothermal -dispatch (LTHD) problem using all three formulations: deterministic equivalent, -stage-wise subproblem decomposition, and multiple shooting. +This example trains target-setting decision rules for the Bolivia +long-term hydrothermal dispatch (LTHD) problem — both **TS-DDR** (deep, +LSTM-based) and **TS-LDR** (linear) — and compares them against an SDDP +baseline with inconsistent formulations. -The Bolivia system has 10 hydro plants, 96 monthly stages, and AC power flow -constraints. Inflow uncertainty is sampled from historical scenarios. +The Bolivia system has **10 hydro plants**, **96 monthly stages**, and +**AC power flow** constraints. Inflow uncertainty is sampled from 47 +historical scenarios. + +## Overview of the TS-DDR approach + +Classical stochastic programming (e.g., SDDP) constructs piecewise-linear +value-function approximations. TS-DDR takes a different route: a neural +network policy ``\pi_\theta`` maps observations to **target states**, and a +projection subproblem at each stage enforces physical feasibility while +tracking those targets as closely as possible. + +The key insight is that the gradient of the projection subproblem with +respect to the target parameters is available through Lagrange duality +(or equivalently, implicit differentiation of the KKT conditions). +This avoids differentiating through the full optimization solver. + +## Problem formulation + +At each stage ``t``, the operator observes inflows ``w_t`` and the current +reservoir state ``x_{t-1}``. The policy predicts target volumes: + +```math +\hat{x}_t = \pi_\theta(w_{1:t},\, x_{t-1}). +``` + +A stage subproblem projects onto the feasible set: + +```math +\begin{aligned} +q_t(x_{t-1},\, w_t;\; \hat{x}_t) + \;=\; + \min_{x_t, u_t, \delta_t} + \quad & + c_t(x_t, u_t) + C_\delta\, \|\delta_t\| \\ +\text{s.t.}\quad + & x_t = x_{t-1} + w_t - \text{turbined}_t - \text{spilled}_t, + && \text{(reservoir balance)} \\ + & x_t + \delta_t = \hat{x}_t, + && : \lambda_t \quad \text{(target constraint)} \\ + & \text{AC-OPF}(u_t), + && \text{(power flow)} \\ + & x_t \in [0, \bar{x}],\; u_t \ge 0. +\end{aligned} +``` + +The slack variable ``\delta_t`` absorbs infeasible targets; ``\lambda_t`` is +the dual multiplier that provides the gradient signal. + +## Gradient computation: the envelope theorem + +By the envelope theorem, the sensitivity of the optimal value with respect +to the target parameter is simply the dual: + +```math +\frac{\partial q_t}{\partial \hat{x}_t} +\;=\; -\lambda_t. +``` + +Combined with backpropagation through the policy network, the full gradient +of the expected cost is: + +```math +\nabla_\theta \mathbb{E}[Q] +\;\approx\; +\frac{1}{S} \sum_{s=1}^{S} \sum_{t=1}^{T} + \lambda_t^s \odot \nabla_\theta \hat{x}_t^s(\theta), +``` + +where ``S`` is the number of sampled trajectories per batch and ``\odot`` +denotes elementwise multiplication. ## Problem setup The JuMP subproblems are built from a MOF file (exported from PowerModels.jl) -plus hydro data (reservoir limits, inflow scenarios). Each subproblem contains: +plus hydro data (reservoir limits, inflow scenarios). Each subproblem contains: - AC optimal power flow constraints - Reservoir balance: `vol_out = vol_in + inflow - turbined - spilled` - Target-slack deficit variables penalizing deviation from the policy's targets @@ -57,8 +127,28 @@ subproblems, state_params_in, state_params_out, uncertainty_samples, initial_sta ## Policy architecture -The policy is a `StateConditionedPolicy` with an LSTM encoder. At each stage it -receives `[inflow_t; reservoir_state_{t-1}]` and outputs target reservoir volumes: +The policy is a [`StateConditionedPolicy`](@ref) with two components: + +1. **Encoder** — a stack of LSTM cells that processes only the uncertainty + (inflow) sequence, capturing temporal dependencies across stages. +2. **Combiner** — a Dense layer that merges the encoded uncertainty with the + previous state to produce the next target. + +At each stage the policy receives ``[w_t;\; x_{t-1}]`` and outputs +target reservoir volumes ``\hat{x}_t``: + +``` + ┌─────────┐ ┌────────────────┐ ┌──────────────┐ + │ w_t │─────▶│ LSTM encoder │─────▶│ │ + └─────────┘ └────────────────┘ │ Dense │──▶ x̂_t + ┌─────────┐ │ combiner │ + │ x_{t-1} │─────────────────────────────▶│ │ + └─────────┘ └──────────────┘ +``` + +The LSTM carries hidden state across stages, giving the policy memory of +past inflows. The activation is `sigmoid` (bounding outputs to ``[0,1]``, +which is then scaled by the feasibility mapping). ```julia models = state_conditioned_policy( @@ -67,11 +157,86 @@ models = state_conditioned_policy( ) ``` -## Training: Deterministic Equivalent +## TS-LDR: Linear Decision Rules + +As a baseline, we also train a **linear** policy (TS-LDR). This uses +`dense_multilayer_nn` with identity activation — a composition of linear +layers equivalent to a single affine map: + +```math +\hat{x}_t = W [w_{1:t};\; x_{t-1}] + b. +``` + +TS-LDR uses the same target-setting framework and training pipeline as +TS-DDR. The only difference is the policy class: linear maps have fewer +parameters and cannot capture nonlinear inflow patterns, but they are a +natural baseline from the classical LDR literature. + +```julia +num_inputs = DecisionRules.policy_input_dim(num_uncertainties, num_hydro) +models = dense_multilayer_nn(num_inputs, num_hydro, [64, 64]; activation=identity) +``` + +## Training pipeline 1: Deterministic Equivalent + +The deterministic equivalent (DE) couples all 96 stages into a **single NLP** +for each sampled trajectory. This is the most direct formulation: the policy +generates the full target trajectory ``\hat{x}_{1:T}`` in one forward pass, +and a single coupled solve determines all realized states simultaneously. + +### How it works + +``` + ┌──────────────────────────────────────────────────────────┐ + │ For each sampled trajectory w_{1:T}: │ + │ │ + │ 1. Forward pass: x̂_{1:T} = π_θ(w_{1:T}, x_0) │ + │ │ + │ 2. Solve coupled NLP: │ + │ min Σ_t c_t(x_t, u_t) + C_δ Σ_t ‖δ_t‖ │ + │ s.t. dynamics + AC-OPF for ALL stages simultaneously │ + │ x_t + δ_t = x̂_t(θ) ∀t (target constraint) │ + │ │ + │ 3. Read duals λ_t of target constraints │ + │ Gradient: Σ_t λ_t ⊙ ∇_θ x̂_t(θ) │ + └──────────────────────────────────────────────────────────┘ +``` + +### Mathematical formulation + +```math +\begin{aligned} +Q(w;\, \theta) + \;=\; + \min_{\{x_t, u_t, \delta_t\}_{t=1}^{T}} + \quad & + \sum_{t=1}^{T} c_t(x_t, u_t) + + C_\delta \sum_{t=1}^{T} \|\delta_t\| \\ +\text{s.t.}\quad + & x_t = T_t(w_t,\, u_t,\, x_{t-1}), + && t=1,\ldots,T \\ + & x_t + \delta_t = \hat{x}_t(\theta), + && : \lambda_t,\quad t=1,\ldots,T \\ + & h_t(x_t, u_t) \ge 0, + && t=1,\ldots,T +\end{aligned} +``` + +The gradient is exact by the envelope theorem: + +```math +\nabla_\theta Q +\;=\; +\sum_{t=1}^{T} +\lambda_t \odot \nabla_\theta \hat{x}_t(\theta). +``` + +**Advantages**: strongest gradient signal — full cross-stage coupling +captures how a target at stage 3 affects costs at stage 50. -The deterministic equivalent couples all 96 stages into a single NLP. The policy -generates targets in one forward pass; the coupled solve determines realized states. -This gives the strongest gradient signal but requires solving the largest subproblem. +**Disadvantage**: the NLP has ``96 \times (\text{AC-OPF variables})`` +decision variables; the policy generates targets without seeing realized +states (open-loop target generation). ```julia det_equivalent, uncertainty_samples_det = DecisionRules.deterministic_equivalent!( @@ -82,31 +247,129 @@ det_equivalent, uncertainty_samples_det = DecisionRules.deterministic_equivalent train_multistage( models, initial_state, det_equivalent, state_params_in, state_params_out, uncertainty_samples; - num_batches=2000, optimizer=Flux.Adam(), - penalty_schedule=:default_annealed, + num_batches=4000, optimizer=Flux.Adam(), + penalty_schedule=[(1,100,0.1), (101,210,1.0), (211,300,10.0), (301,4000,30.0)], ) ``` -## Training: Stage-wise Subproblems +## Training pipeline 2: Stage-wise Decomposition (Single Shooting) -Stage-wise decomposition solves one subproblem per stage sequentially. The policy -receives the realized state from the previous stage (closed-loop). Gradients -combine dual information with DiffOpt sensitivities along the rollout. +Stage-wise decomposition solves one subproblem per stage sequentially. +Unlike the DE, the policy operates in **closed loop**: after each stage +solve, the realized state ``x_t`` (not the predicted target) is fed back +as input to the next stage. + +### How it works + +``` + ┌─────────────────────────────────────────────────────────────┐ + │ For each sampled trajectory w_{1:T}: │ + │ │ + │ x_0 = initial state │ + │ for t = 1, ..., T: │ + │ x̂_t = π_θ(w_t, x_{t-1}) ← predict target │ + │ solve stage-t subproblem ← project to feasible│ + │ x_t = realized state from solver ← closed-loop │ + │ accumulate c_t + C_δ ‖δ_t‖ │ + │ │ + │ Gradient: chain rule through all stage solves │ + └─────────────────────────────────────────────────────────────┘ +``` + +### Gradient chain + +The gradient must account for how the realized state at stage ``t`` +depends on the targets at all earlier stages. By the chain rule: + +```math +\frac{\partial Q}{\partial \hat{x}_t} +\;=\; +\lambda_t ++ \sum_{k>t} + \frac{\partial q_k}{\partial x_{k-1}} + \cdot \prod_{j=t+1}^{k-1} + \frac{\partial x_j}{\partial x_{j-1}} + \cdot \frac{\partial x_t}{\partial \hat{x}_t}. +``` + +In practice, automatic differentiation (Zygote + ChainRules `rrule`s +defined on each stage solve) handles this chain automatically. +The `rrule` for each stage solve reads the dual ``\lambda_t`` for the +target constraint and uses DiffOpt's implicit differentiation for the +state-transition sensitivities. + +**Advantages**: closed-loop — the policy sees realized states, matching +deployment semantics. Each solve is small (single-stage AC-OPF). + +**Disadvantage**: gradients weaken over long horizons because the +chain rule multiplies many Jacobians; sequential solve prevents +parallelism. ```julia train_multistage( models, initial_state, subproblems, state_params_in, state_params_out, uncertainty_samples; - num_batches=2000, optimizer=Flux.Adam(), + num_batches=3000, optimizer=Flux.Adam(), penalty_schedule=:default_annealed, ) ``` -## Training: Multiple Shooting +## Training pipeline 3: Multiple Shooting + +Multiple shooting partitions the ``T``-stage horizon into ``K`` windows of +``W`` stages each. Within each window, a local deterministic equivalent +couples the stages (strong gradient signal). Between windows, the realized +end-state is passed to the next window (closed-loop continuity). + +### How it works + +``` + ┌────────────────────────────────────────────────────────────────┐ + │ Partition T=96 stages into K=⌈96/12⌉=8 windows of W=12 │ + │ │ + │ x_0 = initial state │ + │ for k = 1, ..., K: │ + │ stages = [(k-1)W+1, ..., kW] │ + │ x̂_{stages} = π_θ(w_{stages}, x_{start_k}) │ + │ solve window-k DE (12-stage coupled NLP) │ + │ x_{end_k} = realized end-state from window solve │ + │ x_{start_{k+1}} = x_{end_k} │ + │ │ + │ Gradient: │ + │ Within window: duals from the coupled solve (like full DE) │ + │ Across windows: DiffOpt chain rule through end-states │ + └────────────────────────────────────────────────────────────────┘ +``` + +### Gradient structure + +Let ``Q_k`` be the cost of window ``k``. The total cost is +``Q = \sum_k Q_k``. Within a window, the gradient is identical to the +DE case (duals of the target constraints in the coupled model). Across +windows, the chain rule threads through the realized end-state: + +```math +\frac{dQ}{d\theta} +\;=\; +\sum_{k=1}^{K} +\left( + \frac{\partial Q_k}{\partial \hat{x}_k} + \cdot \frac{\partial \hat{x}_k}{\partial \theta} + \;+\; + \frac{\partial Q_k}{\partial x_{\text{start}_k}} + \cdot \frac{d x_{\text{start}_k}}{d\theta} +\right), +``` -Multiple shooting partitions the 96-stage horizon into windows (e.g., 12 stages -each). Each window solves a local deterministic equivalent, then passes the -realized end-state to the next window. +where ``\frac{d x_{\text{start}_k}}{d\theta}`` involves the chain +through all prior windows via ``x_{\text{end}_{k-1}}``. + +**Advantages**: balances gradient quality (12-stage coupling) with +tractability (8 small DEs instead of one large one); inter-window +chain provides some closed-loop signal. + +**Disadvantage**: window boundaries introduce gradient discontinuities; +the full-horizon coupling is weaker than the single DE. ```julia windows = DecisionRules.setup_shooting_windows( @@ -118,20 +381,40 @@ windows = DecisionRules.setup_shooting_windows( train_multiple_shooting( models, initial_state, windows, () -> uncertainty_samples; - num_batches=2000, optimizer=Flux.Adam(), + num_batches=3000, optimizer=Flux.Adam(), penalty_schedule=:default_annealed, ) ``` +## Penalty annealing + +The target penalty ``C_\delta`` controls the trade-off between following +the policy's targets and minimizing operational cost. DecisionRules +supports a **penalty annealing schedule** that ramps the penalty multiplier +during training: + +| Phase | Multiplier | Purpose | +|:------|:----------:|:--------| +| Warmup | ``0.1 \times C_\delta`` | Let the policy explore freely | +| Nominal | ``1.0 \times C_\delta`` | Standard training | +| Tighten | ``10.0 \times C_\delta`` | Sharpen target tracking | +| Lock | ``30.0 \times C_\delta`` | Final precision | + +This is activated with `penalty_schedule=:default_annealed` or by passing +an explicit list of `(start_iter, end_iter, multiplier)` tuples. + ## Evaluation After training, we evaluate the policy using stage-wise rollout on held-out -scenarios. Two modes: -- **Target feedback** (`policy_state=:target`): matches DE training semantics -- **Realized feedback** (`policy_state=:realized`): deployment/closed-loop semantics +scenarios. Two modes: +- **Target feedback** (`policy_state=:target`): the policy receives its own + predicted target as input, matching DE training semantics. +- **Realized feedback** (`policy_state=:realized`): the policy receives the + realized state from the solver, matching deployment semantics. -The target-violation share measures how much cost comes from the slack penalty -rather than actual operations — it should be small (≤ 5%) for a well-trained policy. +The **target-violation share** measures how much cost comes from the slack +penalty rather than actual operations — it should be small (``\le 5\%``) for +a well-trained policy. ```julia rollout_eval = RolloutEvaluation( @@ -143,20 +426,63 @@ println("Operational cost: ", rollout_eval.last_objective_no_deficit) println("Violation share: ", rollout_eval.last_violation_share) ``` +## SDDP baseline + +For comparison, we also train an SDDP policy using +[SDDP.jl](https://github.com/odow/SDDP.jl) with **inconsistent +formulations**: a convex SOC-WR relaxation for the backward pass +(cut generation) and the nonconvex ACP formulation for the forward +pass (simulation). This is a pragmatic approach when the true problem +(AC-OPF) is nonconvex — SDDP requires convexity for valid cuts, so a +convex relaxation approximates the value function while the forward pass +evaluates under the true physics. + +The SDDP policy is trained for up to 2000 iterations and the learned +cuts are saved to a JSON file, which can be loaded to simulate the +policy under the ACP formulation. + ## Results -The plots below compare all three training formulations on the Bolivia case. -Training curves, out-of-sample cost distributions, and reservoir trajectories -are generated from full training runs (20 epochs × 100 batches each). +The plots below compare the TS-DDR and TS-LDR training formulations and +the SDDP baseline on the Bolivia case. Training curves, out-of-sample +cost distributions, reservoir volume trajectories, and thermal generation +profiles are shown. + +### Training convergence (TS-DDR methods) ![Training convergence](../assets/hydro_training_convergence.png) +### Out-of-sample cost (TS-DDR methods) + ![Out-of-sample cost comparison](../assets/hydro_cost_comparison.png) -![Reservoir trajectories](../assets/hydro_trajectories.png) +### Target-violation share (TS-DDR methods) + +![Violation share](../assets/hydro_violation_share.png) + +### Reservoir volume comparison (all methods) + +![Volume comparison](../assets/hydro_volume_comparison.png) + +### Thermal generation comparison (all methods) + +![Generation comparison](../assets/hydro_generation_comparison.png) + +### Summary + +| Method | Policy | Mean Cost | Std | N | +|:-------|:------:|----------:|----:|--:| +| TS-DDR (DE) | LSTM | 325 540 | 6 266 | 100 | +| TS-DDR (DE, anneal) | LSTM | 324 445 | 6 134 | 100 | +| TS-DDR (shooting w=12) | LSTM | 323 289 | 5 593 | 100 | +| TS-DDR (shooting w=12, anneal) | LSTM | 322 812 | 6 081 | 100 | +| TS-DDR (stage-wise, anneal) | LSTM | 321 543 | 6 214 | 100 | +| SDDP (SOC-WR / ACP) | cuts | 303 684 | — | 100 | + +All three TS-DDR methods with penalty annealing converge to similar +costs (321K–325K). SDDP trains on 126 stages (96 + 30 margin). + +!!! note "Preliminary results" + These numbers reflect the current default training scripts. + They will be updated as the package evolves. -| Method | Mean Cost | Std | Violation % | Train Time | -|:---|---:|---:|---:|---:| -| Deterministic Equivalent | 321189.0 | — | 48.66% | 158 steps | -| Stage-wise Subproblems | 364110.0 | — | 0.59% | 159 steps | -| Multiple Shooting | 319462.0 | — | 36.18% | 236 steps | diff --git a/docs/src/examples/inventory.jl b/docs/src/examples/inventory.jl index efb3606..a3d0058 100644 --- a/docs/src/examples/inventory.jl +++ b/docs/src/examples/inventory.jl @@ -1,187 +1,410 @@ -# # Inventory Control with Ordering Costs +# # Stochastic Lot-Sizing with Fixed Ordering Costs # -# This example studies a 12-period stochastic lot-sizing problem with two -# formulations — a **relaxed** (continuous) case and an **integer** (MIP) case -# with fixed ordering costs. The comparison shows: +# This example shows how to train target-state decision rules for a stochastic +# inventory problem with ex-ante ordering decisions. # -# 1. **Relaxed problem**: SDDP with a PAR(1) demand approximation is -# near-optimal and outperforms TS-DDR. -# 2. **Integer problem**: TS-DDR with `FixedDiscreteIntegerStrategy` outperforms -# both SDDP and TS-DDR with `ContinuousRelaxationIntegerStrategy`, because -# SDDP and continuous relaxation both underestimate the fixed ordering cost. +# The example has two purposes: +# +# 1. show the complete optimization model before discussing implementation +# details; and +# 2. show the code in the same order a reader would run it. using DecisionRules -using JuMP, HiGHS using Flux -using Statistics, Random +using HiGHS +using JuMP +using Random +using Statistics + +# The runnable experiment lives outside the documentation tree. The file defines +# the demand process, JuMP builders, and policy architecture used below. +include(joinpath(@__DIR__, "..", "..", "..", "examples", "inventory_control", + "build_inventory_problem.jl")) # ## Information Pattern # -# At the beginning of a period, the controller observes current inventory and -# recent realized demand. It does **not** observe current demand before ordering. -# The order is therefore ex-ante. After ordering, demand is realized and becomes -# part of the state for the next period. +# At the beginning of period `t`, the controller knows # -# The state carried between periods is: -# -# ```julia -# [net_inventory, last_demand, previous_demand] +# ```math +# x_t = (s_{t-1}, d_{t-1}, d_{t-2}), # ``` # -# This lets a time-invariant policy infer the latent demand regime from recent -# observations without receiving a period counter or synthetic seasonal features. - -# ## Inventory Model +# where `s` is net inventory and `d` is realized demand. The controller chooses +# the order quantity before seeing current demand `d_t`. This is an ex-ante +# decision. # -# ### Relaxed formulation +# The neural policy receives `[d_t, x_t...]` during training because +# DecisionRules policies output target states after the stage uncertainty is +# sampled. The implementation below uses that target only to guide the +# optimization model; the actual order still respects the model's information +# pattern. + +# ## Complete Stage Model # -# The order quantity is continuous with no setup cost: +# For each period `t = 1, ..., T`, the stage model is # # ```math -# 0 \le q_t \le Q_{\max}, \qquad -# \text{cost}_t = c\,q_t + h\max(s_t,0) + p\max(-s_t,0). +# \begin{aligned} +# \min_{q_t,z_t,s_t^{mid},s_t,h_t,b_t} +# \quad & K z_t + c q_t + h h_t + p b_t +# + \lambda |s_t^{mid} - \hat{s}_t| \\ +# \text{s.t.}\quad +# & 0 \le q_t \le Q_{\max} z_t, && \text{(1) order capacity} \\ +# & z_t \in \{0,1\}, && \text{(2) setup decision} \\ +# & s_t^{mid} = s_{t-1} + q_t, && \text{(3) order arrives} \\ +# & s_t = s_t^{mid} - d_t, && \text{(4) demand realizes} \\ +# & h_t - b_t = s_t, && \text{(5) inventory split} \\ +# & h_t \ge 0,\; b_t \ge 0. && \text{(6) split bounds} +# \end{aligned} # ``` # -# ### Integer formulation +# The relaxed model removes (2) and replaces (1) by +# ``0 \le q_t \le Q_{\max}``; it also removes the fixed cost `K z_t` from the +# objective. +# +# The target `\hat{s}_t` is not an operational requirement. It is the state +# target produced by the neural decision rule, and the penalty term gives the +# policy a gradient signal. + +# ## Parameters + +inventory_parameters = ( + T = INVENTORY_T, + setup_cost = INVENTORY_K, + unit_order_cost = INVENTORY_C, + holding_cost = INVENTORY_H, + backlog_cost = INVENTORY_P, + order_capacity = INVENTORY_Q_MAX, + initial_inventory = INVENTORY_I0, + target_penalty = INVENTORY_PENALTY, +) + +# ## Demand Process # -# A binary variable ``z_t \in \{0,1\}`` controls whether an order is placed. -# If ``z_t = 0``, then ``q_t`` must be zero; if ``z_t = 1``, the model pays a -# fixed setup cost ``K``: +# Demand has a hidden seasonal phase, a persistent hidden regime, and an AR(1) +# shock: # # ```math -# 0 \le q_t \le Q_{\max}\,z_t, \qquad -# \text{cost}_t = K\,z_t + c\,q_t + h\max(s_t,0) + p\max(-s_t,0). +# \epsilon_t = 0.92 \epsilon_{t-1} + 0.35 \eta_t, # ``` # -# In both cases, ordered units arrive before demand: -# # ```math -# s^{mid}_t = s_{t-1} + q_t, \qquad s_t = s^{mid}_t - d_t. +# d_t = +# \operatorname{clip}\!\left( +# m_{\kappa_t} +# + w_{\kappa_t}(0.85 r_t + 0.42 \epsilon_t + 0.12 \eta'_t) +# \right), # ``` # -# | Parameter | Value | Meaning | -# |:--|--:|:--| -# | ``T`` | 12 | periods | -# | ``K`` | 500 | fixed order/setup cost (integer case) | -# | ``c`` | 2 | unit ordering cost | -# | ``h`` | 1 | holding cost | -# | ``p`` | 25 | backlog penalty | -# | ``Q_{\max}`` | 350 | order capacity | -# | ``s_0`` | 30 | initial inventory | +# where `r_t` is the hidden regime and +# ``\kappa_t = 1 + ((t + \phi - 1) \bmod T)`` is the hidden seasonal index. -# ## Demand Process +Random.seed!(11) +demand_paths = [sample_inventory_demand_path() for _ in 1:3] + +# ## Build the Continuous and Integer Models +# +# The builders return the JuMP model(s), input-state parameters, output-target +# parameters, an uncertainty sampler, and the initial state. + +relaxed_subproblems, +relaxed_state_in, +relaxed_state_out, +relaxed_sampler, +initial_state = build_inventory_subproblems(; + num_scenarios = 100, + integer = false, +) + +integer_subproblems, +integer_state_in, +integer_state_out, +integer_sampler, +_ = build_inventory_subproblems(; + num_scenarios = 100, + integer = true, +) + +# The deterministic equivalent is the full-horizon model used by direct +# transcription training. + +integer_det_equivalent, +integer_det_state_in, +integer_det_state_out, +integer_det_sampler, +_ = build_inventory_det_equivalent(; + num_scenarios = 50, + integer = true, +) + +# ## Integer Sensitivity Strategies +# +# Mixed-integer models do not have ordinary LP duals. DecisionRules therefore +# makes the chosen postprocessing strategy explicit. + +fixed_discrete = FixedDiscreteIntegerStrategy() +continuous_relaxation = ContinuousRelaxationIntegerStrategy() + +# `FixedDiscreteIntegerStrategy` solves the MIP, fixes the incumbent integer +# variables, re-solves the fixed LP, and reads local dual information. # -# Each trajectory has a path-level phase shift ``\phi \sim \mathrm{Unif}\{0,\ldots,T-1\}``, -# a persistent latent regime ``r_t \in \{-1,0,1\}`` (switch probability 0.04), -# and an autoregressive shock ``\epsilon_t``: +# `ContinuousRelaxationIntegerStrategy` relaxes integer variables first and reads +# duals from the relaxed LP. This is smoother and faster, but the gradient is for +# the relaxation, not for an integer-feasible decision. + +# ## Score-Function Correction +# +# Local LP duals do not see a discrete switch such as "open the setup variable". +# A score-function correction estimates the effect of target changes by solving +# perturbed integer rollouts: # # ```math -# \epsilon_t = 0.92\,\epsilon_{t-1} + 0.35\,\eta_t, \qquad -# d_t = \operatorname{clip}\!\bigl( -# m_{\kappa_t} + w_{\kappa_t}(0.85\,r_t + 0.42\,\epsilon_t + 0.12\,\eta'_t) -# \bigr), +# \nabla L +# = +# \alpha \nabla L_{\mathrm{dual}} +# + (1-\alpha) +# \frac{1}{M} +# \sum_{m=1}^{M} +# (R_m - b) +# \nabla_\theta +# \sum_{t=1}^{T} +# \left\langle +# \delta_{m,t}/\sigma^2, +# \hat{x}_{t+1}(\theta) +# \right\rangle . # ``` # -# where ``\kappa_t = 1 + ((t + \phi - 1) \bmod T)`` is the shifted seasonal -# index, and ``m_{\kappa_t}`` and ``w_{\kappa_t}`` are the midpoint and -# half-width of the seasonal demand band. None of the latent variables are -# observed; the policy sees only inventory and realized demand history. +# There are two different solves in the mixed-gradient training loop: # -# The plot below shows 24 sampled demand paths. Because each trajectory has a -# different phase and persistent regime, the same calendar period can correspond -# to high, medium, or low demand across scenarios. +# - `train_multistage(...; integer_strategy = fixed_discrete)` controls the +# deterministic-equivalent solve used for the dual-gradient term +# ``\nabla L_{\mathrm{dual}}``. This solve needs a postprocessing strategy +# because duals are not directly defined for a MIP. +# - `ScoreFunctionConfig(integer_subproblems, ...)` controls the Monte Carlo +# rollout term. These rollout models are solved exactly as they are built. +# Because `integer_subproblems` contain binary setup variables, the rollout +# costs `R_m` are true MIP rollout costs. # -# ![Demand process](../assets/inventory_demand_process.png) +# In short: `integer_strategy` is for reading local duals; score-function +# rollouts are for measuring realized costs. -# ## Integer Postprocessing Strategies -# -# DecisionRules.jl provides two strategies for extracting gradient information -# from subproblems with discrete variables: +score_function = ScoreFunctionConfig( + integer_subproblems, + integer_state_in, + integer_state_out; + dual_weight = 0.5, + perturbation_std = 1.0, + num_rollouts = 8, +) + +score_schedule = ScoreFunctionSchedule( + score_function; + sf_start = 200, + ramp_batches = 300, + perturbation_std_initial = 0.1, + num_rollouts_initial = 2, +) + +# ## Policy # -# **`FixedDiscreteIntegerStrategy`**: (1) solve the MIP for incumbent binary -# values ``z^*_t``; (2) fix ``z_t = z^*_t`` and relax integrality; (3) re-solve -# the resulting LP; (4) read LP duals as gradient signal. This is the same -# principle as SDDP.jl's `FixedDiscreteDuality`. +# A DecisionRules policy is any callable `π(x) -> target` where `x` is the +# concatenation `[uncertainty..., state...]` and `target` is the desired +# next state. The only requirement is that it is differentiable via +# `Zygote.gradient` and registered with `Functors.@functor` so that +# `Flux.loadmodel!` can checkpoint its parameters. # -# **`ContinuousRelaxationIntegerStrategy`**: relax all binary/integer -# constraints to continuous bounds (binary → [0,1]), solve the resulting LP, -# and read duals directly. This is faster (one LP instead of MIP + LP) and -# gives smoother gradients, but the solution may have fractional integer -# variables — the gradient does not correspond to any feasible integer -# assignment. +# ### Feedforward policy # -# For the relaxed formulation (no integer variables), `NoIntegerStrategy` is -# used and subproblems are solved as-is. +# The simplest architecture is a feedforward MLP. This policy is ex-ante: +# it ignores the current demand `d_t` (index 1) and uses only the state +# entries `[inventory, d_{t-1}, d_{t-2}]`. + +using Functors: @functor + +struct ExAntePolicy{N} + net::N +end -# ## Relaxed (Continuous) Problem +@functor ExAntePolicy (net,) + +# The callable normalizes features to ≈[0,1] and maps through the network. +# The sigmoid output bounds the target to `[0, 500]`. +function (p::ExAntePolicy)(x) + inventory = Float32(x[2]) + d_prev = Float32(x[3]) + d_prev2 = Float32(x[4]) + features = Float32[inventory / 100, d_prev / 100, d_prev2 / 100] + target = 500f0 .* Flux.sigmoid.(p.net(features)) + return Float32[target[1], x[1], d_prev] +end + +Random.seed!(2024) +policy = ExAntePolicy(Chain(Dense(3, 32, relu), Dense(32, 24, relu), Dense(24, 1))) + +# ### Recurrent (LSTM) policy # -# When there are no integer variables, SDDP can model the demand process -# exactly via a PAR(1) approximation that carries ``d_{t-1}`` as a state -# variable. This makes SDDP near-optimal for the relaxed problem. +# When the uncertainty process has temporal structure (regimes, trends, +# seasonality), a recurrent encoder can learn patterns that a feedforward +# MLP cannot detect from a fixed-length window. # -# SDDP uses a PAR(1) fit: ``d_t \approx \mu_t + \alpha(d_{t-1} - \mu_{t-1}) + \omega_t`` -# with per-stage means ``\mu_t``, autocorrelation ``\alpha \approx 0.86``, and -# 9 equiprobable innovation points fitted from 10,000 simulated demand paths. +# The design below uses `Flux.LSTMCell` to process one *lagged* demand +# value per stage. The LSTM hidden state accumulates across stages within +# a scenario, then resets between scenarios via `Flux.reset!`. # -# ![Relaxed results](../assets/inventory_relaxed_results.png) +# The affine output `raw × 200 + 150` avoids sigmoid saturation and +# centers the target on typical inventory levels. + +mutable struct RecurrentExAntePolicy{E,C,S} + encoder::E + combiner::C + state::S +end + +@functor RecurrentExAntePolicy (encoder, combiner) + +function (p::RecurrentExAntePolicy)(x) + d_prev = Float32(x[3]) + inventory = Float32(x[2]) + d_prev2 = Float32(x[4]) + T = eltype(first(p.state)) + encoded, new_state = p.encoder(T[d_prev / 100], p.state) + p.state = new_state + raw = p.combiner(vcat(encoded, T[inventory / 100, d_prev2 / 100])) + target = raw[1] * 200f0 + 150f0 + return Float32[target, x[1], d_prev] +end + +function Flux.reset!(p::RecurrentExAntePolicy) + p.state = Flux.initialstates(p.encoder) + return nothing +end + +Random.seed!(2024) +lstm_encoder = Flux.LSTMCell(1 => 16) +lstm_policy = RecurrentExAntePolicy( + lstm_encoder, + Dense(16 + 2, 1), + Flux.initialstates(lstm_encoder), +) + +# ## Training Calls # -# | Method | N | Mean cost | Std | 95% CI | vs TS-DDR | Fit (s) | Eval (s) | -# |:-------------------------|----:|----------:|------:|-------:|----------:|--------:|---------:| -# | TS-DDR (trained) | 300 | 2667.3 | 594.5 | 67.3 | +0.0% | 54.6 | 0.0018 | -# | SDDP (PAR) | 300 | 2434.2 | 774.8 | 87.7 | -8.7% | 0.0 | 20.6455 | -# | Base-stock (S\*=160) | 300 | 3035.6 | 506.8 | 57.3 | +13.8% | 0.0 | 0.0002 | -# | Random (untrained) | 300 | 3751.7 | 221.7 | 25.1 | +40.7% | 0.0 | 0.0018 | +# The continuous problem uses ordinary dual information. + +# ```julia +# train_multistage( +# policy, +# initial_state, +# relaxed_subproblems, +# relaxed_state_in, +# relaxed_state_out, +# relaxed_sampler; +# num_batches = 400, +# num_train_per_batch = 5, +# optimizer = Flux.Adam(0.0015), +# integer_strategy = NoIntegerStrategy(), +# penalty_schedule = [(1, 80, 0.4), (81, 400, 1.0)], +# ) +# ``` + +# The integer deterministic-equivalent run uses the fixed-discrete local dual +# path plus the scheduled score-function correction. + +# ```julia +# train_multistage( +# policy, +# initial_state, +# integer_det_equivalent, +# integer_det_state_in, +# integer_det_state_out, +# integer_det_sampler; +# num_batches = 800, +# num_train_per_batch = 10, +# optimizer = Flux.Adam(0.0008), +# integer_strategy = fixed_discrete, +# penalty_schedule = [(1, 120, 0.4), (121, 800, 1.0)], +# score_function = score_schedule, +# ) +# ``` + +# ## Evaluation # -# SDDP clearly dominates: 8.7% lower cost than TS-DDR, 40.7% lower than -# Random. The SDDP and Random cost distributions are non-overlapping, -# confirming that informed methods have a clear edge on this demand process. +# A trained policy should be evaluated by stage-wise rollout, because that is +# the deployment semantics: solve one period, observe the realized next state, +# then solve the next period. -# ## Integer (MIP) Problem +uncertainty_sample = sample(integer_sampler) +rollout_cost = simulate_multistage( + integer_subproblems, + integer_state_in, + integer_state_out, + initial_state, + uncertainty_sample, + policy; + integer_strategy = fixed_discrete, +) + +# ## Experiment Scripts # -# Introducing the binary ``z_t`` and fixed cost ``K=500`` changes the -# landscape. SDDP can only use LP relaxation for training (``z \in [0,1]``), -# which systematically underestimates ``K``: when the LP says ``z=0.3``, -# ``q=20``, the relaxed cost is ``0.3 \times 500 + 2 \times 20 = 190``, but -# the true integer cost with ``z=1`` is ``500 + 40 = 540``. +# Each variant can be trained independently via SLURM or directly: # -# TS-DDR with `FixedDiscreteIntegerStrategy` handles this correctly: it -# solves the full MIP, fixes the binary incumbent, and reads LP duals in -# that integer-consistent state. +# ```bash +# # Single variant +# julia --project=. train_dr_inventory.jl integer_lstm # -# ![Integer results](../assets/inventory_integer_results.png) +# # All variants in parallel via SLURM +# cd examples/inventory_control && bash launch_all.sh +# ``` # -# | Method | N | Mean cost | Std | 95% CI | vs TS-DDR (FD) | Fit (s) | Eval (s) | -# |:-------------------------|----:|----------:|------:|-------:|---------------:|--------:|---------:| -# | TS-DDR (FixedDiscrete) | 300 | 8015.8 | 719.5 | 81.4 | +0.0% | 339.2 | 0.0112 | -# | TS-DDR (ContRelax) | 300 | 8318.1 | 720.0 | 81.5 | +3.8% | 109.4 | 0.0117 | -# | SDDP integer rollout | 300 | 8274.2 | 912.5 | 103.3 | +3.2% | 0.0 | 7.9088 | -# | Base-stock (S\*=160) | 300 | 9035.6 | 506.8 | 57.3 | +12.7% | 0.0 | 0.0000 | -# | Random (untrained) | 300 | 9594.6 | 361.1 | 40.9 | +19.7% | 0.0 | 0.0120 | +# Available variant tags: `relaxed`, `relaxed_lstm`, `relaxed_hp`, +# `relaxed_lstm_hp`, `integer`, `integer_cr`, `integer_sf`, `integer_hp`, +# `integer_lstm`, `integer_lstm_sf`. # -# `FixedDiscreteIntegerStrategy` achieves the lowest cost (8016), beating both -# SDDP (8274, +3.2%) and `ContinuousRelaxationIntegerStrategy` (8318, +3.8%). -# The continuous relaxation strategy performs similarly to SDDP — both use LP -# relaxation and both underestimate the fixed ordering cost. +# After training, run the comparison script to regenerate tables and figures: # -# `ContinuousRelaxationIntegerStrategy` trains 3× faster (109s vs 339s) -# because it only solves LPs, but the resulting policy is less accurate on -# integer-constrained problems. - -# ## Runnable Scripts +# ```bash +# julia --project=. evaluate_inventory.jl +# julia --project=. solve_sddp.jl +# julia --project=. compare_results.jl +# ``` # -# The complete experiment lives in `examples/inventory_control/`: +# The figures used by this page are generated by `compare_results.jl`. + +# ![Demand process](../assets/inventory_demand_process.png) # -# | Script | Purpose | -# |:-------|:--------| -# | `build_inventory_problem.jl` | JuMP subproblem and det-equivalent builders, demand process, policy architecture | -# | `train_dr_inventory.jl` | TS-DDR training (relaxed, FixedDiscrete, ContRelax) and trajectory evaluation | -# | `evaluate_inventory.jl` | Base-stock grid-search and random baseline evaluation | -# | `solve_sddp.jl` | SDDP (2T-stage PAR(1)) training and rollout | -# | `compare_results.jl` | Load all CSVs, print summary tables, save plots | +# ![Relaxed results](../assets/inventory_relaxed_results.png) # -# ```bash -# julia --project=examples/inventory_control examples/inventory_control/train_dr_inventory.jl -# julia --project=examples/inventory_control examples/inventory_control/evaluate_inventory.jl -# julia --project=examples/inventory_control examples/inventory_control/solve_sddp.jl -# julia --project=examples/inventory_control examples/inventory_control/compare_results.jl -# ``` +# ![Integer results](../assets/inventory_integer_results.png) + +# ### Relaxed (continuous) results +# +# SDDP uses a PAR(1) approximation of the true latent demand process, which +# is not exact for this problem. Despite this advantage for TS-DDR, the gap +# between the best TS-DDR variant and SDDP is ~7%. +# +# The LSTM encoder closes ~25% of the gap versus the feedforward baseline by +# learning temporal demand patterns from lagged observations. +# +# | Method | N | Mean cost | Std | vs SDDP | +# |:--------------------------------|----:|----------:|------:|--------:| +# | SDDP (PAR) | 300 | 2434.0 | — | 0.0% | +# | TS-DDR (LSTM) | 300 | 2610.6 | 540.3 | +7.3% | +# | TS-DDR (feedforward) | 300 | 2667.3 | 593.5 | +9.6% | +# | TS-DDR (HighPenalty) | 300 | 2677.5 | 547.0 | +10.0% | +# | TS-DDR (LSTM+HP) | 300 | 2712.0 | 554.6 | +11.4% | +# +# ### Integer (MIP) results +# +# SDDP uses an `AlternativeForwardPass`: MIP in the forward pass, LP +# relaxation in the backward pass for valid cuts. The TS-DDR gap is ~36%. +# +# | Method | N | Mean cost | Std | vs SDDP | +# |:--------------------------------|----:|----------:|------:|--------:| +# | SDDP (MIP fwd) | 300 | 5871.6 |1087.4 | 0.0% | +# | TS-DDR (FixedDiscrete) | 300 | 8015.8 | 718.3 | +36.5% | +# | TS-DDR (MixedGrad) | 300 | 8268.0 | 715.3 | +40.8% | +# | TS-DDR (ContRelax) | 300 | 8318.1 | 718.8 | +41.7% | +# | TS-DDR (HighPenalty) | 300 | 8388.4 | 615.9 | +42.8% | +# | SDDP (LP relax) | 300 | 8274.2 | 912.5 | +40.9% | +# | Base-stock (S\*=160) | 300 | 9035.6 | 506.8 | +53.9% | +# | Random (untrained) | 300 | 9594.6 | 361.1 | +63.4% | \ No newline at end of file diff --git a/docs/src/examples/inventory.md b/docs/src/examples/inventory.md index ba10084..f962219 100644 --- a/docs/src/examples/inventory.md +++ b/docs/src/examples/inventory.md @@ -2,194 +2,440 @@ EditURL = "inventory.jl" ``` -# Inventory Control with Ordering Costs +# Stochastic Lot-Sizing with Fixed Ordering Costs -This example studies a 12-period stochastic lot-sizing problem with two -formulations — a **relaxed** (continuous) case and an **integer** (MIP) case -with fixed ordering costs. The comparison shows: +This example shows how to train target-state decision rules for a stochastic +inventory problem with ex-ante ordering decisions. -1. **Relaxed problem**: SDDP with a PAR(1) demand approximation is - near-optimal and outperforms TS-DDR. -2. **Integer problem**: TS-DDR with `FixedDiscreteIntegerStrategy` outperforms - both SDDP and TS-DDR with `ContinuousRelaxationIntegerStrategy`, because - SDDP and continuous relaxation both underestimate the fixed ordering cost. +The example has two purposes: -## Information Pattern - -At the beginning of a period, the controller observes current inventory and -recent realized demand. It does **not** observe current demand before ordering. -The order is therefore ex-ante. After ordering, demand is realized and becomes -part of the state for the next period. +1. show the complete optimization model before discussing implementation + details; and +2. show the code in the same order a reader would run it. -The state carried between periods is: +````@example inventory +using DecisionRules +using Flux +using HiGHS +using JuMP +using Random +using Statistics +```` -```julia -[net_inventory, last_demand, previous_demand] -``` +The runnable experiment lives outside the documentation tree. The file defines +the demand process, JuMP builders, and policy architecture used below. -This lets a time-invariant policy infer the latent demand regime from recent -observations without receiving a period counter or synthetic seasonal features. +````@example inventory +include(joinpath(@__DIR__, "..", "..", "..", "examples", "inventory_control", + "build_inventory_problem.jl")) +```` -## Inventory Model - -### Relaxed formulation +## Information Pattern -The order quantity is continuous with no setup cost: +At the beginning of period `t`, the controller knows ```math -0 \le q_t \le Q_{\max}, \qquad -\text{cost}_t = c\,q_t + h\max(s_t,0) + p\max(-s_t,0). +x_t = (s_{t-1}, d_{t-1}, d_{t-2}), ``` -### Integer formulation +where `s` is net inventory and `d` is realized demand. The controller chooses +the order quantity before seeing current demand `d_t`. This is an ex-ante +decision. -A binary variable ``z_t \in \{0,1\}`` controls whether an order is placed. -If ``z_t = 0``, then ``q_t`` must be zero; if ``z_t = 1``, the model pays a -fixed setup cost ``K``: +The neural policy receives `[d_t, x_t...]` during training because +DecisionRules policies output target states after the stage uncertainty is +sampled. The implementation below uses that target only to guide the +optimization model; the actual order still respects the model's information +pattern. -```math -0 \le q_t \le Q_{\max}\,z_t, \qquad -\text{cost}_t = K\,z_t + c\,q_t + h\max(s_t,0) + p\max(-s_t,0). -``` +## Complete Stage Model -In both cases, ordered units arrive before demand: +For each period `t = 1, ..., T`, the stage model is ```math -s^{mid}_t = s_{t-1} + q_t, \qquad s_t = s^{mid}_t - d_t. +\begin{aligned} +\min_{q_t,z_t,s_t^{mid},s_t,h_t,b_t} + \quad & K z_t + c q_t + h h_t + p b_t + + \lambda |s_t^{mid} - \hat{s}_t| \\ +\text{s.t.}\quad + & 0 \le q_t \le Q_{\max} z_t, && \text{(1) order capacity} \\ + & z_t \in \{0,1\}, && \text{(2) setup decision} \\ + & s_t^{mid} = s_{t-1} + q_t, && \text{(3) order arrives} \\ + & s_t = s_t^{mid} - d_t, && \text{(4) demand realizes} \\ + & h_t - b_t = s_t, && \text{(5) inventory split} \\ + & h_t \ge 0,\; b_t \ge 0. && \text{(6) split bounds} +\end{aligned} ``` -| Parameter | Value | Meaning | -|:--|--:|:--| -| ``T`` | 12 | periods | -| ``K`` | 500 | fixed order/setup cost (integer case) | -| ``c`` | 2 | unit ordering cost | -| ``h`` | 1 | holding cost | -| ``p`` | 25 | backlog penalty | -| ``Q_{\max}`` | 350 | order capacity | -| ``s_0`` | 30 | initial inventory | +The relaxed model removes (2) and replaces (1) by +``0 \le q_t \le Q_{\max}``; it also removes the fixed cost `K z_t` from the +objective. + +The target `\hat{s}_t` is not an operational requirement. It is the state +target produced by the neural decision rule, and the penalty term gives the +policy a gradient signal. + +## Parameters + +````@example inventory +inventory_parameters = ( + T = INVENTORY_T, + setup_cost = INVENTORY_K, + unit_order_cost = INVENTORY_C, + holding_cost = INVENTORY_H, + backlog_cost = INVENTORY_P, + order_capacity = INVENTORY_Q_MAX, + initial_inventory = INVENTORY_I0, + target_penalty = INVENTORY_PENALTY, +) +```` ## Demand Process -Each trajectory has a path-level phase shift ``\phi \sim \mathrm{Unif}\{0,\ldots,T-1\}``, -a persistent latent regime ``r_t \in \{-1,0,1\}`` (switch probability 0.04), -and an autoregressive shock ``\epsilon_t``: +Demand has a hidden seasonal phase, a persistent hidden regime, and an AR(1) +shock: ```math -\epsilon_t = 0.92\,\epsilon_{t-1} + 0.35\,\eta_t, \qquad -d_t = \operatorname{clip}\!\bigl( - m_{\kappa_t} + w_{\kappa_t}(0.85\,r_t + 0.42\,\epsilon_t + 0.12\,\eta'_t) -\bigr), +\epsilon_t = 0.92 \epsilon_{t-1} + 0.35 \eta_t, ``` -where ``\kappa_t = 1 + ((t + \phi - 1) \bmod T)`` is the shifted seasonal -index, and ``m_{\kappa_t}`` and ``w_{\kappa_t}`` are the midpoint and -half-width of the seasonal demand band. None of the latent variables are -observed; the policy sees only inventory and realized demand history. +```math +d_t = +\operatorname{clip}\!\left( + m_{\kappa_t} + + w_{\kappa_t}(0.85 r_t + 0.42 \epsilon_t + 0.12 \eta'_t) +\right), +``` -The plot below shows 24 sampled demand paths. Because each trajectory has a -different phase and persistent regime, the same calendar period can correspond -to high, medium, or low demand across scenarios. +where `r_t` is the hidden regime and +``\kappa_t = 1 + ((t + \phi - 1) \bmod T)`` is the hidden seasonal index. + +````@example inventory +Random.seed!(11) +demand_paths = [sample_inventory_demand_path() for _ in 1:3] +```` + +## Build the Continuous and Integer Models + +The builders return the JuMP model(s), input-state parameters, output-target +parameters, an uncertainty sampler, and the initial state. + +````@example inventory +relaxed_subproblems, +relaxed_state_in, +relaxed_state_out, +relaxed_sampler, +initial_state = build_inventory_subproblems(; + num_scenarios = 100, + integer = false, +) + +integer_subproblems, +integer_state_in, +integer_state_out, +integer_sampler, +_ = build_inventory_subproblems(; + num_scenarios = 100, + integer = true, +) +```` + +The deterministic equivalent is the full-horizon model used by direct +transcription training. + +````@example inventory +integer_det_equivalent, +integer_det_state_in, +integer_det_state_out, +integer_det_sampler, +_ = build_inventory_det_equivalent(; + num_scenarios = 50, + integer = true, +) +```` + +## Integer Sensitivity Strategies + +Mixed-integer models do not have ordinary LP duals. DecisionRules therefore +makes the chosen postprocessing strategy explicit. + +````@example inventory +fixed_discrete = FixedDiscreteIntegerStrategy() +continuous_relaxation = ContinuousRelaxationIntegerStrategy() +```` + +`FixedDiscreteIntegerStrategy` solves the MIP, fixes the incumbent integer +variables, re-solves the fixed LP, and reads local dual information. + +`ContinuousRelaxationIntegerStrategy` relaxes integer variables first and reads +duals from the relaxed LP. This is smoother and faster, but the gradient is for +the relaxation, not for an integer-feasible decision. + +## Score-Function Correction + +Local LP duals do not see a discrete switch such as "open the setup variable". +A score-function correction estimates the effect of target changes by solving +perturbed integer rollouts: -![Demand process](../assets/inventory_demand_process.png) +```math +\nabla L += +\alpha \nabla L_{\mathrm{dual}} ++ (1-\alpha) + \frac{1}{M} + \sum_{m=1}^{M} + (R_m - b) + \nabla_\theta + \sum_{t=1}^{T} + \left\langle + \delta_{m,t}/\sigma^2, + \hat{x}_{t+1}(\theta) + \right\rangle . +``` -## Integer Postprocessing Strategies +There are two different solves in the mixed-gradient training loop: + +- `train_multistage(...; integer_strategy = fixed_discrete)` controls the + deterministic-equivalent solve used for the dual-gradient term + ``\nabla L_{\mathrm{dual}}``. This solve needs a postprocessing strategy + because duals are not directly defined for a MIP. +- `ScoreFunctionConfig(integer_subproblems, ...)` controls the Monte Carlo + rollout term. These rollout models are solved exactly as they are built. + Because `integer_subproblems` contain binary setup variables, the rollout + costs `R_m` are true MIP rollout costs. + +In short: `integer_strategy` is for reading local duals; score-function +rollouts are for measuring realized costs. + +````@example inventory +score_function = ScoreFunctionConfig( + integer_subproblems, + integer_state_in, + integer_state_out; + dual_weight = 0.5, + perturbation_std = 1.0, + num_rollouts = 8, +) + +score_schedule = ScoreFunctionSchedule( + score_function; + sf_start = 200, + ramp_batches = 300, + perturbation_std_initial = 0.1, + num_rollouts_initial = 2, +) +```` + +## Policy + +A DecisionRules policy is any callable `π(x) -> target` where `x` is the +concatenation `[uncertainty..., state...]` and `target` is the desired +next state. The only requirement is that it is differentiable via +`Zygote.gradient` and registered with `Functors.@functor` so that +`Flux.loadmodel!` can checkpoint its parameters. + +### Feedforward policy + +The simplest architecture is a feedforward MLP. This policy is ex-ante: +it ignores the current demand `d_t` (index 1) and uses only the state +entries `[inventory, d_{t-1}, d_{t-2}]`. + +````@example inventory +using Functors: @functor + +struct ExAntePolicy{N} + net::N +end + +@functor ExAntePolicy (net,) +```` + +The callable normalizes features to ≈[0,1] and maps through the network. +The sigmoid output bounds the target to `[0, 500]`. + +````@example inventory +function (p::ExAntePolicy)(x) + inventory = Float32(x[2]) + d_prev = Float32(x[3]) + d_prev2 = Float32(x[4]) + features = Float32[inventory / 100, d_prev / 100, d_prev2 / 100] + target = 500f0 .* Flux.sigmoid.(p.net(features)) + return Float32[target[1], x[1], d_prev] +end + +Random.seed!(2024) +policy = ExAntePolicy(Chain(Dense(3, 32, relu), Dense(32, 24, relu), Dense(24, 1))) +```` + +### Recurrent (LSTM) policy + +When the uncertainty process has temporal structure (regimes, trends, +seasonality), a recurrent encoder can learn patterns that a feedforward +MLP cannot detect from a fixed-length window. + +The design below uses `Flux.LSTMCell` to process one *lagged* demand +value per stage. The LSTM hidden state accumulates across stages within +a scenario, then resets between scenarios via `Flux.reset!`. + +The affine output `raw × 200 + 150` avoids sigmoid saturation and +centers the target on typical inventory levels. + +````@example inventory +mutable struct RecurrentExAntePolicy{E,C,S} + encoder::E + combiner::C + state::S +end + +@functor RecurrentExAntePolicy (encoder, combiner) + +function (p::RecurrentExAntePolicy)(x) + d_prev = Float32(x[3]) + inventory = Float32(x[2]) + d_prev2 = Float32(x[4]) + T = eltype(first(p.state)) + encoded, new_state = p.encoder(T[d_prev / 100], p.state) + p.state = new_state + raw = p.combiner(vcat(encoded, T[inventory / 100, d_prev2 / 100])) + target = raw[1] * 200f0 + 150f0 + return Float32[target, x[1], d_prev] +end + +function Flux.reset!(p::RecurrentExAntePolicy) + p.state = Flux.initialstates(p.encoder) + return nothing +end + +Random.seed!(2024) +lstm_encoder = Flux.LSTMCell(1 => 16) +lstm_policy = RecurrentExAntePolicy( + lstm_encoder, + Dense(16 + 2, 1), + Flux.initialstates(lstm_encoder), +) +```` + +## Training Calls + +The continuous problem uses ordinary dual information. -DecisionRules.jl provides two strategies for extracting gradient information -from subproblems with discrete variables: +```julia +train_multistage( + policy, + initial_state, + relaxed_subproblems, + relaxed_state_in, + relaxed_state_out, + relaxed_sampler; + num_batches = 400, + num_train_per_batch = 5, + optimizer = Flux.Adam(0.0015), + integer_strategy = NoIntegerStrategy(), + penalty_schedule = [(1, 80, 0.4), (81, 400, 1.0)], +) +``` -**`FixedDiscreteIntegerStrategy`**: (1) solve the MIP for incumbent binary -values ``z^*_t``; (2) fix ``z_t = z^*_t`` and relax integrality; (3) re-solve -the resulting LP; (4) read LP duals as gradient signal. This is the same -principle as SDDP.jl's `FixedDiscreteDuality`. +The integer deterministic-equivalent run uses the fixed-discrete local dual +path plus the scheduled score-function correction. -**`ContinuousRelaxationIntegerStrategy`**: relax all binary/integer -constraints to continuous bounds (binary → [0,1]), solve the resulting LP, -and read duals directly. This is faster (one LP instead of MIP + LP) and -gives smoother gradients, but the solution may have fractional integer -variables — the gradient does not correspond to any feasible integer -assignment. +```julia +train_multistage( + policy, + initial_state, + integer_det_equivalent, + integer_det_state_in, + integer_det_state_out, + integer_det_sampler; + num_batches = 800, + num_train_per_batch = 10, + optimizer = Flux.Adam(0.0008), + integer_strategy = fixed_discrete, + penalty_schedule = [(1, 120, 0.4), (121, 800, 1.0)], + score_function = score_schedule, +) +``` -For the relaxed formulation (no integer variables), `NoIntegerStrategy` is -used and subproblems are solved as-is. +## Evaluation -## Relaxed (Continuous) Problem +A trained policy should be evaluated by stage-wise rollout, because that is +the deployment semantics: solve one period, observe the realized next state, +then solve the next period. -When there are no integer variables, SDDP can model the demand process -exactly via a PAR(1) approximation that carries ``d_{t-1}`` as a state -variable. This makes SDDP near-optimal for the relaxed problem. +````@example inventory +uncertainty_sample = sample(integer_sampler) +rollout_cost = simulate_multistage( + integer_subproblems, + integer_state_in, + integer_state_out, + initial_state, + uncertainty_sample, + policy; + integer_strategy = fixed_discrete, +) +```` -SDDP uses a PAR(1) fit: ``d_t \approx \mu_t + \alpha(d_{t-1} - \mu_{t-1}) + \omega_t`` -with per-stage means ``\mu_t``, autocorrelation ``\alpha \approx 0.86``, and -9 equiprobable innovation points fitted from 10,000 simulated demand paths. +## Experiment Scripts -All costs below are out-of-sample operational costs evaluated on the same 300 -demand scenarios (seed 555). **Fit** is the one-time offline cost (training -or tuning). **Eval** is the online deployment cost per decision point. +Each variant can be trained independently via SLURM or directly: -![Relaxed results](../assets/inventory_relaxed_results.png) +```bash +# Single variant +julia --project=. train_dr_inventory.jl integer_lstm -SDDP LP bound: **2162.0** +# All variants in parallel via SLURM +cd examples/inventory_control && bash launch_all.sh +``` -| Method | N | Mean cost | Std | 95% CI | vs TS-DDR | Fit (s) | Eval (s) | -|:-------------------------|----:|----------:|------:|-------:|----------:|--------:|---------:| -| TS-DDR (trained) | 300 | 2667.3 | 594.5 | 67.3 | +0.0% | 54.6 | 0.0018 | -| SDDP (PAR) | 300 | 2434.2 | 774.8 | 87.7 | -8.7% | 0.0 | 20.6455 | -| Base-stock (S\*=160) | 300 | 3035.6 | 506.8 | 57.3 | +13.8% | 0.0 | 0.0002 | -| Random (untrained) | 300 | 3751.7 | 221.7 | 25.1 | +40.7% | 0.0 | 0.0018 | +Available variant tags: `relaxed`, `relaxed_lstm`, `relaxed_hp`, +`relaxed_lstm_hp`, `integer`, `integer_cr`, `integer_sf`, `integer_hp`, +`integer_lstm`, `integer_lstm_sf`. -SDDP clearly dominates: 8.7% lower cost than TS-DDR, and the SDDP and Random -cost distributions are non-overlapping. This is expected for a convex problem -where SDDP can represent the demand dynamics exactly through the PAR(1) state -variable. +After training, run the comparison script to regenerate tables and figures: -## Integer (MIP) Problem +```bash +julia --project=. evaluate_inventory.jl +julia --project=. solve_sddp.jl +julia --project=. compare_results.jl +``` + +The figures used by this page are generated by `compare_results.jl`. -Introducing the binary ``z_t`` and fixed cost ``K=500`` changes the -landscape. SDDP can only use LP relaxation for training (``z \in [0,1]``), -which systematically underestimates ``K``: when the LP says ``z=0.3``, -``q=20``, the relaxed cost is ``0.3 \times 500 + 2 \times 20 = 190``, but -the true integer cost with ``z=1`` is ``500 + 40 = 540``. +![Demand process](../assets/inventory_demand_process.png) -TS-DDR with `FixedDiscreteIntegerStrategy` handles this correctly: it -solves the full MIP, fixes the binary incumbent, and reads LP duals in -that integer-consistent state. +![Relaxed results](../assets/inventory_relaxed_results.png) ![Integer results](../assets/inventory_integer_results.png) -SDDP LP bound: **3346.6** +### Relaxed (continuous) results -| Method | N | Mean cost | Std | 95% CI | vs TS-DDR (FD) | Fit (s) | Eval (s) | -|:-------------------------|----:|----------:|------:|-------:|---------------:|--------:|---------:| -| TS-DDR (FixedDiscrete) | 300 | 8015.8 | 719.5 | 81.4 | +0.0% | 339.2 | 0.0112 | -| TS-DDR (ContRelax) | 300 | 8318.1 | 720.0 | 81.5 | +3.8% | 109.4 | 0.0117 | -| SDDP integer rollout | 300 | 8274.2 | 912.5 | 103.3 | +3.2% | 0.0 | 7.9088 | -| Base-stock (S\*=160) | 300 | 9035.6 | 506.8 | 57.3 | +12.7% | 0.0 | 0.0000 | -| Random (untrained) | 300 | 9594.6 | 361.1 | 40.9 | +19.7% | 0.0 | 0.0120 | +SDDP uses a PAR(1) approximation of the true latent demand process, which +is not exact for this problem. Despite this advantage for TS-DDR, the gap +between the best TS-DDR variant and SDDP is ~7%. -`FixedDiscreteIntegerStrategy` achieves the lowest cost (8016), beating both -SDDP (8274, +3.2%) and `ContinuousRelaxationIntegerStrategy` (8318, +3.8%). -The continuous relaxation strategy performs similarly to SDDP — both use LP -relaxation and both underestimate the fixed ordering cost. +The LSTM encoder closes ~25% of the gap versus the feedforward baseline by +learning temporal demand patterns from lagged observations. -`ContinuousRelaxationIntegerStrategy` trains 3× faster (109s vs 339s) -because it only solves LPs, but the resulting policy is less accurate on -integer-constrained problems. +| Method | N | Mean cost | Std | vs SDDP | +|:--------------------------------|----:|----------:|------:|--------:| +| SDDP (PAR) | 300 | 2434.0 | — | 0.0% | +| TS-DDR (LSTM) | 300 | 2610.6 | 540.3 | +7.3% | +| TS-DDR (feedforward) | 300 | 2667.3 | 593.5 | +9.6% | +| TS-DDR (HighPenalty) | 300 | 2677.5 | 547.0 | +10.0% | +| TS-DDR (LSTM+HP) | 300 | 2712.0 | 554.6 | +11.4% | -## Runnable Scripts +### Integer (MIP) results -The complete experiment lives in `examples/inventory_control/`: +SDDP uses an `AlternativeForwardPass`: MIP in the forward pass, LP +relaxation in the backward pass for valid cuts. The TS-DDR gap is ~36%. -| Script | Purpose | -|:-------|:--------| -| `build_inventory_problem.jl` | JuMP subproblem and det-equivalent builders, demand process, policy architecture | -| `train_dr_inventory.jl` | TS-DDR training (relaxed, FixedDiscrete, ContRelax) and trajectory evaluation | -| `evaluate_inventory.jl` | Base-stock grid-search and random baseline evaluation | -| `solve_sddp.jl` | SDDP (2T-stage PAR(1)) training and rollout | -| `compare_results.jl` | Load all CSVs, print summary tables, save plots | +| Method | N | Mean cost | Std | vs SDDP | +|:--------------------------------|----:|----------:|------:|--------:| +| SDDP (MIP fwd) | 300 | 5871.6 |1087.4 | 0.0% | +| TS-DDR (FixedDiscrete) | 300 | 8015.8 | 718.3 | +36.5% | +| TS-DDR (MixedGrad) | 300 | 8268.0 | 715.3 | +40.8% | +| TS-DDR (ContRelax) | 300 | 8318.1 | 718.8 | +41.7% | +| TS-DDR (HighPenalty) | 300 | 8388.4 | 615.9 | +42.8% | +| SDDP (LP relax) | 300 | 8274.2 | 912.5 | +40.9% | +| Base-stock (S\*=160) | 300 | 9035.6 | 506.8 | +53.9% | +| Random (untrained) | 300 | 9594.6 | 361.1 | +63.4% | -```bash -julia --project=examples/inventory_control examples/inventory_control/train_dr_inventory.jl -julia --project=examples/inventory_control examples/inventory_control/evaluate_inventory.jl -julia --project=examples/inventory_control examples/inventory_control/solve_sddp.jl -julia --project=examples/inventory_control examples/inventory_control/compare_results.jl -``` diff --git a/docs/src/gpu_acceleration.md b/docs/src/gpu_acceleration.md new file mode 100644 index 0000000..5cbf498 --- /dev/null +++ b/docs/src/gpu_acceleration.md @@ -0,0 +1,253 @@ +# GPU Acceleration with DecisionRulesExa.jl + +```@meta +CurrentModule = DecisionRules +``` + +[DecisionRulesExa.jl](https://github.com/LearningToOptimize/DecisionRulesExa.jl) is a +companion package that implements the same TS-DDR algorithm using +[ExaModels.jl](https://github.com/exanauts/ExaModels.jl) instead of JuMP for the +optimization backend. It targets GPU-accelerated training via +[MadNLP.jl](https://github.com/MadNLP/MadNLP.jl) with CUDSS-backed interior-point +solves. + +## When to use DecisionRulesExa.jl + +| | DecisionRules.jl (JuMP) | DecisionRulesExa.jl (ExaModels) | +|:---|:---|:---| +| **Backend** | JuMP + DiffOpt | ExaModels + MadNLP | +| **Hardware** | CPU | CPU or GPU (CUDA) | +| **Training modes** | DE, stage-wise, multiple shooting | Deterministic equivalent | +| **Gradient source** | DiffOpt implicit diff + duals | Envelope theorem (duals only) | +| **Best for** | Moderate NLPs, integer variables, stage-wise decomposition | Large NLPs (AC-OPF), GPU speedup, many samples per batch | + +**Choose DecisionRulesExa.jl when** the inner NLP is large enough that GPU +acceleration matters (e.g., AC-OPF with hundreds of buses and thousands of +variables per stage) and you want to run many training samples per gradient +step on a single GPU. + +**Choose DecisionRules.jl when** you need stage-wise or multiple-shooting +decomposition, integer variable support, or DiffOpt-based solution sensitivities. + +## Installation + +```julia +using Pkg +Pkg.add(url="https://github.com/LearningToOptimize/DecisionRulesExa.jl.git") +``` + +For GPU support, also install CUDA.jl and MadNLPGPU: + +```julia +Pkg.add(["CUDA", "MadNLPGPU"]) +``` + +## Quick start: CPU + +The simplest way to get started is with the built-in linear tracking problem: + +```julia +using DecisionRulesExa +using ExaModels, Flux, MadNLP, Random + +Random.seed!(1) + +T = 8 # horizon +nx = 1 # state dimension + +# Build a parametric deterministic-equivalent NLP on CPU +prob = build_linear_tracking_problem( + horizon = T, + nx = nx, + backend = nothing, # CPU + slack_penalty = 10.0, + u_bounds = (-2.0, 2.0), +) + +# LSTM policy: maps [w_t ; x_{t-1}] → target x̂_t at each stage +policy = StateConditionedPolicy(nx, nx, nx, [64, 64]) + +# Uncertainty sampler: returns a flat vector of length T * nw +sampler() = Float32.(0.1 .* randn(T * nx)) + +# Train with TS-DDR policy gradient (envelope theorem) +train_tsddr( + policy, + Float32.([1.0]), # initial state + prob, + prob.p_x0, + prob.p_target, + prob.p_w, + sampler; + num_batches = 100, + num_train_per_batch = 4, + optimizer = Flux.Adam(1f-3), + madnlp_kwargs = (print_level = MadNLP.ERROR, tol = 1e-6), +) +``` + +## Moving to GPU + +To run the same problem on GPU, change the backend and add a GPU-native +linear solver: + +```julia +using CUDA, MadNLPGPU + +prob_gpu = build_linear_tracking_problem( + horizon = T, + nx = nx, + backend = CUDABackend(), + slack_penalty = 10.0, + u_bounds = (-2.0, 2.0), +) + +train_tsddr( + policy, + Float32.([1.0]), + prob_gpu, + prob_gpu.p_x0, + prob_gpu.p_target, + prob_gpu.p_w, + sampler; + num_batches = 100, + num_train_per_batch = 4, + optimizer = Flux.Adam(1f-3), + madnlp_kwargs = ( + print_level = MadNLP.ERROR, + tol = 1e-6, + linear_solver = CUDSSSolver, + ), +) +``` + +The policy (Flux model) stays on CPU; only the NLP solve runs on GPU. +Parameter updates (`ExaModels.set_parameter!`) and multiplier extraction +handle CPU↔GPU transfers automatically. + +## Custom problems + +For domain-specific models (power systems, robotics, etc.), build the +ExaModels NLP directly instead of using `build_linear_tracking_problem`. +The key requirements are: + +1. **Add target constraints last** so their multipliers form a contiguous + slice of `result.multipliers`. +2. **Parameterize** the initial state (`p_x0`), uncertainty trajectory + (`p_w`), and target trajectory (`p_target`) as ExaModels parameters. +3. **Return** a struct with fields `.core`, `.model`, `.horizon`, and + `.target_con_range`. + +The `HydroPowerModels` example in DecisionRulesExa.jl demonstrates this +pattern for a full AC-OPF problem with reservoir dynamics: + +```julia +# In examples/HydroPowerModels/hydro_power_exa.jl +prob = build_hydro_de( + data; + num_stages = 96, + backend = CUDABackend(), + formulation = :ac_polar, + deficit_cost = 1e5, + target_penalty = :auto, +) +``` + +## Parallel GPU solves + +When training samples are independent, multiple NLP instances can be +solved concurrently on the same GPU. Build a pool of independent problem +copies and pass it to `train_tsddr`: + +```julia +pool = [(prob, prob.p_x0, prob.p_target, prob.p_w)] +for _ in 2:num_workers + p = build_my_problem(backend = CUDABackend()) + push!(pool, (p, p.p_x0, p.p_target, p.p_w)) +end + +train_tsddr(policy, x0, prob, prob.p_x0, prob.p_target, prob.p_w, sampler; + problem_pool = pool, + num_train_per_batch = num_workers, +) +``` + +Each pool entry gets its own MadNLP solver instance. Samples are +distributed round-robin across the pool and solved via `Threads.@spawn`. + +## Penalty annealing + +DecisionRulesExa.jl supports penalty annealing through the +`adjust_hyperparameters` callback. The target penalty coefficient +``\rho`` is stored as an ExaModels parameter and can be updated at +runtime: + +```julia +adjust_hyperparameters = function(iter, opt_state, num_train) + phase = iter < 100 ? 0.1 : + iter < 200 ? 1.0 : + iter < 300 ? 10.0 : 30.0 + ρ = base_penalty * phase + penalty_vals = fill(ρ / 2, T * nx) + ExaModels.set_parameter!(prob.core, prob.p_penalty_half, penalty_vals) + return num_train +end +``` + +This mirrors the `penalty_schedule` keyword in DecisionRules.jl's +[`train_multistage`](@ref). + +## Rollout evaluation + +[`RolloutEvaluation`](@ref) in DecisionRules.jl evaluates policies +stage-by-stage under deployment semantics. DecisionRulesExa.jl provides +an analogous `RolloutEvaluation` that solves stage subproblems +sequentially: + +```julia +eval = RolloutEvaluation( + stage_problem, x0, eval_scenarios; + horizon = T, + n_uncertainty = nw, + set_stage_parameters! = my_stage_setter!, + realized_state = my_realized_state, + stride = 25, + policy_state = :realized, +) +``` + +Both packages report the same metrics: operational cost excluding +target-deficit penalty, and target-violation share. + +## Mapping between packages + +| DecisionRules.jl | DecisionRulesExa.jl | Notes | +|:---|:---|:---| +| `train_multistage` | `train_tsddr` | Main training loop | +| `state_conditioned_policy` | `StateConditionedPolicy` | LSTM policy | +| `dense_multilayer_nn` | `MLPPolicy` | MLP policy | +| `state_params_in` | `p_x0` | Initial state parameter | +| `state_params_out` | `p_target` | Target parameter | +| `uncertainty_samples` | `p_w` + sampler | Uncertainty parameter | +| `SampleLog` / `record` | `record_loss` | Per-iteration callback | +| `RolloutEvaluation` | `RolloutEvaluation` | Stage-wise eval | +| `penalty_schedule` | `adjust_hyperparameters` | Penalty annealing | +| `ScoreFunctionConfig` | — | Not yet ported to ExaModels | +| Stage-wise decomposition | — | JuMP only | +| Multiple shooting | — | JuMP only | + +## Full example: HydroPowerModels + +The `examples/HydroPowerModels/` directory in DecisionRulesExa.jl contains +a complete AC-OPF hydrothermal scheduling example for the Bolivia test case +— the same problem solved by DecisionRules.jl in the +[Hydropower Scheduling](@ref) tutorial. It demonstrates: + +- Parsing PowerModels.jl network data and hydro reservoir parameters +- Building a multi-stage deterministic-equivalent NLP in ExaModels + (DC or AC polar OPF formulations) +- L1 + L2 penalty on target slack (δ⁺/δ⁻ splitting for smooth NLP) +- GPU training with parallel MadNLP solves +- Warm-start caching to prevent cascade solver failures +- Penalty and sample-count annealing schedules +- W&B metric logging diff --git a/docs/src/gradient_fallback.md b/docs/src/gradient_fallback.md new file mode 100644 index 0000000..3baf895 --- /dev/null +++ b/docs/src/gradient_fallback.md @@ -0,0 +1,145 @@ +# Gradient Fallback + +```@meta +CurrentModule = DecisionRules +``` + +## Motivation + +TS-DDR training relies on solving an optimization subproblem at every stage and +differentiating through it (via Lagrange duals or DiffOpt). In practice, some +solves may fail — the solver hits numerical trouble, DiffOpt encounters +degenerate duals, or the subproblem is infeasible for a particular sample. A +single uncaught error kills the entire training run. + +The **gradient fallback** system provides a principled, extensible way to handle +these errors at three levels: + +| Level | Where it fires | What it controls | +|-------|----------------|------------------| +| **rrule pullback** | Inside the ChainRules `rrule` for `get_next_state` | Whether a bad-solver-status pullback returns zeros or throws | +| **Training loop** | Around `Flux.gradient(...)` in `train_multistage` / `train_multiple_shooting` | Whether a DiffOpt error skips the iteration or crashes | +| **Rollout evaluation** | Inside [`RolloutEvaluation`](@ref) per scenario | Whether a failed scenario is excluded from the metric or crashes | + +## Built-in fallback types + +- [`AbstractGradientFallback`](@ref) — abstract supertype for all fallback strategies +- [`ZeroGradientFallback`](@ref) — log a warning, return zero gradients, continue training (default) +- [`ErrorGradientFallback`](@ref) — re-throw the error (useful in tests) + +See the [API Reference](api.md) for full docstrings. + +## Usage + +### Default behavior (zero gradients) + +By default, all training functions use [`ZeroGradientFallback`](@ref). Failed +iterations log a warning and skip the parameter update: + +```julia +train_multistage( + model, x0, subproblems, spi, spo, uncertainty; + num_batches=500, + # gradient_fallback=ZeroGradientFallback() # this is the default +) +``` + +At training start you will see: + +``` +[ Info: Training with ZeroGradientFallback: solver/differentiation errors +will be caught and the iteration skipped (zero gradient). Pass +`gradient_fallback=ErrorGradientFallback()` to throw instead, or implement +a custom `AbstractGradientFallback` subtype. +``` + +### Strict mode (for tests) + +Use [`ErrorGradientFallback`](@ref) when you want errors to surface +immediately — typically in unit tests where every solve should succeed: + +```julia +train_multistage( + model, x0, subproblems, spi, spo, uncertainty; + num_batches=10, + gradient_fallback=ErrorGradientFallback(), +) +``` + +The same keyword works for [`train_multiple_shooting`](@ref) and +[`RolloutEvaluation`](@ref): + +```julia +rollout = RolloutEvaluation( + subproblems, spi, spo, x0, scenarios; + gradient_fallback=ErrorGradientFallback(), +) +``` + +## Custom fallbacks (extending the type system) + +Subtype [`AbstractGradientFallback`](@ref) and implement three methods: + +```julia +struct LoggingFallback <: DecisionRules.AbstractGradientFallback + log::Vector{Any} +end + +# Called when the rrule pullback (DiffOpt / dual extraction) fails. +# Return a tuple of cotangents matching the rrule signature, or rethrow. +function DecisionRules.handle_gradient_error(fb::LoggingFallback, e, n_in, n_out) + push!(fb.log, (:gradient, e)) + return DecisionRules._zero_cotangents(n_in, n_out) +end + +# Called when Flux.gradient(...) throws in the training loop. +# Return `true` to skip this iteration, or rethrow. +function DecisionRules.handle_training_error(fb::LoggingFallback, e, iter) + push!(fb.log, (:training, iter, e)) + return true +end + +# Called when a rollout scenario fails during evaluation. +# Return `true` to exclude this scenario from the metric, or rethrow. +function DecisionRules.handle_rollout_error(fb::LoggingFallback, e, iter) + push!(fb.log, (:rollout, iter, e)) + return true +end +``` + +Then pass it to any training function: + +```julia +fb = LoggingFallback(Any[]) +train_multistage(model, x0, subs, spi, spo, unc; + gradient_fallback=fb, +) +println("Caught $(length(fb.log)) errors during training") +``` + +This is useful for: +- **Monitoring**: count how often solves fail and on which iterations +- **Adaptive recovery**: adjust solver tolerances, restart from a checkpoint, etc. +- **Selective rethrowing**: catch known benign errors but let unexpected ones through + +## Relationship to `STRICT_GRADIENTS` + +The global [`STRICT_GRADIENTS`](@ref DecisionRules.STRICT_GRADIENTS) flag controls a separate, lower-level mechanism: +inside the `rrule` pullback, when the forward solver terminates with a +non-optimal status (e.g., `ITERATION_LIMIT`), the pullback returns zero +gradients (if `STRICT_GRADIENTS[] == false`, the default) or throws (if `true`). + +The `gradient_fallback` keyword operates at a higher level — it catches errors +from DiffOpt's `reverse_differentiate!` (assertion errors, degenerate duals, +etc.) and from the training loop itself. Both mechanisms are independent and +complementary: + +``` +Forward solve + └─ bad termination status → STRICT_GRADIENTS controls behavior + └─ good status → DiffOpt reverse_differentiate! + └─ error (assertion, numerical) → gradient_fallback catches it + └─ in rrule pullback: handle_gradient_error + └─ in training loop: handle_training_error + └─ in rollout eval: handle_rollout_error +``` diff --git a/docs/src/index.md b/docs/src/index.md index 070f3c1..7051de0 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -30,7 +30,7 @@ Three training formulations are supported: ```julia using Pkg -Pkg.add(url="https://github.com/LearningToOptimize/DecisionRules.jl.git") +Pkg.add("DecisionRules") ``` ## Quick start @@ -55,8 +55,10 @@ train_multistage( ) ``` -See the [Algorithm](@ref) page for the mathematical formulation and the -examples for complete worked problems. +See the [Algorithm](@ref) page for the mathematical formulation, the +[Uncertainty Sampling](@ref) guide for how to prepare your scenario data, the +[GPU Acceleration with DecisionRulesExa.jl](@ref) page for GPU-accelerated training, +and the examples for complete worked problems. ## Citation diff --git a/docs/src/sampling.md b/docs/src/sampling.md new file mode 100644 index 0000000..bacdfad --- /dev/null +++ b/docs/src/sampling.md @@ -0,0 +1,314 @@ +# Uncertainty Sampling + +```@meta +CurrentModule = DecisionRules +``` + +## Why sampling matters in TS-DDR + +In the TS-DDR training loop, each SGD step approximates the stochastic objective + +```math +\min_\theta \; \mathbb{E}_{w_{1:T}} \left[ + \sum_{t=1}^{T} q_t\bigl(x_{t-1}, w_t;\, \hat{x}_t(\theta)\bigr) +\right] +``` + +by drawing **sample trajectories** ``w_{1:T}^{(s)},\; s = 1,\ldots,S`` and +differentiating through the subproblem solves. The `uncertainty_sampler` +argument in [`train_multistage`](@ref) and [`train_multiple_shooting`](@ref) +controls how these trajectories are generated. + +Once a trajectory ``w_{1:T}`` is realized — a concrete numeric value per +uncertain parameter per stage — the rest of the training pipeline +(policy rollout, subproblem solve, gradient computation) is **identical** +regardless of how the trajectory was sampled. The sampler is therefore a +pluggable component that lets you match the correlation structure of your +problem. + +## Three sampling formats + +DecisionRules.jl supports three ways to specify uncertainty, offering +increasing levels of correlation control. + +### 1. Independent (per-unit) pools + +Each uncertain parameter has its own finite support and is sampled +**independently** at each stage. + +``` + ┌─ param₁: draw from {v₁₁, v₁₂, v₁₃} ←── independent +Stage t ────────────┤ + └─ param₂: draw from {v₂₁, v₂₂} ←── independent +``` + +**Julia type**: `Vector{Vector{Tuple{VariableRef, Vector{T}}}}` + +```julia +# uncertainty_pool[t][i] = (param_ref, [possible_values...]) +independent_pool = [ + # stage 1 + [(demand_param_1, [10.0, 15.0, 12.0]), + (demand_param_2, [8.0, 12.0, 9.0])], + # stage 2 + [(demand_param_1, [11.0, 14.0, 13.0]), + (demand_param_2, [7.0, 11.0, 10.0])], +] +``` + +Each call to `sample(independent_pool)` draws one value per parameter per +stage, independently. With ``n`` parameters each having ``k`` scenarios, this +samples from ``k^n`` possible combinations per stage — most of which may +never have occurred in reality. + +**Use when**: parameters are genuinely independent, or you have a single +uncertain parameter per stage. + +### 2. Joint-scenario pools (spatial correlation) + +Pre-defined joint realizations across **all** parameters. Sampling picks one +complete scenario per stage, preserving cross-parameter (spatial) correlations. + +``` + ω=1: (param₁=v₁₁, param₂=v₂₁) + ╱ +Stage t ── draw one ω ─── ω=2: (param₁=v₁₂, param₂=v₂₂) + ╲ + ω=3: (param₁=v₁₃, param₂=v₂₃) +``` + +**Julia type**: `Vector{Vector{Vector{Tuple{VariableRef, T}}}}` + +```julia +# uncertainty_pool[t][ω] = [(param₁, val), (param₂, val), ...] +joint_pool = [ + # stage 1: 3 scenarios + [[(inflow_1, 10.0), (inflow_2, 80.0)], # ω=1 + [(inflow_1, 20.0), (inflow_2, 120.0)], # ω=2 + [(inflow_1, 15.0), (inflow_2, 90.0)]], # ω=3 + # stage 2: 3 scenarios + [[(inflow_1, 11.0), (inflow_2, 70.0)], + [(inflow_1, 14.0), (inflow_2, 110.0)], + [(inflow_1, 13.0), (inflow_2, 100.0)]], +] +``` + +Each call to `sample(joint_pool)` picks one scenario index ``\omega`` +per stage and returns all parameters from that scenario. Only historically +observed combinations appear. + +!!! warning "Matching SDDP semantics" + SDDP.jl's `SDDP.parameterize` draws one ``\omega`` for all random + variables in a stage. If you compare TS-DDR against SDDP, you **must** + use joint-scenario pools to avoid a distributional mismatch. + +**Use when**: parameters are correlated (e.g., inflows across a river basin), +or your benchmark uses joint scenarios (SDDP, scenario trees). + +### 3. Trajectory sampler (spatial + temporal correlation) + +A callable that generates each stage's realization **conditioned on previous +stages**. This enables autoregressive (AR), Markovian, or any custom temporal +dependence — something the data-pool formats above cannot express because +they sample stages independently. + +``` +Stage 1 ── sampler(1, []) ──────────────────── w₁ + │ +Stage 2 ── sampler(2, [w₁]) ───────────────── w₂ + │ +Stage 3 ── sampler(3, [w₁, w₂]) ──────────── w₃ +``` + +**Julia type**: `Function` with signature `(t::Int, past::Vector{...}) -> Vector{Tuple{VariableRef, T}}` + +```julia +# AR(1) inflow sampler with spatial correlation +function ar1_sampler(t, past) + if isempty(past) + # Stage 1: draw from marginal distribution + ω = rand(1:nScenarios) + return [(params[t][r], data[r][t, ω]) for r in 1:nHyd] + else + # Stage t > 1: AR(1) conditioned on previous stage + prev = [pair[2] for pair in past[end]] + noise = randn(nHyd) .* σ + vals = ρ .* prev .+ (1 .- ρ) .* μ .+ noise + return [(params[t][r], vals[r]) for r in 1:nHyd] + end +end + +# Generate one trajectory +trajectory = sample(ar1_sampler, T) + +# Use in training — wrap as zero-arg callable +train_multistage( + policy, x0, subproblems, + state_in, state_out, + () -> sample(ar1_sampler, T); # pass as callable + num_batches=500, +) +``` + +**Use when**: your uncertainty process has temporal dependence (e.g., +autoregressive inflows, mean-reverting prices, regime-switching demands). + +## Comparison table + +| Feature | Independent | Joint-scenario | Trajectory sampler | +|:--------|:-----------|:--------------|:-------------------| +| Spatial correlation | ✗ | ✓ | ✓ | +| Temporal correlation | ✗ | ✗ | ✓ | +| Data format | Finite supports | Pre-built scenarios | Callable | +| Combinations per stage | ``k^n`` | ``k`` | unlimited | +| SDDP-compatible | only if ``n=1`` | ✓ | depends on model | +| Ease of use | simplest | moderate | most flexible | + +## Building uncertainty pools + +### From historical data (joint scenarios — recommended) + +When your data comes as a matrix where columns are scenarios: + +```julia +# data[r] is a (T × nScenarios) matrix for reservoir r +nHyd = length(data) +nCen = size(data[1], 2) + +uncertainty_pool = Vector{Any}(undef, T) +for t in 1:T + uncertainty_pool[t] = [ + [(inflow_params[t][r], data[r][t, ω] + 0.0) for r in 1:nHyd] + for ω in 1:nCen # ω is the OUTER loop — all units share it + ] +end +``` + +### From independent distributions + +```julia +uncertainty_pool = [ + [(demand_param[t], [low, mid, high])] + for t in 1:T +] +``` + +### From an AR(1) process + +```julia +# Estimate AR(1) parameters from data +μ = mean.(eachrow(hcat(data...))) # long-run mean per unit +σ = std.(eachrow(hcat(data...))) # innovation std per unit +ρ = 0.7 # autocorrelation coefficient + +function ar1_sampler(t, past) + if isempty(past) + vals = μ .+ σ .* randn(nHyd) + else + prev = [p[2] for p in past[end]] + vals = ρ .* prev .+ (1 .- ρ) .* μ .+ σ .* randn(nHyd) + end + return [(inflow_params[t][r], max(0.0, vals[r])) for r in 1:nHyd] +end +``` + +## Sampling in practice + +All three formats produce the same **realized trajectory** type: +`Vector{Vector{Tuple{VariableRef, Float64}}}`. This is what gets passed to +`simulate_multistage`, `simulate_stage`, and all internal training code. + +```julia +using DecisionRules + +# 1. From a data pool (independent or joint): +trajectory = sample(uncertainty_pool) + +# 2. From a trajectory sampler: +trajectory = sample(ar1_sampler, T) + +# Both produce the same type — downstream code is identical: +objective = simulate_multistage( + subproblems, state_params_in, state_params_out, + initial_state, trajectory, policy, +) +``` + +### Passing to training functions + +```julia +# Data pools are passed directly: +train_multistage(policy, x0, subs, s_in, s_out, uncertainty_pool; ...) + +# Trajectory samplers are wrapped as zero-arg callables: +train_multistage(policy, x0, subs, s_in, s_out, + () -> sample(ar1_sampler, T); ...) +``` + +This works because `train_multistage` calls `sample(uncertainty_sampler)` to +draw each trajectory. For data pools, `sample` dispatches on the pool type. +For callables, `sample(f::Function)` simply calls `f()`. + +## Demonstrating the difference + +Consider 3 hydro reservoirs with 4 historical inflow scenarios: + +``` +Historical inflow data (columns = scenarios): + + ω=1 ω=2 ω=3 ω=4 +Res 1: 10 20 15 25 +Res 2: 80 120 90 110 +Res 3: 5 8 6 9 +``` + +**Independent sampling** draws one value per row independently. A sample +might be `(10, 120, 9)` — reservoir 1 from ω=1, reservoir 2 from ω=2, +reservoir 3 from ω=4. This combination never occurred historically and +may violate the drought-affects-all-basins correlation. + +**Joint sampling** picks one column: `(10, 80, 5)` or `(25, 110, 9)` — +always a historically observed combination. + +**Trajectory sampling** can additionally model temporal persistence: +if ω=1 (dry year) was drawn at stage 1, the AR(1) sampler will likely +produce below-average inflows at stage 2 as well. + +``` +Joint sampling (k=4 possible outcomes per stage): + + Res 1 ──┐ + Res 2 ──┼── same ω ──→ one of 4 historical vectors + Res 3 ──┘ + +Independent sampling (k³=64 possible outcomes per stage): + + Res 1 ── ω₁ ──┐ + Res 2 ── ω₂ ──┼──→ one of 64 combinations (most never observed) + Res 3 ── ω₃ ──┘ + +Trajectory sampling (conditioned on past): + + Stage 1: same as joint ──→ w₁ + │ + Stage 2: AR(1)(w₁) ──────→ w₂ (temporal correlation preserved) +``` + +## Internal functions + +The following internal helpers process uncertainty pools in different training +formulations. They are not part of the public API but documented here for +maintainability. + +- [`_remap_uncertainties`](@ref): Remap JuMP `VariableRef` keys when + copying uncertainty pools into a deterministic-equivalent model. + Two methods dispatch on per-unit vs. joint-scenario pool types. +- [`extract_uncertainty_params`](@ref): Extract just the `VariableRef` + parameters from an uncertainty pool, discarding the scenario values. + Used by `setup_shooting_windows` for multiple-shooting training. + +## API Reference + +```@docs +sample +``` diff --git a/examples/HydroPowerModels/README.md b/examples/HydroPowerModels/README.md index e367d6f..7a6d62b 100644 --- a/examples/HydroPowerModels/README.md +++ b/examples/HydroPowerModels/README.md @@ -15,15 +15,30 @@ generation to meet those targets at minimum cost. ## Training scripts +### TS-DDR (Deep Decision Rules — LSTM policy) + | Script | Decomposition | Reference | |--------|--------------|-----------| | `train_dr_hydropowermodels.jl` | Deterministic equivalent (GPU-enabled) | Extension §1 | | `train_dr_hydropowermodels_subproblems.jl` | Stage-wise (single shooting) | Extension §2 | | `train_dr_hydropowermodels_multipleshooting.jl` | Windowed (multiple shooting) | Extension §3 | -All three share the same data loader (`load_hydropowermodels.jl`) and -policy architecture (LSTM encoder + state-conditioned dense layers). -Training logs to Weights & Biases and saves the best model to JLD2. +These use a `StateConditionedPolicy` (LSTM encoder + state-conditioned dense +layers, `[128, 128]`, sigmoid activation). + +### TS-LDR (Linear Decision Rules — linear policy) + +| Script | Decomposition | Reference | +|--------|--------------|-----------| +| `train_ldr_hydropowermodels.jl` | Deterministic equivalent (GPU-enabled) | §3 | + +TS-LDR uses `dense_multilayer_nn` with identity activation — a composition +of linear layers that is equivalent to a single linear map from +(uncertainties, state) to targets. Same training pipeline as TS-DDR; the +only difference is the policy architecture. + +All training scripts share the data loader (`load_hydropowermodels.jl`), +log to Weights & Biases, and save the best model to JLD2. ### GPU training @@ -64,20 +79,30 @@ policy relying on slack. | Script | Purpose | |--------|---------| +| `evaluate_hydro_policies.jl` | Load all trained TS-DDR and TS-LDR models and evaluate on a common out-of-sample scenario set using stage-wise ACP rollout; writes `eval_costs.csv` | | `eval_jump_de.jl` | Solve the DE with a constant policy and save a reference solution (JLD2) for cross-validation with ExaModels | -| `test_dr_hydropowermodels.jl` | Load a trained model and produce volume/generation/cost comparison plots and CSVs | | `check_consistent_state_paths.jl` | Verify that stage-wise, deterministic equivalent, and multiple-shooting decompositions produce identical state trajectories under the same policy and inflows | ## SDDP baselines -These scripts require [HydroPowerModels.jl](https://github.com/andrewrosemberg/HydroPowerModels.jl), -Gurobi, and Mosek licenses. +These scripts use a dedicated Julia environment in `sddp/`. The inconsistent +SOC-backward/AC-forward baseline uses +[HydroPowerModels.jl](https://github.com/LAMPSPUC/HydroPowerModels.jl), SDDP.jl, +Clarabel for the SOC backward pass, and MadNLP for the AC forward pass. Training +runs log iteration and final simulation metrics to Weights & Biases using the +same keys as the DR runs: `metrics/loss` is the SDDP bound, and +`metrics/rollout_realized_objective_no_deficit` is the SDDP forward-pass +objective. SDDP iterations are logged as `batch` so W&B plots can share the same +x-axis as the DR training runs. Because SDDP solves the forward policy +stage-wise, that forward-pass objective is already the no-target-penalty +objective. | Script | Description | |--------|-------------| -| `run_sddp.jl` | Train SDDP with a consistent convex (SOCWRConic) formulation | -| `run_sddp_inconsistent.jl` | Train SDDP with SOCWRConic backward pass and ACP forward pass | -| `simulate_sddp_policy.jl` | Simulate a pre-trained SDDP policy under ACP and produce comparison plots | +| `sddp/run_sddp.jl` | Train SDDP with a consistent convex (SOCWRConic) formulation | +| `sddp/run_sddp_inconsistent.jl` | Train SDDP with SOCWRConic backward pass and ACP forward pass | +| `sddp/run_sddp_inconsistent.sbatch` | Submit the SOC-backward/AC-forward run with a 12-hour wall time | +| `sddp/simulate_sddp_policy.jl` | Simulate a pre-trained SDDP policy under ACP and produce comparison plots | ## Learning-to-Optimize (L2O) pipeline diff --git a/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanGeneration.csv b/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanGeneration.csv index 38c1229..d0b560d 100644 --- a/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanGeneration.csv +++ b/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanGeneration.csv @@ -1,97 +1,97 @@ TS-DDR,TS-LDR,SDDP-DCLL,SDDP-SOC -147.0247133419527,299.7450624492452,100.8484585807442,210.85805129912742 -157.58814081813028,225.6675464897668,104.86242775104952,211.20242951110967 -185.43371546547627,180.7943849292062,107.48502989328503,211.34903175695075 -199.3134646164604,199.83572444735833,107.55475020324891,211.50499902777818 -206.61858480532382,231.35910431382658,106.18650990754062,213.03900650304772 -214.3697084291256,189.64814901428522,113.67317158934263,213.3433026957727 -187.9473304345247,241.6103392896273,116.58714528380276,213.53001862630535 -171.46565827999535,187.4542766440176,129.25366035880148,213.42630840733494 -170.2002528391322,233.150743363177,114.76063596215229,213.53273980518378 -182.98142893312487,185.52122552406792,109.93768423864336,213.45398275330976 -202.19901785621693,215.06714163113537,116.98303295186679,213.00893342509022 -193.96716199312775,199.9910790500363,121.32762963754153,213.34946528253167 -193.52811950138334,205.21889018439646,118.81044678071419,211.36762915185463 -195.13814426290378,184.97486998527484,140.68919880246779,210.47242267421387 -226.57039256458856,167.1372529242544,157.98870219622455,208.66880566556748 -248.08925112087996,177.07636222131532,182.13344732161957,208.8072773221114 -233.5593017377245,206.24124265365077,213.25680127887568,208.85548225347776 -212.98635880030474,203.58562602361448,219.65332808686793,209.21183174817384 -208.1182840666282,225.62114837627547,215.53856096654266,209.00239858283362 -205.87501153760303,282.5815380137001,218.69325537706618,208.35800693518277 -218.77529830243512,261.2652681819222,229.70401987822862,207.84410265509277 -219.5621182240177,228.86267450841828,233.80086503295337,208.84146637556242 -215.15355785846822,278.8379771241615,219.9540629779893,208.46432000930798 -225.44578342326264,208.6334543644992,227.25750181666496,208.5398276805071 -216.46699729157837,233.62842636523723,228.98664748977347,208.87782967565062 -210.99855030225916,236.77260170176334,245.24308942520355,208.618115177171 -200.82120574954234,257.832718296366,266.63858091994916,210.3109443150775 -217.00901852303974,206.80316405135653,287.580525250383,210.15033697956687 -205.60116177559695,247.22715853816413,288.43119570669523,209.7157833146924 -204.48717949362427,242.7626603477798,295.40865231198757,210.07161367481694 -215.01700896149435,264.8798596812393,316.4539931784796,209.14538186180965 -194.39594990932568,253.05048627557306,329.7940889425171,209.0149875668554 -212.44246770637898,281.106369677497,327.59228806131625,208.94739091431111 -218.88119237987075,264.11335838123404,330.87683426769047,208.47934249184019 -209.54103152313093,265.672452188459,320.54817802711284,209.16634211748254 -232.50511220115487,242.47928625172216,336.16459151820277,210.2650760789873 -246.55559517109822,250.15790720527644,328.4530145309637,210.8958189450547 -259.1804770347488,241.4580174607301,306.6623006700101,210.21284850792782 -243.65212407401336,216.2392032573588,311.05102021329515,210.84944676111476 -268.3327047244221,244.87238185642838,313.43043807340126,216.04183457929315 -259.25135590920064,212.7607093726518,289.6446657672623,217.73032842094054 -239.7596346992519,211.60973323507133,236.34688350288008,212.03827129950005 -225.1627151901455,224.31226414469103,227.41520899884554,214.05479553340882 -226.300220128706,190.17002437882283,237.60749837482643,214.2688665376018 -246.5897657016228,197.34726823968018,242.14935260524663,214.4866449988023 -200.45447214778832,159.42600322266816,217.0164054870767,212.007886833974 -177.65036023699594,199.6529727342857,200.46699991530508,209.9183339789378 -152.7449270008933,184.7732720253377,215.7682105980341,210.72466969624674 -116.54920043933797,185.46879418205066,111.6333307935832,210.26544418415878 -118.11835868140619,166.0896692911,101.80408752144571,210.23698737179038 -135.40331893488192,159.49003675324482,106.15347696391866,210.35402839872972 -165.39822644830028,182.5511233035246,108.92328402254266,209.30901089378818 -164.63498605950215,157.20350355656583,103.94596690664551,208.01878145654337 -166.92019582010437,188.70842110946302,103.82258613108722,208.47415960986146 -153.73417427187314,161.11055875216002,112.13281054476954,209.23591063543003 -164.778321612267,181.93462654464733,161.52068981849445,208.22372981215094 -160.9396855926067,146.59967315111336,169.88300890301332,206.5932949414058 -161.6813291137583,157.86943128110318,177.59131665764,206.6902407990834 -161.9520731591163,129.58260488127797,172.45550740547597,206.73951637050718 -161.5166767553574,192.37516226972488,177.4638905803286,206.95619327342342 -171.23315428685606,169.81265452067095,176.73862367347002,207.51071253650153 -203.84334411204006,189.01908062873738,182.19685255887867,209.81694080008674 -243.88748126609443,185.3643782473636,192.64656185901407,208.53610364457444 -255.36863069633904,189.10691135198536,210.14038242475175,208.4617557397368 -275.72231730419776,177.47926018459844,214.11405852350882,207.85209901607163 -231.14746381351682,203.55835450764843,216.8881876993481,207.6260731591064 -239.70981393223852,185.15015653078225,206.20963274723462,208.4778921323274 -236.55408488066604,209.9345245251102,228.06281826263384,208.23115750760718 -232.50309001204022,247.09182945796942,232.53663833367236,208.3506533209856 -211.09249114427467,224.13559766576765,229.0267023271387,207.95548864145377 -222.7816613997703,251.3620122443193,232.90850460804563,206.62135419445332 -219.16351862667358,241.65598188514326,228.66399277324248,205.7351933331613 -207.22133488700123,260.0058643392084,266.0049119099325,204.86776664864328 -217.06224817213152,249.65113999794553,228.7078025295844,204.38830132480373 -210.07761177326952,201.43928958603075,257.88491102881073,205.79912219619206 -216.6438147791045,209.70322997573356,256.3000900779569,205.84504565445877 -201.41487156994177,254.6147418912871,260.0864123226794,206.04459937322108 -207.97135419475285,233.44163279290632,245.1593291381967,203.60562199452835 -211.26689971022438,206.86923643700797,266.2262958784643,202.65541198826796 -240.1582296197083,249.82887487767167,286.04078934122384,204.0215906015123 -233.02195021621122,218.9756433172769,302.4507102096619,204.25738636756918 -237.29598744217932,254.3292021981702,310.8026658382449,201.8306318980746 -262.7454036444481,254.22526995036333,312.90905574443457,200.48697941077813 -277.54946543623424,228.08210258025755,326.2879316740818,200.8089340858362 -259.14118496809317,237.00317892462166,325.13527205939647,203.97622797873652 -258.0603903695542,268.0249997632043,305.61306417662183,207.41992227810343 -263.1354853907414,238.83057850845063,298.7323501222888,207.68175937544407 -280.35492909981747,216.14279418347928,318.1451091369037,208.5046041321376 -271.56374567280704,229.39964558658272,286.6845872174376,209.98920231592425 -229.35820796650296,212.93348980868046,245.77044877320677,207.67020973138582 -230.49394051322878,203.67236378550288,224.55512987384654,202.38584149268576 -225.7601668320821,219.35714063162678,238.83881638414672,204.348082102984 -230.75407630714457,198.06757965648347,238.36899164573157,204.80416882648063 -221.31628509636886,149.20055233004982,218.55183620363255,200.6825798131114 -200.12300075458802,167.6799930782707,209.6087072026431,196.29537601929695 -138.65216945726675,143.81558253281227,219.38832428287833,206.63438898989997 +147.0247133419527,299.7450624492452,100.8484585807442,209.86380882088514 +157.58814081813028,225.6675464897668,104.86242775104952,209.53839385906838 +185.43371546547627,180.7943849292062,107.48502989328503,209.52017504171553 +199.3134646164604,199.83572444735833,107.55475020324891,209.4783394659705 +206.61858480532382,231.35910431382658,106.18650990754062,209.52518234738324 +214.3697084291256,189.64814901428522,113.67317158934263,209.6034477390574 +187.9473304345247,241.6103392896273,116.58714528380276,209.65384702850434 +171.46565827999535,187.4542766440176,129.25366035880148,209.51238800566185 +170.2002528391322,233.150743363177,114.76063596215229,209.45259355985954 +182.98142893312487,185.52122552406792,109.93768423864336,209.34525985187597 +202.19901785621693,215.06714163113537,116.98303295186679,209.44039837517906 +193.96716199312775,199.9910790500363,121.32762963754153,209.3672289663951 +193.52811950138334,205.21889018439646,118.81044678071419,209.32163376554064 +195.13814426290378,184.97486998527484,140.68919880246779,209.6614054735699 +226.57039256458856,167.1372529242544,157.98870219622455,210.19631956661485 +248.08925112087996,177.07636222131532,182.13344732161957,210.70698165532528 +233.5593017377245,206.24124265365077,213.25680127887568,211.4401169112003 +212.98635880030474,203.58562602361448,219.65332808686793,211.48105832602775 +208.1182840666282,225.62114837627547,215.53856096654266,211.94334659983778 +205.87501153760303,282.5815380137001,218.69325537706618,211.8784711569352 +218.77529830243512,261.2652681819222,229.70401987822862,211.85037808628837 +219.5621182240177,228.86267450841828,233.80086503295337,212.00653608776338 +215.15355785846822,278.8379771241615,219.9540629779893,211.71653085812682 +225.44578342326264,208.6334543644992,227.25750181666496,211.72866526590903 +216.46699729157837,233.62842636523723,228.98664748977347,211.79095480066385 +210.99855030225916,236.77260170176334,245.24308942520355,211.9396216916039 +200.82120574954234,257.832718296366,266.63858091994916,212.2412014072736 +217.00901852303974,206.80316405135653,287.580525250383,211.96614587275292 +205.60116177559695,247.22715853816413,288.43119570669523,212.0593948656155 +204.48717949362427,242.7626603477798,295.40865231198757,212.13189528353013 +215.01700896149435,264.8798596812393,316.4539931784796,212.48730840572054 +194.39594990932568,253.05048627557306,329.7940889425171,212.170191104127 +212.44246770637898,281.106369677497,327.59228806131625,212.68099082676446 +218.88119237987075,264.11335838123404,330.87683426769047,212.9854607332948 +209.54103152313093,265.672452188459,320.54817802711284,212.90364434114954 +232.50511220115487,242.47928625172216,336.16459151820277,212.98400917882333 +246.55559517109822,250.15790720527644,328.4530145309637,212.90706046302626 +259.1804770347488,241.4580174607301,306.6623006700101,212.78491180246007 +243.65212407401336,216.2392032573588,311.05102021329515,213.50440427222605 +268.3327047244221,244.87238185642838,313.43043807340126,214.71903130733853 +259.25135590920064,212.7607093726518,289.6446657672623,215.71309550713656 +239.7596346992519,211.60973323507133,236.34688350288008,214.21514981807115 +225.1627151901455,224.31226414469103,227.41520899884554,214.14602835221518 +226.300220128706,190.17002437882283,237.60749837482643,213.05570166141047 +246.5897657016228,197.34726823968018,242.14935260524663,214.3397929974062 +200.45447214778832,159.42600322266816,217.0164054870767,212.87589299860522 +177.65036023699594,199.6529727342857,200.46699991530508,210.512610777988 +152.7449270008933,184.7732720253377,215.7682105980341,208.87860648289214 +116.54920043933797,185.46879418205066,111.6333307935832,205.95870346385465 +118.11835868140619,166.0896692911,101.80408752144571,205.70698904522408 +135.40331893488192,159.49003675324482,106.15347696391866,205.4160873909239 +165.39822644830028,182.5511233035246,108.92328402254266,205.59700933991252 +164.63498605950215,157.20350355656583,103.94596690664551,205.58479725277505 +166.92019582010437,188.70842110946302,103.82258613108722,205.77526373955115 +153.73417427187314,161.11055875216002,112.13281054476954,205.88135777815637 +164.778321612267,181.93462654464733,161.52068981849445,206.02649485451556 +160.9396855926067,146.59967315111336,169.88300890301332,205.8871588917278 +161.6813291137583,157.86943128110318,177.59131665764,206.70433459244137 +161.9520731591163,129.58260488127797,172.45550740547597,206.366999884319 +161.5166767553574,192.37516226972488,177.4638905803286,206.38870455127955 +171.23315428685606,169.81265452067095,176.73862367347002,206.249716219982 +203.84334411204006,189.01908062873738,182.19685255887867,206.63455688813036 +243.88748126609443,185.3643782473636,192.64656185901407,205.9797537356252 +255.36863069633904,189.10691135198536,210.14038242475175,206.03555271847185 +275.72231730419776,177.47926018459844,214.11405852350882,206.51953984726723 +231.14746381351682,203.55835450764843,216.8881876993481,206.12018860291096 +239.70981393223852,185.15015653078225,206.20963274723462,206.250132372938 +236.55408488066604,209.9345245251102,228.06281826263384,206.1138429495153 +232.50309001204022,247.09182945796942,232.53663833367236,206.4146828063853 +211.09249114427467,224.13559766576765,229.0267023271387,206.5605843384091 +222.7816613997703,251.3620122443193,232.90850460804563,206.40037604289813 +219.16351862667358,241.65598188514326,228.66399277324248,206.5295113131882 +207.22133488700123,260.0058643392084,266.0049119099325,206.8315496621826 +217.06224817213152,249.65113999794553,228.7078025295844,207.13270789037426 +210.07761177326952,201.43928958603075,257.88491102881073,207.77068168915991 +216.6438147791045,209.70322997573356,256.3000900779569,208.16087070287398 +201.41487156994177,254.6147418912871,260.0864123226794,208.40830707759466 +207.97135419475285,233.44163279290632,245.1593291381967,207.38456438434162 +211.26689971022438,206.86923643700797,266.2262958784643,206.9039133561306 +240.1582296197083,249.82887487767167,286.04078934122384,207.77917011114187 +233.02195021621122,218.9756433172769,302.4507102096619,207.558738022174 +237.29598744217932,254.3292021981702,310.8026658382449,208.03806284333123 +262.7454036444481,254.22526995036333,312.90905574443457,209.270249411708 +277.54946543623424,228.08210258025755,326.2879316740818,208.91105191830513 +259.14118496809317,237.00317892462166,325.13527205939647,208.38620445160527 +258.0603903695542,268.0249997632043,305.61306417662183,207.98642330663995 +263.1354853907414,238.83057850845063,298.7323501222888,209.43087634497854 +280.35492909981747,216.14279418347928,318.1451091369037,211.27160719784078 +271.56374567280704,229.39964558658272,286.6845872174376,210.0236414480106 +229.35820796650296,212.93348980868046,245.77044877320677,208.8917459728048 +230.49394051322878,203.67236378550288,224.55512987384654,205.8564923994707 +225.7601668320821,219.35714063162678,238.83881638414672,208.55931411570532 +230.75407630714457,198.06757965648347,238.36899164573157,208.37478598750155 +221.31628509636886,149.20055233004982,218.55183620363255,205.669374135596 +200.12300075458802,167.6799930782707,209.6087072026431,200.7031326575932 +138.65216945726675,143.81558253281227,219.38832428287833,201.49222415721096 diff --git a/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanVolume.csv b/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanVolume.csv index eaae969..6b0f78f 100644 --- a/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanVolume.csv +++ b/examples/HydroPowerModels/bolivia/ACPPowerModel/MeanVolume.csv @@ -1,97 +1,97 @@ TS-DDR,TS-LDR,SDDP-DCLL,SDDP-SOC -22.51732496848532,37.919375441643616,15.010226053947687,29.33194347862757 -47.20258811999907,71.04409366252557,27.799866146567854,56.51380736510445 -72.83176865479658,87.01935425652064,41.809771632008996,84.20898277540019 -91.6036862853376,110.9546703800459,54.779619691636846,110.27630381823403 -111.75802365829193,124.62427072370988,67.41758956472614,134.27275994415598 -132.06225985386516,140.7114941122255,76.61311783876522,155.39600670884147 -151.88292311509423,163.09183590390472,85.64543943322955,176.79982271494404 -167.62574263315506,176.9949496837815,90.09866940534309,200.7038069282425 -184.1432711637443,192.12358945548075,97.21306040484191,224.62760324276306 -199.10712684414443,211.72514898332275,100.754649277593,245.07104162682876 -213.03560361830984,234.5026497996314,107.46682144224346,272.4317856234269 -224.76897734579637,261.6236901648443,111.30922724569007,296.4571689351554 -238.39447203378649,282.426645017808,114.62593628727466,314.971343907225 -245.37549219043413,285.94648083840264,113.43065917802855,323.59617431541096 -252.4494932363904,296.90814187996295,110.20680607797237,326.08839288669947 -250.45907641085287,296.9831633048792,106.20779976718435,318.4972488768354 -248.8524629192563,300.7632599043702,101.76735381105871,306.22180979408193 -245.3285907233545,290.30183231928174,95.88402955956306,292.9779998795557 -237.90416834064467,290.15844458648814,89.15470564095466,280.274491334906 -230.9413064277374,281.77833258453296,82.17436482998376,268.26270575999536 -221.1118971810357,281.1300748339776,74.6379921116223,254.79818728066502 -209.82043242344366,273.15550833073513,66.46062403663097,241.0049501227325 -197.17838472323572,255.4079970540009,58.1801555890566,229.11021885785559 -184.5082417295772,243.31108770222295,49.643407346969596,217.6998371921953 -171.364986731152,236.8863043514778,40.88630847779404,206.66414135128852 -158.67507679180278,228.75659130962944,32.42172641872411,194.91447234762936 -145.70893099014296,228.60197560222855,25.915677587474477,186.04793001489267 -132.80165423375078,219.4870486506905,17.949546658738377,173.36981451590145 -118.76446158949864,212.842623503242,10.91236331395401,163.46784473894132 -105.47673733968136,197.82735475771423,6.342665426831365,154.54992926931362 -92.28262527930282,183.60880119824964,2.9637113646432924,145.05997371744357 -80.08178289428561,178.21417462426308,1.4904346490510063,135.37427475280447 -68.05075167883604,174.38024725627528,0.6542767511259123,125.33618787791389 -57.33892828225943,163.75151223638323,-2.1629451753485584e-7,114.38326959746601 -47.91122996837452,158.30124789510745,0.01699339492037168,104.57111137056225 -39.71203072777365,146.6433762840449,0.013683141695575645,92.26121331847665 -32.08903884626882,138.74956149885196,-2.776369796429933e-5,77.95984718351467 -25.567879137873327,117.64643250247984,0.02029723087689403,68.14542674466739 -20.226109971544595,105.87824541767543,0.02942785780510852,57.80039246187566 -15.822029319315941,95.26193888634329,0.0015619407570932227,44.98308797532776 -11.86482403506727,82.88471848733366,0.0139202251526318,36.4345842628461 -11.41444238123061,89.84998297693535,0.0666095021126761,33.672607545128585 -11.760709387675428,80.56903002626132,0.02911519239806684,31.779550259167188 -15.608653812842075,84.4812008303581,0.1451610159592241,29.167467682825105 -18.46652374296125,75.10873866189522,0.257529864394238,25.52786987723517 -19.154852416425946,79.66424881896181,0.34310578392406726,24.862484554227695 -17.76382346268219,82.33214023398799,0.04020249114668608,27.440814552437722 -11.770417835270539,77.16028138611318,0.1601258590007935,28.178903593901776 -28.575902113160843,100.30120799881679,8.290616244150943,51.18755046563618 -44.6500656371219,133.7662232684492,18.826473984628194,79.46990149570847 -58.93359681990834,144.39896871097727,26.967830366959436,107.64334396878635 -78.81921579909906,157.05027238202555,36.00161441777166,132.62153676565572 -96.31823829542331,172.46207271242636,47.67488678462672,163.03607490960206 -113.68465589999252,199.6980903165839,56.815618070614086,185.5902142428461 -131.44733746695687,218.31345641963154,65.0298903498588,205.6040788843083 -143.57022598996326,248.0669589387606,78.12103160217615,225.18336274841536 -160.2545160514829,276.4730547448535,95.53937029048053,246.97356190959027 -173.71546658765067,294.32632069405474,107.5932705161822,262.4752046573908 -188.87717865623495,289.0248763377732,118.23444331568548,278.0369939094454 -200.72440725109396,310.47111738812987,125.58297208702875,291.21933275895447 -211.54578497851003,321.62467412938156,135.04096202155375,311.8472483005798 -219.67787470209927,322.9377982306577,138.23259862143857,326.9447866786039 -225.5911712053431,313.0818205198798,137.52220454328514,331.6321292047405 -225.39958719867292,303.0418832885243,134.56161843454288,334.41533503174685 -223.0003148921021,295.41511330313085,130.39480512697176,330.21097313357666 -219.3358500411617,292.2241103571665,124.0594015578791,324.70715783895025 -215.7865012983455,282.52479046838965,117.20362201093069,318.9013992484797 -208.9688074571684,270.8475139202375,109.91183630093335,309.0068767261927 -200.25820568279497,268.14577165979705,102.18413675040794,299.37788585534616 -189.3589474950667,274.8028403967962,94.29202902742942,291.0950904790007 -179.11347981019935,269.33461930444184,86.51972305194262,277.7102369291978 -166.46173064664583,261.0404132262943,77.8672879103141,263.4527539589003 -154.36450314445943,258.2200123763158,69.45071513510558,248.33011814559686 -142.61948735542381,250.57594722312155,61.04749888339048,235.64188913357484 -130.39403275688142,237.56983972063503,53.57905190860542,220.48242803133235 -118.09172135231981,232.9065992805976,45.31949038232733,204.10586308604817 -107.2702477985387,215.22999411819978,37.242727577570335,186.07281143864037 -97.0065973471623,205.80014155651824,28.961340379753675,172.53116562776583 -84.6333338632984,192.9275305544331,20.748060187265494,156.70296586552814 -74.44115642907373,175.65153529061962,13.951363389962466,138.83089629990818 -65.89676047851572,159.63588608030236,9.971531743391353,125.07480741352246 -59.101026640990746,157.6032563807769,5.649949808269948,109.34879298512287 -50.4628042074048,151.14613237451394,2.8422167357923183,93.4456297857743 -43.380434756627224,142.1297446607122,0.8089371675179539,77.96036680911685 -37.2402039626919,131.15093810046022,-2.7757075579902497e-5,64.8649411952869 -32.6377410503266,127.61272326799181,0.014576637404480192,58.483461680238136 -29.407339371204802,122.7377666202245,0.020931438505678426,49.55232088062062 -24.06393930942189,85.84155793018172,0.0062571966267834424,37.76748815084717 -23.127901917898463,65.85303059430143,0.03619878202007641,29.09502728354922 -22.924839043559988,62.40507538369134,0.23070127422069103,25.66711762242007 -22.007801580404273,59.154416378847564,0.49709652305249996,23.17058109737539 -21.136620549198433,54.197233720503064,0.12869349623927082,17.637496022430913 -19.031142363960612,38.14249299045677,0.2798932995958727,12.974210990765622 -17.40204173868524,30.7482859887398,0.6275685251939801,10.510870387587385 -12.246953654446704,19.1380483634613,0.7553403609073082,9.280752131127022 -0.40074148030873114,1.9740202514084515,-2.7761152226093412e-5,6.682329562456955 +22.51732496848532,37.919375441643616,15.010226053947687,29.88884403814745 +47.20258811999907,71.04409366252557,27.799866146567854,58.85398607133367 +72.83176865479658,87.01935425652064,41.809771632008996,82.61277155868845 +91.6036862853376,110.9546703800459,54.779619691636846,105.02407149811434 +111.75802365829193,124.62427072370988,67.41758956472614,127.58313808354393 +132.06225985386516,140.7114941122255,76.61311783876522,147.63328200911607 +151.88292311509423,163.09183590390472,85.64543943322955,168.65289410472099 +167.62574263315506,176.9949496837815,90.09866940534309,189.75454495424944 +184.1432711637443,192.12358945548075,97.21306040484191,218.11088982453344 +199.10712684414443,211.72514898332275,100.754649277593,246.93492472290026 +213.03560361830984,234.5026497996314,107.46682144224346,272.21531353974717 +224.76897734579637,261.6236901648443,111.30922724569007,292.90711864275784 +238.39447203378649,282.426645017808,114.62593628727466,311.83209342410777 +245.37549219043413,285.94648083840264,113.43065917802855,322.5848167767246 +252.4494932363904,296.90814187996295,110.20680607797237,322.2425933406325 +250.45907641085287,296.9831633048792,106.20779976718435,320.00444979375743 +248.8524629192563,300.7632599043702,101.76735381105871,312.8087106999844 +245.3285907233545,290.30183231928174,95.88402955956306,303.1966369847269 +237.90416834064467,290.15844458648814,89.15470564095466,296.46398240261857 +230.9413064277374,281.77833258453296,82.17436482998376,286.69479612549327 +221.1118971810357,281.1300748339776,74.6379921116223,275.7387873489527 +209.82043242344366,273.15550833073513,66.46062403663097,264.90582250183024 +197.17838472323572,255.4079970540009,58.1801555890566,252.08676360007652 +184.5082417295772,243.31108770222295,49.643407346969596,237.85231993230704 +171.364986731152,236.8863043514778,40.88630847779404,224.51757587451243 +158.67507679180278,228.75659130962944,32.42172641872411,211.62777515165263 +145.70893099014296,228.60197560222855,25.915677587474477,197.57381524080466 +132.80165423375078,219.4870486506905,17.949546658738377,182.8952051783611 +118.76446158949864,212.842623503242,10.91236331395401,169.0595060088052 +105.47673733968136,197.82735475771423,6.342665426831365,156.70187722582688 +92.28262527930282,183.60880119824964,2.9637113646432924,143.70426904824114 +80.08178289428561,178.21417462426308,1.4904346490510063,131.34345908273133 +68.05075167883604,174.38024725627528,0.6542767511259123,119.6560155151396 +57.33892828225943,163.75151223638323,-2.1629451753485584e-7,107.89499970347059 +47.91122996837452,158.30124789510745,0.01699339492037168,95.99205089892341 +39.71203072777365,146.6433762840449,0.013683141695575645,81.98936536386094 +32.08903884626882,138.74956149885196,-2.776369796429933e-5,70.22278074065235 +25.567879137873327,117.64643250247984,0.02029723087689403,57.957500191547524 +20.226109971544595,105.87824541767543,0.02942785780510852,48.218945817433884 +15.822029319315941,95.26193888634329,0.0015619407570932227,36.867704663165426 +11.86482403506727,82.88471848733366,0.0139202251526318,29.26424780181313 +11.41444238123061,89.84998297693535,0.0666095021126761,25.92639157504742 +11.760709387675428,80.56903002626132,0.02911519239806684,26.098422131593825 +15.608653812842075,84.4812008303581,0.1451610159592241,22.394312574066713 +18.46652374296125,75.10873866189522,0.257529864394238,19.330147292287013 +19.154852416425946,79.66424881896181,0.34310578392406726,19.064203280951464 +17.76382346268219,82.33214023398799,0.04020249114668608,20.09172473445721 +11.770417835270539,77.16028138611318,0.1601258590007935,19.35396878018543 +28.575902113160843,100.30120799881679,8.290616244150943,46.258638018871736 +44.6500656371219,133.7662232684492,18.826473984628194,73.04283772380003 +58.93359681990834,144.39896871097727,26.967830366959436,95.94410009405843 +78.81921579909906,157.05027238202555,36.00161441777166,117.71853421648952 +96.31823829542331,172.46207271242636,47.67488678462672,140.18906733662052 +113.68465589999252,199.6980903165839,56.815618070614086,161.20146693953137 +131.44733746695687,218.31345641963154,65.0298903498588,181.69307761993895 +143.57022598996326,248.0669589387606,78.12103160217615,199.8999392076057 +160.2545160514829,276.4730547448535,95.53937029048053,220.31192988484966 +173.71546658765067,294.32632069405474,107.5932705161822,239.09635346381702 +188.87717865623495,289.0248763377732,118.23444331568548,255.85211397472423 +200.72440725109396,310.47111738812987,125.58297208702875,272.7401007551301 +211.54578497851003,321.62467412938156,135.04096202155375,292.5534588326327 +219.67787470209927,322.9377982306577,138.23259862143857,305.2552176250861 +225.5911712053431,313.0818205198798,137.52220454328514,306.9497613201085 +225.39958719867292,303.0418832885243,134.56161843454288,303.58072451918565 +223.0003148921021,295.41511330313085,130.39480512697176,295.56194440948934 +219.3358500411617,292.2241103571665,124.0594015578791,285.94793100031006 +215.7865012983455,282.52479046838965,117.20362201093069,278.35828561341475 +208.9688074571684,270.8475139202375,109.91183630093335,268.7469595331369 +200.25820568279497,268.14577165979705,102.18413675040794,256.9288932038997 +189.3589474950667,274.8028403967962,94.29202902742942,246.23379738459545 +179.11347981019935,269.33461930444184,86.51972305194262,235.28235566184617 +166.46173064664583,261.0404132262943,77.8672879103141,222.9405685439762 +154.36450314445943,258.2200123763158,69.45071513510558,209.73529860734234 +142.61948735542381,250.57594722312155,61.04749888339048,197.4097594424145 +130.39403275688142,237.56983972063503,53.57905190860542,183.16067786345798 +118.09172135231981,232.9065992805976,45.31949038232733,167.08432725260056 +107.2702477985387,215.22999411819978,37.242727577570335,153.20481806438926 +97.0065973471623,205.80014155651824,28.961340379753675,140.27220558777037 +84.6333338632984,192.9275305544331,20.748060187265494,128.88089859059698 +74.44115642907373,175.65153529061962,13.951363389962466,115.32978413235291 +65.89676047851572,159.63588608030236,9.971531743391353,102.90650396427401 +59.101026640990746,157.6032563807769,5.649949808269948,91.77112364358631 +50.4628042074048,151.14613237451394,2.8422167357923183,81.38224893770428 +43.380434756627224,142.1297446607122,0.8089371675179539,69.40743842003496 +37.2402039626919,131.15093810046022,-2.7757075579902497e-5,58.51697840372556 +32.6377410503266,127.61272326799181,0.014576637404480192,49.72583740775145 +29.407339371204802,122.7377666202245,0.020931438505678426,41.99241630483234 +24.06393930942189,85.84155793018172,0.0062571966267834424,29.914239653894793 +23.127901917898463,65.85303059430143,0.03619878202007641,21.935606087997716 +22.924839043559988,62.40507538369134,0.23070127422069103,18.650205536148956 +22.007801580404273,59.154416378847564,0.49709652305249996,16.31324545284173 +21.136620549198433,54.197233720503064,0.12869349623927082,12.457190721911122 +19.031142363960612,38.14249299045677,0.2798932995958727,8.11780690376344 +17.40204173868524,30.7482859887398,0.6275685251939801,6.793617876065368 +12.246953654446704,19.1380483634613,0.7553403609073082,6.373971458330337 +0.40074148030873114,1.9740202514084515,-2.7761152226093412e-5,4.45582104449006 diff --git a/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-Volume.png b/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-Volume.png new file mode 100644 index 0000000..b8d3b29 Binary files /dev/null and b/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-Volume.png differ diff --git a/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-thermal.png b/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-thermal.png new file mode 100644 index 0000000..715ae9f Binary files /dev/null and b/examples/HydroPowerModels/bolivia/ACPPowerModel/SDDP-bolivia-SOCWRConicPowerModel-ACPPowerModel-thermal.png differ diff --git a/examples/HydroPowerModels/evaluate_hydro_policies.jl b/examples/HydroPowerModels/evaluate_hydro_policies.jl new file mode 100644 index 0000000..cf59032 --- /dev/null +++ b/examples/HydroPowerModels/evaluate_hydro_policies.jl @@ -0,0 +1,234 @@ +# Evaluate pre-trained TS-DDR and TS-LDR policies on the Bolivia LTHD problem +# using stage-wise rollout under the ACP formulation with a fixed scenario set. +# +# This produces an apples-to-apples comparison across all methods using the +# same evaluation protocol: +# - stage-wise AC-OPF subproblems (Ipopt) +# - realized-state feedback (closed-loop / deployment semantics) +# - same seed and number of out-of-sample scenarios +# - operational cost excluding target-deficit penalty +# +# The script auto-discovers saved .jld2 checkpoints and reconstructs the +# correct policy architecture (LDR vs DDR) from the filename. +# Results are written to eval_costs.csv. +# +# Usage: +# julia --project=. evaluate_hydro_policies.jl [NUM_SIMULATIONS] +# +# Environment overrides: +# DR_EVAL_SIMULATIONS=100 number of out-of-sample scenarios +# DR_EVAL_SEED=1221 random seed for scenario generation + +using DecisionRules +using Statistics +using Random +using Flux +using Ipopt +using DiffOpt +using JLD2 +using JuMP +using CSV +using DataFrames + +const HYDRO_DIR = dirname(@__FILE__) +include(joinpath(HYDRO_DIR, "load_hydropowermodels.jl")) + +const CASE_NAME = "bolivia" +const FORMULATION = "ACPPowerModel" +const FORMULATION_FILE = FORMULATION * ".mof.json" +const NUM_STAGES = 96 +const NUM_SIMULATIONS = parse(Int, get(ENV, "DR_EVAL_SIMULATIONS", + length(ARGS) >= 1 ? ARGS[1] : "100")) +const SEED = parse(Int, get(ENV, "DR_EVAL_SEED", "1221")) + +const CASE_DIR = joinpath(HYDRO_DIR, CASE_NAME) +const OUT_DIR = joinpath(CASE_DIR, FORMULATION) +const MODEL_DIR = joinpath(OUT_DIR, "models") + +println("="^60) +println("Policy Evaluation (TS-DDR + TS-LDR)") +println("="^60) +println("Case: ", CASE_NAME) +println("Formulation: ", FORMULATION) +println("Stages: ", NUM_STAGES) +println("Simulations: ", NUM_SIMULATIONS) +println("Seed: ", SEED) +println("="^60) + +# ── Build stage-wise subproblems ───────────────────────────────────────────── + +diff_optimizer = () -> DiffOpt.diff_optimizer( + optimizer_with_attributes( + Ipopt.Optimizer, "print_level" => 0, "linear_solver" => "mumps", + ), +) + +subproblems, state_params_in, state_params_out, uncertainty_samples, initial_state, max_volume = + build_hydropowermodels( + CASE_DIR, FORMULATION_FILE; + num_stages=NUM_STAGES, + optimizer=diff_optimizer, + penalty_l1=:auto, penalty_l2=:auto, + ) + +num_hydro = length(initial_state) +num_uncertainties = length(uncertainty_samples[1][1]) +num_inputs = DecisionRules.policy_input_dim(num_uncertainties, num_hydro) + +# ── Generate fixed scenario set ────────────────────────────────────────────── + +Random.seed!(SEED) +eval_scenarios = [DecisionRules.sample(uncertainty_samples) for _ in 1:NUM_SIMULATIONS] + +# ── Discover saved models ──────────────────────────────────────────────────── +# +# Model files encode the training method and policy type in their filename: +# *-deteq-* → DDR trained with deterministic equivalent +# *-subproblems-* → DDR trained with stage-wise decomposition +# *-shooting-* → DDR trained with multiple shooting +# *-ldr-* → LDR (linear decision rule) +# +# DDR models use state_conditioned_policy (LSTM [128,128], sigmoid). +# LDR models use dense_multilayer_nn (identity activation, [64,64]). +# The most recent file (by lexicographic sort on timestamps) is selected +# for each method. + +struct PolicySpec + label::String + model_file::String + is_ldr::Bool +end + +function _method_variant(base) + method = if contains(base, "ldr") + "ldr" + elseif contains(base, "shooting") + "shooting" + elseif contains(base, "subproblems") + "subproblems" + elseif contains(base, "deteq") + "deteq" + else + return nothing + end + clip_tag = contains(base, "clip") ? "-clip" : "" + sched_tag = contains(base, "anneal") ? "-anneal" : + contains(base, "const") ? "-const" : "" + return method * clip_tag * sched_tag +end + +function _variant_label(variant) + labels = Dict( + "subproblems-anneal" => "Subproblems (anneal)", + "subproblems-clip-anneal" => "Subproblems (clip, anneal)", + "subproblems-const" => "Subproblems (const)", + "subproblems-clip-const" => "Subproblems (clip, const)", + "subproblems" => "Subproblems", + "shooting-anneal" => "Shooting w=12 (anneal)", + "shooting-clip-anneal" => "Shooting w=12 (clip, anneal)", + "shooting" => "Shooting w=12", + "deteq-anneal" => "DE (anneal)", + "deteq-clip-anneal" => "DE (clip, anneal)", + "deteq" => "DE", + "ldr" => "TS-LDR", + ) + return get(labels, variant, variant) +end + +function discover_policies(model_dir) + files = sort(filter(f -> endswith(f, ".jld2"), readdir(model_dir; join=true))) + best = Dict{String,Tuple{String,Bool}}() + for f in files + base = basename(f) + variant = _method_variant(base) + isnothing(variant) && continue + is_ldr = contains(base, "ldr") + best[variant] = (f, is_ldr) + end + specs = PolicySpec[] + for (variant, (path, is_ldr)) in sort(collect(best); by=first) + push!(specs, PolicySpec(_variant_label(variant), path, is_ldr)) + end + return specs +end + +function build_policy(spec::PolicySpec, num_inputs, num_hydro, num_uncertainties) + if spec.is_ldr + return dense_multilayer_nn(num_inputs, num_hydro, Int64[64, 64]; activation=identity) + else + return state_conditioned_policy( + num_uncertainties, num_hydro, num_hydro, Int64[128, 128]; + activation=sigmoid, encoder_type=Flux.LSTM, + ) + end +end + +policies = discover_policies(MODEL_DIR) +println("\nDiscovered policies:") +for p in policies + tag = p.is_ldr ? " (LDR)" : " (DDR)" + println(" ", p.label, tag, " → ", basename(p.model_file)) +end + +# ── Evaluate each policy ───────────────────────────────────────────────────── + +results = DataFrame() + +for spec in policies + println("\nEvaluating: ", spec.label) + + models = build_policy(spec, num_inputs, num_hydro, num_uncertainties) + model_state = JLD2.load(spec.model_file, "model_state") + Flux.loadmodel!(models, model_state) + + objectives_no_deficit = Vector{Float64}(undef, NUM_SIMULATIONS) + objectives_total = Vector{Float64}(undef, NUM_SIMULATIONS) + + for i in 1:NUM_SIMULATIONS + Flux.reset!(models) + + objectives_total[i] = simulate_multistage( + subproblems, + state_params_in, + state_params_out, + initial_state, + eval_scenarios[i], + models; + ) + + objectives_no_deficit[i] = DecisionRules.get_objective_no_target_deficit(subproblems) + end + + violation_share = 1.0 - mean(objectives_no_deficit) / mean(objectives_total) + + println(" Mean cost (no deficit): ", round(mean(objectives_no_deficit); digits=1)) + println(" Std: ", round(std(objectives_no_deficit); digits=1)) + println(" Violation share: ", round(violation_share * 100; digits=2), "%") + + results[!, spec.label] = objectives_no_deficit +end + +# ── Write results ──────────────────────────────────────────────────────────── + +costs_file = joinpath(OUT_DIR, "eval_costs.csv") +CSV.write(costs_file, results) +println("\nSaved: ", costs_file) + +# ── Summary table ──────────────────────────────────────────────────────────── + +println("\n", "="^70) +println(rpad("Method", 35), rpad("Mean", 12), rpad("Std", 12), "N") +println("-"^70) +for col in names(results) + vals = results[!, col] + println( + rpad(col, 35), + rpad(string(round(mean(vals); digits=1)), 12), + rpad(string(round(std(vals); digits=1)), 12), + length(vals), + ) +end +println("="^70) +println("\nNote: SDDP results are from sddp/simulate_sddp_policy.jl") +println("SDDP uses 126 stages (96 + 30 margin) to avoid end-of-horizon effects,") +println("while TS-DDR/TS-LDR use 96 stages. This gives SDDP a structural advantage.") diff --git a/examples/HydroPowerModels/load_hydropowermodels.jl b/examples/HydroPowerModels/load_hydropowermodels.jl index a86e287..613b425 100644 --- a/examples/HydroPowerModels/load_hydropowermodels.jl +++ b/examples/HydroPowerModels/load_hydropowermodels.jl @@ -47,7 +47,7 @@ function build_hydropowermodels( subproblems = Vector{JuMP.Model}(undef, num_stages) state_params_in = Vector{Vector{Any}}(undef, num_stages) state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, num_stages) - uncertainty_samples = Vector{Vector{Tuple{VariableRef,Vector{Float64}}}}( + uncertainty_samples = Vector{Vector{Vector{Tuple{VariableRef,Float64}}}}( undef, num_stages ) @@ -78,13 +78,14 @@ function build_hydropowermodels( variable_to_parameter(subproblems[t], state_param_out[i]; deficit=_deficit[i]) for i in 1:nHyd ] - inflow = [ - ( - variable_to_parameter(subproblems[t], inflow[i]), - vector_inflows[i][t, :] .+ 0.0, - ) for i in 1:nHyd + # Joint scenarios: all hydro units share the same scenario index ω, + # preserving the spatial correlation in the historical inflow data. + inflow_params = [variable_to_parameter(subproblems[t], inflow[i]) for i in 1:nHyd] + joint_scenarios = [ + [(inflow_params[i], vector_inflows[i][t, ω] + 0.0) for i in 1:nHyd] + for ω in 1:nCen ] - uncertainty_samples[t] = inflow + uncertainty_samples[t] = joint_scenarios end return subproblems, diff --git a/examples/HydroPowerModels/run_sddp.jl b/examples/HydroPowerModels/run_sddp.jl deleted file mode 100644 index 6a806e2..0000000 --- a/examples/HydroPowerModels/run_sddp.jl +++ /dev/null @@ -1,120 +0,0 @@ -# SDDP baseline: train and simulate SDDP policy on the Bolivia LTHD problem -# using a consistent (convex) formulation for both forward and backward passes. -# Requires HydroPowerModels.jl and a Mosek license. -using Gurobi -using MosekTools -using HydroPowerModels -using JuMP -using Statistics -import SDDP: stopping_rule_status, convergence_test, PolicyGraph, AbstractStoppingRule, Log -using Wandb, Dates, Logging - -using Random -seed = 1221 - -# Load case -case = "bolivia" -case_dir = joinpath(dirname(@__FILE__), case) -alldata = HydroPowerModels.parse_folder(case_dir); -for load in values(alldata[1]["powersystem"]["load"]) - load["qd"] = load["qd"] * 0.6 - load["pd"] = load["pd"] * 0.6 -end -rm_stages = 30 -num_stages = 96 + rm_stages -formulation = SOCWRConicPowerModel - -params = create_param(; - stages=num_stages, - model_constructor_grid=formulation, - post_method=PowerModels.build_opf, - optimizer=Mosek.Optimizer, -); - -m = hydro_thermal_operation(alldata, params); - -# Wandb stopping rule: logs every iteration but never triggers termination -mutable struct WandBLog <: SDDP.AbstractStoppingRule - lg -end - -SDDP.stopping_rule_status(::WandBLog) = :not_solved - -save_file = "SDDP-$(case)-$(formulation)-$(formulation)-h$(num_stages)-$(now())" - -cuts_file = joinpath( - case_dir, string(formulation), string(formulation)*"-"*string(formulation)*".cuts.json" -) - -function SDDP.convergence_test( - policy::SDDP.PolicyGraph, log::Vector{SDDP.Log}, rule::WandBLog -) - SDDP.write_cuts_to_file( - policy, - joinpath( - case_dir, - string(formulation), - string(formulation)*"-"*string(formulation)*".cuts.json", - ), - ) - - Wandb.log( - rule.lg, - Dict( - "iteration" => length(log), - "bound" => log[end].bound, - "metrics/loss" => log[end].simulation_value, - ), - ) - return false -end - -lg = WandbLogger(; project="HydroPowerModels", name=save_file, save_code=false) - -# ## Train -Random.seed!(seed) -start_time = time() -HydroPowerModels.train( - m; - iteration_limit=200, - stopping_rules=[ - WandBLog(lg); SDDP.Statistical(; num_replications=300, iteration_period=50) - ], -); -end_time = time() - start_time - -# Termination Status and solve time (s) -(SDDP.termination_status(m.forward_graph), end_time) - -# save cuts -SDDP.write_cuts_to_file( - m.forward_graph, - joinpath( - case_dir, - string(formulation), - string(formulation)*"-"*string(formulation)*".cuts.json", - ), -) - -# ## Simulation -using Random: Random -Random.seed!(seed) -results = HydroPowerModels.simulate(m, 300); - -# ## Objective -objective_values = [ - sum(results[:simulations][i][t][:stage_objective] for t in 1:(num_stages - rm_stages)) - for i in 1:length(results[:simulations]) -] -println("Mean Sim: ", mean(objective_values)) - -Wandb.log( - lg, - Dict( - "bound" => SDDP.calculate_bound(m.forward_graph), - "metrics/final_loss" => mean(objective_values), - ), -) - -# Finish the run -close(lg) diff --git a/examples/HydroPowerModels/run_sddp_inconsistent.jl b/examples/HydroPowerModels/run_sddp_inconsistent.jl deleted file mode 100644 index de67550..0000000 --- a/examples/HydroPowerModels/run_sddp_inconsistent.jl +++ /dev/null @@ -1,114 +0,0 @@ -# SDDP baseline with inconsistent formulations: train SDDP with a convex -# backward-pass formulation (SOCWRConic) and an AC forward-pass formulation, -# then simulate the resulting policy under the AC model. -# Requires HydroPowerModels.jl, Gurobi, Mosek, and MadNLP. -using Gurobi -using MosekTools -using MadNLP -using HydroPowerModels -using JuMP -using Statistics -import SDDP: stopping_rule_status, convergence_test, PolicyGraph, AbstractStoppingRule, Log -using Wandb, Dates, Logging - -using Random -seed = 1221 - -# Load case -case = "bolivia" -case_dir = joinpath(dirname(@__FILE__), case) -alldata = HydroPowerModels.parse_folder(case_dir); -for load in values(alldata[1]["powersystem"]["load"]) - load["qd"] = load["qd"] * 0.6 - load["pd"] = load["pd"] * 0.6 -end -rm_stages = 30 -num_stages = 96 + rm_stages -formulation_b = SOCWRConicPowerModel -formulation = ACPPowerModel - -# Wandb stopping rule -mutable struct WandBLog <: SDDP.AbstractStoppingRule - lg -end - -SDDP.stopping_rule_status(::WandBLog) = :not_solved - -save_file = "SDDP-$(case)-$(formulation)-$(formulation_b)-h$(num_stages)-$(now())" - -cuts_file = joinpath( - case_dir, - string(formulation), - string(formulation_b)*"-"*string(formulation)*".cuts.json", -) - -function SDDP.convergence_test( - policy::SDDP.PolicyGraph, log::Vector{SDDP.Log}, rule::WandBLog -) - SDDP.write_cuts_to_file(policy, cuts_file) - Wandb.log( - rule.lg, - Dict( - "iteration" => length(log), - "bound" => log[end].bound, - "metrics/loss" => log[end].simulation_value, - ), - ) - return false -end - -lg = WandbLogger(; project="HydroPowerModels", name=save_file, save_code=false) - -# Train: SOCWRConic backward pass, ACP forward pass via MadNLP -Random.seed!(seed) - -params = create_param(; - stages=num_stages, - model_constructor_grid=formulation_b, - model_constructor_grid_forward=formulation, - post_method=PowerModels.build_opf, - optimizer=Mosek.Optimizer, - optimizer_forward=() -> MadNLP.Optimizer(; print_level=MadNLP.INFO), -); - -m = hydro_thermal_operation(alldata, params); - -if isfile(cuts_file) - SDDP.read_cuts_from_file(m.forward_graph, cuts_file) -end - -start_time = time() -HydroPowerModels.train( - m; - iteration_limit=2000, - stopping_rules=[ - WandBLog(lg); SDDP.Statistical(; num_replications=300, iteration_period=200) - ], -); -end_time = time() - start_time - -(SDDP.termination_status(m.forward_graph), end_time) - -SDDP.write_cuts_to_file(m.forward_graph, cuts_file) - -# Simulation -using Random: Random -Random.seed!(seed) -results = HydroPowerModels.simulate(m, 300); - -# Objective -objective_values = [ - sum(results[:simulations][i][t][:stage_objective] for t in 1:(num_stages - rm_stages)) - for i in 1:length(results[:simulations]) -] -println("Mean Sim: ", mean(objective_values)) - -Wandb.log( - lg, - Dict( - "bound" => SDDP.calculate_bound(m.forward_graph), - "metrics/final_loss" => mean(objective_values), - ), -) - -close(lg) diff --git a/examples/HydroPowerModels/sddp/Project.toml b/examples/HydroPowerModels/sddp/Project.toml new file mode 100644 index 0000000..2aac97c --- /dev/null +++ b/examples/HydroPowerModels/sddp/Project.toml @@ -0,0 +1,11 @@ +[deps] +CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" +Clarabel = "61c947e1-3e6d-4ee4-985a-eec8c727bd6e" +DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" +HydroPowerModels = "1bf2e10f-7293-4f36-bafb-f7584ca75eae" +JuMP = "4076af6c-e467-56ae-b986-b466b2749572" +MadNLP = "2621e9c9-9eb4-46b1-8089-e8c72242dfb6" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +PowerModels = "c36e90e8-916a-50a6-bd94-075b64ef4655" +SDDP = "f4570300-c277-11e8-125c-4912f86ce65d" +Wandb = "ad70616a-06c9-5745-b1f1-6a5f42545108" diff --git a/examples/HydroPowerModels/sddp/run_sddp.jl b/examples/HydroPowerModels/sddp/run_sddp.jl new file mode 100644 index 0000000..6643640 --- /dev/null +++ b/examples/HydroPowerModels/sddp/run_sddp.jl @@ -0,0 +1,168 @@ +# SDDP baseline: train and simulate SDDP policy on the Bolivia LTHD problem +# using a consistent convex SOCWRConic formulation. + +using Clarabel +using HydroPowerModels +using JuMP +using Logging +using PowerModels +using Random +using SDDP +using Statistics +using Wandb, Dates + +const SEED = parse(Int, get(ENV, "DR_SDDP_SEED", "1221")) +const CASE = get(ENV, "DR_SDDP_CASE", "bolivia") +const HYDRO_DIR = dirname(@__DIR__) +const CASE_DIR = joinpath(HYDRO_DIR, CASE) +const RM_STAGES = parse(Int, get(ENV, "DR_SDDP_RM_STAGES", "30")) +const NUM_STAGES = parse(Int, get(ENV, "DR_SDDP_NUM_STAGES", string(96 + RM_STAGES))) +const ITERATION_LIMIT = parse(Int, get(ENV, "DR_SDDP_ITERATION_LIMIT", "200")) +const NUM_SIMULATIONS = parse(Int, get(ENV, "DR_SDDP_SIMULATIONS", "300")) +const STAT_REPLICATIONS = parse(Int, get(ENV, "DR_SDDP_STAT_REPLICATIONS", "300")) +const STAT_PERIOD = parse(Int, get(ENV, "DR_SDDP_STAT_PERIOD", "50")) +const FORMULATION = SOCWRConicPowerModel +const save_file = "SDDP-$(CASE)-$(FORMULATION)-$(FORMULATION)-h$(NUM_STAGES)-$(Dates.now())" +const CUTS_FILE = joinpath( + CASE_DIR, + string(FORMULATION), + string(FORMULATION) * "-" * string(FORMULATION) * ".cuts.json", +) + +function clarabel_optimizer() + return Clarabel.Optimizer(; + verbose=false, + max_iter=parse(Int, get(ENV, "DR_SDDP_CLARABEL_MAX_ITER", "1000")), + tol_gap_abs=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")), + tol_gap_rel=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")), + tol_feas=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")), + ) +end + +mutable struct WandBLog <: SDDP.AbstractStoppingRule + cuts_file::String + lg +end + +SDDP.stopping_rule_status(::WandBLog) = :not_solved + +function SDDP.convergence_test( + policy::SDDP.PolicyGraph, + log::Vector{SDDP.Log}, + rule::WandBLog, +) + mkpath(dirname(rule.cuts_file)) + SDDP.write_cuts_to_file(policy, rule.cuts_file) + latest = log[end] + Wandb.log( + rule.lg, + Dict( + "batch" => length(log), + "metrics/loss" => latest.bound, + "metrics/rollout_realized_objective_no_deficit" => latest.simulation_value, + ), + ) + println( + "iteration=$(length(log)) bound=$(latest.bound) simulation_value=$(latest.simulation_value)", + ) + flush(stdout) + return false +end + +function load_case_data() + alldata = HydroPowerModels.parse_folder(CASE_DIR) + for load in values(alldata[1]["powersystem"]["load"]) + load["qd"] *= 0.6 + load["pd"] *= 0.6 + end + return alldata +end + +function main() + println("Run: ", save_file) + println("Case directory: ", CASE_DIR) + println("Formulation: ", FORMULATION, " with Clarabel") + + Random.seed!(SEED) + mkpath(dirname(CUTS_FILE)) + alldata = load_case_data() + lg = WandbLogger(; + project="RL", + name=save_file, + save_code=false, + config=Dict( + "case_name" => CASE, + "training_method" => "sddp_consistent", + "formulation" => string(FORMULATION), + "solver" => "Clarabel", + "num_stages" => NUM_STAGES, + "rm_stages" => RM_STAGES, + "iteration_limit" => ITERATION_LIMIT, + "num_simulations" => NUM_SIMULATIONS, + "stat_replications" => STAT_REPLICATIONS, + "stat_period" => STAT_PERIOD, + "seed" => SEED, + ), + ) + params = create_param(; + stages=NUM_STAGES, + model_constructor_grid=FORMULATION, + post_method=PowerModels.build_opf, + optimizer=clarabel_optimizer, + ) + model = hydro_thermal_operation(alldata, params) + + if isfile(CUTS_FILE) + println("Loading existing cuts: ", CUTS_FILE) + SDDP.read_cuts_from_file(model.forward_graph, CUTS_FILE) + end + + stopping_rules = SDDP.AbstractStoppingRule[WandBLog(CUTS_FILE, lg)] + if STAT_REPLICATIONS > 0 + push!( + stopping_rules, + SDDP.Statistical(; + num_replications=STAT_REPLICATIONS, + iteration_period=STAT_PERIOD, + ), + ) + end + + start_time = time() + HydroPowerModels.train( + model; + iteration_limit=ITERATION_LIMIT, + stopping_rules=stopping_rules, + ) + elapsed = time() - start_time + bound = SDDP.calculate_bound(model.forward_graph) + println("Termination status: ", SDDP.termination_status(model.forward_graph)) + println("Elapsed seconds: ", elapsed) + println("Bound: ", bound) + + SDDP.write_cuts_to_file(model.forward_graph, CUTS_FILE) + println("Saved cuts: ", CUTS_FILE) + + Random.seed!(SEED) + results = HydroPowerModels.simulate(model, NUM_SIMULATIONS) + objective_values = [ + sum(results[:simulations][i][t][:stage_objective] for t in 1:(NUM_STAGES - RM_STAGES)) + for i in 1:length(results[:simulations]) + ] + final_loss = mean(objective_values) + println("Mean Sim: ", final_loss) + Wandb.log( + lg, + Dict( + "batch" => ITERATION_LIMIT, + "metrics/loss" => bound, + "metrics/final_loss" => final_loss, + "metrics/rollout_realized_objective_no_deficit" => final_loss, + "metrics/final_rollout_realized_objective_no_deficit" => final_loss, + "metrics/elapsed_seconds" => elapsed, + ), + ) + close(lg) +end + +main() diff --git a/examples/HydroPowerModels/sddp/run_sddp_inconsistent.jl b/examples/HydroPowerModels/sddp/run_sddp_inconsistent.jl new file mode 100644 index 0000000..0035912 --- /dev/null +++ b/examples/HydroPowerModels/sddp/run_sddp_inconsistent.jl @@ -0,0 +1,196 @@ +# SDDP baseline with inconsistent formulations: train SDDP with a convex +# SOCWRConic backward-pass formulation and an AC forward-pass formulation, +# then simulate the resulting policy under the AC model. +# +# Environment overrides for local smoke tests: +# DR_SDDP_ITERATION_LIMIT=2 +# DR_SDDP_SIMULATIONS=2 +# DR_SDDP_STAT_REPLICATIONS=2 +# DR_SDDP_STAT_PERIOD=1 + +using Clarabel +using HydroPowerModels +using JuMP +using Logging +using MadNLP +using PowerModels +using Random +using SDDP +using Statistics +using Wandb, Dates + +const SEED = parse(Int, get(ENV, "DR_SDDP_SEED", "1221")) +const CASE = get(ENV, "DR_SDDP_CASE", "bolivia") +const HYDRO_DIR = dirname(@__DIR__) +const CASE_DIR = joinpath(HYDRO_DIR, CASE) +const RM_STAGES = parse(Int, get(ENV, "DR_SDDP_RM_STAGES", "30")) +const NUM_STAGES = parse(Int, get(ENV, "DR_SDDP_NUM_STAGES", string(96 + RM_STAGES))) +const ITERATION_LIMIT = parse(Int, get(ENV, "DR_SDDP_ITERATION_LIMIT", "2000")) +const NUM_SIMULATIONS = parse(Int, get(ENV, "DR_SDDP_SIMULATIONS", "300")) +const STAT_REPLICATIONS = parse(Int, get(ENV, "DR_SDDP_STAT_REPLICATIONS", "300")) +const STAT_PERIOD = parse(Int, get(ENV, "DR_SDDP_STAT_PERIOD", "200")) + +const FORMULATION_BACKWARD = SOCWRConicPowerModel +const FORMULATION_FORWARD = ACPPowerModel +const save_file = "SDDP-$(CASE)-$(FORMULATION_FORWARD)-$(FORMULATION_BACKWARD)-h$(NUM_STAGES)-$(Dates.now())" +const CUTS_DIR = joinpath(CASE_DIR, string(FORMULATION_FORWARD)) +const CUTS_FILE = joinpath( + CUTS_DIR, + string(FORMULATION_BACKWARD) * "-" * string(FORMULATION_FORWARD) * ".cuts.json", +) + +function clarabel_optimizer() + return Clarabel.Optimizer(; + verbose=false, + max_iter=parse(Int, get(ENV, "DR_SDDP_CLARABEL_MAX_ITER", "1000")), + tol_gap_abs=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")), + tol_gap_rel=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")), + tol_feas=parse(Float64, get(ENV, "DR_SDDP_CLARABEL_TOL", "1e-7")), + ) +end + +function madnlp_optimizer() + return MadNLP.Optimizer(; + print_level=parse(Int, get(ENV, "DR_SDDP_MADNLP_PRINT_LEVEL", "0")), + ) +end + +mutable struct WandBLog <: SDDP.AbstractStoppingRule + cuts_file::String + lg +end + +SDDP.stopping_rule_status(::WandBLog) = :not_solved + +function SDDP.convergence_test( + policy::SDDP.PolicyGraph, + log::Vector{SDDP.Log}, + rule::WandBLog, +) + mkpath(dirname(rule.cuts_file)) + SDDP.write_cuts_to_file(policy, rule.cuts_file) + latest = log[end] + Wandb.log( + rule.lg, + Dict( + "batch" => length(log), + "metrics/loss" => latest.bound, + "metrics/rollout_realized_objective_no_deficit" => latest.simulation_value, + ), + ) + println( + "iteration=$(length(log)) bound=$(latest.bound) simulation_value=$(latest.simulation_value)", + ) + flush(stdout) + return false +end + +function load_case_data() + alldata = HydroPowerModels.parse_folder(CASE_DIR) + for load in values(alldata[1]["powersystem"]["load"]) + load["qd"] *= 0.6 + load["pd"] *= 0.6 + end + return alldata +end + +function main() + println("Run: ", save_file) + println("Case directory: ", CASE_DIR) + println("Stages: ", NUM_STAGES, " (reporting first ", NUM_STAGES - RM_STAGES, ")") + println("Backward formulation: ", FORMULATION_BACKWARD, " with Clarabel") + println("Forward formulation: ", FORMULATION_FORWARD, " with MadNLP") + println("Iteration limit: ", ITERATION_LIMIT) + println("Simulations: ", NUM_SIMULATIONS) + + Random.seed!(SEED) + mkpath(CUTS_DIR) + alldata = load_case_data() + lg = WandbLogger(; + project="RL", + name=save_file, + save_code=false, + config=Dict( + "case_name" => CASE, + "training_method" => "sddp_inconsistent", + "backward_formulation" => string(FORMULATION_BACKWARD), + "forward_formulation" => string(FORMULATION_FORWARD), + "backward_solver" => "Clarabel", + "forward_solver" => "MadNLP", + "num_stages" => NUM_STAGES, + "rm_stages" => RM_STAGES, + "iteration_limit" => ITERATION_LIMIT, + "num_simulations" => NUM_SIMULATIONS, + "stat_replications" => STAT_REPLICATIONS, + "stat_period" => STAT_PERIOD, + "seed" => SEED, + ), + ) + + params = create_param(; + stages=NUM_STAGES, + model_constructor_grid=FORMULATION_BACKWARD, + model_constructor_grid_forward=FORMULATION_FORWARD, + post_method=PowerModels.build_opf, + optimizer=clarabel_optimizer, + optimizer_forward=madnlp_optimizer, + ) + + model = hydro_thermal_operation(alldata, params) + + if isfile(CUTS_FILE) + println("Loading existing cuts: ", CUTS_FILE) + SDDP.read_cuts_from_file(model.forward_graph, CUTS_FILE) + end + + stopping_rules = SDDP.AbstractStoppingRule[WandBLog(CUTS_FILE, lg)] + if STAT_REPLICATIONS > 0 + push!( + stopping_rules, + SDDP.Statistical(; + num_replications=STAT_REPLICATIONS, + iteration_period=STAT_PERIOD, + ), + ) + end + + start_time = time() + HydroPowerModels.train( + model; + iteration_limit=ITERATION_LIMIT, + stopping_rules=stopping_rules, + ) + elapsed = time() - start_time + + status = SDDP.termination_status(model.forward_graph) + bound = SDDP.calculate_bound(model.forward_graph) + println("Termination status: ", status) + println("Elapsed seconds: ", elapsed) + println("Bound: ", bound) + + SDDP.write_cuts_to_file(model.forward_graph, CUTS_FILE) + println("Saved cuts: ", CUTS_FILE) + + Random.seed!(SEED) + results = HydroPowerModels.simulate(model, NUM_SIMULATIONS) + objective_values = [ + sum(results[:simulations][i][t][:stage_objective] for t in 1:(NUM_STAGES - RM_STAGES)) + for i in 1:length(results[:simulations]) + ] + final_loss = mean(objective_values) + println("Mean Sim: ", final_loss) + Wandb.log( + lg, + Dict( + "batch" => ITERATION_LIMIT, + "metrics/loss" => bound, + "metrics/final_loss" => final_loss, + "metrics/rollout_realized_objective_no_deficit" => final_loss, + "metrics/final_rollout_realized_objective_no_deficit" => final_loss, + "metrics/elapsed_seconds" => elapsed, + ), + ) + close(lg) +end + +main() diff --git a/examples/HydroPowerModels/simulate_sddp_policy.jl b/examples/HydroPowerModels/sddp/simulate_sddp_policy.jl similarity index 66% rename from examples/HydroPowerModels/simulate_sddp_policy.jl rename to examples/HydroPowerModels/sddp/simulate_sddp_policy.jl index 76f8aca..a392de6 100644 --- a/examples/HydroPowerModels/simulate_sddp_policy.jl +++ b/examples/HydroPowerModels/sddp/simulate_sddp_policy.jl @@ -1,20 +1,18 @@ # Simulate a pre-trained SDDP policy (cuts from run_sddp_inconsistent.jl) under # the ACP formulation and produce comparison plots/CSVs against TS-DDR baselines. -# Requires HydroPowerModels.jl, MadNLP, Gurobi, Mosek. -using MosekTools using MadNLP using HydroPowerModels using JuMP +using PowerModels using Statistics using SDDP: SDDP -using Gurobi using Random seed = 1221 # Load case case = "bolivia" -case_dir = joinpath(dirname(@__FILE__), case) +case_dir = joinpath(dirname(@__DIR__), case) alldata = HydroPowerModels.parse_folder(case_dir); for load in values(alldata[1]["powersystem"]["load"]) load["qd"] = load["qd"] * 0.6 @@ -29,7 +27,7 @@ params = create_param(; stages=num_stages, model_constructor_grid=formulation, post_method=PowerModels.build_opf, - optimizer=() -> MadNLP.Optimizer(; print_level=MadNLP.INFO), + optimizer=() -> MadNLP.Optimizer(; print_level=0), ); m = hydro_thermal_operation(alldata, params); @@ -56,9 +54,14 @@ using CSV using DataFrames volume_to_mw(volume, stage_hours; k=0.0036) = volume / (k * stage_hours) -labels = ["TS-DDR"; "TS-LDR"; "SDDP-DCLL"] -colors = [:black :purple :red] -markers = [:hline :+ :pixel] +const SDDP_COL = "SDDP-SOC" +labels = ["TS-DDR"; "TS-LDR"; "SDDP-DCLL"; SDDP_COL] +colors = [:black :purple :red :orange] +markers = [:hline :+ :pixel :diamond] + +const DOCS_ASSETS = joinpath(dirname(@__DIR__), "..", "..", "docs", "src", "assets") +mkpath(DOCS_ASSETS) +out_dir = joinpath(case_dir, string(formulation)) # Volume trajectory hydro_gen = [ @@ -78,30 +81,24 @@ savefig( ylabel="Volume (Hm3)", title="$(case)-$(formulation_b)-$(formulation)", ), - joinpath( - case_dir, - string(formulation), - "SDDP-$(case)-$(formulation_b)-$(formulation)-Volume.png", - ), -) - -df = CSV.read( - joinpath(case_dir, string(formulation), "MeanVolume.csv"), DataFrame; header=true + joinpath(out_dir, "SDDP-$(case)-$(formulation_b)-$(formulation)-Volume.png"), ) -df[!, "$(string(formulation_b))"] = hydro_gen -CSV.write(joinpath(case_dir, string(formulation), "MeanVolume.csv"), df) +df = CSV.read(joinpath(out_dir, "MeanVolume.csv"), DataFrame; header=true) +df[!, SDDP_COL] = hydro_gen +CSV.write(joinpath(out_dir, "MeanVolume.csv"), df) savefig( plot( Matrix(df[!, labels]); - labels=permutedims(names(df[!, labels])), + labels=permutedims(labels), xlabel="Stage", ylabel="Expected Volume (MWh)", color=colors, shape=markers, + title="Reservoir Volume Comparison", ), - joinpath(case_dir, string(formulation), "DCLL-Comparison-$(case)-Volume.png"), + joinpath(DOCS_ASSETS, "hydro_volume_comparison.png"), ) # Thermal generation @@ -125,28 +122,24 @@ savefig( ylabel="Mwh", title="Thermal-Generation $(case)-$(formulation_b)-$(formulation)", ), - joinpath( - case_dir, - string(formulation), - "SDDP-$(case)-$(formulation_b)-$(formulation)-thermal.png", - ), + joinpath(out_dir, "SDDP-$(case)-$(formulation_b)-$(formulation)-thermal.png"), ) -df = CSV.read(joinpath(case_dir, string(formulation), "MeanGeneration.csv"), DataFrame) -df[!, "SOC"] = thermal_gen - -CSV.write(joinpath(case_dir, string(formulation), "MeanGeneration.csv"), df) +df = CSV.read(joinpath(out_dir, "MeanGeneration.csv"), DataFrame) +df[!, SDDP_COL] = thermal_gen +CSV.write(joinpath(out_dir, "MeanGeneration.csv"), df) savefig( plot( Matrix(df[!, labels]); - labels=permutedims(names(df[!, labels])), + labels=permutedims(labels), xlabel="Stage", ylabel="Expected Thermal Generation (MWh)", color=colors, shape=markers, + title="Thermal Generation Comparison", ), - joinpath(case_dir, string(formulation), "DCLL-Comparison-$(case)-thermal.png"), + joinpath(DOCS_ASSETS, "hydro_generation_comparison.png"), ) # Objective costs @@ -155,9 +148,14 @@ objective_values = [ for i in 1:length(results[:simulations]) ] -df = CSV.read(joinpath(case_dir, string(formulation), "costs.csv"), DataFrame) -df[!, "SDDP_SOC"] = objective_values - -CSV.write(joinpath(case_dir, string(formulation), "costs.csv"), df) +costs_file = joinpath(out_dir, "costs.csv") +if isfile(costs_file) + df = CSV.read(costs_file, DataFrame) + df[!, SDDP_COL] = objective_values +else + df = DataFrame(Symbol(SDDP_COL) => objective_values) +end +CSV.write(costs_file, df) println("Mean Sim: ", mean(objective_values)) +println("Std Sim: ", std(objective_values)) diff --git a/examples/HydroPowerModels/test_dr_hydropowermodels.jl b/examples/HydroPowerModels/test_dr_hydropowermodels.jl deleted file mode 100644 index 79aebf8..0000000 --- a/examples/HydroPowerModels/test_dr_hydropowermodels.jl +++ /dev/null @@ -1,264 +0,0 @@ -# Evaluate a pre-trained TS-DDR policy on the Bolivia LTHD problem and produce -# comparison plots (volume, generation, cost) against SDDP baselines. -# Requires a trained model .jld2 in the case/formulation/models/ directory. -using Statistics -using Random -using Flux -using DecisionRules -using Gurobi -using MosekTools -using Ipopt -using MathOptSymbolicAD: MathOptSymbolicAD -using JLD2 -using HydroPowerModels -using DiffOpt - -HydroPowerModels_dir = dirname(@__FILE__) -include(joinpath(HydroPowerModels_dir, "load_hydropowermodels.jl")) - -function non_ensurance(x_out, x_in, uncertainty, max_volume) - return x_out -end - -# Parameters -case_name = "bolivia" -formulation = "DCPPowerModel" -num_stages = 96 -model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models") -model_file = readdir(model_dir; join=true)[end] -save_name = split(split(model_file, "/")[end], ".")[1] -formulation_file = formulation * ".mof.json" -dense = Dense -activation = DecisionRules.identity -layers = Int64[32, 32] -ensure_feasibility = non_ensurance -optimizer = Flux.Adam(0.01) - -data = HydroPowerModels.parse_folder(joinpath(HydroPowerModels_dir, case_name))[1]; -HydroPowerModels.gather_useful_info!(data) -# Build MSP - -subproblems, state_params_in, state_params_out, uncertainty_samples, initial_state, max_volume = build_hydropowermodels( - joinpath(HydroPowerModels_dir, case_name), formulation_file; num_stages=num_stages -) - -det_equivalent = DiffOpt.nonlinear_diff_model( - optimizer_with_attributes( - Ipopt.Optimizer, - "print_level" => 0, - "linear_solver" => "mumps", - ), -) - -det_equivalent, uncertainty_samples = DecisionRules.deterministic_equivalent!( - det_equivalent, - subproblems, - state_params_in, - state_params_out, - initial_state, - uncertainty_samples, -) - -num_hydro = length(initial_state) - -# Build Model -models = dense_multilayer_nn( - num_hydro, num_hydro, layers; activation=activation, dense=dense -) -model = models -opt_state = Flux.setup(optimizer, model) -x = randn(num_hydro, 1) -y = rand(num_hydro, 1) -train_set = [(x, y)] -Flux.train!(model, train_set, opt_state) do m, x, y - return Flux.mse(m(x), y) -end -models = model -model_state = JLD2.load(model_file, "model_state") -Flux.loadmodel!(model, model_state) - -Random.seed!(1221) -num_samples = 100 -objective_values = Vector{Float64}(undef, num_samples) -states = Vector{Any}(undef, num_samples) -inflows = Array{Float64,3}(undef, num_samples, num_hydro, num_stages) -record_variables_names = ["0_pg", "norm_deficit"] -record_variables = Dict{String,Any}() -record = Dict{String,Array{Float64,3}}() -for _var in record_variables_names - num_vars = length(find_variables(det_equivalent, [_var; r"#1$"])) - record[_var] = Array{Float64,3}(undef, num_samples, num_vars, num_stages) - record_variables[_var] = [ - find_variables(det_equivalent, [_var; Regex("#$i\$")]) for i in 1:num_stages - ] -end -for i in 1:num_samples - Flux.reset!(models) - uncertainty_s = sample(uncertainty_samples) - for j in 1:num_hydro, t in 1:num_stages - inflow_var = collect(keys(uncertainty_s[t])) - inflow_var = inflow_var[findfirst( - x -> occursin("_inflow[$j]", JuMP.name(x)), inflow_var - )] - inflows[i, j, t] = uncertainty_s[t][inflow_var] - end - simulate_multistage( - det_equivalent, - state_params_in, - state_params_out, - initial_state, - uncertainty_s, - models; - ensure_feasibility=(x_out, x_in, _sa) -> - ensure_feasibility(x_out, x_in, _sa, max_volume), - ) - objective_values[i] = DecisionRules.get_objective_no_target_deficit(det_equivalent) - for _var in record_variables_names - num_vars = length(find_variables(det_equivalent, [_var; r"#1$"])) - for j in 1:num_vars, t in 1:num_stages - record[_var][i, j, t] = value(record_variables[_var][t][j]) - end - end - states[i] = Vector{Vector{Float64}}(undef, num_hydro) - for j in 1:num_hydro - states[i][j] = Vector{Float64}(undef, num_stages+1) - states[i][j][1] = initial_state[j] - for t in 1:num_stages - states[i][j][t + 1] = value(state_params_out[t][j][2]) - end - end -end - -# Plot Volumes - -using Plots -using Statistics -using DataFrames -using CSV - -plt = plot( - 1:(num_stages + 1), - [sum([states[1][j][t] for j in 1:num_hydro]) for t in 1:(num_stages + 1)]; - legend=false, - xlabel="Stage", - ylabel="Volume (Hm3)", - title="$(case_name)-$(formulation)", -); -for i in 2:num_samples - plot!( - plt, - 1:(num_stages + 1), - [sum([states[i][j][t] for j in 1:num_hydro]) for t in 1:(num_stages + 1)], - ); -end -savefig( - plt, joinpath(HydroPowerModels_dir, case_name, formulation, save_name * "Volume.png") -) - -# Plot Mean Volume -volume_to_mw(volume, stage_hours; k=0.0036) = volume / (k * stage_hours) - -plt = plot( - 1:(num_stages + 1), - [ - mean(sum([states[i][j][t] for j in 1:num_hydro]) for i in 1:num_samples) for - t in 1:(num_stages + 1) - ]; - xlabel="Stage", - ylabel="Volume (Hm3)", - label="Mean Volume", - title="$(case_name)-$(formulation)", -); -savefig( - plt, - joinpath(HydroPowerModels_dir, case_name, formulation, save_name * "MeanVolume.png"), -) - -df = DataFrame(; - ML_Rule=[ - mean( - sum([volume_to_mw(states[i][j][t], 1) for j in 1:num_hydro]) for - i in 1:num_samples - ) for t in 2:(num_stages + 1) - ], -) - -df = CSV.read( - joinpath(HydroPowerModels_dir, case_name, formulation, "MeanVolume.csv"), - DataFrame; - header=true, -) -df[!, "TS-LDR"] = [ - mean( - sum([volume_to_mw(states[i][j][t], 1) for j in 1:num_hydro]) for i in 1:num_samples - ) for t in 2:(num_stages + 1) -] - -CSV.write(joinpath(HydroPowerModels_dir, case_name, formulation, "MeanVolume.csv"), df) - -# Plot Mean Inflows - -plt = plot( - 1:num_stages, - [ - mean(sum(inflows[i, j, t] for j in 1:num_hydro) for i in 1:num_samples) for - t in 1:num_stages - ]; - xlabel="Stage", - ylabel="Inflow (Hm3)", - label="Mean Inflow", - title="$(case_name)-$(formulation)", -); -savefig( - plt, - joinpath(HydroPowerModels_dir, case_name, formulation, save_name * "MeanInflow.png"), -) - -# Plot Generation -num_gen = size(record["0_pg"], 2) -hydro_idx = HydroPowerModels.idx_hydro(data) - -thermal_gen = [ - mean( - sum(record["0_pg"][i, j, t] * 100 for j in 1:num_gen if !(j in hydro_idx)) for - i in 1:num_samples - ) for t in 1:num_stages -] - -plt = plot( - 1:num_stages, - thermal_gen; - xlabel="Stage", - ylabel="Generation (MW)", - label="Mean Generation", - title="$(case_name)-$(formulation)", -); -savefig( - plt, - joinpath( - HydroPowerModels_dir, case_name, formulation, save_name * "MeanGeneration.png" - ), -) - -df = DataFrame(; ML_Rule=thermal_gen) - -df = CSV.read( - joinpath(HydroPowerModels_dir, case_name, formulation, "MeanGeneration.csv"), - DataFrame; - header=true, -) -df[!, "TS-LDR"] = thermal_gen - -CSV.write(joinpath(HydroPowerModels_dir, case_name, formulation, "MeanGeneration.csv"), df) - -# objective -df = CSV.read( - joinpath(HydroPowerModels_dir, case_name, formulation, "costs.csv"), - DataFrame; - header=true, -) -df[!, "TS-LDR"] = objective_values - -mean((df[!, "SDDP_SOC"] .- df[!, "TS-DDR"]) * 100 ./ df[!, "TS-DDR"]) - -CSV.write(joinpath(HydroPowerModels_dir, case_name, formulation, "costs.csv"), df) diff --git a/examples/HydroPowerModels/test_sampling_consistency.jl b/examples/HydroPowerModels/test_sampling_consistency.jl new file mode 100644 index 0000000..0bbe997 --- /dev/null +++ b/examples/HydroPowerModels/test_sampling_consistency.jl @@ -0,0 +1,166 @@ +# test_sampling_consistency.jl +# +# Verifies that DecisionRules.jl and the ExaModels companion package +# (DecisionRulesExa.jl) sample from the exact same inflow distribution +# as SDDP.jl for the Bolivia HydroPowerModels case. +# +# All three systems read the same inflows.csv and hydro.json files. +# The sampling contract is: +# +# At each stage t, draw one scenario index ω ∈ {1, …, nScenarios} +# uniformly at random. All hydro reservoirs receive the inflow from +# column ω of the historical data for their respective row t. +# Stages are sampled independently (no temporal correlation). +# +# This is SDDP.jl's `SDDP.parameterize` semantics: one ω per node, +# applied to all random variables in that node. +# +# What this script checks: +# 1. Both loaders parse inflows.csv into identical per-reservoir matrices. +# 2. Both samplers produce draws from the same support (only historically +# observed joint vectors, never cross-scenario combinations). +# 3. With the same RNG seed, both produce identical trajectories. +# +# Usage: +# julia --project=. test_sampling_consistency.jl (from this dir) +# julia --project=. test_sampling_consistency.jl /path/to/DecisionRulesExa.jl/examples/HydroPowerModels + +using Test +using Random +using CSV, Tables, JSON + +# ── Paths ──────────────────────────────────────────────────────────────────── + +const SCRIPT_DIR = dirname(@__FILE__) +const CASE_DIR = joinpath(SCRIPT_DIR, "bolivia") +const INFLOW_FILE = joinpath(CASE_DIR, "inflows.csv") +const HYDRO_FILE = joinpath(CASE_DIR, "hydro.json") + +const EXA_DIR = length(ARGS) >= 1 ? ARGS[1] : + joinpath(dirname(dirname(dirname(SCRIPT_DIR))), + "..", "DecisionRulesExa.jl", "examples", "HydroPowerModels") + +# ── 1. Parse inflows with both loaders ─────────────────────────────────────── + +# DecisionRules.jl loader (load_hydropowermodels.jl::read_inflow) +function dr_read_inflow(file, nHyd; num_stages=nothing) + allinflows = CSV.read(file, Tables.matrix; header=false) + nlin, ncol = size(allinflows) + if isnothing(num_stages) + num_stages = nlin + elseif num_stages > nlin + number_of_cycles = div(num_stages, nlin) + 1 + allinflows = vcat([allinflows for _ in 1:number_of_cycles]...) + end + nCen = Int(floor(ncol / nHyd)) + vector_inflows = [allinflows[1:num_stages, ((i-1)*nCen+1):(i*nCen)] for i in 1:nHyd] + return vector_inflows, nCen, num_stages +end + +# ExaModels loader (hydro_power_data.jl::load_hydro_data, inflow portion only) +function exa_read_inflow(file, nHyd; num_stages=nothing) + allinflows = CSV.read(file, Tables.matrix; header=false) + nrows, ncols = size(allinflows) + nScenarios = div(ncols, nHyd) + nStagesSample = isnothing(num_stages) ? nrows : num_stages + if !isnothing(num_stages) && num_stages > nrows + repeats = div(num_stages, nrows) + 1 + allinflows = vcat([allinflows for _ in 1:repeats]...) + end + allinflows = allinflows[1:nStagesSample, :] + scenario_inflows = [Float64.(allinflows[:, ((r-1)*nScenarios+1):(r*nScenarios)]) for r in 1:nHyd] + return scenario_inflows, nScenarios, nStagesSample +end + +# ── 2. Sampler implementations (extracted, no package dependencies) ────────── + +function dr_sample_joint(vector_inflows, nCen, T) + nHyd = length(vector_inflows) + trajectory = Vector{Vector{Float64}}(undef, T) + for t in 1:T + ω = rand(1:nCen) + trajectory[t] = [vector_inflows[r][t, ω] for r in 1:nHyd] + end + return trajectory +end + +function exa_sample_scenario(scenario_inflows, nScenarios, T) + nHyd = length(scenario_inflows) + nStagesSample = size(scenario_inflows[1], 1) + w = Vector{Float64}(undef, T * nHyd) + for t in 1:T + t_row = mod1(t, nStagesSample) + j = rand(1:nScenarios) + for r in 1:nHyd + w[(t-1)*nHyd + r] = scenario_inflows[r][t_row, j] + end + end + return w +end + +# ── Tests ──────────────────────────────────────────────────────────────────── + +hydro_json = JSON.parsefile(HYDRO_FILE)["Hydrogenerators"] +nHyd = length(hydro_json) +T = 96 + +@testset "Sampling consistency: DecisionRules vs Exa vs SDDP" begin + dr_inflows, dr_nCen, dr_T = dr_read_inflow(INFLOW_FILE, nHyd; num_stages=T) + exa_inflows, exa_nScen, exa_T = exa_read_inflow(INFLOW_FILE, nHyd; num_stages=T) + + @testset "identical inflow matrices" begin + @test dr_nCen == exa_nScen + @test dr_T == exa_T + for r in 1:nHyd + @test dr_inflows[r] == exa_inflows[r] + end + end + + @testset "same seed → identical trajectories" begin + for seed in [42, 123, 9999] + Random.seed!(seed) + dr_traj = dr_sample_joint(dr_inflows, dr_nCen, T) + + Random.seed!(seed) + exa_flat = exa_sample_scenario(exa_inflows, exa_nScen, T) + + for t in 1:T + for r in 1:nHyd + @test dr_traj[t][r] == exa_flat[(t-1)*nHyd + r] + end + end + end + end + + @testset "samples are always from historical scenarios (joint)" begin + valid_vectors = Set{Vector{Float64}}() + for t in 1:T, ω in 1:dr_nCen + push!(valid_vectors, [dr_inflows[r][t, ω] for r in 1:nHyd]) + end + + Random.seed!(42) + for _ in 1:500 + traj = dr_sample_joint(dr_inflows, dr_nCen, T) + for stage_vec in traj + @test stage_vec in valid_vectors + end + end + end + + @testset "uniform coverage of all scenarios" begin + Random.seed!(42) + N = 10_000 + counts = zeros(Int, dr_nCen) + for _ in 1:N + ω = rand(1:dr_nCen) + counts[ω] += 1 + end + for ω in 1:dr_nCen + freq = counts[ω] / N + expected = 1.0 / dr_nCen + @test abs(freq - expected) < 0.03 + end + end +end + +println("\nAll sampling consistency tests passed.") diff --git a/examples/HydroPowerModels/train_dr_hydropowermodels.jl b/examples/HydroPowerModels/train_dr_hydropowermodels.jl index 7b3f2e3..ad6876c 100644 --- a/examples/HydroPowerModels/train_dr_hydropowermodels.jl +++ b/examples/HydroPowerModels/train_dr_hydropowermodels.jl @@ -32,30 +32,36 @@ end # Parameters case_name = "bolivia" # bolivia, case3 formulation = "ACPPowerModel" # SOCWRConicPowerModel, DCPPowerModel, ACPPowerModel -num_stages = 96 # 96, 48 +num_stages = parse(Int, get(ENV, "DR_NUM_STAGES", "126")) model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models") mkpath(model_dir) solver_tag = USE_GPU ? "gpu" : "cpu" -save_file = "$(case_name)-$(formulation)-h$(num_stages)-deteq-$(solver_tag)-$(now())" formulation_file = formulation * ".mof.json" # Training parameters -num_epochs = 40 +num_epochs = parse(Int, get(ENV, "DR_NUM_EPOCHS", "80")) num_batches = 100 _num_train_per_batch = 1 activation = sigmoid # tanh, identity, relu, sigmoid layers = Int64[128, 128] ensure_feasibility = non_ensurance -optimizers = [Flux.Adam()] +grad_clip = parse(Float32, get(ENV, "DR_GRAD_CLIP", "0")) +optimizers = if grad_clip > 0 + [Flux.Optimisers.OptimiserChain(Flux.Optimisers.ClipGrad(grad_clip), Flux.Adam())] +else + [Flux.Adam()] +end pre_trained_model = nothing penalty_l2 = :auto penalty_l1 = :auto -penalty_schedule = [ - (1, 100, 0.1), - (101, 210, 1.0), - (211, 300, 10.0), - (301, num_epochs * num_batches, 30.0), -] +penalty_schedule = if get(ENV, "DR_PENALTY_SCHEDULE", "annealed") == "annealed" + :default_annealed +else + nothing +end +clip_tag = grad_clip > 0 ? "-clip$(Int(grad_clip))" : "" +sched_tag = isnothing(penalty_schedule) ? "-const" : "-anneal" +save_file = "$(case_name)-$(formulation)-h$(num_stages)-deteq-$(solver_tag)$(clip_tag)$(sched_tag)-$(now())" num_eval_scenarios = 4 eval_every = 25 @@ -121,6 +127,7 @@ lg = WandbLogger(; "encoder_type" => "LSTM", "ensure_feasibility" => string(ensure_feasibility), "optimizer" => string(optimizers), + "grad_clip" => grad_clip, "training_method" => "deterministic_equivalent", "solver" => USE_GPU ? "MadNLP+CUDSS (GPU)" : "MadNLP (CPU)", "penalty_l1" => string(penalty_l1), @@ -136,7 +143,7 @@ lg = WandbLogger(; ) # Define Model -num_uncertainties = length(uncertainty_samples[1]) +num_uncertainties = length(uncertainty_samples[1][1]) models = state_conditioned_policy( num_uncertainties, num_hydro, @@ -169,8 +176,8 @@ best_obj = mean(objective_values) model_path = joinpath(model_dir, save_file * ".jld2") save_control = SaveBest(best_obj, model_path) -stall_train = StallingCriterium(100, best_obj, 0) -stall_rollout = StallingCriterium(5, best_obj, 0) +stall_train = StallingCriterium(num_epochs * num_batches, best_obj, 0) +stall_rollout = StallingCriterium(num_epochs * num_batches, best_obj, 0) # Rollout evaluation (stage-wise subproblems, CPU) diff --git a/examples/HydroPowerModels/train_dr_hydropowermodels_multipleshooting.jl b/examples/HydroPowerModels/train_dr_hydropowermodels_multipleshooting.jl index 0f41213..906f8e4 100644 --- a/examples/HydroPowerModels/train_dr_hydropowermodels_multipleshooting.jl +++ b/examples/HydroPowerModels/train_dr_hydropowermodels_multipleshooting.jl @@ -21,27 +21,32 @@ end # Parameters case_name = "bolivia" # bolivia, case3 formulation = "ACPPowerModel" # SOCWRConicPowerModel, DCPPowerModel, ACPPowerModel -num_stages = 96 # 96, 48 +num_stages = parse(Int, get(ENV, "DR_NUM_STAGES", "126")) window_size = 12 # 12, 6 model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models") mkpath(model_dir) -save_file = "$(case_name)-$(formulation)-h$(num_stages)-shooting-w$(window_size)-$(now())" formulation_file = formulation * ".mof.json" # Training parameters -num_epochs = 30 +num_epochs = parse(Int, get(ENV, "DR_NUM_EPOCHS", "80")) num_batches = 100 _num_train_per_batch = 1 activation = sigmoid # tanh, identity, relu, sigmoid layers = Int64[128, 128] ensure_feasibility = non_ensurance -optimizers = [Flux.Adam()] +grad_clip = parse(Float32, get(ENV, "DR_GRAD_CLIP", "0")) +optimizers = if grad_clip > 0 + [Flux.Optimisers.OptimiserChain(Flux.Optimisers.ClipGrad(grad_clip), Flux.Adam())] +else + [Flux.Adam()] +end pre_trained_model = nothing penalty_l2 = :auto -penalty_l1 = nothing -# Annealed target-penalty multipliers (relative to the :auto base above); set to `nothing` -# to train with the constant penalties the models were built with. -penalty_schedule = :default_annealed +penalty_l1 = :auto +penalty_schedule = get(ENV, "DR_PENALTY_SCHEDULE", "annealed") == "annealed" ? :default_annealed : nothing +clip_tag = grad_clip > 0 ? "-clip$(Int(grad_clip))" : "" +sched_tag = isnothing(penalty_schedule) ? "-const" : "-anneal" +save_file = "$(case_name)-$(formulation)-h$(num_stages)-shooting-w$(window_size)$(clip_tag)$(sched_tag)-$(now())" num_eval_scenarios = 4 eval_every = 25 @@ -87,6 +92,7 @@ lg = WandbLogger(; "activation" => string(activation), "ensure_feasibility" => string(ensure_feasibility), "optimizer" => string(optimizers), + "grad_clip" => grad_clip, "training_method" => "multiple_shooting", "window_size" => string(window_size), "penalty_l1" => string(penalty_l1), @@ -100,7 +106,7 @@ lg = WandbLogger(; # Define Model # Policy architecture: LSTM processes uncertainty, Dense combines with previous state -num_uncertainties = length(uncertainty_samples[1]) +num_uncertainties = length(uncertainty_samples[1][1]) models = state_conditioned_policy( num_uncertainties, num_hydro, @@ -145,7 +151,7 @@ best_obj = mean(objective_values) model_path = joinpath(model_dir, save_file * ".jld2") save_control = SaveBest(best_obj, model_path) -convergence_criterium = StallingCriterium(200, best_obj, 0) +convergence_criterium = StallingCriterium(num_epochs * num_batches, best_obj, 0) Random.seed!(8789) eval_scenarios = [DecisionRules.sample(uncertainty_samples) for _ in 1:num_eval_scenarios] @@ -170,7 +176,7 @@ train_multiple_shooting( models, initial_state, windows, - () -> uncertainty_samples; + uncertainty_samples; num_batches=num_epochs * num_batches, num_train_per_batch=_num_train_per_batch, optimizer=first(optimizers), diff --git a/examples/HydroPowerModels/train_dr_hydropowermodels_subproblems.jl b/examples/HydroPowerModels/train_dr_hydropowermodels_subproblems.jl index d98c575..7428293 100644 --- a/examples/HydroPowerModels/train_dr_hydropowermodels_subproblems.jl +++ b/examples/HydroPowerModels/train_dr_hydropowermodels_subproblems.jl @@ -21,24 +21,29 @@ end # Parameters case_name = "bolivia" formulation = "ACPPowerModel" -num_stages = 96 +num_stages = parse(Int, get(ENV, "DR_NUM_STAGES", "126")) model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models") mkpath(model_dir) -save_file = "$(case_name)-$(formulation)-h$(num_stages)-subproblems-$(now())" formulation_file = formulation * ".mof.json" -num_epochs = 30 +num_epochs = parse(Int, get(ENV, "DR_NUM_EPOCHS", "80")) num_batches = 100 _num_train_per_batch = 1 activation = sigmoid layers = Int64[128, 128] ensure_feasibility = non_ensurance -optimizers = [Flux.Adam()] +grad_clip = parse(Float32, get(ENV, "DR_GRAD_CLIP", "0")) +optimizers = if grad_clip > 0 + [Flux.Optimisers.OptimiserChain(Flux.Optimisers.ClipGrad(grad_clip), Flux.Adam())] +else + [Flux.Adam()] +end pre_trained_model = nothing penalty_l2 = :auto penalty_l1 = :auto -# Annealed target-penalty multipliers (relative to the :auto base above); set to `nothing` -# to train with the constant penalties the models were built with. -penalty_schedule = :default_annealed +penalty_schedule = get(ENV, "DR_PENALTY_SCHEDULE", "constant") == "annealed" ? :default_annealed : nothing +clip_tag = grad_clip > 0 ? "-clip$(Int(grad_clip))" : "" +sched_tag = isnothing(penalty_schedule) ? "-const" : "-anneal" +save_file = "$(case_name)-$(formulation)-h$(num_stages)-subproblems$(clip_tag)$(sched_tag)-$(now())" num_eval_scenarios = 4 # fixed held-out scenarios for the rollout evaluation eval_every = 25 # rollout-evaluate every eval_every batches @@ -76,6 +81,7 @@ lg = WandbLogger(; "activation" => string(activation), "ensure_feasibility" => string(ensure_feasibility), "optimizer" => string(optimizers), + "grad_clip" => grad_clip, "training_method" => "subproblems", "penalty_l1" => string(penalty_l1), "penalty_l2" => string(penalty_l2), @@ -88,7 +94,7 @@ lg = WandbLogger(; # Define Model # Policy architecture: LSTM processes uncertainty, Dense combines with previous state -num_uncertainties = length(uncertainty_samples[1]) +num_uncertainties = length(uncertainty_samples[1][1]) models = state_conditioned_policy( num_uncertainties, num_hydro, @@ -120,7 +126,7 @@ best_obj = mean(objective_values) model_path = joinpath(model_dir, save_file * ".jld2") save_control = SaveBest(best_obj, model_path) -convergence_criterium = StallingCriterium(100, best_obj, 0) +convergence_criterium = StallingCriterium(num_epochs * num_batches, best_obj, 0) # Fixed held-out scenarios, materialized once so every evaluation uses the same set. # The rollout evaluation executes the policy stage by stage (deployment semantics) and diff --git a/examples/HydroPowerModels/train_ldr_hydropowermodels.jl b/examples/HydroPowerModels/train_ldr_hydropowermodels.jl new file mode 100644 index 0000000..c296ce2 --- /dev/null +++ b/examples/HydroPowerModels/train_ldr_hydropowermodels.jl @@ -0,0 +1,264 @@ +# Train a TS-LDR (Linear Decision Rule) policy on the Bolivia LTHD problem. +# +# TS-LDR uses the same target-setting framework as TS-DDR but replaces the +# deep neural network with a linear map: +# +# x̂_t = W [w_{1:t}; x_{t-1}] + b +# +# where W, b are the trainable parameters. This is a `dense_multilayer_nn` +# with identity activation — a composition of linear layers is still linear, +# so the result is a standard linear decision rule. +# +# Training uses the Deterministic Equivalent pipeline (all stages coupled in +# one NLP), identical to train_dr_hydropowermodels.jl except for the policy +# architecture. The saved model is evaluated by evaluate_hydro_policies.jl. +# +# Usage: +# julia --project=. train_ldr_hydropowermodels.jl + +using DecisionRules +using Statistics +using Random +using Flux + +using Ipopt +using Wandb, Dates, Logging +using JLD2 +using DiffOpt +using JuMP +using MadNLP + +USE_GPU = try + using CUDA, CUDSS, MadNLPGPU + CUDA.functional() +catch + @warn "GPU packages not available — running on CPU" + false +end +@info "GPU status" USE_GPU + +HydroPowerModels_dir = dirname(@__FILE__) +include(joinpath(HydroPowerModels_dir, "load_hydropowermodels.jl")) + +function non_ensurance(x_out, x_in, uncertainty, max_volume) + return x_out +end + +# ── Parameters ─────────────────────────────────────────────────────────────── + +case_name = "bolivia" +formulation = "ACPPowerModel" +num_stages = 96 +model_dir = joinpath(HydroPowerModels_dir, case_name, formulation, "models") +mkpath(model_dir) +solver_tag = USE_GPU ? "gpu" : "cpu" +save_file = "$(case_name)-$(formulation)-h$(num_stages)-ldr-$(solver_tag)-$(now())" +formulation_file = formulation * ".mof.json" + +num_epochs = 40 +num_batches = 100 +_num_train_per_batch = 1 +activation = identity +layers = Int64[64, 64] +ensure_feasibility = non_ensurance +optimizers = [Flux.Adam()] +pre_trained_model = nothing +penalty_l2 = :auto +penalty_l1 = :auto +penalty_schedule = [ + (1, 100, 0.1), + (101, 210, 1.0), + (211, 300, 10.0), + (301, num_epochs * num_batches, 30.0), +] +num_eval_scenarios = 4 +eval_every = 25 + +# ── Build MSP: subproblems for rollout evaluation ──────────────────────────── + +diff_optimizer = + () -> DiffOpt.diff_optimizer( + optimizer_with_attributes( + Ipopt.Optimizer, + "print_level" => 0, + "linear_solver" => "mumps", + ), + ) +subproblems, state_params_in_sub, state_params_out_sub, uncertainty_samples_sub, initial_state, max_volume = build_hydropowermodels( + joinpath(HydroPowerModels_dir, case_name), + formulation_file; + num_stages=num_stages, + optimizer=diff_optimizer, + penalty_l1=penalty_l1, + penalty_l2=penalty_l2, +) + +# ── Build det-eq for training ──────────────────────────────────────────────── + +subproblems_de, state_params_in, state_params_out, uncertainty_samples, _, _ = build_hydropowermodels( + joinpath(HydroPowerModels_dir, case_name), + formulation_file; + num_stages=num_stages, + penalty_l1=penalty_l1, + penalty_l2=penalty_l2, +) + +det_equivalent = Model(MadNLP.Optimizer) + +if USE_GPU + set_optimizer_attribute(det_equivalent, "array_type", CUDA.CuArray) + set_optimizer_attribute(det_equivalent, "linear_solver", MadNLPGPU.CUDSSSolver) + set_optimizer_attribute(det_equivalent, "print_level", MadNLP.ERROR) + set_optimizer_attribute(det_equivalent, "barrier", MadNLP.LOQOUpdate()) +else + set_optimizer_attribute(det_equivalent, "print_level", MadNLP.ERROR) + set_optimizer_attribute(det_equivalent, "barrier", MadNLP.LOQOUpdate()) +end + +det_equivalent, uncertainty_samples = DecisionRules.deterministic_equivalent!( + det_equivalent, + subproblems_de, + state_params_in, + state_params_out, + initial_state, + uncertainty_samples, +) + +num_hydro = length(initial_state) + +# ── Logging ────────────────────────────────────────────────────────────────── + +lg = WandbLogger(; + project="RL", + name=save_file, + save_code=false, + config=Dict( + "layers" => layers, + "activation" => "identity (LDR)", + "policy_type" => "dense_multilayer_nn", + "ensure_feasibility" => string(ensure_feasibility), + "optimizer" => string(optimizers), + "training_method" => "deterministic_equivalent", + "solver" => USE_GPU ? "MadNLP+CUDSS (GPU)" : "MadNLP (CPU)", + "penalty_l1" => string(penalty_l1), + "penalty_l2" => string(penalty_l2), + "penalty_schedule" => string(penalty_schedule), + "num_epochs" => string(num_epochs), + "num_batches" => string(num_batches), + "num_train_per_batch" => string(_num_train_per_batch), + "num_eval_scenarios" => num_eval_scenarios, + "eval_every" => eval_every, + "use_gpu" => USE_GPU, + ), +) + +# ── Define linear policy ───────────────────────────────────────────────────── + +num_uncertainties = length(uncertainty_samples[1][1]) +num_inputs = DecisionRules.policy_input_dim(num_uncertainties, num_hydro) +models = dense_multilayer_nn( + num_inputs, num_hydro, layers; + activation=activation, +) + +if !isnothing(pre_trained_model) + model_save = JLD2.load(pre_trained_model) + model_state = model_save["model_state"] + Flux.loadmodel!(models, model_state) +end + +# ── Initial evaluation ─────────────────────────────────────────────────────── + +Random.seed!(8788) +@time objective_values = [ + simulate_multistage( + det_equivalent, + state_params_in, + state_params_out, + initial_state, + DecisionRules.sample(uncertainty_samples), + models; + ) for _ in 1:2 +] +best_obj = mean(objective_values) + +model_path = joinpath(model_dir, save_file * ".jld2") +save_control = SaveBest(best_obj, model_path) +stall_train = StallingCriterium(100, best_obj, 0) +stall_rollout = StallingCriterium(5, best_obj, 0) + +# ── Rollout evaluation (stage-wise subproblems, CPU) ───────────────────────── + +Random.seed!(8789) +eval_scenarios = [ + DecisionRules.sample(uncertainty_samples_sub) for _ in 1:num_eval_scenarios +] +rollout_evaluation = RolloutEvaluation( + subproblems, + state_params_in_sub, + state_params_out_sub, + initial_state, + eval_scenarios; + stride=eval_every, + policy_state=:target, +) +realized_rollout_evaluation = RolloutEvaluation( + subproblems, + state_params_in_sub, + state_params_out_sub, + initial_state, + eval_scenarios; + stride=eval_every, + policy_state=:realized, +) +resolved_penalty_schedule = isnothing(penalty_schedule) ? nothing : + DecisionRules._resolve_penalty_schedule(penalty_schedule, num_epochs * num_batches) + +# ── Train ──────────────────────────────────────────────────────────────────── + +train_multistage( + models, + initial_state, + det_equivalent, + state_params_in, + state_params_out, + uncertainty_samples; + num_batches=num_epochs * num_batches, + num_train_per_batch=_num_train_per_batch, + optimizer=first(optimizers), + record=(sample_log, iter, model) -> begin + training_loss = mean(sample_log.objectives) + loss_no_deficit = mean(sample_log.objectives_no_deficit) + metrics = Dict( + "metrics/loss" => loss_no_deficit, + "metrics/training_loss" => training_loss, + ) + rollout_evaluation(iter, model) + realized_rollout_evaluation(iter, model) + converged_training = stall_train(iter, model, training_loss) + converged_rollout = false + if iter % eval_every == 0 + converged_rollout = stall_rollout( + iter, model, rollout_evaluation.last_objective_no_deficit + ) + metrics["metrics/rollout_objective_no_deficit"] = + rollout_evaluation.last_objective_no_deficit + metrics["metrics/rollout_target_violation_share"] = + rollout_evaluation.last_violation_share + metrics["metrics/rollout_realized_objective_no_deficit"] = + realized_rollout_evaluation.last_objective_no_deficit + metrics["metrics/rollout_realized_target_violation_share"] = + realized_rollout_evaluation.last_violation_share + end + if !isnothing(resolved_penalty_schedule) + metrics["metrics/target_penalty_multiplier"] = + DecisionRules._penalty_multiplier_for(resolved_penalty_schedule, iter) + end + Wandb.log(lg, metrics) + save_control(iter, model, training_loss) + return converged_training && converged_rollout && isapprox(training_loss, rollout_evaluation.last_objective_no_deficit; rtol=0.01) + end, + penalty_schedule=penalty_schedule, +) + +close(lg) diff --git a/examples/README.md b/examples/README.md index 6bf37dc..23e4d16 100644 --- a/examples/README.md +++ b/examples/README.md @@ -8,7 +8,8 @@ and additional experiments. | Directory | Application | Paper section | |-----------|------------|---------------| -| [`HydroPowerModels/`](HydroPowerModels/) | Bolivia Long-Term Hydrothermal Dispatching (10 hydro units, AC/SOC/DC OPF, 96 stages) | §4, Extension §1–§4 | +| [`HydroPowerModels/`](HydroPowerModels/) | Bolivia Long-Term Hydrothermal Dispatching (10 hydro units, AC/SOC/DC OPF, 96 stages). Trains TS-DDR (LSTM) and TS-LDR (linear) policies. | §4, Extension §1–§4 | +| [`inventory_control/`](inventory_control/) | Stochastic lot-sizing with fixed ordering costs (relaxed LP and integer MIP). Demonstrates score-function (REINFORCE) gradient mixing for integer variables. | §3 | | [`rocket_control/`](rocket_control/) | Goddard rocket altitude maximization with stochastic wind | §3 | | [`RL/`](RL/) | Reinforcement learning baselines (REINFORCE, PPO, DDPG, TD3, SAC) on Bolivia LTHD | Beyond paper | | `Experimental/` | Work-in-progress experiments (not documented) | — | @@ -44,6 +45,7 @@ same problem: 2. **Stage-wise / Single Shooting** — solve one subproblem per stage, backpropagate through the chain (Extension §2) 3. **Windowed / Multiple Shooting** — partition stages into windows, parallelize window solves (Extension §3) -The HydroPowerModels directory contains a training script for each strategy -and a consistency check (`check_consistent_state_paths.jl`) verifying they -produce identical trajectories. +The HydroPowerModels directory contains a training script for each strategy, +a TS-LDR training script (linear policy baseline), and an evaluation script +(`evaluate_hydro_policies.jl`) that runs all trained policies on a common +out-of-sample scenario set. diff --git a/examples/inventory_control/README.md b/examples/inventory_control/README.md index c6e3132..a4ae4be 100644 --- a/examples/inventory_control/README.md +++ b/examples/inventory_control/README.md @@ -68,6 +68,26 @@ subproblems. Two strategies are available: For the relaxed formulation (no integer variables), `NoIntegerStrategy` is used — subproblems are solved and duals read as-is. +## Score-Function Gradient Mixing + +`ScoreFunctionConfig` adds a REINFORCE-style correction to the dual +gradient, enabling TS-DDR to capture discrete transitions that LP duals +miss. Stage-wise rollouts with Gaussian-perturbed targets estimate the +gradient of the true integer cost, and the two signals are mixed: + + g = α · g_dual + (1-α) · g_score_function + +There are two solves in the mixed-gradient training loop: + +- `train_multistage(...; integer_strategy=...)` controls the + deterministic-equivalent solve used to read the local dual-gradient term. +- `ScoreFunctionConfig(subproblems, ...)` uses its rollout subproblems exactly + as built. If those subproblems contain binary setup variables, the + score-function term measures true MIP rollout costs. + +So the integer strategy is not duplicated: it belongs to the dual path. The +score-function path measures costs from the rollout models you pass in. + ## Scripts Run from the repository root: @@ -105,11 +125,14 @@ Figures are written to `docs/src/assets/`: - **TS-DDR** learns an ex-ante order target from inventory and demand history, using the same time-invariant neural policy at every period. - **SDDP** uses a PAR(1) demand approximation in a 24-stage order/demand graph. - For the integer case, it uses LP relaxation with integer rounding at rollout. + For the integer case, it uses `AlternativeForwardPass`: the forward pass solves + true MIP subproblems (`z ∈ {0,1}`), while the backward pass uses LP relaxation + (`z ∈ [0,1]`) to compute cuts with valid duals. - **Base-stock** is a tuned constant order-up-to policy. - **Random** is an untrained ex-ante neural policy. The expected qualitative result is: - **Relaxed**: SDDP dominates (near-optimal for convex problems with Markov noise). - **Integer**: TS-DDR dominates (handles MIP subproblems natively via integer - postprocessing strategies, while SDDP's LP relaxation underestimates fixed costs). + postprocessing strategies, while SDDP with `AlternativeForwardPass` generates + cuts at MIP-realistic trial points but still relies on LP duals for cuts). diff --git a/examples/inventory_control/build_inventory_problem.jl b/examples/inventory_control/build_inventory_problem.jl index 4c5ebdf..b6137b8 100644 --- a/examples/inventory_control/build_inventory_problem.jl +++ b/examples/inventory_control/build_inventory_problem.jl @@ -39,40 +39,123 @@ const D_HI = Float64[44, 55, 81, 112, 153, 185, 164, 122, 88, 65, 49, 40] # --------------------------------------------------------------------------- # Latent demand process (stronger structure than original) # --------------------------------------------------------------------------- + +""" + sample_inventory_demand_path(rng = Random.default_rng()) -> Vector{Float64} + +Draw one realization of the latent demand process over `T` periods. + +The process has three hidden components: +- a seasonal phase shift (uniform in 0:T-1); +- a persistent regime ∈ {-1, 0, 1} with 4% switching probability; +- an AR(1) shock with coefficient 0.92. + +None of these are observed by the policy — only realized demand values. + +# Arguments +- `rng::AbstractRNG`: random number generator. + +# Examples +```julia +path = sample_inventory_demand_path() +``` +""" function sample_inventory_demand_path(rng::AbstractRNG=Random.default_rng()) + # Draw a random seasonal phase offset so demand peaks at different months. phase_shift = rand(rng, 0:(INVENTORY_T - 1)) + + # Draw the initial demand regime (low / neutral / high). regime = rand(rng, (-1.0, 0.0, 1.0)) + + # Initialize the AR(1) shock ε₀ = 0. shock = 0.0 + + # Pre-allocate the demand path vector. path = Vector{Float64}(undef, INVENTORY_T) + for t in 1:INVENTORY_T + # Map period t to the shifted seasonal index κ_t = 1 + ((t+φ-1) mod T). seasonal_t = mod1(t + phase_shift, INVENTORY_T) + + # With 4% probability, jump to a new demand regime. if rand(rng) < 0.04 regime = rand(rng, (-1.0, 0.0, 1.0)) end + + # Update the AR(1) shock: ε_t = 0.92 ε_{t-1} + 0.35 η_t. shock = 0.92 * shock + 0.35 * randn(rng) + + # Compute the seasonal center μ_κ = (D_LO[κ] + D_HI[κ]) / 2. center = (D_LO[seasonal_t] + D_HI[seasonal_t]) / 2 + + # Compute the seasonal half-width w_κ = (D_HI[κ] - D_LO[κ]) / 2. half_width = (D_HI[seasonal_t] - D_LO[seasonal_t]) / 2 + + # Demand: d_t = μ_κ + w_κ · (0.85·regime + 0.42·shock + 0.12·noise). demand = center + half_width * (0.85 * regime + 0.42 * shock + 0.12 * randn(rng)) + + # Clip demand to [5, D_HI[κ] + 0.55·w_κ] to prevent negative or extreme values. path[t] = clamp(demand, 5.0, D_HI[seasonal_t] + 0.55 * half_width) end + return path end # --------------------------------------------------------------------------- # DecisionRules sampler # --------------------------------------------------------------------------- + +""" + InventoryProcessSampler + +Wraps JuMP demand parameters so `DecisionRules.sample` returns uncertainty +realizations in the format expected by `train_multistage` and +`simulate_multistage`. + +# Fields +- `params::Vector{VariableRef}`: one demand parameter per stage. +""" struct InventoryProcessSampler params::Vector{VariableRef} end +""" + DecisionRules.sample(sampler::InventoryProcessSampler) + +Draw one demand path and return it as a vector of `[(param, value)]` pairs. +""" function DecisionRules.sample(sampler::InventoryProcessSampler) + # Draw a fresh demand path from the latent process. demand_path = sample_inventory_demand_path() + + # Pair each stage's JuMP parameter with the sampled demand value. return [[(sampler.params[t], demand_path[t])] for t in 1:INVENTORY_T] end # --------------------------------------------------------------------------- # Stage-wise subproblems # --------------------------------------------------------------------------- + +""" + build_inventory_subproblems(; kwargs...) -> (subproblems, state_in, state_out, sampler, x0) + +Build `T` independent JuMP stage models for stage-wise rollout evaluation. + +Each model has demand as a parameter, input state `(inventory, d_{t-1}, +d_{t-2})`, and a target constraint on mid-period inventory `s_mid` using +`create_deficit!`. + +Returns the five-tuple expected by `simulate_multistage` and +`train_multistage`. + +# Keyword Arguments +- `T`, `K`, `c`, `h`, `p`, `Q_max`: problem parameters. +- `I_0`: initial inventory. +- `num_scenarios`: number of uncertainty samples per SGD batch. +- `penalty`: target-deficit penalty λ. +- `seed`: RNG seed for demand sampling. +- `integer`: whether to include binary setup variable z. +""" function build_inventory_subproblems(; T = INVENTORY_T, K = INVENTORY_K, @@ -86,49 +169,84 @@ function build_inventory_subproblems(; seed = 42, integer = true, ) + # Fix the random seed so demand samples are reproducible. Random.seed!(seed) + + # Pre-allocate one JuMP model per stage. subproblems = Vector{JuMP.Model}(undef, T) + + # Each stage has 3 input-state parameters: (inventory, d_{t-1}, d_{t-2}). state_params_in = Vector{Vector{Any}}(undef, T) + + # Each stage has 3 output pairs: (target_param, realized_variable). state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, T) + + # One demand parameter per stage. uncertainty_params = Vector{VariableRef}(undef, T) for t in 1:T + # Create a HiGHS LP/MIP model for this stage. m = Model(optimizer_with_attributes(HiGHS.Optimizer, "output_flag" => false)) set_silent(m) + # --- Decision variables --- + # q: order quantity, bounded by capacity Q_max. @variable(m, 0 <= q <= Q_max) + # s_mid: mid-period inventory after order arrives but before demand. @variable(m, s_mid) + # s_out: end-of-period inventory after demand realizes. @variable(m, s_out) + # inv_hold: positive part of s_out (holding cost component). @variable(m, inv_hold >= 0) + # back: negative part of s_out (backlog cost component). @variable(m, back >= 0) + # Demand pass-through to the next stage state. @variable(m, last_demand_out) @variable(m, prev_demand_out) + # --- Parametric inputs (set before each solve) --- + # s_in: incoming inventory from the previous stage. @variable(m, s_in in MOI.Parameter(I_0)) + # last_demand_in: demand observed one period ago (part of state). @variable(m, last_demand_in in MOI.Parameter(INVENTORY_LAST_DEMAND0)) + # prev_demand_in: demand observed two periods ago (part of state). @variable(m, prev_demand_in in MOI.Parameter(INVENTORY_PREV_DEMAND0)) + # demand: current-period demand realization (uncertainty). @variable(m, demand in MOI.Parameter((D_LO[t] + D_HI[t]) / 2)) + # s_target: target mid-period inventory from the policy. @variable(m, s_target in MOI.Parameter(I_0)) + # Target pass-throughs for demand state entries. @variable(m, last_demand_target in MOI.Parameter(INVENTORY_LAST_DEMAND0)) @variable(m, prev_demand_target in MOI.Parameter(INVENTORY_PREV_DEMAND0)) if integer + # z ∈ {0,1}: binary setup decision. @variable(m, z, Bin) + # If z = 0, no order is allowed: q ≤ Q_max · z. @constraint(m, q <= Q_max * z) + # Objective: K·z + c·q + h·hold + p·backlog. @objective(m, Min, K * z + c * q + h * inv_hold + p * back) else + # Relaxed objective: no setup cost or binary variable. @objective(m, Min, c * q + h * inv_hold + p * back) end + # s_mid = s_in + q: order arrives before demand. @constraint(m, s_mid == s_in + q) + # s_out = s_mid - demand: demand subtracts from inventory. @constraint(m, s_out == s_mid - demand) + # Pass current demand to next stage as "last demand". @constraint(m, last_demand_out == demand) + # Pass previous "last demand" to next stage as "prev demand". @constraint(m, prev_demand_out == last_demand_in) + # Split end-of-period inventory into holding and backlog parts. @constraint(m, inv_hold - back == s_out) + # L1 target-deficit penalty: λ · |s_mid - ŝ_target|. _, deficit = create_deficit!(m, 1; penalty_l1=penalty) @constraint(m, deficit[1] == s_mid - s_target) + # Store the model and parameter mappings. subproblems[t] = m state_params_in[t] = Any[s_in, last_demand_in, prev_demand_in] state_params_out[t] = [ @@ -139,6 +257,7 @@ function build_inventory_subproblems(; uncertainty_params[t] = demand end + # Return the five-tuple: (models, state_in, state_out, sampler, x0). return subproblems, state_params_in, state_params_out, InventoryProcessSampler(uncertainty_params), [I_0, INVENTORY_LAST_DEMAND0, INVENTORY_PREV_DEMAND0] @@ -147,6 +266,39 @@ end # --------------------------------------------------------------------------- # Deterministic equivalent (full-horizon) # --------------------------------------------------------------------------- + +""" + build_inventory_det_equivalent(; kwargs...) -> (model, state_in, state_out, sampler, x0) + +Build a single JuMP model coupling all `T` stages for direct transcription +training. + +The deterministic equivalent jointly optimizes over the full horizon. Target +constraints appear as NormOneCone (L1) penalties so the training gradient +captures inter-stage cost coupling that stage-wise rollouts miss. + +The penalty term is + +```math +\\lambda \\sum_{t=1}^{T} |s_t^{mid} - \\hat{s}_t| . +``` + +# Keyword Arguments +- `T`, `K`, `c`, `h`, `p`, `Q_max`: problem parameters. +- `I_0`: initial inventory. +- `num_scenarios`: number of uncertainty samples per SGD batch. +- `penalty`: target-deficit penalty ``\\lambda``. +- `seed`: RNG seed for demand sampling. +- `integer`: whether to include binary setup variable z. + +# Examples +```julia +model, spi, spo, sampler, x0 = build_inventory_det_equivalent(; + num_scenarios = 50, + integer = true, +) +``` +""" function build_inventory_det_equivalent(; T = INVENTORY_T, K = INVENTORY_K, @@ -160,44 +312,68 @@ function build_inventory_det_equivalent(; seed = 42, integer = true, ) + # Fix the random seed so demand samples are reproducible. Random.seed!(seed) + + # One monolithic model for the entire T-period horizon. m = Model(optimizer_with_attributes(HiGHS.Optimizer, "output_flag" => false)) set_silent(m) + # --- Decision variables (one per stage, indexed 1:T) --- + # q[t]: order quantity in period t. @variable(m, 0 <= q[1:T] <= Q_max) + # s_mid[t]: mid-period inventory after order arrives. @variable(m, s_mid[1:T]) + # s_out[t]: end-of-period inventory after demand. @variable(m, s_out[1:T]) + # Demand pass-through state variables. @variable(m, last_demand_out[1:T]) @variable(m, prev_demand_out[1:T]) + # Holding and backlog split of s_out. @variable(m, inv_hold[1:T] >= 0) @variable(m, back[1:T] >= 0) + # --- Parametric inputs (set before each DE solve) --- + # Initial state at t = 0. @variable(m, s_init in MOI.Parameter(I_0)) @variable(m, last_demand_init in MOI.Parameter(INVENTORY_LAST_DEMAND0)) @variable(m, prev_demand_init in MOI.Parameter(INVENTORY_PREV_DEMAND0)) + # Demand realizations for each period (set per scenario). @variable(m, demand[t=1:T] in MOI.Parameter((D_LO[t] + D_HI[t]) / 2)) + # Target state from the policy (set per scenario). @variable(m, s_target[t=1:T] in MOI.Parameter(I_0)) @variable(m, last_demand_target[t=1:T] in MOI.Parameter(INVENTORY_LAST_DEMAND0)) @variable(m, prev_demand_target[t=1:T] in MOI.Parameter(INVENTORY_PREV_DEMAND0)) if integer + # z[t] ∈ {0,1}: binary setup decision. @variable(m, z[1:T], Bin) + # q[t] ≤ Q_max · z[t]: no order if setup is off. @constraint(m, [t=1:T], q[t] <= Q_max * z[t]) end + # --- Dynamics --- + # First stage links to the initial inventory parameter. @constraint(m, s_mid[1] == s_init + q[1]) + # Subsequent stages chain from the previous end-of-period inventory. @constraint(m, [t=2:T], s_mid[t] == s_out[t-1] + q[t]) + # Demand subtracts from mid-period inventory. @constraint(m, [t=1:T], s_out[t] == s_mid[t] - demand[t]) + # Pass demand through to state for the next stage. @constraint(m, [t=1:T], last_demand_out[t] == demand[t]) @constraint(m, prev_demand_out[1] == prev_demand_init) @constraint(m, [t=2:T], prev_demand_out[t] == last_demand_out[t-1]) + # Split end-of-period inventory into holding and backlog. @constraint(m, [t=1:T], inv_hold[t] - back[t] == s_out[t]) + # --- Target-deficit penalty via NormOneCone --- + # norm_deficit_arr[t] ≥ |s_mid[t] - s_target[t]| (L1 norm). @variable(m, norm_deficit_arr[1:T] >= 0.0) @variable(m, deficit_arr[1:T]) @constraint(m, [t=1:T], deficit_arr[t] == s_mid[t] - s_target[t]) @constraint(m, [t=1:T], [norm_deficit_arr[t]; deficit_arr[t:t]] in MOI.NormOneCone(2)) + # --- Objective: operational cost + target penalty --- if integer @objective(m, Min, sum(K * z[t] + c * q[t] + h * inv_hold[t] + p * back[t] for t in 1:T) + @@ -208,14 +384,20 @@ function build_inventory_det_equivalent(; penalty * sum(norm_deficit_arr)) end + # --- Build parameter mappings for DecisionRules interface --- state_params_in = Vector{Vector{Any}}(undef, T) state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, T) uncertainty_params = Vector{VariableRef}(undef, T) + # Stage 1 reads from the initial-state parameters. state_params_in[1] = Any[s_init, last_demand_init, prev_demand_init] + + # Stages 2..T read from the previous stage's realized output variables. for t in 2:T state_params_in[t] = Any[s_out[t-1], last_demand_out[t-1], prev_demand_out[t-1]] end + + # Each stage maps (target_parameter → realized_variable) for gradient reading. for t in 1:T state_params_out[t] = [ (s_target[t], s_out[t]), @@ -225,6 +407,7 @@ function build_inventory_det_equivalent(; uncertainty_params[t] = demand[t] end + # Return the five-tuple: (model, state_in, state_out, sampler, x0). return m, state_params_in, state_params_out, InventoryProcessSampler(uncertainty_params), [I_0, INVENTORY_LAST_DEMAND0, INVENTORY_PREV_DEMAND0] @@ -233,28 +416,89 @@ end # --------------------------------------------------------------------------- # Policies # --------------------------------------------------------------------------- + +""" + base_stock_policy(S_star) -> Function + +Return a constant-target base-stock policy. + +The target is the mid-period inventory level ``s^{mid} = S^*``; pass-through +state entries are current and lagged demand. + +# Arguments +- `S_star::Float64`: order-up-to level. + +# Examples +```julia +policy = base_stock_policy(160.0) +target = policy(Float32[d_t, inventory, d_{t-1}, d_{t-2}]) +``` +""" function base_stock_policy(S_star::Float64) + # Return a closure: target is always S*, pass through d_t and d_{t-1}. return x -> Float32[S_star, x[1], x[2]] end +""" + ExAnteInventoryPolicy{N} + +Feedforward ex-ante inventory policy. + +Input: `[d_t, inventory, d_{t-1}, d_{t-2}]`. The policy ignores the current +demand `d_t` to respect the ex-ante information pattern. Features passed to +the network are `[inventory/100, d_{t-1}/100, d_{t-2}/100]`. + +Output: `[500 σ(net(features)), d_t, d_{t-1}]` — a target for mid-period +inventory, plus pass-through state entries. + +# Fields +- `net::N`: Flux `Chain` mapping ℝ³ → ℝ¹. +""" struct ExAnteInventoryPolicy{N} net::N end Functors.@functor ExAnteInventoryPolicy (net,) +Flux.reset!(::ExAnteInventoryPolicy) = nothing + function (policy::ExAnteInventoryPolicy)(x) + # Unpack the input vector: [d_t, inventory, d_{t-1}, d_{t-2}]. current_demand = Float32(x[1]) inventory = Float32(x[2]) last_demand = Float32(x[3]) prev_demand = Float32(x[4]) + + # Scale features to ≈ [0, 1] range for stable neural network training. order_features = Float32[inventory / 100, last_demand / 100, prev_demand / 100] + + # Map through the network and squash to [0, 500] via sigmoid. target = 500f0 .* Flux.sigmoid.(policy.net(order_features)) + + # Return [target_s_mid, d_t, d_{t-1}] — target inventory + pass-through state. return Float32[target[1], current_demand, last_demand] end +""" + build_exante_policy(; seed = 2024) -> ExAnteInventoryPolicy + +Construct the default feedforward ex-ante policy. + +Architecture: Dense(3 → 32, relu) → Dense(32 → 24, relu) → Dense(24 → 1). + +# Keyword Arguments +- `seed::Int`: random seed for weight initialization. + +# Examples +```julia +policy = build_exante_policy(; seed = 2024) +``` +""" function build_exante_policy(; seed=2024) + # Fix the random seed for reproducible weight initialization. Random.seed!(seed) + + # Three-layer feedforward: 3 inputs → 32 hidden → 24 hidden → 1 output. net = Chain( Dense(3, 32, relu), Dense(32, 24, relu), @@ -262,3 +506,120 @@ function build_exante_policy(; seed=2024) ) return ExAnteInventoryPolicy(net) end + +# --------------------------------------------------------------------------- +# LSTM ex-ante policy (temporal demand encoding, strictly ex-ante) +# --------------------------------------------------------------------------- + +""" + LSTMExAntePolicy{E,C,S} + +Recurrent ex-ante inventory policy with temporal demand encoding. + +An `LSTMCell` encoder processes the *lagged* demand ``d_{t-1}`` at each +stage, building a hidden representation of the demand history. The combiner +maps the LSTM output concatenated with `[inventory, d_{t-2}]` to a single +target value. + +The policy is strictly ex-ante: it never sees the current-period demand +``d_t``. Temporal information comes from the LSTM state accumulated over +previous stages. + +Output parameterization is affine: `raw * 200 + 150`, centered on typical +mid-period inventory and free from sigmoid saturation. + +# Fields +- `encoder::E`: `Flux.LSTMCell` processing one demand value per stage. +- `combiner::C`: `Dense` layer mapping encoded + state features to target. +- `state::S`: current LSTM hidden state (reset between scenarios). + +# Examples +```julia +policy = build_lstm_exante_policy(; seed = 2024, hidden = 16) +Flux.reset!(policy) +target = policy(Float32[d_t, inventory, d_{t-1}, d_{t-2}]) +``` +""" +mutable struct LSTMExAntePolicy{E,C,S} + encoder::E + combiner::C + state::S +end + +Functors.@functor LSTMExAntePolicy (encoder, combiner) + +function (policy::LSTMExAntePolicy)(x) + # Extract features from input: [d_t, inventory, d_{t-1}, d_{t-2}]. + # Only d_{t-1} (lagged) feeds the LSTM — d_t is NOT used (ex-ante). + last_demand = Float32(x[3]) + inventory = Float32(x[2]) + prev_demand = Float32(x[4]) + + # Match the element type of the LSTM state (Float32 during training). + T = eltype(first(policy.state)) + + # Feed the normalized lagged demand through the LSTM cell. + # The cell returns the encoded output and the updated hidden state. + encoded, new_state = policy.encoder(T[last_demand / 100], policy.state) + + # Update the hidden state for the next stage call within this scenario. + policy.state = new_state + + # Concatenate LSTM output with current inventory and prev demand. + combined = vcat(encoded, T[inventory / 100, prev_demand / 100]) + + # Map combined features to a single scalar through the Dense combiner. + raw = policy.combiner(combined) + + # Affine output: target = raw × 200 + 150 (centered, no saturation). + target_s_mid = raw[1] * 200f0 + 150f0 + + # Return [target_s_mid, d_t, d_{t-1}] — target inventory + pass-through. + return Float32[target_s_mid, x[1], last_demand] +end + +""" + Flux.reset!(policy::LSTMExAntePolicy) -> Nothing + +Reset the LSTM hidden state to its initial value. + +Must be called before each scenario rollout so hidden state from previous +scenarios does not leak. +""" +function Flux.reset!(policy::LSTMExAntePolicy) + # Restore the LSTM hidden state to its fresh initial values. + policy.state = Flux.initialstates(policy.encoder) + return nothing +end + +""" + build_lstm_exante_policy(; seed = 2024, hidden = 16) -> LSTMExAntePolicy + +Construct an LSTM ex-ante policy. + +Architecture: LSTMCell(1 → hidden) encoder, Dense(hidden + 2 → 1) combiner. + +# Keyword Arguments +- `seed::Int`: random seed for weight initialization. +- `hidden::Int`: LSTM hidden dimension. + +# Examples +```julia +policy = build_lstm_exante_policy(; seed = 2024, hidden = 16) +``` +""" +function build_lstm_exante_policy(; seed=2024, hidden=16) + # Fix the random seed for reproducible weight initialization. + Random.seed!(seed) + + # LSTM cell: 1 input (normalized lagged demand) → hidden state. + encoder = Flux.LSTMCell(1 => hidden) + + # Dense combiner: [LSTM output; inventory; prev_demand] → 1 target. + combiner = Dense(hidden + 2, 1) + + # Initialize the LSTM hidden state to its default zeros. + state = Flux.initialstates(encoder) + + return LSTMExAntePolicy(encoder, combiner, state) +end diff --git a/examples/inventory_control/compare_results.jl b/examples/inventory_control/compare_results.jl index 1251e43..f9d6f2e 100644 --- a/examples/inventory_control/compare_results.jl +++ b/examples/inventory_control/compare_results.jl @@ -1,205 +1,1060 @@ """ -Compare all policies for both relaxed and integer inventory problem variants. -Produces two-section output: tables + compact 2×2 plot layouts. +Compare inventory-control benchmark results and regenerate documentation plots. + +This script expects the CSV files written by: + +- `train_dr_inventory.jl`; +- `evaluate_inventory.jl`; and +- `solve_sddp.jl`. + +Results live in timestamped subdirectories under `results/`. Pass the run ID as +the first CLI argument, or omit it to use the most recent run: + +```bash +julia --project=. compare_results.jl # latest run +julia --project=. compare_results.jl 20260619_231417 # specific run +``` """ -using CSV, DataFrames, Statistics, Printf, Random -using Plots, StatsPlots +using CSV +using DataFrames +using Plots +using Printf +using Random +using Statistics +using StatsPlots include(joinpath(@__DIR__, "build_inventory_problem.jl")) -result_dir = joinpath(@__DIR__, "results") -example_dir = @__DIR__ -docs_dir = normpath(joinpath(example_dir, "..", "..", "docs", "src", "assets")) -mkpath(docs_dir) +""" + resolve_result_dir(args) -> String + +Pick the results directory from CLI args or default to the most recent run. + +When the `results/` directory contains timestamped subdirectories +(e.g. `results/20260619_231417/`), the most recent one is used. If no +subdirectories exist, the flat `results/` directory itself is used for +backward compatibility with older runs. + +# Arguments +- `args`: `ARGS` from the script entry point. + +# Examples +```julia +result_dir = resolve_result_dir(ARGS) +``` +""" +function resolve_result_dir(args) + base = joinpath(@__DIR__, "results") + + if !isempty(args) + dir = joinpath(base, args[1]) + isdir(dir) || error("Run directory not found: $dir") + return dir + end + + subdirs = filter(d -> isdir(joinpath(base, d)), readdir(base)) + + if isempty(subdirs) + return base + end + + return joinpath(base, sort(subdirs)[end]) +end + +const RESULT_DIR = resolve_result_dir(ARGS) +const RESULT_BASE = joinpath(@__DIR__, "results") +println("Loading results from: $RESULT_DIR") + +""" + resolve_file(filename::AbstractString) -> String + +Find a result file in `RESULT_DIR`, falling back to the base `results/` +directory. This lets run-specific TS-DDR results coexist with shared +baselines (SDDP, base-stock, random) that were generated once and live +in the parent directory. + +# Arguments +- `filename::AbstractString`: file name (not a full path). + +# Examples +```julia +path = resolve_file("relaxed_sddp_costs.csv") +``` +""" +function resolve_file(filename::AbstractString) + primary = joinpath(RESULT_DIR, filename) + isfile(primary) && return primary + + fallback = joinpath(RESULT_BASE, filename) + isfile(fallback) && return fallback + + error("Result file \"$filename\" not found in $RESULT_DIR or $RESULT_BASE") +end + +""" + resolve_file_optional(filename::AbstractString) -> Union{String, Nothing} + +Like `resolve_file`, but returns `nothing` when the file does not exist in +either directory. + +# Arguments +- `filename::AbstractString`: file name (not a full path). + +# Examples +```julia +path = resolve_file_optional("integer_sf_training_curve.csv") +``` +""" +function resolve_file_optional(filename::AbstractString) + primary = joinpath(RESULT_DIR, filename) + isfile(primary) && return primary + + fallback = joinpath(RESULT_BASE, filename) + isfile(fallback) && return fallback + + return nothing +end + +# Documentation figures are checked into the docs asset directory. +const DOCS_ASSET_DIR = normpath(joinpath(@__DIR__, "..", "..", "docs", "src", "assets")) -function ci95(costs) +# Ensure the figure output directory exists before plotting. +mkpath(DOCS_ASSET_DIR) + +""" + MethodResult + +Costs and display metadata for one benchmark method. + +# Fields +- `name::String`: label printed in tables. +- `costs::Vector{Float64}`: operational costs, one per evaluation scenario. + +# Examples +```julia +result = MethodResult("TS-DDR", [1.0, 2.0, 3.0]) +``` +""" +struct MethodResult + name::String + costs::Vector{Float64} +end + +""" + TimingRecord + +Training and evaluation timing for one benchmark method. + +# Fields +- `fit_seconds::Float64`: total fitting time. +- `eval_seconds::Float64`: average evaluation time per stage. + +# Examples +```julia +timing = TimingRecord(10.0, 0.01) +``` +""" +struct TimingRecord + fit_seconds::Float64 + eval_seconds::Float64 +end + +""" + ci95(costs::AbstractVector{<:Real}) -> Float64 + +Return the normal-approximation half-width of a 95% confidence interval. + +The reported value is + +```math +1.96 \\frac{s}{\\sqrt{n}}, +``` + +where ``s`` is the sample standard deviation and ``n`` is the number of costs. + +# Arguments +- `costs::AbstractVector{<:Real}`: sampled operational costs. + +# Examples +```julia +half_width = ci95([10.0, 12.0, 11.0]) +``` +""" +function ci95(costs::AbstractVector{<:Real}) + # The table reports uncertainty in the sample mean, not in one trajectory. return 1.96 * std(costs) / sqrt(length(costs)) end -function load_costs(tag, method) - CSV.read(joinpath(result_dir, "$(tag)_$(method)_costs.csv"), DataFrame).operational_cost +""" + percent_gap(costs, reference_costs) -> Float64 + +Return the mean-cost percent gap relative to a reference method. + +The gap is + +```math +100 \\frac{\\bar{c} - \\bar{c}_{ref}}{\\bar{c}_{ref}}. +``` + +# Arguments +- `costs::AbstractVector{<:Real}`: candidate method costs. +- `reference_costs::AbstractVector{<:Real}`: reference method costs. + +# Examples +```julia +gap = percent_gap(candidate.costs, reference.costs) +``` +""" +function percent_gap(costs, reference_costs) + # Positive gaps mean the candidate is more expensive than the reference. + return 100.0 * (mean(costs) - mean(reference_costs)) / mean(reference_costs) end +""" + load_costs(tag::AbstractString, method::AbstractString) -> Vector{Float64} + +Load one benchmark cost vector from `RESULT_DIR`. + +# Arguments +- `tag::AbstractString`: problem prefix, such as `"relaxed"` or `"integer"`. +- `method::AbstractString`: method suffix, such as `"dr"` or `"sddp"`. + +# Examples +```julia +costs = load_costs("integer", "dr") +``` +""" +function load_costs(tag::AbstractString, method::AbstractString) + # Every cost file uses the shared `operational_cost` column. + table = CSV.read(resolve_file("$(tag)_$(method)_costs.csv"), DataFrame) + + return Float64.(table.operational_cost) +end + +""" + optional_costs(tag::AbstractString, method::AbstractString) + +Load costs if a result file exists; otherwise return `nothing`. + +# Arguments +- `tag::AbstractString`: problem prefix. +- `method::AbstractString`: method suffix. + +# Examples +```julia +costs = optional_costs("integer_sf", "dr") +``` +""" +function optional_costs(tag::AbstractString, method::AbstractString) + filename = "$(tag)_$(method)_costs.csv" + primary = joinpath(RESULT_DIR, filename) + fallback = joinpath(RESULT_BASE, filename) + + path = isfile(primary) ? primary : isfile(fallback) ? fallback : nothing + + return isnothing(path) ? nothing : + Float64.(CSV.read(path, DataFrame).operational_cost) +end + +""" + read_scalar(path::AbstractString) -> Float64 + +Read a scalar floating-point value from a text file. + +# Arguments +- `path::AbstractString`: text file containing one numeric value. + +# Examples +```julia +bound = read_scalar(resolve_file("integer_sddp_bound.txt")) +``` +""" +function read_scalar(path::AbstractString) + # Baseline scripts write scalar values as plain text. + return parse(Float64, strip(read(path, String))) +end + +""" + timing_key(method_name::AbstractString) -> String + +Return the key used to look up timing rows. + +# Arguments +- `method_name::AbstractString`: table label. + +# Examples +```julia +key = timing_key("Base-stock (S*=160)") +``` +""" +function timing_key(method_name::AbstractString) + # Base-stock labels include S*, while timing files use a stable method name. + startswith(method_name, "Base-stock") && return "Base-stock" + + return String(method_name) +end + +""" + load_timing(tags) -> Dict{String,TimingRecord} + +Load timing rows for a set of result prefixes. + +# Arguments +- `tags`: iterable of prefixes, such as `["integer", "integer_cr"]`. + +# Examples +```julia +timing = load_timing(["integer", "integer_cr"]) +``` +""" function load_timing(tags) - dfs = DataFrame[] + # These suffixes cover TS-DDR, SDDP, LP-relaxed SDDP, and baselines. + timing_suffixes = ["dr_timing", "sddp_timing", "sddp_lp_timing", "baseline_timing"] + + # Accumulate all timing CSVs that exist for the requested tags. + rows = DataFrame[] for tag in tags - for f in ["dr_timing", "sddp_timing", "baseline_timing"] - path = joinpath(result_dir, "$(tag)_$(f).csv") - isfile(path) && push!(dfs, CSV.read(path, DataFrame)) + for suffix in timing_suffixes + path = resolve_file_optional("$(tag)_$(suffix).csv") + !isnothing(path) && push!(rows, CSV.read(path, DataFrame)) end end - df = vcat(dfs..., cols=:union) - return Dict(row.method => row for row in eachrow(df)) + + # A missing timing row is a data-generation error, so keep loading strict. + combined = vcat(rows...; cols = :union) + + # Convert DataFrame rows to a small typed dictionary. + return Dict( + String(row.method) => TimingRecord(row.fit_seconds, row.eval_seconds) + for row in eachrow(combined) + ) end -# ═══════════════════════════════════════════════════════════════════════════════ -# Print comparison table -# ═══════════════════════════════════════════════════════════════════════════════ -function print_table(entries, timing, sddp_bound; ref_idx=1) - ref_mean = mean(entries[ref_idx][2]) - println("SDDP LP bound: $(@sprintf("%.1f", sddp_bound))") +""" + print_table(results, timing, bound; reference_index = 1) + +Print a Markdown comparison table. + +# Arguments +- `results::Vector{MethodResult}`: cost vectors and display names. +- `timing::Dict{String,TimingRecord}`: timing rows by method name. +- `bound::Real`: SDDP lower bound printed above the table. +- `reference_index::Integer`: result used for percent-gap comparisons. + +# Examples +```julia +print_table(results, timing, bound) +``` +""" +function print_table( + results::Vector{MethodResult}, + timing::Dict{String,TimingRecord}, + bound::Real; + reference_index::Integer = 1, +) + # The reference method defines the "vs" column. + reference = results[reference_index] + + println("SDDP LP bound: $(@sprintf("%.1f", bound))") println() - println("| Method | N | Mean cost | Std | 95% CI | vs $(entries[ref_idx][1]) | Fit (s) | Eval (s) |") + println( + "| Method | N | Mean cost | Std | 95% CI | " * + "vs $(reference.name) | Fit (s) | Eval (s) |", + ) println("|:-------------------------|----:|----------:|------:|-------:|----------:|--------:|---------:|") - for (name, costs) in entries - timing_key = startswith(name, "Base-stock") ? "Base-stock" : name - row = timing[timing_key] - gap = (mean(costs) - ref_mean) / ref_mean * 100 - @printf("| %-24s | %3d | %9.1f | %5.1f | %6.1f | %+9.1f%% | %7.1f | %8.4f |\n", - name, length(costs), mean(costs), std(costs), ci95(costs), - gap, row.fit_seconds, row.eval_seconds) + + for result in results + # Timing rows use stable method labels. + row = timing[timing_key(result.name)] + + # Compute all statistics once so the table row is easy to inspect. + mean_cost = mean(result.costs) + std_cost = std(result.costs) + confidence = ci95(result.costs) + gap = percent_gap(result.costs, reference.costs) + + @printf( + "| %-24s | %3d | %9.1f | %5.1f | %6.1f | %+9.1f%% | %7.1f | %8.4f |\n", + result.name, + length(result.costs), + mean_cost, + std_cost, + confidence, + gap, + row.fit_seconds, + row.eval_seconds, + ) end + println() + + return nothing end -# ═══════════════════════════════════════════════════════════════════════════════ -# Compact 2×2 plot -# ═══════════════════════════════════════════════════════════════════════════════ -function make_plots(tag, entries, S_star, title_suffix; sddp_tag=tag, dr_tag=tag) - time_cols = [Symbol("t$i") for i in 0:INVENTORY_T] - - # (1,1) SDDP learning curve - sddp_log = CSV.read(joinpath(result_dir, "$(sddp_tag)_sddp_training_log.csv"), DataFrame) - valid = filter(row -> !ismissing(row.bound) && isfinite(row.bound), sddp_log) - p1 = plot(valid.iteration, valid.bound; - xlabel="Iteration", ylabel="Cost", - title="SDDP learning curve", label="LP bound", - linewidth=2, color=:darkgreen, legend=:right) - if "simulation_value" in names(valid) - sim_rows = filter(row -> !ismissing(row.simulation_value) && isfinite(row.simulation_value), valid) - if nrow(sim_rows) > 0 - plot!(p1, sim_rows.iteration, sim_rows.simulation_value; - label="Simulation", linewidth=2, color=:darkorange) - end - end +""" + short_method_label(name::AbstractString, base_stock_level::Real) -> String - # (1,2) TS-DDR training curve - curve_df = CSV.read(joinpath(result_dir, "$(dr_tag)_training_curve.csv"), DataFrame) - p2 = plot(curve_df.batch, curve_df.loss; - xlabel="Batch", ylabel="Mean operational cost", - title="TS-DDR training curve", legend=false, - linewidth=2, color=:steelblue) - - # (2,1) Net-inventory trajectories - dr_traj = CSV.read(joinpath(result_dir, "$(dr_tag)_dr_trajectories.csv"), DataFrame) - bs_tag_file = sddp_tag # baselines share the sddp tag prefix - bs_traj = CSV.read(joinpath(result_dir, "$(bs_tag_file)_basestock_trajectories.csv"), DataFrame) - n_show = min(20, nrow(dr_traj), nrow(bs_traj)) - p3 = plot(; xlabel="Period", ylabel="Net inventory", - title="Inventory trajectories", legend=:topright) - for s in 1:n_show - plot!(p3, 0:INVENTORY_T, Vector(dr_traj[s, time_cols]); - color=:steelblue, alpha=0.35, label=s == 1 ? "TS-DDR" : false) - end - for s in 1:n_show - plot!(p3, 0:INVENTORY_T, Vector(bs_traj[s, time_cols]); - color=:darkorange, alpha=0.35, label=s == 1 ? "Base-stock" : false) - end - hline!(p3, [0.0]; linestyle=:dash, color=:black, label="Zero") +Return compact labels for plot axes. - # (2,2) Cost distribution boxplot - labels = [e[1] for e in entries] - short_labels = replace.(labels, +# Arguments +- `name::AbstractString`: full table label. +- `base_stock_level::Real`: base-stock order-up-to level. + +# Examples +```julia +label = short_method_label("TS-DDR (FixedDiscrete)", 160.0) +``` +""" +function short_method_label(name::AbstractString, base_stock_level::Real) + # Keep repeated labels short enough for the violin plot axis. + startswith(name, "Base-stock") && + return "Base-stock\n(S*=$(round(Int, base_stock_level)))" + + replacements = Dict( "TS-DDR (FixedDiscrete)" => "TS-DDR\n(FixedDisc)", "TS-DDR (ContRelax)" => "TS-DDR\n(ContRelax)", + "TS-DDR (MixedGrad)" => "TS-DDR\n(MixedGrad)", + "TS-DDR (HighPenalty)" => "TS-DDR\n(HighPen)", + "TS-DDR (LSTM)" => "TS-DDR\n(LSTM)", + "TS-DDR (LSTM+SF)" => "TS-DDR\n(LSTM+SF)", "TS-DDR (trained)" => "TS-DDR", + "TS-DDR Relaxed (LSTM)" => "TS-DDR\n(LSTM)", + "TS-DDR Relaxed (HighPenalty)" => "TS-DDR\n(HighPen)", + "TS-DDR Relaxed (LSTM+HP)" => "TS-DDR\n(LSTM+HP)", "SDDP (PAR)" => "SDDP", - "SDDP.jl integer rollout" => "SDDP", - "Random (untrained)" => "Random") - short_labels = [startswith(l, "Base-stock") ? "Base-stock\n(S*=$(round(Int,S_star)))" : l for l in short_labels] - data = [e[2] for e in entries] - n_methods = length(entries) - colors = if n_methods == 4 - [:gold :darkgreen :steelblue :gray] - elseif n_methods == 5 - [:gold :royalblue :darkgreen :steelblue :gray] - else - palette(:auto, n_methods)' + "SDDP (MIP fwd)" => "SDDP\n(MIP fwd)", + "SDDP (LP relax)" => "SDDP\n(LP relax)", + "Random (untrained)" => "Random", + ) + + return get(replacements, String(name), String(name)) +end + +""" + method_colors(num_methods::Integer) + +Return stable plot colors for the number of compared methods. + +# Arguments +- `num_methods::Integer`: number of methods in the comparison. + +# Examples +```julia +colors = method_colors(length(results)) +``` +""" +function method_colors(num_methods::Integer) + # Keep colors stable between documentation rebuilds. + # Color assignments: TS-DDR variants in blues/oranges, SDDP in greens, + # baselines in warm tones and gray. + color_bank = [ + :steelblue, :royalblue, :darkorange, :mediumpurple, + :coral, :teal, :darkgreen, :seagreen, :gold, :gray, + ] + + num_methods <= length(color_bank) && return color_bank[1:num_methods] + + return palette(:auto, num_methods) +end + +""" + plot_sddp_learning_curve(tag::AbstractString) + +Create the SDDP training-bound subplot. + +# Arguments +- `tag::AbstractString`: result prefix used by SDDP output files. + +# Examples +```julia +plot_sddp_learning_curve("integer") +``` +""" +function plot_sddp_learning_curve(tag::AbstractString; start_fraction::Float64 = 0.5) + # SDDP logs may include failed or missing simulation rows. + log = CSV.read(resolve_file("$(tag)_sddp_training_log.csv"), DataFrame) + + # Log-scale plots require strictly positive finite values. + valid_bound_rows = filter( + row -> !ismissing(row.bound) && isfinite(row.bound) && row.bound > 0, + log, + ) + + # Show only the converged portion of training. + start_iter = round(Int, start_fraction * maximum(valid_bound_rows.iteration)) + converged = filter(row -> row.iteration >= start_iter, valid_bound_rows) + + plot_handle = plot( + converged.iteration, + converged.bound; + xlabel = "Iteration", + ylabel = "Cost (log scale)", + title = "SDDP learning curve (converged)", + label = "LP bound", + linewidth = 2, + color = :darkgreen, + legend = :right, + yscale = :log10, + ) + + if "simulation_value" in names(converged) + valid_sim_rows = filter( + row -> !ismissing(row.simulation_value) && + isfinite(row.simulation_value) && + row.simulation_value > 0, + converged, + ) + + if nrow(valid_sim_rows) > 0 + plot!( + plot_handle, + valid_sim_rows.iteration, + valid_sim_rows.simulation_value; + label = "Simulation", + linewidth = 2, + color = :darkorange, + ) + end end - p4 = boxplot(short_labels, data; - xlabel="Method", ylabel="Operational cost", - title="Cost comparison", legend=false, - fillcolor=colors, linecolor=:black) + return plot_handle +end + +""" + plot_training_curves(curve_specs) + +Create the TS-DDR training-curve subplot. + +# Arguments +- `curve_specs`: tuples `(tag, label, color)` for training-curve CSV files. + +# Examples +```julia +plot_training_curves([("integer", "FixedDiscrete", :steelblue)]) +``` +""" +function plot_training_curves(curve_specs) + # Start with an empty plot so optional curves can be skipped cleanly. + plot_handle = plot(; + xlabel = "Batch", + ylabel = "Out-of-sample rollout cost", + title = "TS-DDR training curves", + legend = :topright, + ) + + for (tag, label, color) in curve_specs + # Optional variants should not break the plot. + path = resolve_file_optional("$(tag)_training_curve.csv") + isnothing(path) && continue + + curve = CSV.read(path, DataFrame) + + # Prefer the true out-of-sample rollout cost; fall back to the + # DE training objective for data generated before the rollout + # evaluation was added. + if "rollout_cost" in names(curve) + valid = dropmissing(curve, :rollout_cost) + valid = filter(row -> isfinite(row.rollout_cost), valid) + plot!(plot_handle, valid.batch, valid.rollout_cost; + label = label, linewidth = 2, color = color) + else + plot!(plot_handle, curve.batch, curve.loss; + label = label, linewidth = 2, color = color) + end + end + + return plot_handle +end + +""" + plot_inventory_trajectories(dr_tag, baseline_tag) + +Create the inventory-trajectory subplot. + +# Arguments +- `dr_tag::AbstractString`: TS-DDR trajectory prefix. +- `baseline_tag::AbstractString`: baseline trajectory prefix. + +# Examples +```julia +plot_inventory_trajectories("integer", "integer") +``` +""" +function plot_inventory_trajectories(dr_tag, baseline_tag) + # Trajectory files have columns t0, t1, ..., tT. + time_columns = [Symbol("t$(period)") for period in 0:INVENTORY_T] + + # Load TS-DDR and base-stock trajectories. + dr_paths = CSV.read(resolve_file("$(dr_tag)_dr_trajectories.csv"), DataFrame) + base_stock_paths = CSV.read( + resolve_file("$(baseline_tag)_basestock_trajectories.csv"), + DataFrame, + ) + + # Plot a readable subset rather than all trajectories. + num_paths = min(20, nrow(dr_paths), nrow(base_stock_paths)) + + plot_handle = plot(; + xlabel = "Period", + ylabel = "Net inventory", + title = "Inventory trajectories", + legend = :topright, + ) + + for row in 1:num_paths + plot!( + plot_handle, + 0:INVENTORY_T, + Vector(dr_paths[row, time_columns]); + color = :steelblue, + alpha = 0.35, + label = row == 1 ? "TS-DDR" : false, + ) + end + + for row in 1:num_paths + plot!( + plot_handle, + 0:INVENTORY_T, + Vector(base_stock_paths[row, time_columns]); + color = :darkorange, + alpha = 0.35, + label = row == 1 ? "Base-stock" : false, + ) + end + + # Zero inventory separates holding from backlog. + hline!(plot_handle, [0.0]; linestyle = :dash, color = :black, label = "Zero") + + return plot_handle +end + +""" + plot_cost_distribution(results, base_stock_level) + +Create the cost-distribution subplot. + +# Arguments +- `results::Vector{MethodResult}`: methods to compare. +- `base_stock_level::Real`: base-stock order-up-to level. + +# Examples +```julia +plot_cost_distribution(results, 160.0) +``` +""" +function plot_cost_distribution(results::Vector{MethodResult}, base_stock_level::Real) + # Convert table labels to compact axis labels. + labels = [short_method_label(result.name, base_stock_level) for result in results] + + # Keep method colors stable across plot rebuilds. + colors = method_colors(length(results)) + + plot_handle = plot(; + xlabel = "Method", + ylabel = "Operational cost", + title = "Cost comparison", + legend = false, + xrotation = 30, + bottom_margin = 8Plots.mm, + xtickfontsize = 7, + ) + + for index in eachindex(results) + violin!( + plot_handle, + fill(labels[index], length(results[index].costs)), + results[index].costs; + fillcolor = colors[index], + linecolor = :black, + fillalpha = 0.7, + ) + end + + return plot_handle +end + +""" + make_summary_plot(problem; kwargs...) + +Build the 2x2 documentation figure for one problem variant. + +# Arguments +- `problem::AbstractString`: figure title. +- `results::Vector{MethodResult}`: compared methods. +- `base_stock_level::Real`: base-stock order-up-to level. +- `sddp_tag::AbstractString`: SDDP result prefix. +- `dr_tag::AbstractString`: TS-DDR trajectory prefix. +- `curve_specs`: training-curve plot specifications. + +# Examples +```julia +plot = make_summary_plot( + "Integer problem", + results = integer_results, + base_stock_level = 160.0, + sddp_tag = "integer", + dr_tag = "integer", + curve_specs = [("integer", "FixedDiscrete", :steelblue)], +) +``` +""" +function make_summary_plot( + problem::AbstractString; + results::Vector{MethodResult}, + base_stock_level::Real, + sddp_tag::AbstractString, + dr_tag::AbstractString, + curve_specs, +) + # Build each panel with a single responsibility. + sddp_panel = plot_sddp_learning_curve(sddp_tag) + training_panel = plot_training_curves(curve_specs) + trajectory_panel = plot_inventory_trajectories(dr_tag, sddp_tag) + distribution_panel = plot_cost_distribution(results, base_stock_level) + + # Use a fixed layout so generated docs are stable. layout = @layout [a b; c d] - combined = plot(p1, p2, p3, p4; layout=layout, size=(1100, 800), - plot_title=title_suffix, plot_titlefontsize=12, margin=5Plots.mm) - return combined -end - -# ═══════════════════════════════════════════════════════════════════════════════ -# Demand process plot (shared) -# ═══════════════════════════════════════════════════════════════════════════════ -periods = 1:INVENTORY_T -demand_mid = (D_LO .+ D_HI) ./ 2 -plt_demand = plot(periods, demand_mid; - xlabel="Period", ylabel="Demand", - title="Latent demand process (random phase + regime + AR)", - label="Nominal seasonal center", linewidth=2, linestyle=:dash, color=:purple) -rng_plot = MersenneTwister(1234) -for k in 1:24 - path = sample_inventory_demand_path(rng_plot) - plot!(plt_demand, periods, path; color=:gray, alpha=0.28, label=false) -end -savefig(plt_demand, joinpath(docs_dir, "inventory_demand_process.png")) -println("Saved inventory_demand_process.png") - -# ═══════════════════════════════════════════════════════════════════════════════ -# Section 1: Relaxed -# ═══════════════════════════════════════════════════════════════════════════════ -println("\n" * "=" ^ 60) -println("SECTION 1: Relaxed (continuous) comparison") -println("=" ^ 60) - -r_dr = load_costs("relaxed", "dr") -r_sddp = load_costs("relaxed", "sddp") -r_bs = load_costs("relaxed", "basestock") -r_rand = load_costs("relaxed", "random") -r_timing = load_timing(["relaxed"]) -r_S = parse(Float64, strip(read(joinpath(result_dir, "relaxed_basestock_S_star.txt"), String))) -r_bound = parse(Float64, strip(read(joinpath(result_dir, "relaxed_sddp_bound.txt"), String))) - -r_entries = [ - ("TS-DDR (trained)", r_dr), - ("SDDP (PAR)", r_sddp), - ("Base-stock (S*=$(round(Int, r_S)))", r_bs), - ("Random (untrained)", r_rand), -] -print_table(r_entries, r_timing, r_bound) - -plt_relaxed = make_plots("relaxed", r_entries, r_S, "Relaxed (continuous) problem") -savefig(plt_relaxed, joinpath(docs_dir, "inventory_relaxed_results.png")) -println("Saved inventory_relaxed_results.png") - -# ═══════════════════════════════════════════════════════════════════════════════ -# Section 2: Integer -# ═══════════════════════════════════════════════════════════════════════════════ -println("\n" * "=" ^ 60) -println("SECTION 2: Integer (MIP) comparison") -println("=" ^ 60) - -i_dr = load_costs("integer", "dr") -i_dr_cr = load_costs("integer_cr", "dr") -i_sddp = load_costs("integer", "sddp") -i_bs = load_costs("integer", "basestock") -i_rand = load_costs("integer", "random") -i_timing = load_timing(["integer", "integer_cr"]) -i_S = parse(Float64, strip(read(joinpath(result_dir, "integer_basestock_S_star.txt"), String))) -i_bound = parse(Float64, strip(read(joinpath(result_dir, "integer_sddp_bound.txt"), String))) - -i_entries = [ - ("TS-DDR (FixedDiscrete)", i_dr), - ("TS-DDR (ContRelax)", i_dr_cr), - ("SDDP.jl integer rollout", i_sddp), - ("Base-stock (S*=$(round(Int, i_S)))", i_bs), - ("Random (untrained)", i_rand), -] -print_table(i_entries, i_timing, i_bound) - -plt_integer = make_plots("integer", i_entries, i_S, "Integer (MIP) problem"; - sddp_tag="integer", dr_tag="integer") -savefig(plt_integer, joinpath(docs_dir, "inventory_integer_results.png")) -println("Saved inventory_integer_results.png") - -println("\nAll assets saved to: $(relpath(docs_dir, example_dir))") + + return plot( + sddp_panel, + training_panel, + trajectory_panel, + distribution_panel; + layout = layout, + size = (1200, 900), + plot_title = problem, + plot_titlefontsize = 12, + margin = 6Plots.mm, + ) +end + +""" + plot_demand_process() -> Nothing + +Regenerate the demand-process documentation figure. + +# Examples +```julia +plot_demand_process() +``` +""" +function plot_demand_process() + # Period numbers run from 1 to T. + periods = 1:INVENTORY_T + + # The nominal seasonal center is the midpoint of the demand band. + demand_midpoint = (D_LO .+ D_HI) ./ 2 + + plot_handle = plot( + periods, + demand_midpoint; + xlabel = "Period", + ylabel = "Demand", + title = "Latent demand process (random phase + regime + AR)", + label = "Nominal seasonal center", + linewidth = 2, + linestyle = :dash, + color = :purple, + ) + + # Use a fixed RNG so the figure is reproducible. + rng = MersenneTwister(1234) + + for _ in 1:24 + # Each path has its own hidden phase, regime, and shock sequence. + path = sample_inventory_demand_path(rng) + + plot!( + plot_handle, + periods, + path; + color = :gray, + alpha = 0.28, + label = false, + ) + end + + savefig(plot_handle, joinpath(DOCS_ASSET_DIR, "inventory_demand_process.png")) + println("Saved inventory_demand_process.png") + + return nothing +end + +""" + relaxed_results() -> (results, timing, base_stock_level, bound) + +Load all relaxed-problem comparison data. + +# Examples +```julia +results, timing, base_stock_level, bound = relaxed_results() +``` +""" +function relaxed_results() + # Load all relaxed operational-cost samples. + dr_costs = load_costs("relaxed", "dr") + sddp_costs = load_costs("relaxed", "sddp") + base_stock_costs = load_costs("relaxed", "basestock") + random_costs = load_costs("relaxed", "random") + + # Load optional tuned-variant costs. + lstm_costs = optional_costs("relaxed_lstm", "dr") + hp_costs = optional_costs("relaxed_hp", "dr") + lstm_hp_costs = optional_costs("relaxed_lstm_hp", "dr") + + # Load scalar baseline metadata. + base_stock_level = read_scalar(resolve_file("relaxed_basestock_S_star.txt")) + sddp_bound = read_scalar(resolve_file("relaxed_sddp_bound.txt")) + + # Build display records in table order. + results = [ + MethodResult("TS-DDR (trained)", dr_costs), + ] + + # Insert tuned variants after the baseline feedforward. + !isnothing(hp_costs) && + push!(results, MethodResult("TS-DDR Relaxed (HighPenalty)", hp_costs)) + !isnothing(lstm_costs) && + push!(results, MethodResult("TS-DDR Relaxed (LSTM)", lstm_costs)) + !isnothing(lstm_hp_costs) && + push!(results, MethodResult("TS-DDR Relaxed (LSTM+HP)", lstm_hp_costs)) + + # Append non-TS-DDR baselines. + push!(results, MethodResult("SDDP (PAR)", sddp_costs)) + push!(results, MethodResult("Base-stock (S*=$(round(Int, base_stock_level)))", base_stock_costs)) + push!(results, MethodResult("Random (untrained)", random_costs)) + + # Collect timing tags for all present variants. + timing_tags = ["relaxed"] + !isnothing(resolve_file_optional("relaxed_lstm_dr_timing.csv")) && + push!(timing_tags, "relaxed_lstm") + !isnothing(resolve_file_optional("relaxed_hp_dr_timing.csv")) && + push!(timing_tags, "relaxed_hp") + !isnothing(resolve_file_optional("relaxed_lstm_hp_dr_timing.csv")) && + push!(timing_tags, "relaxed_lstm_hp") + + return results, load_timing(timing_tags), base_stock_level, sddp_bound +end + +""" + integer_results() -> (results, timing, base_stock_level, bound) + +Load all integer-problem comparison data. + +# Examples +```julia +results, timing, base_stock_level, bound = integer_results() +``` +""" +function integer_results() + # Load required integer operational-cost samples. + fixed_discrete_costs = load_costs("integer", "dr") + continuous_relaxation_costs = load_costs("integer_cr", "dr") + sddp_mip_forward_costs = load_costs("integer", "sddp") + sddp_lp_relaxation_costs = load_costs("integer", "sddp_lp") + base_stock_costs = load_costs("integer", "basestock") + random_costs = load_costs("integer", "random") + + # Optional variants — load only if the result file exists. + mixed_gradient_costs = optional_costs("integer_sf", "dr") + hp_costs = optional_costs("integer_hp", "dr") + lstm_costs = optional_costs("integer_lstm", "dr") + lstm_sf_costs = optional_costs("integer_lstm_sf", "dr") + + # Load scalar baseline metadata. + base_stock_level = read_scalar(resolve_file("integer_basestock_S_star.txt")) + sddp_bound = read_scalar(resolve_file("integer_sddp_bound.txt")) + + # Build the method list: original TS-DDR variants first. + results = [ + MethodResult("TS-DDR (FixedDiscrete)", fixed_discrete_costs), + MethodResult("TS-DDR (ContRelax)", continuous_relaxation_costs), + ] + + # Insert optional TS-DDR variants in logical order. + !isnothing(mixed_gradient_costs) && + push!(results, MethodResult("TS-DDR (MixedGrad)", mixed_gradient_costs)) + !isnothing(hp_costs) && + push!(results, MethodResult("TS-DDR (HighPenalty)", hp_costs)) + !isnothing(lstm_costs) && + push!(results, MethodResult("TS-DDR (LSTM)", lstm_costs)) + !isnothing(lstm_sf_costs) && + push!(results, MethodResult("TS-DDR (LSTM+SF)", lstm_sf_costs)) + + # Append non-TS-DDR baselines. + push!(results, MethodResult("SDDP (MIP fwd)", sddp_mip_forward_costs)) + push!(results, MethodResult("SDDP (LP relax)", sddp_lp_relaxation_costs)) + push!(results, MethodResult("Base-stock (S*=$(round(Int, base_stock_level)))", base_stock_costs)) + push!(results, MethodResult("Random (untrained)", random_costs)) + + # Collect timing tags for all present variants. + timing_tags = ["integer", "integer_cr"] + for tag in ["integer_sf", "integer_hp", "integer_lstm", "integer_lstm_sf"] + !isnothing(resolve_file_optional("$(tag)_dr_timing.csv")) && + push!(timing_tags, tag) + end + + return results, load_timing(timing_tags), base_stock_level, sddp_bound +end + +""" + integer_curve_specs() + +Return training-curve plot specs for the integer comparison. + +# Examples +```julia +curves = integer_curve_specs() +``` +""" +function integer_curve_specs() + # FixedDiscrete and ContRelax are always part of the integer benchmark. + specs = [ + ("integer", "FixedDiscrete", :steelblue), + ("integer_cr", "ContRelax", :royalblue), + ] + + # Optional variants appear only when their training curve exists. + optional = [ + ("integer_sf", "MixedGrad", :darkorange), + ("integer_hp", "HighPenalty", :mediumpurple), + ("integer_lstm", "LSTM", :coral), + ("integer_lstm_sf", "LSTM+SF", :teal), + ] + + for spec in optional + !isnothing(resolve_file_optional("$(spec[1])_training_curve.csv")) && + push!(specs, spec) + end + + return specs +end + +""" + relaxed_curve_specs() + +Return training-curve plot specs for the relaxed comparison. + +# Examples +```julia +curves = relaxed_curve_specs() +``` +""" +function relaxed_curve_specs() + # Baseline feedforward is always present. + specs = [("relaxed", "Feedforward", :steelblue)] + + # Optional tuned variants. + optional = [ + ("relaxed_hp", "HighPenalty", :mediumpurple), + ("relaxed_lstm", "LSTM", :coral), + ("relaxed_lstm_hp", "LSTM+HP", :teal), + ] + + for spec in optional + !isnothing(resolve_file_optional("$(spec[1])_training_curve.csv")) && + push!(specs, spec) + end + + return specs +end + +""" + run_relaxed_comparison() -> Nothing + +Print and plot the relaxed continuous comparison. + +# Examples +```julia +run_relaxed_comparison() +``` +""" +function run_relaxed_comparison() + println("\n" * "=" ^ 60) + println("SECTION 1: Relaxed (continuous) comparison") + println("=" ^ 60) + + # Load data, print table, and save the documentation figure. + results, timing, base_stock_level, bound = relaxed_results() + print_table(results, timing, bound) + + figure = make_summary_plot( + "Relaxed (continuous) problem"; + results = results, + base_stock_level = base_stock_level, + sddp_tag = "relaxed", + dr_tag = "relaxed", + curve_specs = relaxed_curve_specs(), + ) + + savefig(figure, joinpath(DOCS_ASSET_DIR, "inventory_relaxed_results.png")) + println("Saved inventory_relaxed_results.png") + + return nothing +end + +""" + run_integer_comparison() -> Nothing + +Print and plot the integer MIP comparison. + +# Examples +```julia +run_integer_comparison() +``` +""" +function run_integer_comparison() + println("\n" * "=" ^ 60) + println("SECTION 2: Integer (MIP) comparison") + println("=" ^ 60) + + # Load data, print table, and save the documentation figure. + results, timing, base_stock_level, bound = integer_results() + print_table(results, timing, bound) + + figure = make_summary_plot( + "Integer (MIP) problem"; + results = results, + base_stock_level = base_stock_level, + sddp_tag = "integer", + dr_tag = "integer", + curve_specs = integer_curve_specs(), + ) + + savefig(figure, joinpath(DOCS_ASSET_DIR, "inventory_integer_results.png")) + println("Saved inventory_integer_results.png") + + return nothing +end + +""" + main() -> Nothing + +Run every inventory-result comparison. + +# Examples +```julia +main() +``` +""" +function main() + # Regenerate the demand-process figure before method comparisons. + plot_demand_process() + + # Print and plot the relaxed benchmark. + run_relaxed_comparison() + + # Print and plot the integer benchmark. + run_integer_comparison() + + println("\nAll assets saved to: $(relpath(DOCS_ASSET_DIR, @__DIR__))") + + return nothing +end + +# Run only when invoked as a script. +if abspath(PROGRAM_FILE) == @__FILE__ + main() +end diff --git a/examples/inventory_control/evaluate_inventory.jl b/examples/inventory_control/evaluate_inventory.jl index 9fd1f6a..a0e42f2 100644 --- a/examples/inventory_control/evaluate_inventory.jl +++ b/examples/inventory_control/evaluate_inventory.jl @@ -2,8 +2,16 @@ Evaluate non-neural baselines for the inventory control problem. Evaluates base-stock and random policies for both relaxed and integer cases. + +Pass a run ID as the first CLI argument, or omit to generate one from the +current timestamp: + +```bash +julia --project=. evaluate_inventory.jl 20260619_231417 +``` """ +using Dates using DecisionRules using JuMP using Flux @@ -12,6 +20,9 @@ using Random, Statistics include(joinpath(@__DIR__, "build_inventory_problem.jl")) +const RUN_ID = isempty(ARGS) ? + Dates.format(Dates.now(), "yyyymmdd_HHMMss") : ARGS[1] + const N_EVAL = 300 # ═══════════════════════════════════════════════════════════════════════════════ @@ -156,7 +167,7 @@ function evaluate_baselines(; tag::String, integer::Bool) println(" Mean: $(round(mean(rand_costs), digits=1)) ± $(round(std(rand_costs), digits=1))") # --- Save results --- - result_dir = joinpath(@__DIR__, "results") + result_dir = joinpath(@__DIR__, "results", RUN_ID) mkpath(result_dir) df_bs_traj = DataFrame(bs_traj, [Symbol("t$i") for i in 0:INVENTORY_T]) @@ -184,4 +195,4 @@ end evaluate_baselines(tag="relaxed", integer=false) println() evaluate_baselines(tag="integer", integer=true) -println("\nAll baseline results saved to results/") +println("\nAll baseline results saved to results/$RUN_ID/") diff --git a/examples/inventory_control/solve_sddp.jl b/examples/inventory_control/solve_sddp.jl index 3760612..c3058bf 100644 --- a/examples/inventory_control/solve_sddp.jl +++ b/examples/inventory_control/solve_sddp.jl @@ -7,9 +7,19 @@ where μ_t, α, and Ω are fitted from simulated demand paths. Two cases: 1. Relaxed: no binary z, SDDP is near-optimal for convex problems -2. Integer: z ∈ [0,1] relaxation + integer rounding at rollout +2. Integer: AlternativeForwardPass — forward pass solves true MIP (z ∈ {0,1}), + backward pass uses LP relaxation (z ∈ [0,1]) to compute cuts with valid duals. + Both models share the same PAR(1) demand structure. + +Pass a run ID as the first CLI argument, or omit to generate one from the +current timestamp: + +```bash +julia --project=. solve_sddp.jl 20260619_231417 +``` """ +using Dates using SDDP using JuMP using HiGHS @@ -19,6 +29,9 @@ using Random include(joinpath(@__DIR__, "build_inventory_problem.jl")) +const RUN_ID = isempty(ARGS) ? + Dates.format(Dates.now(), "yyyymmdd_HHMMss") : ARGS[1] + const N_SIM = 300 const ITERATION_LIMIT = 500 @@ -66,7 +79,8 @@ println(" Ω (innovations, $(length(par_omega)) points): $(round.(par_omega, di # ═══════════════════════════════════════════════════════════════════════════════ # Build SDDP model with PAR(1) demand approximation # ═══════════════════════════════════════════════════════════════════════════════ -function build_sddp_model(; integer::Bool=false, mu=par_mu, alpha=par_alpha, omega=par_omega) +function build_sddp_model(; integer::Bool=false, binary::Bool=false, + mu=par_mu, alpha=par_alpha, omega=par_omega) d_lag_init = mu[1] SDDP.LinearPolicyGraph( stages=2 * INVENTORY_T, @@ -81,7 +95,11 @@ function build_sddp_model(; integer::Bool=false, mu=par_mu, alpha=par_alpha, ome if isodd(stage) @variable(sp, 0 <= q <= INVENTORY_Q_MAX) if integer - @variable(sp, 0 <= z <= 1) + if binary + @variable(sp, z, Bin) + else + @variable(sp, 0 <= z <= 1) + end @constraint(sp, q <= INVENTORY_Q_MAX * z) @stageobjective(sp, INVENTORY_K * z + INVENTORY_C * q) else @@ -171,7 +189,7 @@ function rollout_sddp(model, n_sim; integer_round::Bool=false, mu=par_mu) return costs, traj_inv end -result_dir = joinpath(@__DIR__, "results") +result_dir = joinpath(@__DIR__, "results", RUN_ID) mkpath(result_dir) # ═══════════════════════════════════════════════════════════════════════════════ @@ -219,44 +237,97 @@ open(joinpath(result_dir, "relaxed_sddp_bound.txt"), "w") do io end # ═══════════════════════════════════════════════════════════════════════════════ -# Section 2: Integer SDDP (LP relaxation + integer rounding rollout) +# Section 2: Integer SDDP (MIP forward pass + LP cuts via AlternativeForwardPass) +# +# Two-phase training: +# Phase 1 — LP forward + LP backward until convergence (warm-start cuts) +# Phase 2 — MIP forward + LP backward (refine at MIP-realistic trial points) +# Rollout on true MIP model (z ∈ {0,1}) with all accumulated cuts. # ═══════════════════════════════════════════════════════════════════════════════ println("\n" * "=" ^ 60) -println("SECTION 2: SDDP — Integer (LP relax + integer rollout)") +println("SECTION 2: SDDP — Integer (MIP forward + LP cuts)") println("=" ^ 60) -model_integer = build_sddp_model(; integer=true) -println("Training integer-relaxed SDDP ($(2*INVENTORY_T) stages)...") +model_lp = build_sddp_model(; integer=true, binary=false) +model_mip = build_sddp_model(; integer=true, binary=true) + +# --- Phase 1: LP warm-start --- +println("Phase 1: LP warm-start ($(2*INVENTORY_T) stages)...") sddp_int_start = time() SDDP.train( - model_integer; + model_lp; duality_handler=SDDP.ContinuousConicDuality(), iteration_limit=ITERATION_LIMIT, stopping_rules=[SDDP.BoundStalling(100, 1e-3)], print_level=1, ) +lp_bound = SDDP.calculate_bound(model_lp) +println(" LP warm-start bound: $(round(lp_bound, digits=1))") + +phase1_log = training_log_dataframe(model_lp) +sddp_lp_seconds = time() - sddp_int_start + +# --- LP rollout (default SDDP baseline: LP decisions + integer rounding) --- +println("LP rollout (default SDDP) on $N_SIM fresh scenarios...") +Random.seed!(555) +lp_eval_start = time() +lp_costs, lp_traj = rollout_sddp(model_lp, N_SIM; integer_round=true) +lp_eval_seconds = time() - lp_eval_start +μ_lp = mean(lp_costs) +println(" Default SDDP (LP rollout) — mean: $(round(μ_lp, digits=1)) ± $(round(std(lp_costs), digits=1))") + +CSV.write(joinpath(result_dir, "integer_sddp_lp_costs.csv"), DataFrame(operational_cost=lp_costs)) +CSV.write(joinpath(result_dir, "integer_sddp_lp_trajectories.csv"), + DataFrame(lp_traj, [Symbol("t$i") for i in 0:INVENTORY_T])) +CSV.write(joinpath(result_dir, "integer_sddp_lp_training_log.csv"), phase1_log) +CSV.write(joinpath(result_dir, "integer_sddp_lp_timing.csv"), + DataFrame(method=["SDDP (LP relax)"], fit_seconds=[0.0], + eval_seconds=[sddp_lp_seconds], n_eval=[N_SIM])) + +cuts_file = joinpath(result_dir, "integer_lp_cuts.json") +SDDP.write_cuts_to_file(model_lp, cuts_file) +SDDP.read_cuts_from_file(model_mip, cuts_file) +println(" Exported LP cuts to MIP model") + +# --- Phase 2: MIP forward + LP backward --- +println("\nPhase 2: AlternativeForwardPass — MIP forward + LP cuts...") +println(" Forward pass: true MIP (z ∈ {0,1})") +println(" Backward pass: LP relaxation (z ∈ [0,1]) for cuts") +SDDP.train( + model_lp; + forward_pass=SDDP.AlternativeForwardPass(model_mip), + post_iteration_callback=SDDP.AlternativePostIterationCallback(model_mip), + duality_handler=SDDP.ContinuousConicDuality(), + iteration_limit=ITERATION_LIMIT, + add_to_existing_cuts=true, + print_level=1, +) sddp_int_seconds = time() - sddp_int_start -int_bound = SDDP.calculate_bound(model_integer) -println("\nInteger-relaxed SDDP bound: $(round(int_bound, digits=1))") +phase2_log = training_log_dataframe(model_lp) +phase2_log.iteration .+= maximum(phase1_log.iteration) +combined_log = vcat(phase1_log, phase2_log) + +int_bound = SDDP.calculate_bound(model_lp) +println("\nLP relaxation bound (after both phases): $(round(int_bound, digits=1))") -println("Integer rollout on $N_SIM fresh scenarios (TRUE demand)...") +println("MIP rollout on $N_SIM fresh scenarios (TRUE demand)...") Random.seed!(555) int_eval_start = time() -int_costs, int_traj = rollout_sddp(model_integer, N_SIM; integer_round=true) +int_costs, int_traj = rollout_sddp(model_mip, N_SIM; integer_round=true) int_eval_seconds = time() - int_eval_start μ_i = mean(int_costs) σ_i = std(int_costs) -println("Integer SDDP — mean: $(round(μ_i, digits=1)) ± $(round(σ_i, digits=1))") +println("Integer SDDP (MIP fwd) — mean: $(round(μ_i, digits=1)) ± $(round(σ_i, digits=1))") println("Gap to LP bound: $(round(100 * (μ_i - int_bound) / μ_i, digits=1))%") CSV.write(joinpath(result_dir, "integer_sddp_costs.csv"), DataFrame(operational_cost=int_costs)) CSV.write(joinpath(result_dir, "integer_sddp_trajectories.csv"), DataFrame(int_traj, [Symbol("t$i") for i in 0:INVENTORY_T])) -CSV.write(joinpath(result_dir, "integer_sddp_training_log.csv"), training_log_dataframe(model_integer)) +CSV.write(joinpath(result_dir, "integer_sddp_training_log.csv"), combined_log) CSV.write(joinpath(result_dir, "integer_sddp_timing.csv"), - DataFrame(method=["SDDP.jl integer rollout"], fit_seconds=[0.0], + DataFrame(method=["SDDP (MIP fwd)"], fit_seconds=[0.0], eval_seconds=[sddp_int_seconds], n_eval=[N_SIM])) open(joinpath(result_dir, "integer_sddp_bound.txt"), "w") do io println(io, int_bound) diff --git a/examples/inventory_control/train_dr_inventory.jl b/examples/inventory_control/train_dr_inventory.jl index eb2c387..7777e1b 100644 --- a/examples/inventory_control/train_dr_inventory.jl +++ b/examples/inventory_control/train_dr_inventory.jl @@ -1,207 +1,1149 @@ """ -Train TS-DDR policies for the inventory control problem. +Train TS-DDR policies for the inventory-control benchmark. -Trains two policies: -1. Relaxed (continuous LP subproblems, standard LP duals) -2. Integer (MIP subproblems, FixedDiscreteIntegerStrategy) — uses more - batches and lower learning rate for stable convergence. +The benchmark compares target-state decision-rule variants across two axes: + +**Gradient estimator** — how ∇_θ Q(w; θ) is computed for integer models: +1. fixed-discrete local duals (solve MIP → fix z → re-solve LP → read duals); +2. continuous-relaxation duals (relax z ∈ {0,1} → LP → read duals); +3. mixed gradient (α · dual + (1-α) · score-function REINFORCE correction). + +**Policy architecture** — which function class maps observations to targets: +- `ExAnteInventoryPolicy`: feedforward MLP, sigmoid output; +- `LSTMExAntePolicy`: recurrent encoder on lagged demand, affine output. + +Each variant is independent and can be run via: + + julia --project=. train_dr_inventory.jl + +where `` is one of `relaxed`, `integer`, `integer_cr`, `integer_sf`, +`integer_hp`, `integer_lstm`, `integer_lstm_sf`. """ +using CSV +using DataFrames +using Dates using DecisionRules -using JuMP using Flux using JLD2 -using CSV, DataFrames -using Random, Statistics +using JuMP +using Random +using Statistics include(joinpath(@__DIR__, "build_inventory_problem.jl")) +# The script keeps generated models and CSV files out of the source directory. +const EXAMPLE_DIR = @__DIR__ + +# Each run writes to results// so concurrent or successive runs never +# clobber each other. RUN_ID is set by launch_all.sh for batch submissions or +# generated from the current timestamp for standalone runs. +const RUN_ID = get(ENV, "RUN_ID", Dates.format(Dates.now(), "yyyymmdd_HHMMss")) +const RESULT_DIR = joinpath(EXAMPLE_DIR, "results", RUN_ID) +const MODEL_DIR = joinpath(EXAMPLE_DIR, "models", RUN_ID) + +# Create output directories before any training run tries to write into them. +mkpath(MODEL_DIR) +mkpath(RESULT_DIR) + +println("Run ID: $RUN_ID") +println("Results → $RESULT_DIR") + +# Use one fixed training sample size for every TS-DDR variant. const N_TRAIN_SCENARIOS = 50 -const N_TEST = 300 - -example_dir = @__DIR__ -model_dir = joinpath(example_dir, "models") -result_dir = joinpath(example_dir, "results") -mkpath(model_dir) -mkpath(result_dir) - -# ═══════════════════════════════════════════════════════════════════════════════ -# Rollout helper -# ═══════════════════════════════════════════════════════════════════════════════ -function rollout_policy(policy, subproblems, spi, spo, unc_eval, init_state; - n_test=N_TEST, seed=555, integer=true) + +# Use one held-out evaluation size for every reported cost distribution. +const N_TEST_SCENARIOS = 300 + +""" + InventoryTrainingVariant + +Configuration for one TS-DDR inventory training run. + +Mathematically, every variant trains a policy ``\\pi_\\theta`` by stochastic +gradient descent on sampled deterministic-equivalent objectives +``Q(w; \\theta)``. The fields choose the model family and the gradient estimator: + +- `integer`: whether the JuMP model contains binary setup variables ``z_t``; +- `training_integer_strategy`: how local dual information is read for + ``\\nabla_\\theta Q(w; \\theta)`` when the model is mixed-integer; +- `score_function`: optional Monte Carlo correction using perturbed target + rollouts. + +# Fields +- `tag::String`: prefix used for saved models and CSV files. +- `integer::Bool`: whether to build the fixed-cost MIP formulation. +- `num_batches::Int`: number of SGD updates. +- `train_per_batch::Int`: sampled trajectories per SGD update. +- `learning_rate::Float64`: Adam learning rate. +- `warmup_batches::Int`: last batch of the low target-penalty phase. +- `training_integer_strategy::AbstractIntegerStrategy`: dual-path strategy. +- `score_function::Union{Nothing,ScoreFunctionConfig,ScoreFunctionSchedule}`: + optional score-function estimator. +- `penalty::Float64`: target-deficit penalty λ. +- `policy_builder::Function`: zero-argument callable returning a fresh policy. +- `penalty_schedule_fn::Function`: `(variant) -> schedule` for target-penalty + multiplier ramp. + +The 8-argument constructor defaults `penalty = INVENTORY_PENALTY`, +`policy_builder = () -> build_exante_policy(; seed = 2024)`, and +`penalty_schedule_fn = penalty_schedule_for`. + +# Examples +```julia +variant = InventoryTrainingVariant( + "integer", + true, + 800, + 10, + 8.0e-4, + 120, + FixedDiscreteIntegerStrategy(), + nothing, +) +``` +""" +struct InventoryTrainingVariant + tag::String + integer::Bool + num_batches::Int + train_per_batch::Int + learning_rate::Float64 + warmup_batches::Int + training_integer_strategy::AbstractIntegerStrategy + score_function::Union{Nothing,ScoreFunctionConfig,ScoreFunctionSchedule} + penalty::Float64 + policy_builder::Function + penalty_schedule_fn::Function +end + +function InventoryTrainingVariant( + tag, integer, num_batches, train_per_batch, learning_rate, warmup_batches, + training_integer_strategy, score_function, +) + return InventoryTrainingVariant( + tag, integer, num_batches, train_per_batch, learning_rate, warmup_batches, + training_integer_strategy, score_function, + INVENTORY_PENALTY, + () -> build_exante_policy(; seed = 2024), + penalty_schedule_for, + ) +end + +""" + penalty_schedule_for(variant::InventoryTrainingVariant) + +Return the two-phase target-penalty schedule used by one inventory variant. + +The target penalty is multiplied by `0.4` during warmup and by `1.0` +afterward: + +```math +m_k = +\\begin{cases} +0.4, & 1 \\le k \\le k_{warm}, \\\\ +1.0, & k_{warm} < k \\le K. +\\end{cases} +``` + +# Arguments +- `variant::InventoryTrainingVariant`: training configuration. + +# Examples +```julia +schedule = penalty_schedule_for(variant) +``` +""" +function penalty_schedule_for(variant::InventoryTrainingVariant) + # The first tuple covers the gentler warmup phase. + warmup_phase = (1, variant.warmup_batches, 0.4) + + # The second tuple restores the nominal target penalty. + full_penalty_phase = ( + variant.warmup_batches + 1, + variant.num_batches, + 1.0, + ) + + return [warmup_phase, full_penalty_phase] +end + +""" + method_label(variant::InventoryTrainingVariant) -> String + +Return the table label for one TS-DDR variant. + +# Arguments +- `variant::InventoryTrainingVariant`: training configuration. + +# Examples +```julia +label = method_label(variant) +``` +""" +function method_label(variant::InventoryTrainingVariant) + tag = variant.tag + + # --- Relaxed tuned variants --- + tag == "relaxed_lstm" && return "TS-DDR Relaxed (LSTM)" + tag == "relaxed_hp" && return "TS-DDR Relaxed (HighPenalty)" + tag == "relaxed_lstm_hp" && return "TS-DDR Relaxed (LSTM+HP)" + + # --- Integer tuned variants --- + tag == "integer_lstm" && return "TS-DDR (LSTM)" + tag == "integer_lstm_sf" && return "TS-DDR (LSTM+SF)" + tag == "integer_hp" && return "TS-DDR (HighPenalty)" + + # --- Original variants --- + !isnothing(variant.score_function) && return "TS-DDR (MixedGrad)" + + variant.training_integer_strategy isa ContinuousRelaxationIntegerStrategy && + return "TS-DDR (ContRelax)" + + variant.training_integer_strategy isa FixedDiscreteIntegerStrategy && + return "TS-DDR (FixedDiscrete)" + + return "TS-DDR (trained)" +end + +""" + operational_stage_cost(model::JuMP.Model, integer::Bool) -> Float64 + +Return the realized inventory cost of one solved stage model. + +For the integer formulation, the operational cost is + +```math +K z_t + c q_t + h \\max(s_t,0) + p \\max(-s_t,0). +``` + +For the relaxed formulation, the setup term ``K z_t`` is absent. + +# Arguments +- `model::JuMP.Model`: solved inventory stage model. +- `integer::Bool`: whether `model` contains the binary setup variable `z`. + +# Examples +```julia +cost = operational_stage_cost(stage_model, true) +``` +""" +function operational_stage_cost(model::JuMP.Model, integer::Bool) + # The order quantity is common to both formulations. + order_quantity = value(model[:q]) + + # Net inventory after demand determines holding or backlog cost. + next_inventory = value(model[:s_out]) + + # Positive inventory pays holding cost. + holding_cost = INVENTORY_H * max(next_inventory, 0.0) + + # Negative inventory pays backlog cost. + backlog_cost = INVENTORY_P * max(-next_inventory, 0.0) + + # Continuous formulations pay only variable ordering, holding, and backlog. + variable_cost = INVENTORY_C * order_quantity + holding_cost + backlog_cost + + if integer + # MIP solves return an integral setup value; round removes solver noise. + setup_value = round(value(model[:z])) + + return INVENTORY_K * setup_value + variable_cost + end + + return variable_cost +end + +""" + rollout_policy(policy, subproblems, state_params_in, state_params_out, + uncertainty_sampler, initial_state; kwargs...) + +Evaluate a trained policy by stage-wise rollout on held-out trajectories. + +At each stage the policy proposes a target state. The stage model then solves + +```math +\\min f_t(x_t,y_t) + \\lambda |x_t^{mid} - \\hat{x}_t| +``` + +subject to the inventory transition and capacity constraints. Only the +operational term ``f_t`` is reported as cost, because target slack is a training +device rather than a deployed cost. + +# Arguments +- `policy`: Flux-compatible target-state policy. +- `subproblems`: one solved-and-reused JuMP model per stage. +- `state_params_in`: input-state parameters for each stage. +- `state_params_out`: `(target_parameter, realized_state_variable)` pairs. +- `uncertainty_sampler`: sampler for held-out demand trajectories. +- `initial_state`: inventory state entering stage 1. + +# Keywords +- `num_scenarios::Int`: number of held-out rollouts. +- `seed::Int`: random seed for evaluation trajectories. +- `integer::Bool`: whether to use the MIP operational cost formula. + +# Examples +```julia +costs, inventory, setup, order = rollout_policy( + policy, + subproblems, + state_params_in, + state_params_out, + uncertainty_sampler, + initial_state; + integer = true, +) +``` +""" +function rollout_policy( + policy, + subproblems, + state_params_in, + state_params_out, + uncertainty_sampler, + initial_state; + num_scenarios::Int = N_TEST_SCENARIOS, + seed::Int = 555, + integer::Bool = true, +) + # Fix the evaluation sample so variants see the same demand distribution. Random.seed!(seed) - traj_inv = Matrix{Float64}(undef, n_test, INVENTORY_T + 1) - traj_z = Matrix{Float64}(undef, n_test, INVENTORY_T) - traj_q = Matrix{Float64}(undef, n_test, INVENTORY_T) - op_costs = Vector{Float64}(undef, n_test) - - for s in 1:n_test - unc_sample = sample(unc_eval) - state = Float64.(init_state) - traj_inv[s, 1] = state[1] - op_costs[s] = 0.0 - - for t in 1:INVENTORY_T - d_val = unc_sample[t][1][2] - target = Float64.(policy(Float32[d_val, state...])) - - for i in eachindex(spi[t]) - set_parameter_value(spi[t][i], state[i]) - end - for (param, value) in unc_sample[t] - set_parameter_value(param, value) + + # Store net-inventory trajectories, including the initial inventory at t=0. + inventory_paths = Matrix{Float64}(undef, num_scenarios, INVENTORY_T + 1) + + # Store setup indicators for integer runs and order indicators for relaxed runs. + setup_paths = Matrix{Float64}(undef, num_scenarios, INVENTORY_T) + + # Store order quantities for diagnostics. + order_paths = Matrix{Float64}(undef, num_scenarios, INVENTORY_T) + + # Store operational cost for each scenario. + operational_costs = Vector{Float64}(undef, num_scenarios) + + for scenario in 1:num_scenarios + # Reset recurrent state for LSTM policies. + Flux.reset!(policy) + + # Draw one demand path for this rollout. + uncertainty_sample = sample(uncertainty_sampler) + + # Start from the benchmark initial state. + state = Float64.(initial_state) + + # Record inventory at t=0. + inventory_paths[scenario, 1] = state[1] + + # Reset the scenario cost accumulator. + operational_costs[scenario] = 0.0 + + for stage in 1:INVENTORY_T + # Demand is the single uncertainty value in this inventory model. + demand_value = uncertainty_sample[stage][1][2] + + # The policy maps observed demand plus current state to a target state. + target = Float64.(policy(Float32[demand_value, state...])) + + # Input-state parameters receive the realized state entering this stage. + for index in eachindex(state_params_in[stage]) + set_parameter_value(state_params_in[stage][index], state[index]) end - for i in eachindex(spo[t]) - set_parameter_value(spo[t][i][1], target[i]) + + # Uncertainty parameters receive this stage's realized demand. + for (parameter, value) in uncertainty_sample[stage] + set_parameter_value(parameter, value) end - optimize!(subproblems[t]) - - q_val = value(subproblems[t][:q]) - s_val = value(subproblems[t][:s_out]) - - if integer - z_val = round(value(subproblems[t][:z])) - op_costs[s] += INVENTORY_K * z_val + INVENTORY_C * q_val + - INVENTORY_H * max(s_val, 0.0) + - INVENTORY_P * max(-s_val, 0.0) - traj_z[s, t] = z_val - else - op_costs[s] += INVENTORY_C * q_val + - INVENTORY_H * max(s_val, 0.0) + - INVENTORY_P * max(-s_val, 0.0) - traj_z[s, t] = q_val > 1e-7 ? 1.0 : 0.0 + + # Target parameters receive the policy output. + for index in eachindex(state_params_out[stage]) + set_parameter_value(state_params_out[stage][index][1], target[index]) end - traj_q[s, t] = q_val - traj_inv[s, t+1] = s_val - state = [s_val, d_val, state[2]] + + # Solve the deployment stage exactly as modeled. + optimize!(subproblems[stage]) + + # Read decisions and realized inventory from the solved stage. + order_quantity = value(subproblems[stage][:q]) + next_inventory = value(subproblems[stage][:s_out]) + + # Add the operational cost, excluding target-deficit penalty. + operational_costs[scenario] += + operational_stage_cost(subproblems[stage], integer) + + # Store setup or order activity for later diagnostics. + setup_paths[scenario, stage] = integer ? + round(value(subproblems[stage][:z])) : + Float64(order_quantity > 1.0e-7) + + # Store order quantity and realized inventory trajectory. + order_paths[scenario, stage] = order_quantity + inventory_paths[scenario, stage + 1] = next_inventory + + # The next state carries current inventory and demand history. + state = [next_inventory, demand_value, state[2]] end end - return op_costs, traj_inv, traj_z, traj_q -end - -# ═══════════════════════════════════════════════════════════════════════════════ -# Train + evaluate helper -# ═══════════════════════════════════════════════════════════════════════════════ -function train_and_evaluate(; - tag::String, - integer::Bool, - num_batches::Int=400, - train_per_batch::Int=5, - lr::Float64=0.0015, - warmup_batches::Int=80, - int_strategy_override::Union{Nothing, AbstractIntegerStrategy}=nothing, -) - println("=" ^ 60) - println("Training TS-DDR — $(tag) (integer=$integer)") - println("=" ^ 60) - model_path = joinpath(model_dir, "$(tag)_policy.jld2") - curve_path = joinpath(result_dir, "$(tag)_training_curve.csv") + return operational_costs, inventory_paths, setup_paths, order_paths +end - println("Building deterministic equivalent...") - det_eq, spi_train, spo_train, unc_train, init_state = build_inventory_det_equivalent(; - num_scenarios=N_TRAIN_SCENARIOS, penalty=INVENTORY_PENALTY, seed=42, integer=integer) +""" + build_training_problem(variant::InventoryTrainingVariant) - println("Building stage-wise subproblems...") - eval_subproblems, spi_eval, spo_eval, unc_eval, _ = build_inventory_subproblems(; - num_scenarios=N_TEST, penalty=INVENTORY_PENALTY, seed=99, integer=integer) +Build the deterministic-equivalent model used for training a variant. - policy = build_exante_policy(; seed=2024) +# Arguments +- `variant::InventoryTrainingVariant`: variant configuration. - int_strategy = if int_strategy_override !== nothing - int_strategy_override - elseif integer - FixedDiscreteIntegerStrategy() - else - NoIntegerStrategy() - end +# Examples +```julia +det_eq, state_in, state_out, sampler, initial_state = + build_training_problem(variant) +``` +""" +function build_training_problem(variant::InventoryTrainingVariant) + # Training uses a deterministic equivalent so target-dual gradients are coupled. + return build_inventory_det_equivalent(; + num_scenarios = N_TRAIN_SCENARIOS, + penalty = variant.penalty, + seed = 42, + integer = variant.integer, + ) +end + +""" + build_evaluation_problem(variant::InventoryTrainingVariant) + +Build the stage-wise models used for held-out rollout evaluation. +# Arguments +- `variant::InventoryTrainingVariant`: variant configuration. + +# Examples +```julia +subproblems, state_in, state_out, sampler, initial_state = + build_evaluation_problem(variant) +``` +""" +function build_evaluation_problem(variant::InventoryTrainingVariant) + # Evaluation uses stage-wise deployment semantics, not the training DE solve. + return build_inventory_subproblems(; + num_scenarios = N_TEST_SCENARIOS, + penalty = variant.penalty, + seed = 99, + integer = variant.integer, + ) +end + +""" + estimate_initial_loss(policy, det_eq, state_params_in, state_params_out, + uncertainty_sampler, initial_state, variant) + +Estimate pre-training deterministic-equivalent cost for checkpoint initialization. + +# Arguments +- `policy`: policy evaluated before training. +- `det_eq::JuMP.Model`: deterministic-equivalent training model. +- `state_params_in`: input-state parameters. +- `state_params_out`: target-state parameters. +- `uncertainty_sampler`: training sampler. +- `initial_state`: state entering stage 1. +- `variant::InventoryTrainingVariant`: training configuration. + +# Examples +```julia +loss = estimate_initial_loss(policy, det_eq, spi, spo, sampler, x0, variant) +``` +""" +function estimate_initial_loss( + policy, + det_eq::JuMP.Model, + state_params_in, + state_params_out, + uncertainty_sampler, + initial_state, + variant::InventoryTrainingVariant, +) + # Use a small fixed sample only to seed SaveBest with a finite baseline. Random.seed!(111) - pre_costs = [ - let unc = sample(unc_train) + + return mean( + let uncertainty_sample = sample(uncertainty_sampler) + # Deterministic-equivalent simulation needs the full target trajectory. + target_states = simulate_states(initial_state, uncertainty_sample, policy) + simulate_multistage( - det_eq, spi_train, spo_train, unc, - simulate_states(init_state, unc, policy); - integer_strategy=int_strategy, + det_eq, + state_params_in, + state_params_out, + uncertainty_sample, + target_states; + integer_strategy = variant.training_integer_strategy, ) end for _ in 1:12 - ] - pre_mean = mean(pre_costs) - println("Pre-training mean cost: $(round(pre_mean, digits=2))") + ) +end - save_best = SaveBest(pre_mean, model_path) - training_log = DataFrame(batch=Int[], loss=Float64[]) +""" + train_variant!(policy, variant, det_eq, state_params_in, state_params_out, + uncertainty_sampler, initial_state, model_path, curve_path; + eval_subproblems, eval_state_in, eval_state_out, eval_sampler, + eval_every, eval_scenarios) + +Train one policy and write its training curve. + +The training curve records the true out-of-sample stage-wise rollout cost +(not the deterministic-equivalent training objective) every `eval_every` +batches. This is the deployment-relevant metric and allows fair comparison +across integer strategies and with SDDP. + +# Arguments +- `policy`: mutable Flux policy updated in place. +- `variant::InventoryTrainingVariant`: training configuration. +- `det_eq::JuMP.Model`: deterministic-equivalent training model. +- `state_params_in`: input-state parameters. +- `state_params_out`: target-state parameters. +- `uncertainty_sampler`: training sampler. +- `initial_state`: state entering stage 1. +- `model_path::String`: path for the best model checkpoint. +- `curve_path::String`: path for the training-curve CSV. + +# Keywords +- `eval_subproblems`: stage-wise models for out-of-sample rollout evaluation. +- `eval_state_in`: input-state parameters for evaluation models. +- `eval_state_out`: output-state parameters for evaluation models. +- `eval_sampler`: uncertainty sampler for evaluation scenarios. +- `eval_every::Int = 25`: evaluate rollout cost every this many batches. +- `eval_scenarios::Int = 30`: number of scenarios per periodic evaluation. + +# Examples +```julia +train_variant!(policy, variant, det_eq, spi, spo, sampler, x0, + model_path, curve_path; + eval_subproblems = esub, eval_state_in = esi, + eval_state_out = eso, eval_sampler = esamp) +``` +""" +function train_variant!( + policy, + variant::InventoryTrainingVariant, + det_eq::JuMP.Model, + state_params_in, + state_params_out, + uncertainty_sampler, + initial_state, + model_path::String, + curve_path::String; + eval_subproblems, + eval_state_in, + eval_state_out, + eval_sampler, + eval_every::Int = 25, + eval_scenarios::Int = 30, +) + # Estimate a baseline loss before any optimizer step. + initial_loss = estimate_initial_loss( + policy, + det_eq, + state_params_in, + state_params_out, + uncertainty_sampler, + initial_state, + variant, + ) - println("Training ($num_batches batches × $train_per_batch scenarios, lr=$lr)...") + # SaveBest stores the best policy according to the recorded operational loss. + save_best = SaveBest(initial_loss, model_path) + + # Keep a small CSV trace for plots and sanity checks. + training_log = DataFrame( + batch = Int[], loss = Float64[], rollout_cost = Float64[], + ) + + println("=" ^ 60) + println("Training TS-DDR [$(variant.tag)] integer=$(variant.integer)") + println(" $(variant.num_batches) batches x $(variant.train_per_batch) scenarios") + println(" learning rate: $(variant.learning_rate)") + println(" penalty: $(variant.penalty)") + println(" policy: $(typeof(policy))") + println(" training integer strategy: $(typeof(variant.training_integer_strategy))") + !isnothing(variant.score_function) && + println(" score function: $(typeof(variant.score_function))") + println(" pre-training cost: $(round(initial_loss, digits = 1))") + println(" rollout eval every $(eval_every) batches on $(eval_scenarios) scenarios") + println("=" ^ 60) + + # Fix optimizer randomness for repeatability. Random.seed!(2024) - train_start = time() - train_multistage( - policy, init_state, det_eq, spi_train, spo_train, unc_train; - num_batches=num_batches, - num_train_per_batch=train_per_batch, - optimizer=Flux.Adam(lr), - integer_strategy=int_strategy, - penalty_schedule=[(1, warmup_batches, 0.4), (warmup_batches+1, num_batches, 1.0)], - record=(sample_log, iter, model) -> begin - loss = isempty(sample_log.objectives_no_deficit) ? NaN : mean(sample_log.objectives_no_deficit) - push!(training_log, (batch=iter, loss=loss)) - if mod(iter, 20) == 0 || iter == 1 - println(" Batch $(lpad(iter, 3)) / $num_batches loss = $(round(loss, digits=2))") + + elapsed_seconds = @elapsed train_multistage( + policy, + initial_state, + det_eq, + state_params_in, + state_params_out, + uncertainty_sampler; + num_batches = variant.num_batches, + num_train_per_batch = variant.train_per_batch, + optimizer = Flux.Adam(variant.learning_rate), + integer_strategy = variant.training_integer_strategy, + penalty_schedule = variant.penalty_schedule_fn(variant), + score_function = variant.score_function, + record = (sample_log, iteration, current_policy) -> begin + loss = isempty(sample_log.objectives_no_deficit) ? + NaN : + mean(sample_log.objectives_no_deficit) + + # Periodically evaluate the true out-of-sample stage-wise + # rollout cost — the metric that matters at deployment. + rollout_cost = NaN + if iteration == 1 || mod(iteration, eval_every) == 0 + saved_state = hasproperty(current_policy, :state) ? + deepcopy(current_policy.state) : nothing + + eval_costs, _, _, _ = rollout_policy( + current_policy, + eval_subproblems, + eval_state_in, + eval_state_out, + eval_sampler, + initial_state; + num_scenarios = eval_scenarios, + seed = 777, + integer = variant.integer, + ) + rollout_cost = mean(eval_costs) + + if !isnothing(saved_state) + current_policy.state = saved_state + end + end + + push!(training_log, ( + batch = iteration, loss = loss, rollout_cost = rollout_cost, + )) + + if iteration == 1 || mod(iteration, 50) == 0 + cost_str = isnan(rollout_cost) ? "" : + " rollout=$(round(rollout_cost, digits = 1))" + println( + " batch $(lpad(iteration, 4))/$(variant.num_batches) " * + "loss=$(round(loss, digits = 1))$(cost_str)", + ) end - save_best(iter, model, loss) + + # Save best policy based on rollout cost when available, + # falling back to DE loss otherwise. + save_metric = isnan(rollout_cost) ? loss : rollout_cost + save_best(iteration, current_policy, save_metric) + return false end, ) - train_seconds = time() - train_start + + # Persist the training curve after training finishes. CSV.write(curve_path, training_log) - println("Training time: $(round(train_seconds, digits=1))s") - - model_state = JLD2.load(model_path, "model_state") - Flux.loadmodel!(policy, model_state) - - println("Evaluating on $N_TEST test scenarios...") - eval_start = time() - op_costs, traj_inv, traj_z, traj_q = rollout_policy( - policy, eval_subproblems, spi_eval, spo_eval, unc_eval, init_state; - n_test=N_TEST, seed=555, integer=integer) - eval_seconds = time() - eval_start - - df_inv = DataFrame(traj_inv, [Symbol("t$i") for i in 0:INVENTORY_T]) - df_inv[!, :scenario] = 1:N_TEST - CSV.write(joinpath(result_dir, "$(tag)_dr_trajectories.csv"), df_inv) - CSV.write(joinpath(result_dir, "$(tag)_dr_costs.csv"), - DataFrame(scenario=1:N_TEST, operational_cost=op_costs)) - method_name = if int_strategy isa ContinuousRelaxationIntegerStrategy - "TS-DDR (ContRelax)" - elseif int_strategy isa FixedDiscreteIntegerStrategy - "TS-DDR (FixedDiscrete)" + + println("Training time: $(round(elapsed_seconds, digits = 1))s") + + return elapsed_seconds +end + +""" + save_evaluation_outputs(variant, costs, inventory_paths, train_seconds, eval_seconds) + +Write rollout costs, inventory trajectories, and timing rows for one variant. + +# Arguments +- `variant::InventoryTrainingVariant`: variant configuration. +- `costs::AbstractVector{<:Real}`: held-out operational costs. +- `inventory_paths::AbstractMatrix{<:Real}`: inventory trajectory matrix. +- `train_seconds::Real`: total training time. +- `eval_seconds::Real`: total rollout evaluation time. + +# Examples +```julia +save_evaluation_outputs(variant, costs, inventory_paths, train_time, eval_time) +``` +""" +function save_evaluation_outputs( + variant::InventoryTrainingVariant, + costs::AbstractVector{<:Real}, + inventory_paths::AbstractMatrix{<:Real}, + train_seconds::Real, + eval_seconds::Real, +) + # Name trajectory columns by period t=0,...,T. + time_columns = [Symbol("t$(period)") for period in 0:INVENTORY_T] + + # Write inventory paths for plotting. + CSV.write( + joinpath(RESULT_DIR, "$(variant.tag)_dr_trajectories.csv"), + DataFrame(inventory_paths, time_columns), + ) + + # Write one operational-cost row per held-out scenario. + CSV.write( + joinpath(RESULT_DIR, "$(variant.tag)_dr_costs.csv"), + DataFrame( + scenario = 1:length(costs), + operational_cost = costs, + ), + ) + + # Write timing in the shared schema consumed by compare_results.jl. + CSV.write( + joinpath(RESULT_DIR, "$(variant.tag)_dr_timing.csv"), + DataFrame( + method = [method_label(variant)], + fit_seconds = [train_seconds], + eval_seconds = [eval_seconds / (N_TEST_SCENARIOS * INVENTORY_T)], + n_eval = [N_TEST_SCENARIOS], + ), + ) + + return nothing +end + +""" + train_and_evaluate(variant::InventoryTrainingVariant) + +Train one TS-DDR variant and evaluate it by stage-wise rollout. + +# Arguments +- `variant::InventoryTrainingVariant`: variant configuration. + +# Examples +```julia +costs = train_and_evaluate(variant) +``` +""" +function train_and_evaluate(variant::InventoryTrainingVariant) + # Keep model and curve paths tied to the variant tag. + model_path = joinpath(MODEL_DIR, "$(variant.tag)_policy.jld2") + curve_path = joinpath(RESULT_DIR, "$(variant.tag)_training_curve.csv") + + # Build the training deterministic equivalent. + det_eq, train_state_in, train_state_out, train_sampler, initial_state = + build_training_problem(variant) + + # Build separate stage-wise models for deployment evaluation. + eval_subproblems, eval_state_in, eval_state_out, eval_sampler, _ = + build_evaluation_problem(variant) + + # Start from the variant's chosen policy architecture. + policy = variant.policy_builder() + + # Train the policy and save the best checkpoint. The eval subproblems + # are shared with the post-training evaluation — rollout_policy resets + # parameters each call, so reuse is safe. + train_seconds = train_variant!( + policy, + variant, + det_eq, + train_state_in, + train_state_out, + train_sampler, + initial_state, + model_path, + curve_path; + eval_subproblems = eval_subproblems, + eval_state_in = eval_state_in, + eval_state_out = eval_state_out, + eval_sampler = eval_sampler, + ) + + # Reload the best checkpoint before evaluation. + Flux.loadmodel!(policy, JLD2.load(model_path, "model_state")) + + # Allocate outer variables so the timed block can assign them. + costs = Float64[] + inventory_paths = Matrix{Float64}(undef, 0, 0) + + # Evaluate under deployment semantics and time only the rollout solve work. + eval_seconds = @elapsed begin + rollout_costs, rollout_inventory_paths, _setup_paths, _order_paths = + rollout_policy( + policy, + eval_subproblems, + eval_state_in, + eval_state_out, + eval_sampler, + initial_state; + integer = variant.integer, + ) + + # Copy the rollout results into the outer scope. + costs = rollout_costs + inventory_paths = rollout_inventory_paths + end + + # Write all result files after the elapsed time is known. + save_evaluation_outputs( + variant, + costs, + inventory_paths, + train_seconds, + eval_seconds, + ) + + # Print the headline cost distribution for this variant. + mean_cost = mean(costs) + std_cost = std(costs) + seconds_per_stage = eval_seconds / (N_TEST_SCENARIOS * INVENTORY_T) + println( + "Result: $(round(mean_cost, digits = 1)) +- " * + "$(round(std_cost, digits = 1)) " * + "(eval/stage: $(round(seconds_per_stage, digits = 4))s)", + ) + + return costs +end + +""" + score_function_variant() -> InventoryTrainingVariant + +Build the mixed-gradient integer variant. + +The dual path uses `FixedDiscreteIntegerStrategy`. The score-function path uses +separate integer rollout subproblems, so the Monte Carlo costs are true MIP +rollout costs. + +# Examples +```julia +variant = score_function_variant() +``` +""" +function score_function_variant() + # Score-function rollouts use separate models so training solves do not + # mutate the deterministic-equivalent model. + rollout_subproblems, rollout_state_in, rollout_state_out, _sampler, _ = + build_inventory_subproblems(; + num_scenarios = N_TRAIN_SCENARIOS, + penalty = INVENTORY_PENALTY, + seed = 77, + integer = true, + ) + + # The score-function config describes the final estimator settings. + score_config = ScoreFunctionConfig( + rollout_subproblems, + rollout_state_in, + rollout_state_out; + dual_weight = 0.5, + perturbation_std = 1.0, + num_rollouts = 8, + ) + + # The schedule phases the Monte Carlo correction in after dual-only warmup. + score_schedule = ScoreFunctionSchedule( + score_config; + sf_start = 200, + ramp_batches = 300, + perturbation_std_initial = 0.1, + num_rollouts_initial = 2, + ) + + return InventoryTrainingVariant( + "integer_sf", + true, + 800, + 10, + 8.0e-4, + 120, + FixedDiscreteIntegerStrategy(), + score_schedule, + ) +end + +""" + three_phase_schedule(variant::InventoryTrainingVariant) + +Return a three-phase target-penalty multiplier schedule. + +The ramp starts gentle (0.2) so the optimizer sees smooth cost landscapes +before the high penalty dominates: + +```math +m_k = +\\begin{cases} +0.2, & 1 \\le k \\le K/6, \\\\ +0.6, & K/6 < k \\le K/2, \\\\ +1.0, & K/2 < k \\le K. +\\end{cases} +``` + +# Arguments +- `variant::InventoryTrainingVariant`: training configuration (uses + `num_batches` to compute phase boundaries). + +# Examples +```julia +schedule = three_phase_schedule(variant) +``` +""" +function three_phase_schedule(variant::InventoryTrainingVariant) + # Total number of SGD batches for this variant. + n = variant.num_batches + + # Phase 1 (batches 1..n/6): multiplier 0.2 — gentle start. + # Phase 2 (batches n/6..n/2): multiplier 0.6 — ramp up. + # Phase 3 (batches n/2..n): multiplier 1.0 — full penalty. + return [ + (1, div(n, 6), 0.2), + (div(n, 6) + 1, div(n, 2), 0.6), + (div(n, 2) + 1, n, 1.0), + ] +end + +""" + lstm_score_function_variant() -> InventoryTrainingVariant + +Build the LSTM mixed-gradient variant with tuned score function. + +Compared to `score_function_variant()`, this variant: +- uses `LSTMExAntePolicy` instead of `ExAnteInventoryPolicy`; +- raises the target penalty to 250 (vs 75); +- widens perturbation std to 15.0 (vs 1.0) so score-function rollouts are + large enough to flip the binary setup variable; +- increases rollout count to 12 for lower REINFORCE variance. + +# Examples +```julia +variant = lstm_score_function_variant() +``` +""" +function lstm_score_function_variant() + # Higher penalty (250 vs 75) gives stronger dual signal to the optimizer. + penalty = 250.0 + + # Build separate stage-wise MIP models for score-function rollouts. + # These models are solved with full integrality — not relaxed. + rollout_subproblems, rollout_state_in, rollout_state_out, _sampler, _ = + build_inventory_subproblems(; + num_scenarios = N_TRAIN_SCENARIOS, + penalty = penalty, + seed = 77, + integer = true, + ) + + # Score-function config: α=0.7 dual weight, σ=15 perturbation, M=12 rollouts. + # σ=15 is ≈10% of typical target values (~150), enough to flip z decisions. + score_config = ScoreFunctionConfig( + rollout_subproblems, + rollout_state_in, + rollout_state_out; + dual_weight = 0.7, + perturbation_std = 15.0, + num_rollouts = 12, + ) + + # Schedule: no score function for first 400 batches (dual-only warmup), + # then linear ramp over 400 batches to full score-function parameters. + score_schedule = ScoreFunctionSchedule( + score_config; + sf_start = 400, + ramp_batches = 400, + perturbation_std_initial = 3.0, + num_rollouts_initial = 4, + ) + + return InventoryTrainingVariant( + "integer_lstm_sf", + true, + 1200, + 16, + 5.0e-4, + 200, + FixedDiscreteIntegerStrategy(), + score_schedule, + penalty, + () -> build_lstm_exante_policy(; seed = 2024), + three_phase_schedule, + ) +end + +""" + inventory_training_variants() -> Vector{InventoryTrainingVariant} + +Return all TS-DDR variants used in the benchmark. + +# Examples +```julia +for variant in inventory_training_variants() + train_and_evaluate(variant) +end +``` +""" +function inventory_training_variants() + return [ + InventoryTrainingVariant( + "relaxed", + false, + 400, + 5, + 1.5e-3, + 80, + NoIntegerStrategy(), + nothing, + ), + InventoryTrainingVariant( + "integer", + true, + 800, + 10, + 8.0e-4, + 120, + FixedDiscreteIntegerStrategy(), + nothing, + ), + InventoryTrainingVariant( + "integer_cr", + true, + 800, + 10, + 8.0e-4, + 120, + ContinuousRelaxationIntegerStrategy(), + nothing, + ), + score_function_variant(), + # --- Tuned variants (relaxed) --- + # LSTM on the relaxed problem: isolates temporal encoding benefit + # without integer complexity. + InventoryTrainingVariant( + "relaxed_lstm", + false, + 800, + 10, + 1.0e-3, + 120, + NoIntegerStrategy(), + nothing, + INVENTORY_PENALTY, + () -> build_lstm_exante_policy(; seed = 2024), + penalty_schedule_for, + ), + # Higher penalty feedforward on the relaxed problem. + InventoryTrainingVariant( + "relaxed_hp", + false, + 800, + 10, + 1.0e-3, + 120, + NoIntegerStrategy(), + nothing, + 250.0, + () -> build_exante_policy(; seed = 2024), + penalty_schedule_for, + ), + # LSTM + high penalty on the relaxed problem. + InventoryTrainingVariant( + "relaxed_lstm_hp", + false, + 800, + 10, + 1.0e-3, + 120, + NoIntegerStrategy(), + nothing, + 250.0, + () -> build_lstm_exante_policy(; seed = 2024), + penalty_schedule_for, + ), + # --- Tuned variants (integer) --- + # Improved feedforward with higher penalty. + InventoryTrainingVariant( + "integer_hp", + true, + 1200, + 16, + 5.0e-4, + 200, + FixedDiscreteIntegerStrategy(), + nothing, + 250.0, + () -> build_exante_policy(; seed = 2024), + three_phase_schedule, + ), + # Variant A: LSTM with high penalty + InventoryTrainingVariant( + "integer_lstm", + true, + 1200, + 16, + 5.0e-4, + 200, + FixedDiscreteIntegerStrategy(), + nothing, + 250.0, + () -> build_lstm_exante_policy(; seed = 2024), + three_phase_schedule, + ), + # Variant B: LSTM with tuned score function + lstm_score_function_variant(), + ] +end + +""" + run_variant(tag::AbstractString) -> Nothing + +Train and evaluate a single variant by tag name. + +This is the entry point used by SLURM jobs to run one variant at a time: + +```bash +julia --project=. train_dr_inventory.jl integer_lstm +``` + +# Arguments +- `tag::AbstractString`: one of the tags returned by + `inventory_training_variants()`. + +# Examples +```julia +run_variant("integer_lstm") +``` +""" +function run_variant(tag::AbstractString) + all_variants = inventory_training_variants() + idx = findfirst(v -> v.tag == tag, all_variants) + isnothing(idx) && error( + "Unknown variant tag \"$tag\". " * + "Available: $(join([v.tag for v in all_variants], ", "))" + ) + train_and_evaluate(all_variants[idx]) + return nothing +end + +""" + main() -> Nothing + +Run the full inventory TS-DDR training benchmark. + +# Examples +```julia +main() +``` +""" +function main() + for variant in inventory_training_variants() + train_and_evaluate(variant) + println() + end + + println("All TS-DDR results saved to $(relpath(RESULT_DIR, EXAMPLE_DIR))") + + return nothing +end + +# Run the script only when invoked directly, not when included by tests. +if abspath(PROGRAM_FILE) == @__FILE__ + if isempty(ARGS) + main() else - "TS-DDR (trained)" + run_variant(ARGS[1]) end - CSV.write(joinpath(result_dir, "$(tag)_dr_timing.csv"), - DataFrame(method=[method_name], - fit_seconds=[train_seconds], - eval_seconds=[eval_seconds / (N_TEST * INVENTORY_T)], - n_eval=[N_TEST])) - - μ = mean(op_costs) - σ = std(op_costs) - println("$(tag) TS-DDR — mean: $(round(μ, digits=1)) ± $(round(σ, digits=1))") - println(" Eval/stage: $(round(eval_seconds/(N_TEST*INVENTORY_T), digits=4))s") - return op_costs -end - -# ═══════════════════════════════════════════════════════════════════════════════ -# Run both -# ═══════════════════════════════════════════════════════════════════════════════ -train_and_evaluate(tag="relaxed", integer=false, - num_batches=400, train_per_batch=5, lr=0.0015, warmup_batches=80) -println() -train_and_evaluate(tag="integer", integer=true, - num_batches=800, train_per_batch=10, lr=0.0008, warmup_batches=120) -println() -train_and_evaluate(tag="integer_cr", integer=true, - num_batches=800, train_per_batch=10, lr=0.0008, warmup_batches=120, - int_strategy_override=ContinuousRelaxationIntegerStrategy()) -println("\nAll TS-DDR results saved to $(relpath(result_dir, example_dir))") +end diff --git a/src/DecisionRules.jl b/src/DecisionRules.jl index a4e11ad..5f90a6a 100644 --- a/src/DecisionRules.jl +++ b/src/DecisionRules.jl @@ -9,6 +9,7 @@ using ChainRulesCore import ChainRulesCore.rrule using DiffOpt using Logging +using Statistics: mean export simulate_multistage, sample, @@ -35,13 +36,114 @@ export simulate_multistage, StateConditionedPolicy, state_conditioned_policy, materialize_tangent, + # Score-function gradient mixing + ScoreFunctionConfig, + ScoreFunctionSchedule, + sf_params, # Multiple shooting exports train_multiple_shooting, setup_shooting_windows, solve_window, predict_window_targets, simulate_multiple_shooting, - WindowData + WindowData, + # Gradient fallback + AbstractGradientFallback, + ZeroGradientFallback, + ErrorGradientFallback + +@doc raw""" + AbstractGradientFallback + +Abstract type governing what happens when a solver or differentiation error +occurs during training. + +DecisionRules ships two concrete subtypes: + +| Type | Behavior | +|------|----------| +| [`ZeroGradientFallback`](@ref) | Log a warning, return zero gradients, continue training | +| [`ErrorGradientFallback`](@ref) | Re-throw the error (useful in tests) | + +## Extending + +Implement your own subtype to customize recovery: + +```julia +struct MyFallback <: DecisionRules.AbstractGradientFallback end + +function DecisionRules.handle_gradient_error(::MyFallback, e, n_state_in, n_state_out) + # e is the caught exception + # Return a tuple of cotangents (same shape as the rrule pullback) or rethrow + @error "Custom handler" exception=e + return DecisionRules._zero_cotangents(n_state_in, n_state_out) +end + +function DecisionRules.handle_training_error(::MyFallback, e, iter) + # Return true to skip this iteration, false to rethrow + @error "Custom training handler" exception=e + return true # skip +end + +function DecisionRules.handle_rollout_error(::MyFallback, e, iter) + # Return true to skip this scenario, false to rethrow + return true +end +``` + +Then pass `gradient_fallback=MyFallback()` to [`train_multistage`](@ref) or +[`train_multiple_shooting`](@ref). +""" +abstract type AbstractGradientFallback end + +""" + ZeroGradientFallback() + +Default fallback: log a warning and return zero gradients when the solver or +DiffOpt differentiation fails. Training continues with a skipped update for +that iteration. +""" +struct ZeroGradientFallback <: AbstractGradientFallback end + +""" + ErrorGradientFallback() + +Strict fallback: re-throw any solver or differentiation error. Use this in +tests to ensure that controlled problems never silently produce zero gradients. +""" +struct ErrorGradientFallback <: AbstractGradientFallback end + +_zero_cotangents(n_in, n_out) = ( + NoTangent(), NoTangent(), NoTangent(), NoTangent(), + zeros(n_in), zeros(n_out), NoTangent(), +) + +function handle_gradient_error(::ZeroGradientFallback, e, n_state_in, n_state_out) + @warn "get_next_state pullback failed — returning zero gradients" exception=(e, catch_backtrace()) + return _zero_cotangents(n_state_in, n_state_out) +end + +function handle_gradient_error(::ErrorGradientFallback, e, n_state_in, n_state_out) + rethrow(e) +end + +function handle_training_error(::ZeroGradientFallback, e, iter) + @warn "Gradient computation failed at iter $iter — skipping update" exception=(e, catch_backtrace()) + return true +end + +function handle_training_error(::ErrorGradientFallback, e, iter) + rethrow(e) +end + +function handle_rollout_error(::ZeroGradientFallback, e, iter) + @warn "Rollout scenario failed at iter $iter — skipping" exception=(e, catch_backtrace()) + return true +end + +function handle_rollout_error(::ErrorGradientFallback, e, iter) + rethrow(e) +end """ STRICT_GRADIENTS @@ -57,13 +159,22 @@ to verify that controlled test cases never silently fall through to zero gradients: DecisionRules.STRICT_GRADIENTS[] = true + +!!! note + This flag controls the **rrule-level** fallback for bad solver status. + For the **training-loop-level** fallback (DiffOpt assertion errors, etc.), + use the `gradient_fallback` keyword in [`train_multistage`](@ref) and + [`train_multiple_shooting`](@ref). """ const STRICT_GRADIENTS = Ref(false) +const _DEFAULT_GRADIENT_FALLBACK = ZeroGradientFallback() + const _SUCCESSFUL_TERM_STATUSES = (MOI.OPTIMAL, MOI.ALMOST_OPTIMAL, MOI.LOCALLY_SOLVED) include("integer_strategies.jl") include("parameter_duals.jl") +include("score_function.jl") include("simulate_multistage.jl") include("dense_multilayer_nn.jl") include("utils.jl") diff --git a/src/integer_strategies.jl b/src/integer_strategies.jl index 33f001e..046576a 100644 --- a/src/integer_strategies.jl +++ b/src/integer_strategies.jl @@ -1,46 +1,127 @@ """ AbstractIntegerStrategy -Extension point for preparing models with discrete variables before reading duals -or solver sensitivities. +Abstract supertype for strategies that prepare a JuMP model before reading +duals or solver sensitivities. + +# Arguments +This abstract type has no fields. Concrete subtypes are passed as the +`integer_strategy::AbstractIntegerStrategy` keyword to simulation and training +functions. + +# Examples +```julia +simulate_multistage( + subproblems, + state_params_in, + state_params_out, + initial_state, + uncertainties, + policy; + integer_strategy = FixedDiscreteIntegerStrategy(), +) +``` """ abstract type AbstractIntegerStrategy end """ NoIntegerStrategy() -Default integer strategy. Solves the model exactly as-is and preserves the -historical continuous-model behavior. +Solve the model exactly as written before reading duals or sensitivities. + +Use this for continuous LP, conic, or nonlinear models whose derivative +information is available directly from the solved model. + +# Arguments +This type has no fields. + +# Examples +```julia +strategy = NoIntegerStrategy() +``` """ struct NoIntegerStrategy <: AbstractIntegerStrategy end """ FixedDiscreteIntegerStrategy() -Solve the original model, fix binary/integer variables to their incumbent values, -relax integrality, re-solve the fixed continuous model, and read duals or -sensitivities in that fixed-incumbent continuous state. +Solve a mixed-integer model, fix discrete variables to their incumbent values, +relax integrality, re-solve, and read duals or sensitivities from the fixed +continuous model. -The returned derivative-like information is local to the incumbent integer -assignment and should be interpreted as a postprocessing surrogate, not as a full +If ``z^*`` is the incumbent binary/integer solution, this strategy reads +derivative-like information from the continuous problem + +```math +\\min_x f(x, z^*) \\quad \\text{subject to} \\quad g(x, z^*) \\le 0. +``` + +The result is local to the incumbent integer assignment. It is not a differentiable MIP method. + +# Arguments +This type has no fields. + +# Examples +```julia +strategy = FixedDiscreteIntegerStrategy() +``` """ struct FixedDiscreteIntegerStrategy <: AbstractIntegerStrategy end """ discrete_variables(model::JuMP.Model) -Return all binary or integer variables in `model`. +Return the binary or integer variables in `model`. + +# Arguments +- `model::JuMP.Model`: model to inspect. + +# Examples +```julia +vars = DecisionRules.discrete_variables(model) +``` """ function discrete_variables(model::JuMP.Model) + # JuMP tracks binary and integer status on variables, not in one shared list. return filter(JuMP.all_variables(model)) do variable JuMP.is_binary(variable) || JuMP.is_integer(variable) end end +""" + has_discrete_variables(model::JuMP.Model) -> Bool + +Return whether `model` contains at least one binary or integer variable. + +# Arguments +- `model::JuMP.Model`: model to inspect. + +# Examples +```julia +if DecisionRules.has_discrete_variables(model) + @info "MIP model" +end +``` +""" has_discrete_variables(model::JuMP.Model) = !isempty(discrete_variables(model)) +""" + _assert_successful_solve(model::JuMP.Model; context::AbstractString = "solve") + +Throw an error unless `model` terminated with an accepted success status. + +# Arguments +- `model::JuMP.Model`: model whose termination status is checked. +- `context::AbstractString`: human-readable phrase included in the error. + +# Examples +```julia +DecisionRules._assert_successful_solve(model; context = "fixed LP solve") +``` +""" function _assert_successful_solve(model::JuMP.Model; context::AbstractString="solve") + # Keep the accepted statuses centralized in DecisionRules.jl. status = JuMP.termination_status(model) status in _SUCCESSFUL_TERM_STATUSES && return status throw( @@ -54,13 +135,26 @@ end """ with_sensitivity_solution(f, model, integer_strategy) -Run `f(model)` while `model` is in a state where duals or DiffOpt sensitivities -can be read. Integer strategies that temporarily mutate the model must restore it -before returning, including when `f` throws. +Run `f(model)` while `model` is in a state suitable for reading duals or +DiffOpt sensitivities. + +# Arguments +- `f::Function`: callback that reads values, duals, or sensitivities. +- `model::JuMP.Model`: model to solve and inspect. +- `integer_strategy::AbstractIntegerStrategy`: strategy used to prepare models + with binary or integer variables. + +# Examples +```julia +objective = with_sensitivity_solution(model, FixedDiscreteIntegerStrategy()) do m + JuMP.objective_value(m) +end +``` """ function with_sensitivity_solution( f::Function, model::JuMP.Model, ::NoIntegerStrategy ) + # Continuous models can be solved directly. optimize!(model) return f(model) end @@ -68,21 +162,46 @@ end function with_sensitivity_solution( f::Function, model::JuMP.Model, ::FixedDiscreteIntegerStrategy ) + # First solve the original MIP to obtain an incumbent integer assignment. optimize!(model) _assert_successful_solve(model; context="original integer solve") + # Models without discrete variables fall back to the direct solved state. has_discrete_variables(model) || return f(model) + # JuMP returns an undo callback that restores integrality and bounds. undo = JuMP.fix_discrete_variables(model) try + # Re-solve the fixed continuous problem before reading duals. optimize!(model) _assert_successful_solve(model; context="fixed-discrete sensitivity solve") return f(model) finally + # Always restore the original model, even when the callback fails. undo() end end +""" + _with_current_or_sensitivity_solution(f, model, integer_strategy) + +Run `f(model)` directly for continuous models and through +[`with_sensitivity_solution`](@ref) for integer strategies. + +# Arguments +- `f::Function`: callback that reads values, duals, or sensitivities. +- `model::JuMP.Model`: model to inspect. +- `integer_strategy::AbstractIntegerStrategy`: current integer strategy. + +# Examples +```julia +value = DecisionRules._with_current_or_sensitivity_solution( + m -> JuMP.objective_value(m), + model, + strategy, +) +``` +""" _with_current_or_sensitivity_solution( f::Function, model::JuMP.Model, ::NoIntegerStrategy ) = f(model) @@ -99,6 +218,10 @@ end Relax all binary/integer constraints to continuous bounds (binary → [0,1]), solve the resulting LP, and read duals in that relaxed state. +Mathematically, this replaces ``z \\in \\{0,1\\}`` or integer restrictions with +continuous bounds before solving. The derivative signal belongs to the relaxed +problem, not to the original MIP. + Compared to [`FixedDiscreteIntegerStrategy`](@ref): - **Faster**: one LP solve instead of MIP + LP. - **Smoother gradients**: no integer fixing means no zero-gradient dead zones. @@ -109,31 +232,59 @@ A practical pattern is to train with `ContinuousRelaxationIntegerStrategy` during warmup (smooth landscape for initial learning) and switch to `FixedDiscreteIntegerStrategy` later (integer-accurate gradients for fine-tuning). + +# Arguments +This type has no fields. + +# Examples +```julia +strategy = ContinuousRelaxationIntegerStrategy() +``` """ struct ContinuousRelaxationIntegerStrategy <: AbstractIntegerStrategy end function with_sensitivity_solution( f::Function, model::JuMP.Model, ::ContinuousRelaxationIntegerStrategy ) + # Continuous models need no relaxation step. has_discrete_variables(model) || begin optimize!(model) return f(model) end + + # JuMP returns an undo callback that restores integrality after the solve. undo = JuMP.relax_integrality(model) try + # Solve the continuous relaxation before reading duals. optimize!(model) _assert_successful_solve(model; context="continuous relaxation sensitivity solve") return f(model) finally + # Restore integer declarations before returning control to the caller. undo() end end +""" + _sensitivity_forward_status(model::JuMP.Model, strategy) -> MOI.TerminationStatusCode + +Return the termination status that an rrule should use for gradient fallback. + +# Arguments +- `model::JuMP.Model`: model inspected after the forward pass. +- `strategy::AbstractIntegerStrategy`: integer strategy used for the solve. + +# Examples +```julia +status = DecisionRules._sensitivity_forward_status(model, strategy) +``` +""" _sensitivity_forward_status(model::JuMP.Model, ::NoIntegerStrategy) = JuMP.termination_status(model) function _sensitivity_forward_status( ::JuMP.Model, ::AbstractIntegerStrategy ) + # Integer strategies do their own solve checks inside the sensitivity pass. return MOI.OPTIMAL end diff --git a/src/multiple_shooting.jl b/src/multiple_shooting.jl index afe93de..322d174 100644 --- a/src/multiple_shooting.jl +++ b/src/multiple_shooting.jl @@ -60,13 +60,16 @@ end =============================================================================# """ - extract_uncertainty_params(window_uncertainties_new) + extract_uncertainty_params(window_uncertainties) -> Vector{Vector{VariableRef}} -Normalize uncertainty data to a per-stage vector of parameter VariableRefs. +Extract the JuMP parameter `VariableRef`s from each stage of an uncertainty pool. -Accepts either: -- Vector{Vector{Tuple{VariableRef, Any}}} (common in this package), or -- Vector{Vector{VariableRef}}. +Handles three possible input shapes (automatically detected): +- Already-extracted `Vector{Vector{VariableRef}}` — returned as-is. +- Per-unit pool `Vector{Vector{Tuple{VariableRef, Vector}}}` — extracts the first + element of each tuple. +- Joint-scenario pool `Vector{Vector{Vector{Tuple{VariableRef, T}}}}` — extracts + params from the first scenario of each stage (all scenarios share the same params). """ function extract_uncertainty_params(window_uncertainties) if isempty(window_uncertainties) @@ -76,10 +79,15 @@ function extract_uncertainty_params(window_uncertainties) if isempty(first_stage) return [VariableRef[] for _ in 1:length(window_uncertainties)] end - if first(first_stage) isa VariableRef + elem = first(first_stage) + if elem isa VariableRef return window_uncertainties + elseif elem isa AbstractVector + # Joint-scenario format: each stage is [scenario₁, scenario₂, ...], + # each scenario is [(param, val), ...]. Extract params from the first scenario. + return [[pair[1] for pair in first(stage)] for stage in window_uncertainties] else - # assume tuples (param, something) + # Per-unit format: (param, support_vector) return [[u[1] for u in stage_u] for stage_u in window_uncertainties] end end @@ -115,10 +123,20 @@ function _create_like_variable( end """ - windows_equivalent!(model, subproblems, state_params_in, state_params_out, initial_state, uncertainties) + windows_equivalent!(model, subproblems, state_params_in, state_params_out, + initial_state, uncertainties) -Create a window equivalent without mutating the original subproblems and without -adding extra variables/constraints beyond those already present in the subproblems. +Build a coupled JuMP model for a contiguous window of stages by copying all variables, +constraints, and objectives from `subproblems` into `model`. Stage coupling is enforced +by identifying each stage's realized state variable with the next stage's incoming state +parameter (same approach as [`deterministic_equivalent!`](@ref), but scoped to a +window). + +`uncertainties` accepts both per-unit and joint-scenario pool formats +(see [`sample`](@ref)). The returned `uncertainties_new` preserves the input format +with variable refs remapped to the window model. + +Returns `(model, state_params_in_new, state_params_out_new, uncertainties_new)`. """ function windows_equivalent!( model::JuMP.Model, @@ -134,7 +152,10 @@ function windows_equivalent!( var_src_to_dest = Dict{VariableRef,VariableRef}() state_in_new = Vector{Vector{Any}}(undef, num_stages) state_out_new = Vector{Vector{Tuple{Any,VariableRef}}}(undef, num_stages) - uncertainties_new = Vector{Vector{Tuple{Any,Vector{Float64}}}}(undef, num_stages) + # Detect format: joint-scenario (Vector{Vector{Tuple}}) vs per-unit (Vector{Tuple{...,Vector}}) + _is_joint = !isempty(uncertainties) && !isempty(uncertainties[1]) && + first(uncertainties[1]) isa AbstractVector + uncertainties_new = Any[nothing for _ in 1:num_stages] for t in 1:num_stages subproblem = subproblems[t] @@ -200,21 +221,42 @@ function windows_equivalent!( end end - # uncertainties - uncertainties_new[t] = Vector{Tuple{Any,Vector{Float64}}}( - undef, length(uncertainties[t]) - ) - for (i, tup) in enumerate(uncertainties[t]) - u_src, u_vals = tup - if u_src isa VariableRef - dest = get(var_src_to_dest, u_src, nothing) - if dest === nothing - dest = _create_like_variable(model, u_src, t; force_parameter=true) - var_src_to_dest[u_src] = dest + # uncertainties — remap VariableRefs through var_src_to_dest + if _is_joint + # Joint-scenario format: each element is a scenario (Vector of Tuples) + uncertainties_new[t] = [ + [begin + u_src, u_val = pair + if u_src isa VariableRef + dest = get(var_src_to_dest, u_src, nothing) + if dest === nothing + dest = _create_like_variable(model, u_src, t; force_parameter=true) + var_src_to_dest[u_src] = dest + end + (dest, Float64(u_val)) + else + (u_src, Float64(u_val)) + end + end for pair in scenario] + for scenario in uncertainties[t] + ] + else + # Per-unit format: each element is (param, support_vector) + uncertainties_new[t] = Vector{Tuple{Any,Vector{Float64}}}( + undef, length(uncertainties[t]) + ) + for (i, tup) in enumerate(uncertainties[t]) + u_src, u_vals = tup + if u_src isa VariableRef + dest = get(var_src_to_dest, u_src, nothing) + if dest === nothing + dest = _create_like_variable(model, u_src, t; force_parameter=true) + var_src_to_dest[u_src] = dest + end + uncertainties_new[t][i] = (dest, _as_float64_vec(u_vals)) + else + uncertainties_new[t][i] = (u_src, _as_float64_vec(u_vals)) end - uncertainties_new[t][i] = (dest, _as_float64_vec(u_vals)) - else - uncertainties_new[t][i] = (u_src, _as_float64_vec(u_vals)) end end @@ -266,10 +308,11 @@ end """ set_window_uncertainties!(window, uncertainty_sample) -Set sampled uncertainty values into the window model parameters. +Set sampled (realized) uncertainty values into the window model's JuMP parameters. -- `window.uncertainty_params[t][i]` is the parameter VariableRef in the window model -- `uncertainty_sample[global_t][i][2]` is the sampled numeric value (original structure) +`uncertainty_sample` is a **realized** trajectory (output of [`sample`](@ref)), so +each stage is `Vector{Tuple{VariableRef, Float64}}` regardless of whether the +original pool used independent or joint-scenario format. """ function set_window_uncertainties!( window, @@ -880,10 +923,19 @@ end """ train_multiple_shooting(model, initial_state, windows, uncertainty_sampler; ...) -This mirrors your other training loops: -- Reuse pre-built window models. -- For each SGD step, sample uncertainties, build uncertainties_vec for the policy, - evaluate simulate_multiple_shooting, and update parameters. +Train a target-state policy with multiple-shooting decomposition (windowed). + +`uncertainty_sampler` controls how trajectories are drawn at each SGD step. +Three formats are accepted (same API as [`train_multistage`](@ref)): +1. **Per-unit pool** (`Vector{Vector{Tuple{VariableRef, Vector{T}}}}`): + independent sampling per parameter per stage. +2. **Joint-scenario pool** (`Vector{Vector{Vector{Tuple{VariableRef, T}}}}`): + one scenario drawn per stage, preserving spatial correlation. +3. **Callable** (`() -> Vector{Vector{Tuple{VariableRef, T}}}`): a zero-arg + function returning a realized trajectory. Use this for temporal correlation; + see [`sample`](@ref). + +See the [Uncertainty Sampling](@ref) documentation for details. """ function train_multiple_shooting( model, @@ -901,6 +953,7 @@ function train_multiple_shooting( get_objective_no_target_deficit=get_objective_no_target_deficit, penalty_schedule=nothing, integer_strategy::AbstractIntegerStrategy=NoIntegerStrategy(), + gradient_fallback::AbstractGradientFallback=ZeroGradientFallback(), ) opt_state = Flux.setup(optimizer, model) @@ -913,18 +966,7 @@ function train_multiple_shooting( end current_multiplier = NaN - # We only need the uncertainty *structure* here. - base_uncertainty = uncertainty_sampler() - # If uncertainty values are vectors (sample sets), draw realized values per iteration. - has_sample_sets = - !isempty(base_uncertainty) && - !isempty(base_uncertainty[1]) && - (base_uncertainty[1][1][2] isa AbstractVector) - draw_uncertainty = if has_sample_sets - (() -> DecisionRules.sample(base_uncertainty)) - else - uncertainty_sampler - end + draw_uncertainty = () -> DecisionRules.sample(uncertainty_sampler) initial_state_f32 = Float32.(initial_state) @@ -940,7 +982,8 @@ function train_multiple_shooting( objective = 0.0 - grads = Flux.gradient(model) do m + grads = try + Flux.gradient(model) do m objective = 0.0 for _ in 1:num_train_per_batch @ignore_derivatives Flux.reset!(m) @@ -962,6 +1005,11 @@ function train_multiple_shooting( objective /= num_train_per_batch return objective end + catch e + if handle_training_error(gradient_fallback, e, iter) + nothing + end + end eval_loss = @ignore_derivatives begin total = 0.0 @@ -1015,6 +1063,10 @@ function train_multiple_shooting( record_loss(iter, model, eval_loss, "metrics/loss") && break record_loss(iter, model, objective, "metrics/training_loss") && break + if isnothing(grads) + continue + end + grad = materialize_tangent(grads[1]) Flux.update!(opt_state, model, grad) end diff --git a/src/score_function.jl b/src/score_function.jl new file mode 100644 index 0000000..120d392 --- /dev/null +++ b/src/score_function.jl @@ -0,0 +1,638 @@ +""" + ScoreFunctionConfig( + subproblems::AbstractVector{<:JuMP.Model}, + state_params_in::AbstractVector, + state_params_out::AbstractVector; + dual_weight::Real = 0.5, + perturbation_std::Real = 1.0, + num_rollouts::Integer = 8, + baseline::Symbol = :mean, + ) + +Configure the score-function correction used by [`train_multistage`](@ref). + +The deterministic-equivalent training path differentiates the target policy +through dual information. For mixed-integer subproblems, those duals are local +to a fixed integer assignment. This configuration adds a REINFORCE-style +correction estimated from stage-wise rollouts with perturbed targets. + +The rollout models are solved exactly as they are built. If `subproblems` +contain binary variables, the score-function rollouts solve MIPs. If they +contain relaxed variables, the score-function rollouts solve the relaxation. +This is intentionally separate from the `integer_strategy` keyword of +[`train_multistage`](@ref), which controls only how the differentiable +dual-gradient path reads local sensitivity information from the deterministic +equivalent. + +If ``\\hat{x}_{t+1}(\\theta)`` is the target emitted by the policy and +``\\delta_t \\sim \\mathcal{N}(0, \\sigma^2 I)``, the perturbed rollout solves +with target ``\\hat{x}_{t+1}(\\theta) + \\delta_t``. The score-function +surrogate loss is + +```math +L_{sf}(\\theta) + = \\frac{1}{M} \\sum_{m=1}^{M} + (R_m - b) + \\sum_{t=1}^{T} + \\left\\langle \\frac{\\delta_{m,t}}{\\sigma^2}, + \\hat{x}_{t+1}(\\theta) \\right\\rangle , +``` + +and the mixed gradient is + +```math +\\nabla L + = \\alpha \\nabla L_{dual} + + (1 - \\alpha) \\nabla L_{sf}. +``` + +# Arguments +- `subproblems::AbstractVector{<:JuMP.Model}`: stage-wise rollout models used to + estimate realized costs under perturbed targets. +- `state_params_in::AbstractVector`: stage input-state parameters. +- `state_params_out::AbstractVector`: pairs `(target_parameter, state_variable)` + for every stage output state. + +# Keywords +- `dual_weight::Real`: mixing weight ``\\alpha`` on the dual-gradient term. +- `perturbation_std::Real`: Gaussian standard deviation ``\\sigma``. +- `num_rollouts::Integer`: number of perturbed rollouts ``M`` per sample. +- `baseline::Symbol`: either `:mean` for mean-centering costs or `:none`. + +# Examples +```julia +score_function = ScoreFunctionConfig( + rollout_subproblems, + state_params_in, + state_params_out; + dual_weight = 0.5, + perturbation_std = 1.0, + num_rollouts = 8, +) + +train_multistage( + policy, + initial_state, + det_equivalent, + state_params_in, + state_params_out, + uncertainty_sampler; + score_function, +) +``` +""" +struct ScoreFunctionConfig + subproblems::Vector{JuMP.Model} + state_params_in::AbstractVector + state_params_out::AbstractVector + dual_weight::Float64 + perturbation_std::Float64 + num_rollouts::Int + baseline::Symbol +end + +function ScoreFunctionConfig( + subproblems::AbstractVector{<:JuMP.Model}, + state_params_in::AbstractVector, + state_params_out::AbstractVector; + dual_weight::Real = 0.5, + perturbation_std::Real = 1.0, + num_rollouts::Integer = 8, + baseline::Symbol = :mean, +) + # Validate dimensions first so later rollout errors point at modeling issues. + length(subproblems) == length(state_params_in) || + throw(ArgumentError("state_params_in must have one entry per subproblem.")) + length(subproblems) == length(state_params_out) || + throw(ArgumentError("state_params_out must have one entry per subproblem.")) + + # Convert scalar configuration values once, at construction time. + dual_weight_value = Float64(dual_weight) + perturbation_std_value = Float64(perturbation_std) + num_rollouts_value = Int(num_rollouts) + + # Keep validation messages tied to the public keyword names. + 0.0 <= dual_weight_value <= 1.0 || + throw(ArgumentError("dual_weight must lie in [0, 1].")) + perturbation_std_value > 0.0 || + throw(ArgumentError("perturbation_std must be positive.")) + num_rollouts_value >= 1 || + throw(ArgumentError("num_rollouts must be at least 1.")) + baseline in (:mean, :none) || + throw(ArgumentError("baseline must be either :mean or :none.")) + + # Store a plain Vector of models so iteration order is concrete and stable. + return ScoreFunctionConfig( + collect(subproblems), + state_params_in, + state_params_out, + dual_weight_value, + perturbation_std_value, + num_rollouts_value, + baseline, + ) +end + +""" + ScoreFunctionSchedule(config::ScoreFunctionConfig; ) + +Ramp a [`ScoreFunctionConfig`](@ref) into training after a pure-dual warmup. + +The schedule delays score-function rollouts until `sf_start`, then linearly +increases the score-function weight, perturbation scale, and rollout count until +the final values stored in `config` are reached. + +Let ``k`` be the current iteration and +``\\rho_k = \\operatorname{clip}((k - k_0) / r, 0, 1)``. The effective +score-function weight is +``\\rho_k (1 - \\alpha)``. The effective dual weight is one minus that value. + +# Arguments +- `config::ScoreFunctionConfig`: final score-function configuration. + +# Keywords +- `sf_start::Integer`: first iteration at which score-function rollouts are + considered. +- `ramp_batches::Integer`: number of iterations in the linear ramp. +- `perturbation_std_initial::Real`: initial ``\\sigma`` at ramp start. +- `num_rollouts_initial::Integer`: initial rollout count at ramp start. + +# Examples +```julia +schedule = ScoreFunctionSchedule( + score_function; + sf_start = 200, + ramp_batches = 300, + perturbation_std_initial = 0.1, + num_rollouts_initial = 2, +) +``` +""" +struct ScoreFunctionSchedule + config::ScoreFunctionConfig + sf_start::Int + ramp_batches::Int + final_dual_weight::Float64 + initial_perturbation_std::Float64 + final_perturbation_std::Float64 + initial_num_rollouts::Int + final_num_rollouts::Int +end + +function ScoreFunctionSchedule( + config::ScoreFunctionConfig; + sf_start::Integer = 200, + ramp_batches::Integer = 200, + perturbation_std_initial::Real = 0.1, + num_rollouts_initial::Integer = 2, +) + # Convert public numeric inputs before validation. + sf_start_value = Int(sf_start) + ramp_batches_value = Int(ramp_batches) + initial_std_value = Float64(perturbation_std_initial) + initial_rollouts_value = Int(num_rollouts_initial) + + # Reject invalid schedules with keyword-specific messages. + sf_start_value >= 1 || + throw(ArgumentError("sf_start must be at least 1.")) + ramp_batches_value >= 1 || + throw(ArgumentError("ramp_batches must be at least 1.")) + initial_std_value > 0.0 || + throw(ArgumentError("perturbation_std_initial must be positive.")) + initial_rollouts_value >= 1 || + throw(ArgumentError("num_rollouts_initial must be at least 1.")) + + return ScoreFunctionSchedule( + config, + sf_start_value, + ramp_batches_value, + config.dual_weight, + initial_std_value, + config.perturbation_std, + initial_rollouts_value, + config.num_rollouts, + ) +end + +const _ScoreFunctionParameters = @NamedTuple{ + alpha::Float64, + score_weight::Float64, + perturbation_std::Float64, + num_rollouts::Int, + active::Bool, +} + +""" + sf_params(config::ScoreFunctionConfig, iteration::Integer) + sf_params(schedule::ScoreFunctionSchedule, iteration::Integer) + +Return the effective score-function parameters for `iteration`. + +# Arguments +- `config::ScoreFunctionConfig`: unscheduled score-function configuration. +- `schedule::ScoreFunctionSchedule`: scheduled score-function configuration. +- `iteration::Integer`: one-based training iteration. + +# Returns +A named tuple with fields: +- `alpha::Float64`: weight on the dual-gradient term. +- `score_weight::Float64`: weight on the score-function term. +- `perturbation_std::Float64`: Gaussian standard deviation ``\\sigma``. +- `num_rollouts::Int`: number of perturbed rollouts. +- `active::Bool`: whether rollout estimation should run. + +# Examples +```julia +params = sf_params(schedule, 250) +params.active && @show params.score_weight +``` +""" +function sf_params( + config::ScoreFunctionConfig, + ::Integer, +)::_ScoreFunctionParameters + # Static configurations are active at every training iteration. + return ( + alpha = config.dual_weight, + score_weight = 1.0 - config.dual_weight, + perturbation_std = config.perturbation_std, + num_rollouts = config.num_rollouts, + active = true, + ) +end + +function sf_params( + schedule::ScoreFunctionSchedule, + iteration::Integer, +)::_ScoreFunctionParameters + # Before warmup ends, keep the original deterministic-equivalent gradient. + if iteration < schedule.sf_start + return ( + alpha = 1.0, + score_weight = 0.0, + perturbation_std = 0.0, + num_rollouts = 0, + active = false, + ) + end + + # A clipped ramp fraction keeps all interpolated values inside bounds. + ramp_fraction = clamp( + (iteration - schedule.sf_start) / schedule.ramp_batches, + 0.0, + 1.0, + ) + + # The score-function weight follows the linear ramp. + uncapped_score_weight = ramp_fraction * (1.0 - schedule.final_dual_weight) + + # Interpolate the perturbation scale and rollout count over the same ramp. + perturbation_std = schedule.initial_perturbation_std + + ramp_fraction * + (schedule.final_perturbation_std - schedule.initial_perturbation_std) + num_rollouts = round( + Int, + schedule.initial_num_rollouts + + ramp_fraction * + (schedule.final_num_rollouts - schedule.initial_num_rollouts), + ) + + return ( + alpha = 1.0 - uncapped_score_weight, + score_weight = uncapped_score_weight, + perturbation_std = perturbation_std, + num_rollouts = max(1, num_rollouts), + active = true, + ) +end + +""" + _sf_config(score_function) -> Union{Nothing,ScoreFunctionConfig} + +Extract the underlying [`ScoreFunctionConfig`](@ref), if one exists. + +# Arguments +- `score_function::Nothing`: score-function correction is disabled. +- `score_function::ScoreFunctionConfig`: returned as-is. +- `score_function::ScoreFunctionSchedule`: unwraps `score_function.config`. + +# Examples +```julia +config = DecisionRules._sf_config(score_function) +``` +""" +_sf_config(::Nothing) = nothing +_sf_config(config::ScoreFunctionConfig) = config +_sf_config(schedule::ScoreFunctionSchedule) = schedule.config + +""" + _set_score_function_stage_parameters!( + state_params_in, + state_params_out, + uncertainties, + state, + target, + ) -> Nothing + +Set the JuMP parameters needed for one perturbed rollout stage. + +# Arguments +- `state_params_in::AbstractVector`: parameters receiving the current state. +- `state_params_out::AbstractVector`: `(target_parameter, state_variable)` + pairs receiving the target state. +- `uncertainties::AbstractVector`: `(parameter, value)` pairs for stage + uncertainty. +- `state::AbstractVector{<:Real}`: realized state entering this stage. +- `target::AbstractVector{<:Real}`: perturbed target for the output state. + +# Examples +```julia +DecisionRules._set_score_function_stage_parameters!( + spi[t], + spo[t], + uncertainty_sample[t], + state, + target, +) +``` +""" +function _set_score_function_stage_parameters!( + state_params_in, + state_params_out, + uncertainties, + state::AbstractVector{<:Real}, + target::AbstractVector{<:Real}, +) + # Input-state parameters receive the realized state from the prior stage. + for index in eachindex(state_params_in) + set_parameter_value(state_params_in[index], state[index]) + end + + # Uncertainty parameters receive the sampled exogenous values. + for (parameter, value) in uncertainties + set_parameter_value(parameter, value) + end + + # Output target parameters receive the perturbed policy targets. + for index in eachindex(state_params_out) + set_parameter_value(state_params_out[index][1], target[index]) + end + + return nothing +end + +""" + rollout_with_perturbation( + config::ScoreFunctionConfig, + initial_state::AbstractVector, + uncertainties, + targets, + perturbations, + ) -> Float64 + +Run one stage-wise rollout with fixed target perturbations. + +The rollout target at stage `t` is `targets[t + 1] + perturbations[t]`. The +returned cost excludes the target-deficit penalty so the score-function signal +estimates operational cost rather than target-following slack. + +# Arguments +- `config::ScoreFunctionConfig`: rollout models and parameter mappings. +- `initial_state::AbstractVector`: state entering stage 1. +- `uncertainties`: sampled uncertainty trajectory. +- `targets`: target trajectory, including `targets[1] == initial_state`. +- `perturbations`: one perturbation vector for each stage target. + +# Examples +```julia +cost = DecisionRules.rollout_with_perturbation( + score_function, + initial_state, + uncertainty_sample, + targets, + perturbations, +) +``` +""" +function rollout_with_perturbation( + config::ScoreFunctionConfig, + initial_state::AbstractVector, + uncertainties, + targets, + perturbations, +)::Float64 + # Rollouts always start from the true initial state. + state = Float64.(initial_state) + + # Accumulate operational cost over the horizon. + total_cost = 0.0 + + for stage in eachindex(config.subproblems) + # The deterministic target sequence includes the initial state at index 1. + target = Float64.(targets[stage + 1]) .+ perturbations[stage] + + # Set all model parameters before solving this stage. + _set_score_function_stage_parameters!( + config.state_params_in[stage], + config.state_params_out[stage], + uncertainties[stage], + state, + target, + ) + + # Score-function rollouts need realized costs, not duals, so solve the + # model exactly as it was built. + optimize!(config.subproblems[stage]) + + # Fail loudly when a sampled rollout is not solved to a usable status. + _assert_successful_solve( + config.subproblems[stage]; + context = "score-function rollout solve", + ) + + # Read the operational cost after the successful solve. + stage_cost = get_objective_no_target_deficit(config.subproblems[stage]) + + # Read the realized output state that becomes the next input state. + next_state = Float64.([ + JuMP.value(config.state_params_out[stage][index][2]) + for index in eachindex(config.state_params_out[stage]) + ]) + + # Feed the realized output state to the next stage. + total_cost += stage_cost + state = next_state + end + + return total_cost +end + +""" + _sample_target_perturbations(num_stages::Integer, state_dimension::Integer, sigma::Real) + +Draw Gaussian target perturbations for one score-function rollout. + +# Arguments +- `num_stages::Integer`: number of stage targets to perturb. +- `state_dimension::Integer`: length of each target state vector. +- `sigma::Real`: Gaussian standard deviation ``\\sigma``. + +# Examples +```julia +perturbations = DecisionRules._sample_target_perturbations(3, 2, 0.5) +``` +""" +function _sample_target_perturbations( + num_stages::Integer, + state_dimension::Integer, + sigma::Real, +) + # Multiplying standard normal draws by sigma stores actual perturbations. + return [Float64(sigma) .* randn(Int(state_dimension)) for _ in 1:Int(num_stages)] +end + +""" + _center_rollout_costs(costs::AbstractVector{<:Real}, baseline::Symbol) + +Convert rollout costs into score-function advantages. + +# Arguments +- `costs::AbstractVector{<:Real}`: operational costs from perturbed rollouts. +- `baseline::Symbol`: either `:mean` or `:none`. + +# Examples +```julia +advantages = DecisionRules._center_rollout_costs([10.0, 12.0], :mean) +``` +""" +function _center_rollout_costs( + costs::AbstractVector{<:Real}, + baseline::Symbol, +) + # A mean baseline reduces variance without changing the expected gradient. + baseline_value = baseline === :mean ? mean(costs) : 0.0 + + return Float64.(costs) .- baseline_value +end + +""" + _score_function_rollouts( + config::ScoreFunctionConfig, + initial_state::AbstractVector, + uncertainties, + targets; + perturbation_std = config.perturbation_std, + num_rollouts = config.num_rollouts, + ) -> (advantages, perturbations) + +Estimate rollout advantages for the score-function term. + +# Arguments +- `config::ScoreFunctionConfig`: score-function rollout configuration. +- `initial_state::AbstractVector`: state entering stage 1. +- `uncertainties`: sampled uncertainty trajectory. +- `targets`: target trajectory, including the initial state. +- `perturbation_std::Real`: Gaussian standard deviation ``\\sigma``. +- `num_rollouts::Integer`: number of perturbed rollouts to sample. + +# Examples +```julia +advantages, perturbations = DecisionRules._score_function_rollouts( + score_function, + initial_state, + uncertainty_sample, + targets; + perturbation_std = 0.5, + num_rollouts = 4, +) +``` +""" +function _score_function_rollouts( + config::ScoreFunctionConfig, + initial_state::AbstractVector, + uncertainties, + targets; + perturbation_std::Real = config.perturbation_std, + num_rollouts::Integer = config.num_rollouts, +) + # Use the first target after the initial state to infer the state dimension. + state_dimension = length(targets[2]) + num_stages = length(config.subproblems) + + # Allocate both arrays up front so each rollout has a visible slot. + costs = Vector{Float64}(undef, Int(num_rollouts)) + perturbations = Vector{Vector{Vector{Float64}}}(undef, Int(num_rollouts)) + + for rollout in eachindex(costs) + # Draw perturbations once, then reuse them in the surrogate gradient. + perturbations[rollout] = _sample_target_perturbations( + num_stages, + state_dimension, + perturbation_std, + ) + + # Evaluate the realized cost under the perturbed target trajectory. + costs[rollout] = rollout_with_perturbation( + config, + initial_state, + uncertainties, + targets, + perturbations[rollout], + ) + end + + return _center_rollout_costs(costs, config.baseline), perturbations +end + +""" + _score_function_surrogate( + advantage::Real, + perturbations, + targets, + perturbation_std::Real, + ) -> Real + +Build the differentiable scalar whose gradient is the Gaussian score estimate. + +For fixed rollout cost advantage ``A`` and perturbations ``\\delta_t``, the +surrogate is + +```math +A \\sum_t \\left\\langle + \\delta_t / \\sigma^2, \\hat{x}_{t+1}(\\theta) +\\right\\rangle . +``` + +# Arguments +- `advantage::Real`: centered rollout cost ``R - b``. +- `perturbations`: stage perturbations ``\\delta_t``. +- `targets`: differentiable target trajectory produced by the policy. +- `perturbation_std::Real`: Gaussian standard deviation ``\\sigma``. + +# Examples +```julia +loss = DecisionRules._score_function_surrogate( + 3.0, + perturbations, + targets, + 0.5, +) +``` +""" +function _score_function_surrogate( + advantage::Real, + perturbations, + targets, + perturbation_std::Real, +) + # The Gaussian location score divides actual perturbations by sigma squared. + inverse_variance = inv(Float32(perturbation_std)^2) + + # Targets include the initial state, so stage t uses targets[t + 1]. + score = sum(eachindex(perturbations)) do stage + sum(Float32.(perturbations[stage]) .* targets[stage + 1]) * + inverse_variance + end + + return Float32(advantage) * score +end diff --git a/src/simulate_multistage.jl b/src/simulate_multistage.jl index f7d1801..a5116e7 100644 --- a/src/simulate_multistage.jl +++ b/src/simulate_multistage.jl @@ -301,16 +301,8 @@ function ChainRulesCore.rrule( ) end catch e - msg = sprint(showerror, e) - throw( - ArgumentError( - "Differentiating get_next_state requires a DiffOpt-enabled model " * - "because the closed-loop rollout needs solution sensitivities of the " * - "realized state variables. Use an appropriate DiffOpt wrapper for the " * - "stage subproblems (for target-slack conic models, " * - "`DiffOpt.conic_diff_model(...)`), or use the deterministic-equivalent " * - "training path when only target duals are needed. Original error: $msg", - ), + return handle_gradient_error( + _DEFAULT_GRADIENT_FALLBACK, e, length(state_in), length(state_out_target) ) end end @@ -347,6 +339,9 @@ end function get_objective_no_target_deficit( subproblem::JuMP.Model; norm_deficit::AbstractString="norm_deficit" ) + if subproblem.is_model_dirty + return get(subproblem.ext, :_last_obj_no_deficit, 0.0) + end try obj = JuMP.objective_function(subproblem) objective_val = objective_value(subproblem) @@ -902,6 +897,99 @@ function ChainRulesCore.rrule( return y, public_pullback end +@doc raw""" + sample(uncertainty_pool) -> Vector{Vector{Tuple{VariableRef, T}}} + +Draw one full uncertainty trajectory from a DecisionRules uncertainty pool. + +The returned trajectory is a length-``T`` vector where each element is +`Vector{Tuple{VariableRef, Float64}}` — one realized value per uncertain +parameter for that stage. This is the format consumed by `simulate_multistage`, +`train_multistage`, and all other training/evaluation functions. + +Three pool formats are supported, offering increasing levels of correlation: + +## 1. Independent sampling (per-unit pools) + +Each uncertain parameter has its own finite support; sampling draws +independently from each support at each stage. + + sample(multistage_pool::Vector{Vector{Tuple{VariableRef, Vector{T}}}}) + +`multistage_pool[t]` is `[(param₁, [v₁₁, v₁₂, …]), (param₂, [v₂₁, v₂₂, …]), …]`. +Each parameter picks one value uniformly at random from its own support. +**No spatial or temporal correlation is preserved.** + +## 2. Joint-scenario sampling (spatial correlation) + +Scenarios are pre-defined joint realizations across all parameters at each +stage. Sampling picks one complete scenario per stage uniformly, preserving +cross-parameter correlations (e.g., spatially correlated inflows across +hydro reservoirs). Stages are still drawn independently. + + sample(multistage_joint::Vector{Vector{Vector{Tuple{VariableRef, T}}}}) + +`multistage_joint[t]` is `[scenario₁, scenario₂, …]` where each scenario +is `[(param₁, val₁), (param₂, val₂), …]`. + +## 3. Trajectory sampler (spatial + temporal correlation) + +A callable `sampler(t, past) -> Vector{Tuple{VariableRef, T}}` that generates +stage `t`'s realization given the realized values from stages `1:t-1`. This +enables autoregressive, Markovian, or any custom temporal dependence. + + sample(sampler::Function, T::Int) + +The callable receives: +- `t::Int` — the current stage (1-indexed) +- `past::Vector{Vector{Tuple{VariableRef, T}}}` — realized samples from + stages `1:t-1` (empty vector for `t=1`) + +and must return `Vector{Tuple{VariableRef, T}}` — the realized sample for +stage `t`. + +## Output format + +All three methods return `Vector{Vector{Tuple{VariableRef, T}}}` — a length-``T`` +vector of per-stage realized samples. This is the universal input to +`simulate_multistage`, `train_multistage`, `simulate_multiple_shooting`, and all +evaluation functions. + +# Examples +```julia +# 1. Independent sampling (each unit draws independently): +independent_pool = [ + [(inflow_1, [10.0, 15.0, 12.0]), (inflow_2, [8.0, 12.0, 9.0])], + [(inflow_1, [11.0, 14.0, 13.0]), (inflow_2, [7.0, 11.0, 10.0])], +] +path = sample(independent_pool) + +# 2. Joint-scenario sampling (preserves spatial correlation): +joint_pool = [ + [[(inflow_1, 10.0), (inflow_2, 8.0)], # scenario 1 + [(inflow_1, 15.0), (inflow_2, 12.0)]], # scenario 2 — stage 1 + [[(inflow_1, 11.0), (inflow_2, 7.0)], + [(inflow_1, 14.0), (inflow_2, 11.0)]], # stage 2 +] +path = sample(joint_pool) + +# 3. Trajectory sampler (preserves temporal + spatial correlation): +function my_sampler(t, past) + if t == 1 + ω = rand(1:nScenarios) + return [(inflow_params[t][r], data[r][t, ω]) for r in 1:nHyd] + else + # AR(1): next inflow depends on previous realized inflow + prev_values = [pair[2] for pair in past[end]] + noise = randn(nHyd) .* σ + return [(inflow_params[t][r], ρ * prev_values[r] + noise[r]) for r in 1:nHyd] + end +end +path = sample(my_sampler, T) +``` + +See the [Uncertainty Sampling](@ref) documentation page for a complete guide. +""" function sample(uncertainty_samples::Vector{Tuple{VariableRef,Vector{T}}}) where {T<:Real} uncertainty_sample = Vector{Tuple{VariableRef,T}}(undef, length(uncertainty_samples)) for i in 1:length(uncertainty_samples) @@ -910,20 +998,168 @@ function sample(uncertainty_samples::Vector{Tuple{VariableRef,Vector{T}}}) where return uncertainty_sample end +function sample(joint_scenarios::Vector{Vector{Tuple{VariableRef,T}}}) where {T<:Real} + return rand(joint_scenarios) +end + function sample( uncertainty_samples::Vector{Vector{Tuple{VariableRef,Vector{T}}}} ) where {T<:Real} return [sample(uncertainty_samples[t]) for t in 1:length(uncertainty_samples)] end +function sample( + uncertainty_samples::Vector{Vector{Vector{Tuple{VariableRef,T}}}} +) where {T<:Real} + return [sample(uncertainty_samples[t]) for t in 1:length(uncertainty_samples)] +end + """ - train_multistage(model, initial_state, subproblems, state_params_in, - state_params_out, uncertainty_sampler; kwargs...) + sample(sampler::Function, T::Int) + +Draw a full trajectory using a callable trajectory sampler with temporal dependence. + +`sampler(t, past)` receives the current stage `t` and a vector of all previously +realized samples `past[1:t-1]`, and returns the realized sample for stage `t`. + +This enables autoregressive, Markovian, or any custom temporal correlation between +stages — something the data-based pool formats cannot express. -Train a policy with **stage-wise decomposition** (single shooting, Extension §2). -Each SGD step samples `num_train_per_batch` uncertainty trajectories, rolls out the -policy through `simulate_multistage` (stage-wise overload), and updates `model` via -the Flux optimizer. +See [`sample`](@ref) for the full API and examples. +""" +function sample(sampler::Function, T::Int) + trajectory = Vector{Vector{Tuple{VariableRef,Float64}}}(undef, T) + past = Vector{Vector{Tuple{VariableRef,Float64}}}() + for t in 1:T + trajectory[t] = sampler(t, past) + push!(past, trajectory[t]) + end + return trajectory +end + +""" + sample(sampler::Function) + +Call a zero-argument trajectory sampler that returns a complete trajectory. + +This is the dispatch used by `train_multistage` and `train_multiple_shooting` +when `uncertainty_sampler` is a callable. Wrap a trajectory sampler as: + +```julia +uncertainty_sampler = () -> sample(my_stage_sampler, T) +``` +""" +function sample(sampler::Function) + return sampler() +end + +@doc raw""" + train_multistage(model, initial_state, subproblems::Vector{JuMP.Model}, + state_params_in, state_params_out, uncertainty_sampler; + kwargs...) + +Train a target-state policy with stage-wise decomposition (single shooting). + +For one sampled uncertainty trajectory ``w_{1:T}``, this overload solves one +optimization problem per stage. At stage ``t``, given the realized incoming +state ``x_{t-1}``, the policy predicts a target +``\hat{x}_t = \pi_\theta(w_t, x_{t-1})`` and the stage problem is + +```math +\begin{aligned} +q_t(x_{t-1}, w_t; \hat{x}_t) + = \min_{x_t, y_t, \delta_t} + \quad & f_t(x_t, y_t) + C_\delta \|\delta_t\| \\ +\text{s.t.}\quad + & x_t = T_t(w_t, y_t, x_{t-1}) && : \mu_t, \\ + & x_t + \delta_t = \hat{x}_t && : \lambda_t, \\ + & h_t(x_t, y_t) \ge 0 . +\end{aligned} +``` + +The rollout objective is the sum of stage values, + +```math +Q(\theta; w) = + \sum_{t=1}^{T} q_t(x_{t-1}, w_t; \hat{x}_t), +``` + +where each realized ``x_t`` is read from the previous stage solve. The gradient +therefore contains both the target duals ``\lambda_t`` and the sensitivity of +later realized states with respect to earlier targets. In the notation of the +extension note, + +```math +\nabla_\theta Q(\theta; w) += +\sum_{t=1}^{T} +\left[ + \frac{\partial q_t}{\partial \hat{x}_t} + + + \sum_{k=t+1}^{T} + \frac{\partial q_k}{\partial x_{k-1}} + \prod_{j=t+1}^{k-1} + \frac{\partial x_j}{\partial x_{j-1}} + \frac{\partial x_t}{\partial \hat{x}_t} +\right] +\nabla_\theta \pi_\theta(w_t, x_{t-1}). +``` + +The dual terms come from target and transition constraints; the state +sensitivities are computed through DiffOpt in the rrules for +[`simulate_stage`](@ref) and [`get_next_state`](@ref). + +# Arguments +- `model`: differentiable Flux-compatible policy. It receives + `vcat(stage_uncertainty, realized_state)` and returns the next target state. +- `initial_state::AbstractVector{<:Real}`: state ``x_0`` entering stage 1. +- `subproblems::Vector{JuMP.Model}`: one JuMP model per stage. +- `state_params_in`: stage input-state parameters. +- `state_params_out`: `(target_parameter, realized_state_variable)` pairs for + each stage output state. +- `uncertainty_sampler`: source of uncertainty trajectories, passed to + [`sample`](@ref). Three formats are accepted: + 1. **Per-unit pool** (`Vector{Vector{Tuple{VariableRef, Vector{T}}}}`): + independent sampling per parameter per stage. + 2. **Joint-scenario pool** (`Vector{Vector{Vector{Tuple{VariableRef, T}}}}`): + one scenario drawn per stage, preserving spatial correlation. + 3. **Callable** (`() -> Vector{Vector{Tuple{VariableRef, T}}}`): a zero-arg + function returning a full trajectory. Use this for temporal correlation + by wrapping a trajectory sampler: + `() -> sample(my_stage_sampler, T)` where `my_stage_sampler(t, past)` + generates stage `t` conditioned on past realizations. + +# Keywords +- `num_batches::Integer`: number of SGD batches. +- `num_train_per_batch::Integer`: sampled trajectories per batch. +- `optimizer`: Flux optimizer used to update `model`. +- `adjust_hyperparameters::Function`: optional hook returning the batch size for + the current iteration. +- `record_loss`: legacy logging callback. +- `sample_log::SampleLog`: per-batch objective cache. +- `record::Function`: callback called as `record(sample_log, iter, model)`. +- `penalty_schedule`: optional multiplier schedule for target-penalty terms. +- `integer_strategy::AbstractIntegerStrategy`: strategy used when a stage model + has discrete variables and derivative information must be read. + +# Examples +```julia +# With data pool (independent or joint): +train_multistage( + policy, initial_state, subproblems, + state_params_in, state_params_out, uncertainty_pool; + num_batches=200, optimizer=Flux.Adam(1e-3), +) + +# With trajectory sampler (temporal correlation): +ar_sampler(t, past) = my_ar1_model(t, past, inflow_params) +train_multistage( + policy, initial_state, subproblems, + state_params_in, state_params_out, + () -> sample(ar_sampler, T); + num_batches=200, optimizer=Flux.Adam(1e-3), +) +``` """ function train_multistage( model, @@ -942,9 +1178,16 @@ function train_multistage( record=default_record, penalty_schedule=nothing, integer_strategy::AbstractIntegerStrategy=NoIntegerStrategy(), + gradient_fallback::AbstractGradientFallback=ZeroGradientFallback(), ) + if gradient_fallback isa ZeroGradientFallback + @info "Training with ZeroGradientFallback: solver/differentiation errors will be " * + "caught and the iteration skipped (zero gradient). Pass " * + "`gradient_fallback=ErrorGradientFallback()` to throw instead, or implement " * + "a custom `AbstractGradientFallback` subtype." + end + record = _resolve_record(record, record_loss) - # Initialise the optimiser for this model: opt_state = Flux.setup(optimizer, model) schedule = _resolve_penalty_schedule(penalty_schedule, num_batches) @@ -964,32 +1207,40 @@ function train_multistage( end end num_train_per_batch = adjust_hyperparameters(iter, opt_state, num_train_per_batch) - # Sample uncertainties + uncertainty_samples = [sample(uncertainty_sampler) for _ in 1:num_train_per_batch] + objective = 0.0 _reset_sample_log!(sample_log) - grads = Flux.gradient(model) do m - for s in 1:num_train_per_batch - Flux.reset!(m) - objective += simulate_multistage( - subproblems, - state_params_in, - state_params_out, - initial_state, - uncertainty_samples[s], - m; - integer_strategy=integer_strategy, - ) - @ignore_derivatives sample_log(s, subproblems) + grads = try + Flux.gradient(model) do m + for s in 1:num_train_per_batch + Flux.reset!(m) + objective += simulate_multistage( + subproblems, + state_params_in, + state_params_out, + initial_state, + uncertainty_samples[s], + m; + integer_strategy=integer_strategy, + ) + @ignore_derivatives sample_log(s, subproblems) + end + objective /= num_train_per_batch + return objective + end + catch e + if handle_training_error(gradient_fallback, e, iter) + nothing end - objective /= num_train_per_batch - return objective end record(sample_log, iter, model) && break - # Update the parameters so as to reduce the objective, - # according the chosen optimisation rule: - # Convert gradients from MutableTangent to plain NamedTuples for Flux.update! + if isnothing(grads) + continue + end + grad = materialize_tangent(grads[1]) Flux.update!(opt_state, model, grad) end @@ -1010,14 +1261,121 @@ function sim_states(t, m, initial_state, uncertainty_sample_vec, prev_states) end end -""" +@doc raw""" train_multistage(model, initial_state, det_equivalent::JuMP.Model, - state_params_in, state_params_out, uncertainty_sampler; kwargs...) - -Train a policy with the **deterministic equivalent** (direct transcription, -Extension §1). Each SGD step samples uncertainty trajectories, rolls out target -states with `Base.accumulate`, solves the coupled `det_equivalent`, and updates -`model`. Gradient: Eq. 1.2, ``λ^s ⊙ ∇_θ π``. + state_params_in, state_params_out, uncertainty_sampler; + score_function=nothing, kwargs...) + +Train a target-state policy with a deterministic equivalent (direct transcription). + +For one sampled trajectory ``w_{1:T}``, the policy first produces the full target +trajectory + +```math +\hat{x}_{1:T}(\theta) = \pi_\theta(w_{1:T}, x_0). +``` + +The coupled implementation problem is + +```math +\begin{aligned} +Q(w; \theta) + = + \min_{\{x_t, y_t, \delta_t\}_{t=1}^{T}} + \quad & + \sum_{t=1}^{T} f_t(x_t, y_t) + + C_\delta \sum_{t=1}^{T} \|\delta_t\| \\ +\text{s.t.}\quad + & x_t = T_t(w_t, y_t, x_{t-1}) && t=1,\ldots,T, \\ + & x_t + \delta_t = \hat{x}_t(\theta) && : \lambda_t,\quad t=1,\ldots,T, \\ + & h_t(x_t, y_t) \ge 0 && t=1,\ldots,T . +\end{aligned} +``` + +The target trajectory appears as right-hand-side parameters. If +``\lambda_t`` is the dual multiplier of the target constraint, the envelope +gradient used by this overload is + +```math +\nabla_\theta \mathbb{E}[Q(w; \theta)] +\approx +\frac{1}{S} +\sum_{s=1}^{S} +\sum_{t=1}^{T} +\lambda_t^s \odot +\nabla_\theta \hat{x}_t^s(\theta), +``` + +where ``S`` is `num_train_per_batch` and ``\odot`` denotes componentwise +multiplication. + +Pass a [`ScoreFunctionConfig`](@ref) or [`ScoreFunctionSchedule`](@ref) via +`score_function` to mix the dual gradient with a REINFORCE correction +estimated from rollouts under perturbed targets. + +When `score_function` is used, there are two separate solve paths: + +1. `integer_strategy` applies to `det_equivalent` and controls how local dual + information is read for the differentiable dual-gradient term. +2. `score_function` owns separate rollout subproblems. Those models are solved + exactly as they are built, and their realized costs define the Monte Carlo + score-function term. + +For a mixed-integer model, this usually means +`integer_strategy = FixedDiscreteIntegerStrategy()` for the dual path and +MIP rollout subproblems inside `ScoreFunctionConfig` for the score-function +path. + +# Arguments +- `model`: differentiable Flux-compatible policy. It is rolled forward over + uncertainty values to produce ``\hat{x}_{1:T}``. +- `initial_state::AbstractVector{<:Real}`: state ``x_0``. +- `det_equivalent::JuMP.Model`: full-horizon JuMP model for one sampled + trajectory. +- `state_params_in`: input-state parameters in the deterministic equivalent. +- `state_params_out`: `(target_parameter, realized_state_variable)` pairs for + each target state. +- `uncertainty_sampler`: source of uncertainty trajectories, passed to + [`sample`](@ref). Three formats are accepted: + 1. **Per-unit pool** (`Vector{Vector{Tuple{VariableRef, Vector{T}}}}`): + independent sampling per parameter per stage. + 2. **Joint-scenario pool** (`Vector{Vector{Vector{Tuple{VariableRef, T}}}}`): + one scenario drawn per stage, preserving spatial correlation. + 3. **Callable** (`() -> Vector{Vector{Tuple{VariableRef, T}}}`): a zero-arg + function returning a full trajectory. Use this for temporal correlation; + see [`sample`](@ref). + +# Keywords +- `num_batches::Integer`: number of SGD batches. +- `num_train_per_batch::Integer`: sampled trajectories per batch ``S``. +- `optimizer`: Flux optimizer used to update `model`. +- `adjust_hyperparameters::Function`: optional hook returning the batch size for + the current iteration. +- `record_loss`: legacy logging callback. +- `sample_log::SampleLog`: per-batch objective cache. +- `record::Function`: callback called as `record(sample_log, iter, model)`. +- `penalty_schedule`: optional multiplier schedule for target-penalty terms. +- `integer_strategy::AbstractIntegerStrategy`: strategy used to read local dual + information from `det_equivalent` when it has discrete variables. +- `score_function`: optional [`ScoreFunctionConfig`](@ref) or + [`ScoreFunctionSchedule`](@ref) for mixed dual/score-function gradients. + +# Examples +```julia +train_multistage( + policy, + initial_state, + det_equivalent, + state_params_in, + state_params_out, + uncertainty_sampler; + num_batches = 200, + num_train_per_batch = 16, + optimizer = Flux.Adam(1.0e-3), + integer_strategy = FixedDiscreteIntegerStrategy(), + score_function = nothing, +) +``` """ function train_multistage( model, @@ -1036,75 +1394,110 @@ function train_multistage( record=default_record, penalty_schedule=nothing, integer_strategy::AbstractIntegerStrategy=NoIntegerStrategy(), + score_function::Union{Nothing,ScoreFunctionConfig,ScoreFunctionSchedule}=nothing, + gradient_fallback::AbstractGradientFallback=ZeroGradientFallback(), ) record = _resolve_record(record, record_loss) - # Initialise the optimiser for this model: opt_state = Flux.setup(optimizer, model) num_stages = length(state_params_in) schedule = _resolve_penalty_schedule(penalty_schedule, num_batches) - penalty_bases = if isnothing(schedule) - nothing - else + penalty_bases = isnothing(schedule) ? nothing : _check_deficit_penalty_bases(_deficit_penalty_bases(det_equivalent)) - end current_multiplier = NaN + sf_cfg = _sf_config(score_function) + use_sf = !isnothing(sf_cfg) + for iter in 1:num_batches if !isnothing(schedule) multiplier = _penalty_multiplier_for(schedule, iter) if multiplier != current_multiplier _apply_deficit_penalty_multiplier!( - det_equivalent, penalty_bases, multiplier - ) + det_equivalent, penalty_bases, multiplier) current_multiplier = multiplier end end num_train_per_batch = adjust_hyperparameters(iter, opt_state, num_train_per_batch) - # Sample uncertainties + + score_params = use_sf ? sf_params(score_function, iter) : + ( + alpha = 1.0, + score_weight = 0.0, + perturbation_std = 0.0, + num_rollouts = 0, + active = false, + ) + uncertainty_samples = [sample(uncertainty_sampler) for _ in 1:num_train_per_batch] num_uncertainties = length(uncertainty_samples[1][1]) uncertainty_samples_vec = [ [ - [uncertainty_samples[s][stage][i][2] for i in 1:num_uncertainties] for - stage in 1:length(uncertainty_samples[1]) + [uncertainty_samples[s][stage][i][2] for i in 1:num_uncertainties] + for stage in 1:num_stages ] for s in 1:num_train_per_batch ] - # Calculate the gradient of the objective - # with respect to the parameters within the model: objective = 0.0 _reset_sample_log!(sample_log) - grads = Flux.gradient(model) do m - for s in 1:num_train_per_batch - Flux.reset!(m) - init_state = Float32.(initial_state) - predicted_states = accumulate( - uncertainty_samples_vec[s]; init=init_state - ) do prev_state, uncertainties_t - return m(vcat(uncertainties_t, prev_state)) + grads = try + Flux.gradient(model) do m + for s in 1:num_train_per_batch + Flux.reset!(m) + x0 = Float32.(initial_state) + states = vcat([x0], accumulate( + uncertainty_samples_vec[s]; init=x0 + ) do prev, ξ + m(vcat(ξ, prev)) + end) + + dual_obj = simulate_multistage( + det_equivalent, state_params_in, state_params_out, + uncertainty_samples[s], states; + integer_strategy=integer_strategy) + @ignore_derivatives sample_log(s, det_equivalent) + objective += score_params.alpha * dual_obj + + if score_params.active + advantages, perturbations = @ignore_derivatives( + _score_function_rollouts( + sf_cfg, + initial_state, + uncertainty_samples[s], + states; + perturbation_std = score_params.perturbation_std, + num_rollouts = score_params.num_rollouts, + ) + ) + for rollout in 1:score_params.num_rollouts + advantage = @ignore_derivatives advantages[rollout] + perturbation = @ignore_derivatives perturbations[rollout] + surrogate = _score_function_surrogate( + advantage, + perturbation, + states, + score_params.perturbation_std, + ) + objective += score_params.score_weight * + surrogate / Float32(score_params.num_rollouts) + end + end end - states = vcat([init_state], predicted_states) - objective += simulate_multistage( - det_equivalent, - state_params_in, - state_params_out, - uncertainty_samples[s], - states; - integer_strategy=integer_strategy, - ) - @ignore_derivatives sample_log(s, det_equivalent) + objective /= num_train_per_batch + return objective + end + catch e + if handle_training_error(gradient_fallback, e, iter) + nothing end - objective /= num_train_per_batch - return objective end record(sample_log, iter, model) && break - # Update the parameters so as to reduce the objective, - # according the chosen optimisation rule: - # Convert gradients from MutableTangent to plain NamedTuples for Flux.update! - grad = materialize_tangent(grads[1]) - Flux.update!(opt_state, model, grad) + if isnothing(grads) + continue + end + + Flux.update!(opt_state, model, materialize_tangent(grads[1])) end return model diff --git a/src/utils.jl b/src/utils.jl index f8306a3..6dc07e5 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -323,6 +323,30 @@ function _apply_deficit_penalty_multiplier!( return models end +""" + SaveBest(best_loss::Float64, model_path::String) + +Callback that saves the best policy state seen during training. + +`SaveBest` is a small callable object used as a training callback. When called +as `callback(iter, model, loss)`, it compares `loss` with the best loss stored +so far. If the new loss is smaller, it copies `model` to CPU, normalizes any +recurrent layer state, and writes the Flux state to `model_path` with JLD2. +It returns `false`, so it records checkpoints without stopping training. + +# Arguments +- `best_loss::Float64`: incumbent loss. Use `Inf` to save the first observed + model. +- `model_path::String`: path of the JLD2 file that receives the best model + state. + +# Examples +```julia +callback = SaveBest(Inf, "best_policy.jld2") +train_multistage(policy, x0, subproblems, state_in, state_out, sampler; + record = (log, iter, model) -> callback(iter, model, mean(log.losses))) +``` +""" mutable struct SaveBest <: Function best_loss::Float64 model_path::String @@ -428,6 +452,9 @@ _reset_sample_log!(sample_log::SampleLog) = empty!(sample_log) _reset_sample_log!(sample_log) = sample_log function _total_objective_value(model::JuMP.Model) + if model.is_model_dirty + return get(model.ext, :_last_obj, 0.0) + end try return objective_value(model) catch @@ -575,6 +602,7 @@ mutable struct RolloutEvaluation <: Function stride::Int policy_state::Symbol integer_strategy::AbstractIntegerStrategy + gradient_fallback::AbstractGradientFallback last_objective_no_deficit::Float64 last_violation_share::Float64 end @@ -588,6 +616,7 @@ function RolloutEvaluation( stride=1, policy_state::Symbol=:realized, integer_strategy::AbstractIntegerStrategy=NoIntegerStrategy(), + gradient_fallback::AbstractGradientFallback=ZeroGradientFallback(), ) isempty(scenarios) && throw( ArgumentError( @@ -607,6 +636,7 @@ function RolloutEvaluation( stride, policy_state, integer_strategy, + gradient_fallback, NaN, NaN, ) @@ -658,32 +688,45 @@ function (evaluation::RolloutEvaluation)(iter, model) iter % evaluation.stride == 0 || return nothing total = 0.0 total_no_deficit = 0.0 + n_success = 0 for scenario in evaluation.scenarios - total += if evaluation.policy_state === :realized - simulate_multistage( - evaluation.subproblems, - evaluation.state_params_in, - evaluation.state_params_out, - evaluation.initial_state, - scenario, - model; - integer_strategy=evaluation.integer_strategy, - ) - else - _simulate_multistage_target_feedback( - evaluation.subproblems, - evaluation.state_params_in, - evaluation.state_params_out, - evaluation.initial_state, - scenario, - model, - evaluation.integer_strategy, - ) + obj = try + if evaluation.policy_state === :realized + simulate_multistage( + evaluation.subproblems, + evaluation.state_params_in, + evaluation.state_params_out, + evaluation.initial_state, + scenario, + model; + integer_strategy=evaluation.integer_strategy, + ) + else + _simulate_multistage_target_feedback( + evaluation.subproblems, + evaluation.state_params_in, + evaluation.state_params_out, + evaluation.initial_state, + scenario, + model, + evaluation.integer_strategy, + ) + end + catch e + handle_rollout_error(evaluation.gradient_fallback, e, iter) + nothing end + isnothing(obj) && continue + total += obj total_no_deficit += get_objective_no_target_deficit(evaluation.subproblems) + n_success += 1 end - objective = total / length(evaluation.scenarios) - evaluation.last_objective_no_deficit = total_no_deficit / length(evaluation.scenarios) + if n_success == 0 + @warn "All rollout scenarios failed at iter $iter" + return nothing + end + objective = total / n_success + evaluation.last_objective_no_deficit = total_no_deficit / n_success evaluation.last_violation_share = _target_violation_share( objective, evaluation.last_objective_no_deficit ) @@ -909,8 +952,15 @@ stage subproblems into `model`. Variables are renamed with a `#t` suffix to avo conflicts. Stage coupling is enforced by identifying the realized state variable of stage `t` with the incoming state parameter of stage `t+1`. -Returns `(model, uncertainties_new)` where `uncertainties_new` maps the original -uncertainty parameter refs to the new refs in the combined model. +`uncertainties` accepts both sampling formats (see [`sample`](@ref)): + +- **Per-unit pools**: `Vector{Vector{Tuple{VariableRef, Vector{T}}}}` — one pool per + parameter, drawing independently per parameter. +- **Joint-scenario pools**: `Vector{Vector{Vector{Tuple{VariableRef, T}}}}` — pre-built + joint scenarios preserving cross-parameter correlations. + +Returns `(model, uncertainties_new)` where `uncertainties_new` has the same format as +the input but with variable refs remapped to the deterministic-equivalent model. """ function deterministic_equivalent!( model::JuMP.Model, @@ -918,12 +968,9 @@ function deterministic_equivalent!( state_params_in::Vector{Vector{Any}}, state_params_out::Vector{Vector{Tuple{Any,VariableRef}}}, initial_state::Vector{Float64}, - uncertainties::Vector{Vector{Tuple{VariableRef,Vector{Float64}}}}, + uncertainties, ) set_objective_sense(model, objective_sense(subproblems[1])) - uncertainties_new = Vector{Vector{Tuple{VariableRef,Vector{Float64}}}}( - undef, length(uncertainties) - ) var_src_to_dest = Dict{VariableRef,VariableRef}() for t in 1:length(subproblems) DecisionRules.add_child_model_vars!( @@ -944,31 +991,52 @@ function deterministic_equivalent!( ) end - if uncertainties[1][1][1] isa VariableRef - # use var_src_to_dest - for t in 1:length(subproblems) - uncertainties_new[t] = Vector{Tuple{VariableRef,Vector{Float64}}}( - undef, length(uncertainties[t]) - ) - for (i, tup) in enumerate(uncertainties[t]) - ky, val = tup - uncertainties_new[t][i] = (var_src_to_dest[ky], val) - end - end + uncertainties_new = _remap_uncertainties(uncertainties, var_src_to_dest, cons_to_cons) + return model, uncertainties_new +end + +""" + _remap_uncertainties(uncertainties, var_src_to_dest, cons_to_cons) + +Replace source-model `VariableRef` keys in an uncertainty pool with their +destination-model counterparts (using the variable or constraint mapping built +by [`deterministic_equivalent!`](@ref)). + +Two methods dispatch on the pool format: + +- **Per-unit pools** (`Vector{Vector{Tuple{VariableRef, Vector{T}}}}`): + each stage maps `[(param₁, [v₁, …]), …]` independently. +- **Joint-scenario pools** (`Vector{Vector{Vector{Tuple{VariableRef, T}}}}`): + each stage maps `[[scenario₁…], [scenario₂…], …]` preserving the grouped + structure. + +This is an internal helper; users interact with it indirectly through +[`deterministic_equivalent!`](@ref). +""" +function _remap_uncertainties( + uncertainties::Vector{Vector{Tuple{VariableRef,Vector{T}}}}, + var_src_to_dest, cons_to_cons, +) where {T<:Real} + remap = if uncertainties[1][1][1] isa VariableRef + ky -> var_src_to_dest[ky] else - # use cons_to_cons - for t in 1:length(subproblems) - uncertainties_new[t] = Vector{Tuple{VariableRef,Vector{Float64}}}( - undef, length(uncertainties[t]) - ) - for (i, tup) in enumerate(uncertainties[t]) - ky, val = tup - uncertainties_new[t] = (cons_to_cons[t][ky], val) - end - end + ky -> cons_to_cons[1][ky] end + return [ + [(remap(ky), val) for (ky, val) in uncertainties[t]] + for t in eachindex(uncertainties) + ] +end - return model, uncertainties_new +function _remap_uncertainties( + uncertainties::Vector{Vector{Vector{Tuple{VariableRef,T}}}}, + var_src_to_dest, cons_to_cons, +) where {T<:Real} + remap = ky -> haskey(var_src_to_dest, ky) ? var_src_to_dest[ky] : cons_to_cons[1][ky] + return [ + [[(remap(ky), val) for (ky, val) in scenario] for scenario in uncertainties[t]] + for t in eachindex(uncertainties) + ] end function find_variables(model::JuMP.Model, variable_name_parts::Vector{S}) where {S} diff --git a/test/runtests.jl b/test/runtests.jl index 3919e78..84814a4 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -44,6 +44,8 @@ function build_subproblem( return subproblem, state_in, state_out, state_out_var, uncertainty end +include("test_score_function.jl") + @testset "DecisionRules.jl" begin @testset "pdual at infeasibility" begin subproblem1, state_in_1, state_out_1, state_out_var_1, uncertainty_1 = build_subproblem( @@ -756,7 +758,7 @@ end model, [5.0], windows, - () -> usamples; + usamples; num_batches=4, num_train_per_batch=1, optimizer=Flux.Descent(0.0), @@ -1732,7 +1734,7 @@ end model, [1.0], windows, - () -> uncertainty_samples; + uncertainty_samples; num_batches=1, num_train_per_batch=1, optimizer=Flux.Descent(0.0), @@ -1933,6 +1935,186 @@ end end end + @testset "sample (independent vs joint-scenario)" begin + # Build a two-stage problem with 2 uncertain parameters per stage + sp1 = quiet_conic_ipopt_model() + @variable(sp1, u1_1 in MOI.Parameter(0.0)) + @variable(sp1, u2_1 in MOI.Parameter(0.0)) + sp2 = quiet_conic_ipopt_model() + @variable(sp2, u1_2 in MOI.Parameter(0.0)) + @variable(sp2, u2_2 in MOI.Parameter(0.0)) + + # -- Independent (per-unit) format -- + indep_pool = [ + [(u1_1, [10.0, 20.0, 30.0]), (u2_1, [100.0, 200.0])], + [(u1_2, [40.0, 50.0]), (u2_2, [300.0, 400.0, 500.0])], + ] + Random.seed!(42) + indep_sample = sample(indep_pool) + @test length(indep_sample) == 2 + @test length(indep_sample[1]) == 2 + @test length(indep_sample[2]) == 2 + @test indep_sample[1][1][1] === u1_1 + @test indep_sample[1][1][2] in [10.0, 20.0, 30.0] + @test indep_sample[1][2][2] in [100.0, 200.0] + + # -- Joint-scenario format -- + # 3 scenarios per stage, 2 parameters each + joint_pool = [ + [ + [(u1_1, 10.0), (u2_1, 100.0)], + [(u1_1, 20.0), (u2_1, 200.0)], + [(u1_1, 30.0), (u2_1, 300.0)], + ], + [ + [(u1_2, 40.0), (u2_2, 400.0)], + [(u1_2, 50.0), (u2_2, 500.0)], + [(u1_2, 60.0), (u2_2, 600.0)], + ], + ] + Random.seed!(42) + joint_sample = sample(joint_pool) + @test length(joint_sample) == 2 + @test length(joint_sample[1]) == 2 # 2 params per stage + @test joint_sample[1][1][1] === u1_1 + # Verify the sample is one of the pre-defined scenarios (not mixed) + @test joint_sample[1] in joint_pool[1] + @test joint_sample[2] in joint_pool[2] + + # Key property: joint sampling preserves correlation within each stage + Random.seed!(999) + n_draws = 50 + for _ in 1:n_draws + s = sample(joint_pool) + @test s[1] in joint_pool[1] + @test s[2] in joint_pool[2] + end + end + + @testset "deterministic_equivalent with joint-scenario format" begin + sp1, si1, so1, sov1, u1 = build_subproblem(10) + sp2, si2, so2, sov2, u2 = build_subproblem( + 10; state_i_val=4.0, state_out_val=3.0, uncertainty_val=1.0 + ) + + sps = [sp1, sp2] + spi = Vector{Vector{Any}}(undef, 2) + spo = Vector{Vector{Tuple{Any,VariableRef}}}(undef, 2) + spi .= [[si1], [si2]] + spo .= [[(so1, sov1)], [(so2, sov2)]] + + # Build joint-scenario format: 1 scenario per stage (deterministic) + joint_pool = [ + [[(u1, 2.0)]], # stage 1: one scenario with inflow=2.0 + [[(u2, 1.0)]], # stage 2: one scenario with inflow=1.0 + ] + + det_equivalent, joint_new = DecisionRules.deterministic_equivalent!( + quiet_nonlinear_ipopt_model(), sps, spi, spo, [5.0], joint_pool + ) + # Remapped pool should have the same structure + @test length(joint_new) == 2 + @test length(joint_new[1]) == 1 # 1 scenario + @test length(joint_new[1][1]) == 1 # 1 param + + # Sample and simulate + s = sample(joint_new) + obj = DecisionRules.simulate_multistage( + det_equivalent, spi, spo, s, [[9.0], [7.0], [4.0]] + ) + @test obj ≈ 359 rtol=1.0e-1 + end + + @testset "simulate_multistage with joint-scenario sampling" begin + sp1, si1, so1, sov1, u1 = build_subproblem( + 10; subproblem=quiet_conic_ipopt_model() + ) + sp2, si2, so2, sov2, u2 = build_subproblem( + 10; + state_i_val=1.0, state_out_val=9.0, uncertainty_val=2.0, + subproblem=quiet_conic_ipopt_model(), + ) + + sps = [sp1, sp2] + spi = Vector{Vector{Any}}(undef, 2) + spo = Vector{Vector{Tuple{Any,VariableRef}}}(undef, 2) + spi .= [[si1], [si2]] + spo .= [[(so1, sov1)], [(so2, sov2)]] + + joint_pool = [ + [[(u1, 2.0)]], # stage 1 + [[(u2, 1.0)]], # stage 2 + ] + + Random.seed!(222) + m = Chain(Dense(2, 10), Dense(10, 1)) + obj_before = simulate_multistage( + sps, spi, spo, [5.0], sample(joint_pool), m + ) + + train_multistage( + m, [5.0], sps, spi, spo, joint_pool; + num_batches=100, num_train_per_batch=1, + ) + + obj_after = simulate_multistage( + sps, spi, spo, [5.0], sample(joint_pool), m + ) + @test obj_after < obj_before + end + + @testset "multiple_shooting with joint-scenario sampling" begin + num_stages = 2 + subproblems = Vector{JuMP.Model}(undef, num_stages) + state_params_in = Vector{Vector{Any}}(undef, num_stages) + state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, num_stages) + + for t in 1:num_stages + subproblems[t] = quiet_diffopt_ipopt_model() + @variable(subproblems[t], x[1:4] >= 0) + @variable(subproblems[t], state_in in MOI.Parameter(1.0)) + @variable(subproblems[t], uncertainty in MOI.Parameter(0.5)) + @variable(subproblems[t], state_out in MOI.Parameter(1.0)) + @variable(subproblems[t], state_out_var) + @constraint(subproblems[t], sum(x) >= state_in + uncertainty) + @constraint(subproblems[t], state_out_var == sum(x[1:2])) + @constraint(subproblems[t], state_out_var >= state_out - 5.0) + @constraint(subproblems[t], state_out_var <= state_out + 5.0) + @objective(subproblems[t], Min, sum(x) + 10 * (state_out - state_out_var)^2) + + state_params_in[t] = [state_in] + state_params_out[t] = [(state_out, state_out_var)] + end + + # Joint-scenario pool: 3 scenarios per stage + joint_pool = [ + [[(subproblems[t][:uncertainty], v)] for v in [0.3, 0.5, 0.7]] + for t in 1:num_stages + ] + + windows = DecisionRules.setup_shooting_windows( + subproblems, + state_params_in, + state_params_out, + [1.5], + joint_pool; + window_size=2, + model_factory=() -> quiet_nonlinear_ipopt_model(), + ) + @test length(windows) == 1 + + decision_rule(x) = x[2:2] .+ 0.1f0 + uncertainty_sample = sample(joint_pool) + uncertainties_vec = [ + [Float32(u[2]) for u in stage_u] for stage_u in uncertainty_sample + ] + + obj = DecisionRules.simulate_multiple_shooting( + windows, decision_rule, Float32[1.5], uncertainty_sample, uncertainties_vec + ) + @test obj > 0 + end + @testset "dense_multilayer_nn" begin # Dense layers m = dense_multilayer_nn(3, 2, [8, 4]; activation=relu, dense=Dense) @@ -2668,4 +2850,227 @@ end @test_skip false end end + + @testset "GradientFallback" begin + @testset "type hierarchy and exports" begin + @test ZeroGradientFallback() isa AbstractGradientFallback + @test ErrorGradientFallback() isa AbstractGradientFallback + end + + @testset "ZeroGradientFallback returns zero cotangents" begin + fb = ZeroGradientFallback() + result = @test_logs (:warn,) DecisionRules.handle_gradient_error( + fb, ErrorException("test"), 3, 2 + ) + @test result[5] == zeros(3) + @test result[6] == zeros(2) + @test result[1] == ChainRulesCore.NoTangent() + end + + @testset "ErrorGradientFallback rethrows" begin + fb = ErrorGradientFallback() + @test_throws ErrorException DecisionRules.handle_gradient_error( + fb, ErrorException("test"), 3, 2 + ) + end + + @testset "handle_training_error" begin + @test_logs (:warn,) DecisionRules.handle_training_error( + ZeroGradientFallback(), ErrorException("test"), 1 + ) == true + @test_throws ErrorException DecisionRules.handle_training_error( + ErrorGradientFallback(), ErrorException("test"), 1 + ) + end + + @testset "handle_rollout_error" begin + @test_logs (:warn,) DecisionRules.handle_rollout_error( + ZeroGradientFallback(), ErrorException("test"), 1 + ) == true + @test_throws ErrorException DecisionRules.handle_rollout_error( + ErrorGradientFallback(), ErrorException("test"), 1 + ) + end + + @testset "custom fallback subtype" begin + struct CountingFallback <: AbstractGradientFallback + count::Ref{Int} + end + function DecisionRules.handle_gradient_error(fb::CountingFallback, e, n_in, n_out) + fb.count[] += 1 + return DecisionRules._zero_cotangents(n_in, n_out) + end + function DecisionRules.handle_training_error(fb::CountingFallback, e, iter) + fb.count[] += 1 + return true + end + function DecisionRules.handle_rollout_error(fb::CountingFallback, e, iter) + fb.count[] += 1 + return true + end + + fb = CountingFallback(Ref(0)) + DecisionRules.handle_gradient_error(fb, ErrorException("x"), 2, 2) + @test fb.count[] == 1 + DecisionRules.handle_training_error(fb, ErrorException("x"), 1) + @test fb.count[] == 2 + DecisionRules.handle_rollout_error(fb, ErrorException("x"), 1) + @test fb.count[] == 3 + end + + @testset "train_multistage accepts gradient_fallback kwarg" begin + subproblems, spi, spo, usamples, _ = let + subs = JuMP.Model[] + spi_vec = Vector{Any}[] + spo_vec = Vector{Tuple{Any,VariableRef}}[] + us_vec = Vector{Tuple{VariableRef,Vector{Float64}}}[] + for d in [4.0, 5.0] + s, si, so, sov, u = build_subproblem(d; subproblem=quiet_conic_ipopt_model()) + push!(subs, s) + push!(spi_vec, [si]) + push!(spo_vec, [(so, sov)]) + push!(us_vec, [(u, [1.0, 2.0, 3.0])]) + end + subs, spi_vec, spo_vec, us_vec, [5.0] + end + policy = dense_multilayer_nn(2, 1, [8]; activation=Flux.relu) + + model_out = train_multistage( + policy, [5.0], subproblems, spi, spo, usamples; + num_batches=2, + num_train_per_batch=1, + optimizer=Flux.Adam(0.01), + gradient_fallback=ErrorGradientFallback(), + ) + @test model_out isa Any + + model_out2 = train_multistage( + policy, [5.0], subproblems, spi, spo, usamples; + num_batches=2, + num_train_per_batch=1, + optimizer=Flux.Adam(0.01), + gradient_fallback=ZeroGradientFallback(), + ) + @test model_out2 isa Any + end + + @testset "RolloutEvaluation accepts gradient_fallback kwarg" begin + subproblems, spi, spo, usamples, _ = let + subs = JuMP.Model[] + spi_vec = Vector{Any}[] + spo_vec = Vector{Tuple{Any,VariableRef}}[] + us_vec = Vector{Tuple{VariableRef,Vector{Float64}}}[] + for d in [4.0, 5.0] + s, si, so, sov, u = build_subproblem(d; subproblem=quiet_conic_ipopt_model()) + push!(subs, s) + push!(spi_vec, [si]) + push!(spo_vec, [(so, sov)]) + push!(us_vec, [(u, [1.0, 2.0, 3.0])]) + end + subs, spi_vec, spo_vec, us_vec, [5.0] + end + policy = dense_multilayer_nn(2, 1, [8]; activation=Flux.relu) + + eval_scenarios = [sample(usamples) for _ in 1:2] + re = RolloutEvaluation( + subproblems, spi, spo, [5.0], eval_scenarios; + stride=1, policy_state=:realized, + gradient_fallback=ErrorGradientFallback(), + ) + re(1, policy) + @test isfinite(re.last_objective_no_deficit) + + re2 = RolloutEvaluation( + subproblems, spi, spo, [5.0], eval_scenarios; + stride=1, policy_state=:realized, + gradient_fallback=ZeroGradientFallback(), + ) + re2(1, policy) + @test isfinite(re2.last_objective_no_deficit) + end + end + + @testset "Uncertainty sampling" begin + m = quiet_highs_model() + @variable(m, p1 in MOI.Parameter(0.0)) + @variable(m, p2 in MOI.Parameter(0.0)) + T = 3 + + @testset "independent pool: each param drawn independently" begin + indep_pool = [ + [(p1, [1.0, 2.0, 3.0]), (p2, [10.0, 20.0, 30.0])] + for _ in 1:T + ] + Random.seed!(42) + N = 3000 + trajectories = [sample(indep_pool) for _ in 1:N] + + @test length(trajectories[1]) == T + @test length(trajectories[1][1]) == 2 + + # Values always come from the declared support + for traj in trajectories, stage in traj + @test stage[1][2] in [1.0, 2.0, 3.0] + @test stage[2][2] in [10.0, 20.0, 30.0] + end + + # k^n = 9 combinations possible; independent draws break correlation + combos = Set((s[1][2], s[2][2]) for traj in trajectories for s in traj) + @test length(combos) == 9 + end + + @testset "joint pool: all params from same scenario" begin + joint_pool = [ + [[(p1, 1.0), (p2, 10.0)], + [(p1, 2.0), (p2, 20.0)], + [(p1, 3.0), (p2, 30.0)]] + for _ in 1:T + ] + Random.seed!(42) + N = 3000 + trajectories = [sample(joint_pool) for _ in 1:N] + + @test length(trajectories[1]) == T + @test length(trajectories[1][1]) == 2 + + # Only k=3 combinations possible (never cross-scenario combos) + combos = Set((s[1][2], s[2][2]) for traj in trajectories for s in traj) + @test combos == Set([(1.0, 10.0), (2.0, 20.0), (3.0, 30.0)]) + end + + @testset "trajectory sampler: temporal conditioning" begin + calls = Tuple{Int, Vector}[] + function my_sampler(t, past) + push!(calls, (t, copy(past))) + prev = isempty(past) ? 0.0 : past[end][1][2] + return [(p1, prev + 1.0)] + end + + Random.seed!(42) + traj = sample(my_sampler, 3) + + @test length(traj) == 3 + @test traj[1][1][2] == 1.0 # 0 + 1 + @test traj[2][1][2] == 2.0 # 1 + 1 + @test traj[3][1][2] == 3.0 # 2 + 1 + + # Sampler received correct past at each stage + @test calls[1] == (1, []) + @test length(calls[2][2]) == 1 + @test length(calls[3][2]) == 2 + end + + @testset "all formats produce same trajectory type" begin + indep_pool = [[(p1, [1.0]), (p2, [10.0])] for _ in 1:T] + joint_pool = [[[(p1, 1.0), (p2, 10.0)]] for _ in 1:T] + sampler_fn = (t, past) -> [(p1, 1.0), (p2, 10.0)] + + t1 = sample(indep_pool) + t2 = sample(joint_pool) + t3 = sample(sampler_fn, T) + + @test typeof(t1) == typeof(t2) == typeof(t3) + @test t1[1][1][2] == t2[1][1][2] == t3[1][1][2] == 1.0 + end + end end diff --git a/test/test_score_function.jl b/test/test_score_function.jl new file mode 100644 index 0000000..e4ecf9d --- /dev/null +++ b/test/test_score_function.jl @@ -0,0 +1,382 @@ +using Statistics + +raw""" + _score_function_stage_model(; kwargs...) + +Build a one-dimensional linear stage model for score-function tests. + +The continuous version is + +```math +\begin{aligned} +\min_{u,x,\delta}\quad + & 30u + 10^4 |\delta| \\ +\text{s.t.}\quad + & x = x^{in} + \xi - u, \\ + & u \ge d, \\ + & \delta = x - \hat{x}. +\end{aligned} +``` + +When `integer = true`, the model adds a binary setup variable `z`, + +```math +u \le 10 z,\qquad z \in \{0,1\}, +``` + +and the objective becomes ``5z + 30u + 10^4|\delta|``. + +# Keywords +- `state_value::Real`: initial value for the input-state parameter. +- `target_value::Real`: initial value for the target parameter. +- `uncertainty_value::Real`: initial value for the uncertainty parameter. +- `demand::Real`: lower bound that forces positive ordering cost. +- `integer::Bool`: whether to include a binary setup decision. + +# Examples +```julia +model, state_in, target, state_out, uncertainty = + _score_function_stage_model(; integer = true) +``` +""" +function _score_function_stage_model(; + state_value::Real = 5.0, + target_value::Real = 4.0, + uncertainty_value::Real = 2.0, + demand::Real = 1.0, + integer::Bool = false, +) + # HiGHS keeps these tests fast and supports both LP and small MIP cases. + model = quiet_highs_model() + + # The order quantity is the operational decision whose cost we measure. + @variable(model, order >= 0.0) + + # The output state is what the policy target tries to guide. + @variable(model, state_out >= 0.0) + + # Parameters are updated by rollout helpers before every stage solve. + @variable(model, state_in in MOI.Parameter(Float64(state_value))) + @variable(model, target in MOI.Parameter(Float64(target_value))) + @variable(model, uncertainty in MOI.Parameter(Float64(uncertainty_value))) + + # The transition is intentionally simple so expected states are easy to audit. + @constraint(model, state_out == state_in + uncertainty - order) + + # A positive lower bound prevents the zero-cost solution from hiding bugs. + @constraint(model, order >= Float64(demand)) + + if integer + # The binary variable gives integer-strategy tests a real discrete object. + @variable(model, setup, Bin) + + # This links setup to order without changing the simple state equation. + @constraint(model, order <= 10.0 * setup) + + # The objective includes operational cost and the generated deficit cost. + _norm_deficit, deficit = create_deficit!(model, 1; penalty = 1.0e4) + @constraint(model, deficit[1] == state_out - target) + @objective(model, Min, 5.0 * setup + 30.0 * order + objective_function(model)) + else + # Continuous tests use the same deficit structure without a setup binary. + _norm_deficit, deficit = create_deficit!(model, 1; penalty = 1.0e4) + @constraint(model, deficit[1] == state_out - target) + @objective(model, Min, 30.0 * order + objective_function(model)) + end + + return model, state_in, target, state_out, uncertainty +end + +raw""" + _two_stage_score_function_fixture(; integer = false) + +Create a reusable two-stage score-function fixture. + +The fixture represents a two-stage rollout + +```math +Q(\hat{x}_{1:2}) + = + q_1(x_0,\xi_1;\hat{x}_1) + + + q_2(x_1,\xi_2;\hat{x}_2), +``` + +where each ``q_t`` is the one-dimensional stage model built by +[`_score_function_stage_model`](@ref). The two stages intentionally use +different parameter defaults so indexing mistakes change the solved model. + +# Keywords +- `integer::Bool`: whether stage models contain binary setup variables. + +# Examples +```julia +config, initial_state, uncertainties, targets = + _two_stage_score_function_fixture() +``` +""" +function _two_stage_score_function_fixture(; integer::Bool = false) + # Stage 1 starts from inventory 5 and observes uncertainty 2. + stage_1, state_in_1, target_1, state_out_1, uncertainty_1 = + _score_function_stage_model(; integer) + + # Stage 2 uses different parameter defaults to catch stage-index mistakes. + stage_2, state_in_2, target_2, state_out_2, uncertainty_2 = + _score_function_stage_model(; + state_value = 4.0, + target_value = 3.0, + uncertainty_value = 1.0, + integer, + ) + + # The config mirrors the shape used by train_multistage. + state_params_in = [[state_in_1], [state_in_2]] + state_params_out = [[(target_1, state_out_1)], [(target_2, state_out_2)]] + config = ScoreFunctionConfig( + [stage_1, stage_2], + state_params_in, + state_params_out; + num_rollouts = 4, + perturbation_std = 0.5, + ) + + # Targets include the initial state at index 1. + initial_state = [5.0] + targets = [[5.0], [4.0], [3.0]] + uncertainties = [ + [(uncertainty_1, 2.0)], + [(uncertainty_2, 1.0)], + ] + + return config, initial_state, uncertainties, targets +end + +@testset "Score-function gradient mixing" begin + @testset "ScoreFunctionConfig validates public arguments" begin + config, _, _, _ = _two_stage_score_function_fixture() + + @test config.dual_weight == 0.5 + @test config.perturbation_std == 0.5 + @test config.num_rollouts == 4 + @test config.baseline == :mean + + @test_throws ArgumentError ScoreFunctionConfig( + config.subproblems, + config.state_params_in[1:1], + config.state_params_out, + ) + @test_throws ArgumentError ScoreFunctionConfig( + config.subproblems, + config.state_params_in, + config.state_params_out; + dual_weight = -0.1, + ) + @test_throws ArgumentError ScoreFunctionConfig( + config.subproblems, + config.state_params_in, + config.state_params_out; + perturbation_std = 0.0, + ) + @test_throws ArgumentError ScoreFunctionConfig( + config.subproblems, + config.state_params_in, + config.state_params_out; + num_rollouts = 0, + ) + @test_throws ArgumentError ScoreFunctionConfig( + config.subproblems, + config.state_params_in, + config.state_params_out; + baseline = :median, + ) + end + + @testset "rollout_with_perturbation returns operational cost" begin + config, initial_state, uncertainties, targets = + _two_stage_score_function_fixture() + + # Zero perturbation should still solve the staged rollout successfully. + zero_cost = DecisionRules.rollout_with_perturbation( + config, + initial_state, + uncertainties, + targets, + [[0.0], [0.0]], + ) + + # A nonzero perturbation exercises target parameter updates. + perturbed_cost = DecisionRules.rollout_with_perturbation( + config, + initial_state, + uncertainties, + targets, + [[0.1], [-0.2]], + ) + + @test isfinite(zero_cost) + @test isfinite(perturbed_cost) + @test zero_cost > 0.0 + @test perturbed_cost > 0.0 + end + + @testset "_score_function_rollouts samples centered advantages" begin + config, initial_state, uncertainties, targets = + _two_stage_score_function_fixture() + + Random.seed!(42) + advantages, perturbations = DecisionRules._score_function_rollouts( + config, + initial_state, + uncertainties, + targets; + perturbation_std = 0.5, + num_rollouts = 6, + ) + + @test length(advantages) == 6 + @test length(perturbations) == 6 + @test all(isfinite, advantages) + @test all(length(rollout) == 2 for rollout in perturbations) + @test all(length(stage) == 1 for rollout in perturbations for stage in rollout) + + # The default :mean baseline centers advantages by construction. + @test sum(advantages) ≈ 0.0 atol = 1.0e-8 + end + + @testset "_score_function_surrogate matches Gaussian location score" begin + # These targets are differentiable arrays in the training loop. + targets = [[1.0f0], [2.0f0], [4.0f0]] + + # Perturbations are actual target perturbations, not standard normals. + perturbations = [[0.5], [-0.25]] + + surrogate = DecisionRules._score_function_surrogate( + 3.0, + perturbations, + targets, + 0.5, + ) + + # 3 * ((0.5 / 0.25) * 2 + (-0.25 / 0.25) * 4) == 0. + @test surrogate ≈ 0.0f0 + end + + @testset "sf_params reports scheduled ASCII-named fields" begin + config, _, _, _ = _two_stage_score_function_fixture() + schedule = ScoreFunctionSchedule( + config; + sf_start = 10, + ramp_batches = 20, + perturbation_std_initial = 0.1, + num_rollouts_initial = 2, + ) + + before_start = sf_params(schedule, 9) + @test before_start.active == false + @test before_start.alpha == 1.0 + @test before_start.score_weight == 0.0 + + at_start = sf_params(schedule, 10) + @test at_start.active == true + @test at_start.alpha == 1.0 + @test at_start.num_rollouts == 2 + + halfway = sf_params(schedule, 20) + @test halfway.active == true + @test 0.0 < halfway.score_weight < 0.5 + @test halfway.perturbation_std > 0.1 + + after_ramp = sf_params(schedule, 30) + @test after_ramp.alpha ≈ config.dual_weight + @test after_ramp.score_weight ≈ 1.0 - config.dual_weight + @test after_ramp.perturbation_std ≈ config.perturbation_std + @test after_ramp.num_rollouts == config.num_rollouts + + static_params = sf_params(config, 1) + @test static_params.active == true + @test static_params.alpha == config.dual_weight + end + + @testset "rollout solves integer models exactly as written" begin + config, initial_state, uncertainties, targets = + _two_stage_score_function_fixture(; integer = true) + + cost = DecisionRules.rollout_with_perturbation( + config, + initial_state, + uncertainties, + targets, + [[0.1], [0.0]], + ) + + @test isfinite(cost) + @test any(JuMP.is_binary, JuMP.all_variables(config.subproblems[1])) + end + + @testset "train_multistage accepts ScoreFunctionConfig on deterministic equivalent" begin + # Build the deterministic-equivalent problem used by the dual path. + stage_1, state_in_1, target_1, state_out_1, uncertainty_1 = + build_subproblem(10; subproblem = quiet_highs_model()) + stage_2, state_in_2, target_2, state_out_2, uncertainty_2 = + build_subproblem( + 10; + state_i_val = 4.0, + state_out_val = 3.0, + uncertainty_val = 1.0, + subproblem = quiet_highs_model(), + ) + subproblems = [stage_1, stage_2] + state_params_in = Vector{Vector{Any}}(undef, 2) + state_params_in .= [[state_in_1], [state_in_2]] + state_params_out = Vector{Vector{Tuple{Any,VariableRef}}}(undef, 2) + state_params_out .= [[(target_1, state_out_1)], [(target_2, state_out_2)]] + uncertainty_samples = [[(uncertainty_1, [2.0])], [(uncertainty_2, [1.0])]] + + det_equivalent, deterministic_sampler = DecisionRules.deterministic_equivalent!( + quiet_highs_model(), + subproblems, + state_params_in, + state_params_out, + [5.0], + uncertainty_samples, + ) + + # Build separate rollout models so the score-function solves do not + # mutate the deterministic-equivalent model. + score_config, _, _, _ = _two_stage_score_function_fixture() + score_config = ScoreFunctionConfig( + score_config.subproblems, + score_config.state_params_in, + score_config.state_params_out; + dual_weight = 0.5, + perturbation_std = 0.3, + num_rollouts = 2, + ) + + Random.seed!(222) + policy = Chain(Dense(2, 8, relu), Dense(8, 1)) + + Random.seed!(42) + train_multistage( + policy, + [5.0], + det_equivalent, + state_params_in, + state_params_out, + deterministic_sampler; + num_batches = 4, + num_train_per_batch = 2, + score_function = score_config, + ) + + objective = simulate_multistage( + det_equivalent, + state_params_in, + state_params_out, + [5.0], + sample(deterministic_sampler), + policy, + ) + @test isfinite(objective) + end +end