sweep sweep

frixaco · frixaco · commit d2c4a6c9d4c4 · 2026-01-25T00:17:42.000+05:00
diff --git a/pufferlib/config/boss_fight.ini b/pufferlib/config/boss_fight.ini
@@ -2,7 +2,6 @@
 package = ocean
 env_name = puffer_boss_fight
 policy_name = Policy
-# rnn_name = Recurrent  # Uncomment if adding LSTM/GRU
 
 [vec]
 num_envs = 1024
@@ -12,59 +11,40 @@ zero_copy = True
 seed = 42
 
 [env]
-# Environment-specific params (passed to env constructor)
-# None needed - using defaults from README
 
 [policy]
-# Policy constructor args (e.g., hidden_size)
-# hidden_size = 64  # Experiment: 32, 64, 128
 
 [train]
-# Experiment tracking
 name = boss_fight
 project = boss_fight_experiments
 data_dir = experiments
 checkpoint_interval = 200
-
-# Reproducibility
 seed = 42
-# TODO: disable for sweep or speed
 torch_deterministic = True
 device = cpu
-
-# Optimization
-# TODO: try muon with 0.015 lr
 optimizer = adam
 precision = float32
 compile = False
-
-# Core PPO hyperparameters
 total_timesteps = 5_000_000
-learning_rate = 0.0003
+learning_rate = 0.000864
 anneal_lr = True
-min_lr_ratio = 0.0
-gamma = 0.99
-gae_lambda = 0.95
+min_lr_ratio = 0.437
+gamma = 0.983
+gae_lambda = 0.902
 update_epochs = 4
-clip_coef = 0.2
-vf_coef = 0.5
-vf_clip_coef = 0.2
-max_grad_norm = 0.5
-ent_coef = 0.01
-
-# Batch sizes
-minibatch_size =  2048
+clip_coef = 0.421
+vf_coef = 4.38
+vf_clip_coef = 0.303
+max_grad_norm = 2.28
+ent_coef = 0.00623
+minibatch_size = 2048
 max_minibatch_size = 32768
 bptt_horizon = 32
-
-# Adam parameters (if optimizer = adam)
-adam_beta1 = 0.9
-adam_beta2 = 0.999
-adam_eps = 1e-8
-
-# V-trace (for off-policy correction)
-# vtrace_rho_clip = 1.0
-# vtrace_c_clip = 1.0
+adam_beta1 = 0.991
+adam_beta2 = 0.998
+adam_eps = 1e-14
+vtrace_rho_clip = 2.72
+vtrace_c_clip = 2.13
 
 [sweep]
 goal = maximize
@@ -74,31 +54,26 @@ metric_distribution = linear
 max_suggestion_cost = 3600
 use_gpu = True
 
-# Learning rate sweep
 [sweep.train.learning_rate]
 distribution = log_normal
 min = 0.0001
 max = 0.003
 
-# Entropy coefficient sweep (exploration vs exploitation)
 [sweep.train.ent_coef]
 distribution = log_normal
 min = 0.0001
 max = 0.05
 
-# Discount factor sweep
 [sweep.train.gamma]
 distribution = logit_normal
 min = 0.95
 max = 0.999
 
-# GAE lambda sweep
 [sweep.train.gae_lambda]
 distribution = logit_normal
 min = 0.9
 max = 0.99
 
-# Minibatch size sweep
 [sweep.train.minibatch_size]
 distribution = uniform_pow2
 min = 1024