22package = ocean
33env_name = puffer_boss_fight
44policy_name = Policy
5- # rnn_name = Recurrent # Uncomment if adding LSTM/GRU
65
76[vec]
87num_envs = 1024
@@ -12,59 +11,40 @@ zero_copy = True
1211seed = 42
1312
1413[env]
15- # Environment-specific params (passed to env constructor)
16- # None needed - using defaults from README
1714
1815[policy]
19- # Policy constructor args (e.g., hidden_size)
20- # hidden_size = 64 # Experiment: 32, 64, 128
2116
2217[train]
23- # Experiment tracking
2418name = boss_fight
2519project = boss_fight_experiments
2620data_dir = experiments
2721checkpoint_interval = 200
28-
29- # Reproducibility
3022seed = 42
31- # TODO: disable for sweep or speed
3223torch_deterministic = True
3324device = cpu
34-
35- # Optimization
36- # TODO: try muon with 0.015 lr
3725optimizer = adam
3826precision = float32
3927compile = False
40-
41- # Core PPO hyperparameters
4228total_timesteps = 5_000_000
43- learning_rate = 0.0003
29+ learning_rate = 0.000864
4430anneal_lr = True
45- min_lr_ratio = 0.0
46- gamma = 0.99
47- gae_lambda = 0.95
31+ min_lr_ratio = 0.437
32+ gamma = 0.983
33+ gae_lambda = 0.902
4834update_epochs = 4
49- clip_coef = 0.2
50- vf_coef = 0.5
51- vf_clip_coef = 0.2
52- max_grad_norm = 0.5
53- ent_coef = 0.01
54-
55- # Batch sizes
56- minibatch_size = 2048
35+ clip_coef = 0.421
36+ vf_coef = 4.38
37+ vf_clip_coef = 0.303
38+ max_grad_norm = 2.28
39+ ent_coef = 0.00623
40+ minibatch_size = 2048
5741max_minibatch_size = 32768
5842bptt_horizon = 32
59-
60- # Adam parameters (if optimizer = adam)
61- adam_beta1 = 0.9
62- adam_beta2 = 0.999
63- adam_eps = 1e-8
64-
65- # V-trace (for off-policy correction)
66- # vtrace_rho_clip = 1.0
67- # vtrace_c_clip = 1.0
43+ adam_beta1 = 0.991
44+ adam_beta2 = 0.998
45+ adam_eps = 1e-14
46+ vtrace_rho_clip = 2.72
47+ vtrace_c_clip = 2.13
6848
6949[sweep]
7050goal = maximize
@@ -74,31 +54,26 @@ metric_distribution = linear
7454max_suggestion_cost = 3600
7555use_gpu = True
7656
77- # Learning rate sweep
7857[sweep.train.learning_rate]
7958distribution = log_normal
8059min = 0.0001
8160max = 0.003
8261
83- # Entropy coefficient sweep (exploration vs exploitation)
8462[sweep.train.ent_coef]
8563distribution = log_normal
8664min = 0.0001
8765max = 0.05
8866
89- # Discount factor sweep
9067[sweep.train.gamma]
9168distribution = logit_normal
9269min = 0.95
9370max = 0.999
9471
95- # GAE lambda sweep
9672[sweep.train.gae_lambda]
9773distribution = logit_normal
9874min = 0.9
9975max = 0.99
10076
101- # Minibatch size sweep
10277[sweep.train.minibatch_size]
10378distribution = uniform_pow2
10479min = 1024
0 commit comments