@@ -60,6 +60,7 @@ typedef struct {
6060 int difficulty_id ; // 0=basic,1=easy,2=medium,3=hard,4=unfiltered
6161 Client * client ;
6262 int win ;
63+ float epiosde_return ;
6364} Boxoban ;
6465
6566void ensure_map_loaded (void );
@@ -126,9 +127,9 @@ void add_log(Boxoban* env) {
126127 float denom = (float )env -> n_boxes ;
127128 float num = (float )env -> on_target ;
128129 env -> log .perf += (env -> win == 1 ) ? 1.0 : num /denom ;
129- env -> log .score += env -> rewards [ 0 ] ;
130+ env -> log .score += env -> log . perf ;
130131 env -> log .episode_length += env -> tick ;
131- env -> log .episode_return += env -> rewards [ 0 ] ;
132+ env -> log .episode_return += env -> episode_return ;
132133 env -> log .on_targets += env -> on_target ;
133134 env -> log .n ++ ;
134135}
@@ -159,6 +160,7 @@ void c_reset(Boxoban* env) {
159160
160161 env -> tick = 0 ;
161162 env -> win = 0 ;
163+ env -> episode_return = 0 ;
162164
163165}
164166
@@ -250,18 +252,21 @@ void c_step(Boxoban* env) {
250252 env -> terminals [0 ] = 1 ;
251253 env -> rewards [0 ] += 1.0 ;
252254 env -> win = 1 ;
255+ env -> episode_return += env -> rewards [0 ];
253256 add_log (env );
254257 c_reset (env );
255258 return ;
256259 }
257260
258261 if (env -> tick >= env -> max_steps ) {
259262 env -> terminals [0 ] = 1 ;
260- env -> rewards [0 ] -= 1.0 ;
263+ env -> rewards [0 ] -= 1.0 ;
264+ env -> episode_return += env -> rewards [0 ];
261265 add_log (env );
262266 c_reset (env );
263267 return ;
264268 }
269+ env -> episode_return += env -> rewards [0 ];
265270
266271}
267272
0 commit comments