2222 kexec_machine , wait_for_results , fetch_results ,
2323 parse_results , process_crashes , set_log_file ,
2424 WaitResult , grab_hw_worker_journal , grab_sol_logs ,
25- reboot_machine , CRASH_SENTINEL ,
26- _journal_has_crash_sentinel )
25+ reboot_machine ,
26+ CRASH_SENTINEL , _journal_has_crash_sentinel )
2727
2828# Config:
2929#
@@ -207,7 +207,9 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
207207 wait_result = wait_for_results (config , mc , reservation_id ,
208208 machine_ids , machine_ips )
209209
210- # 8. Grab debug artifacts for this attempt
210+ # 8. Grab debug artifacts for this attempt.
211+ # If machine is hung (needs_power_cycle), it may still
212+ # respond to SSH briefly — try to grab what we can.
211213 try :
212214 grab_hw_worker_journal (machine_ips [0 ],
213215 results_path , suffix = attempt_sfx )
@@ -219,26 +221,32 @@ def test(binfo, rinfo, cbarg): # pylint: disable=unused-argument
219221 except Exception as e :
220222 print (f"Warning: failed to grab SOL logs: { e } " )
221223
222- # 9. Copy back results
223- fetch_results (machine_ips , reservation_id , results_path )
224+ # 9. Check if we need to retry
225+ needs_retry = wait_result .needs_power_cycle
226+ if not needs_retry :
227+ try :
228+ needs_retry = _journal_has_crash_sentinel (machine_ips [0 ])
229+ except Exception :
230+ pass
224231
225- # 10. Check if hw-worker detected a crash and wants a reboot
226- needs_reboot = False
227- try :
228- needs_reboot = _journal_has_crash_sentinel (machine_ips [0 ])
229- except Exception :
230- pass
231-
232- if not needs_reboot :
232+ if not needs_retry :
233233 break
234234
235+ if wait_result .needs_power_cycle :
236+ print (f"Machine hung (attempt { attempt + 1 } ), rebooting" )
237+ else :
238+ print (f"hw-worker detected crash (attempt { attempt + 1 } ), rebooting" )
239+ reboot_machine (config , mc , reservation_id ,
240+ machine_ids , machine_ips )
241+
242+ # Do the reboot even if we are about to give up, otherwise
243+ # if machine is hung we won't be able to fetch results
235244 if attempt >= max_crash_retries :
236245 print (f"Max crash retries ({ max_crash_retries } ) reached, giving up" )
237246 break
238247
239- print (f"hw-worker detected crash (attempt { attempt + 1 } ), rebooting" )
240- reboot_machine (config , mc , reservation_id ,
241- machine_ids , machine_ips )
248+ # 10. Copy back results
249+ fetch_results (machine_ips , reservation_id , results_path )
242250
243251 # 11. Parse results
244252 cases = parse_results (results_path , link )
0 commit comments