Skip to content

Commit effe087

Browse files
RoryBarnesclaude
andcommitted
Fix lock deadlock, subprocess pipe inheritance, and checkpoint bugs
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 6c77aee commit effe087

1 file changed

Lines changed: 45 additions & 54 deletions

File tree

multiplanet/multiplanet.py

Lines changed: 45 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import argparse
22
import multiprocessing as mp
33
import os
4+
import shutil
45
import subprocess as sub
56
import sys
67

@@ -33,30 +34,27 @@ def fnGetNextSimulation(sCheckpointFile, lockFile):
3334
str or None
3435
Absolute path to simulation folder, or None if all done
3536
"""
36-
lockFile.acquire()
37-
listData = []
37+
with lockFile:
38+
listData = []
39+
with open(sCheckpointFile, "r") as f:
40+
for sLine in f:
41+
listData.append(sLine.strip().split())
3842

39-
with open(sCheckpointFile, "r") as f:
40-
for sLine in f:
41-
listData.append(sLine.strip().split())
42-
43-
sFolder = ""
44-
for listLine in listData:
45-
if len(listLine) > 1 and listLine[1] == "-1":
46-
sFolder = listLine[0]
47-
listLine[1] = "0"
48-
break
43+
sFolder = ""
44+
for listLine in listData:
45+
if len(listLine) > 1 and listLine[1] == "-1":
46+
sFolder = listLine[0]
47+
listLine[1] = "0"
48+
break
4949

50-
if not sFolder:
51-
lockFile.release()
52-
return None
50+
if not sFolder:
51+
return None
5352

54-
with open(sCheckpointFile, "w") as f:
55-
for listLine in listData:
56-
f.writelines(" ".join(listLine) + "\n")
53+
with open(sCheckpointFile, "w") as f:
54+
for listLine in listData:
55+
f.writelines(" ".join(listLine) + "\n")
5756

58-
lockFile.release()
59-
return os.path.abspath(sFolder)
57+
return os.path.abspath(sFolder)
6058

6159

6260
def fnMarkSimulationComplete(sCheckpointFile, sFolder, lockFile):
@@ -78,23 +76,20 @@ def fnMarkSimulationComplete(sCheckpointFile, sFolder, lockFile):
7876
-------
7977
None
8078
"""
81-
lockFile.acquire()
82-
listData = []
83-
84-
with open(sCheckpointFile, "r") as f:
85-
for sLine in f:
86-
listData.append(sLine.strip().split())
79+
with lockFile:
80+
listData = []
81+
with open(sCheckpointFile, "r") as f:
82+
for sLine in f:
83+
listData.append(sLine.strip().split())
8784

88-
for listLine in listData:
89-
if len(listLine) > 1 and listLine[0] == sFolder:
90-
listLine[1] = "1"
91-
break
92-
93-
with open(sCheckpointFile, "w") as f:
9485
for listLine in listData:
95-
f.writelines(" ".join(listLine) + "\n")
86+
if len(listLine) > 1 and listLine[0] == sFolder:
87+
listLine[1] = "1"
88+
break
9689

97-
lockFile.release()
90+
with open(sCheckpointFile, "w") as f:
91+
for listLine in listData:
92+
f.writelines(" ".join(listLine) + "\n")
9893

9994

10095
def fnMarkSimulationFailed(sCheckpointFile, sFolder, lockFile):
@@ -116,23 +111,20 @@ def fnMarkSimulationFailed(sCheckpointFile, sFolder, lockFile):
116111
-------
117112
None
118113
"""
119-
lockFile.acquire()
120-
listData = []
121-
122-
with open(sCheckpointFile, "r") as f:
123-
for sLine in f:
124-
listData.append(sLine.strip().split())
125-
126-
for listLine in listData:
127-
if len(listLine) > 1 and listLine[0] == sFolder:
128-
listLine[1] = "-1"
129-
break
114+
with lockFile:
115+
listData = []
116+
with open(sCheckpointFile, "r") as f:
117+
for sLine in f:
118+
listData.append(sLine.strip().split())
130119

131-
with open(sCheckpointFile, "w") as f:
132120
for listLine in listData:
133-
f.writelines(" ".join(listLine) + "\n")
121+
if len(listLine) > 1 and listLine[0] == sFolder:
122+
listLine[1] = "-1"
123+
break
134124

135-
lockFile.release()
125+
with open(sCheckpointFile, "w") as f:
126+
for listLine in listData:
127+
f.writelines(" ".join(listLine) + "\n")
136128

137129

138130
# --------------------------------------------------------------------
@@ -247,13 +239,13 @@ def ReCreateCP(checkpoint_file, input_file, verbose, sims, folder_name, force):
247239
for newline in datalist:
248240
wr.writelines(" ".join(newline) + "\n")
249241

250-
if all(len(l) > 1 and l[1] == "1" for l in datalist[2:-2]) == True:
242+
if all(len(l) > 1 and l[1] == "1" for l in datalist[2:-1]) == True:
251243
print("All simulations have been ran")
252244

253245
if force:
254246
if verbose:
255247
print("Deleting folder...")
256-
os.remove(folder_name)
248+
shutil.rmtree(folder_name)
257249
if verbose:
258250
print("Deleting Checkpoint File...")
259251
os.remove(checkpoint_file)
@@ -337,6 +329,7 @@ def par_worker(
337329
stdout=sub.PIPE,
338330
stderr=sub.PIPE,
339331
universal_newlines=True,
332+
close_fds=True,
340333
)
341334
# FIXED: Use communicate() to wait for completion and get output
342335
stdout, stderr = vplanet.communicate()
@@ -365,8 +358,7 @@ def par_worker(
365358
)
366359

367360
# STEP 4: Write to HDF5 (WITH LOCK - minimal critical section)
368-
lock.acquire()
369-
try:
361+
with lock:
370362
with h5py.File(h5_file, "a") as Master:
371363
group_name = os.path.basename(sFolder)
372364
if group_name not in Master:
@@ -378,8 +370,6 @@ def par_worker(
378370
group_name,
379371
archive=True,
380372
)
381-
finally:
382-
lock.release()
383373
except Exception as e:
384374
# Log BigPlanet errors but don't fail the simulation
385375
if verbose:
@@ -442,6 +432,7 @@ def parallel_run_planet(input_file, cores, quiet, verbose, bigplanet, force):
442432
logfile = system_name + ".log"
443433
# initalizes the checkpoint file
444434
checkpoint_file = os.getcwd() + "/" + "." + folder_name
435+
os.makedirs(os.path.dirname(checkpoint_file), exist_ok=True)
445436
# checks if the files doesn't exist and if so then it creates it
446437
if os.path.isfile(checkpoint_file) == False:
447438
CreateCP(checkpoint_file, input_file, sims)

0 commit comments

Comments
 (0)