Skip to content

Commit 8d142f0

Browse files
authored
Merge pull request #143 from OpenBioSim/feature_energy_components
Feature energy components
2 parents 2e47223 + 8e23866 commit 8d142f0

5 files changed

Lines changed: 287 additions & 180 deletions

File tree

README.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,33 @@ geometry. To override this for all groups:
242242
somd2 perturbable_system.bss --terminal-flip-frequency "1 ps" --terminal-flip-angle "180 degrees"
243243
```
244244

245+
## Debugging with energy components
246+
247+
To help diagnose simulation instabilities, `SOMD2` can record the potential
248+
energy contribution from each OpenMM force group. This is enabled with the
249+
`--save-energy-components` flag:
250+
251+
```
252+
somd2 perturbable_system.bss --save-energy-components
253+
```
254+
255+
One Parquet file per λ window is written to the output directory, named
256+
`energy_components_<lambda>.parquet`. Times are in nanoseconds and energies in
257+
kcal/mol; both are stored as schema metadata in the file.
258+
259+
The recording interval depends on the runner and active samplers:
260+
261+
- **Replica exchange**: always `energy-frequency`
262+
- **Standard runner, no MC**: `energy-frequency`
263+
- **Standard runner, with MC**: the shortest active MC frequency, i.e.
264+
`gcmc-frequency`, `terminal-flip-frequency`, or the smaller of the two
265+
when both are active
266+
267+
> [!NOTE]
268+
> Energy components are written more frequently than checkpoint files and are
269+
> not guarded by the file lock, so they may lead the checkpoint files by up
270+
> to one `checkpoint-frequency` interval when copying output mid-simulation.
271+
245272
## Copying output files during a simulation
246273

247274
When `SOMD2` writes checkpoint files it acquires an exclusive

src/somd2/config/_config.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,11 @@ def __init__(
506506
Whether to save a crash report if the simulation crashes.
507507
508508
save_energy_components: bool
509-
Whether to save the energy contribution for each force when checkpointing.
509+
Whether to save per-force-group energy contributions to a Parquet file
510+
in the output directory. Energies are recorded at every 'energy_frequency'
511+
interval. When not running replica exchange, the interval is instead the
512+
shortest active MC frequency when running with GCMC or terminal flip moves.
513+
Intended for debugging purposes.
510514
511515
save_xml: bool
512516
Whether to write an XML file for the OpenMM system to the output

src/somd2/runner/_base.py

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,10 @@ def __init__(self, system, config):
498498
# Check the output directories and create names of output files.
499499
self._filenames = self._prepare_output()
500500

501+
# Per-window cache of the last saved energy-components time (ns),
502+
# used to skip duplicate rows on restart.
503+
self._last_ec_time = {}
504+
501505
# Store the current system as a reference.
502506
self._reference_system = self._system.clone()
503507

@@ -1194,7 +1198,7 @@ def increment_filename(base_filename, suffix):
11941198
filenames["trajectory"] = str(output_directory / f"traj_{lam}.dcd")
11951199
filenames["trajectory_chunk"] = str(output_directory / f"traj_{lam}_")
11961200
filenames["energy_components"] = str(
1197-
output_directory / f"energy_components_{lam}.csv"
1201+
output_directory / f"energy_components_{lam}.parquet"
11981202
)
11991203
filenames["gcmc_ghosts"] = str(output_directory / f"gcmc_ghosts_{lam}.txt")
12001204
filenames["sampler_stats"] = str(output_directory / f"sampler_stats_{lam}.pkl")
@@ -2020,12 +2024,23 @@ def _backup_checkpoint(self, index):
20202024
except Exception as e:
20212025
return index, e
20222026

2027+
try:
2028+
# Backup the existing energy components file, if it exists.
2029+
path = _Path(self._filenames[index]["energy_components"])
2030+
if path.exists() and path.stat().st_size > 0:
2031+
_copyfile(
2032+
self._filenames[index]["energy_components"],
2033+
str(self._filenames[index]["energy_components"]) + ".bak",
2034+
)
2035+
except Exception as e:
2036+
return index, e
2037+
20232038
return index, None
20242039

20252040
def _save_energy_components(self, index, context, time_ns):
20262041
"""
20272042
Internal function to save the energy components for each force group to a
2028-
CSV file.
2043+
Parquet file.
20292044
20302045
Parameters
20312046
----------
@@ -2040,11 +2055,28 @@ def _save_energy_components(self, index, context, time_ns):
20402055
The current simulation time in nanoseconds.
20412056
"""
20422057

2043-
import csv as _csv
2058+
import json as _json
20442059
import openmm
2060+
import pandas as _pd
2061+
import pyarrow as _pa
2062+
import pyarrow.parquet as _pq_local
20452063

20462064
filepath = self._filenames[index]["energy_components"]
2047-
file_exists = _Path(filepath).exists()
2065+
2066+
# Lazy-initialise the last saved time for restart deduplication.
2067+
# On the first call for this window, read the existing file (if any)
2068+
# to find the maximum time already written.
2069+
if index not in self._last_ec_time:
2070+
path = _Path(filepath)
2071+
if path.exists() and path.stat().st_size > 0:
2072+
existing = _pq_local.read_table(filepath).to_pandas()
2073+
self._last_ec_time[index] = float(existing["time"].max())
2074+
else:
2075+
self._last_ec_time[index] = -1.0
2076+
2077+
# Skip rows that have already been written (restart deduplication).
2078+
if time_ns <= self._last_ec_time[index]:
2079+
return
20482080

20492081
# Use the named force groups already assigned by sire_to_openmm_system,
20502082
# sorted alphabetically for a consistent column order across runs.
@@ -2055,18 +2087,25 @@ def _save_energy_components(self, index, context, time_ns):
20552087
openmm.unit.kilocalories_per_mole
20562088
)
20572089

2058-
columns = ["time"] + list(energies.keys())
2059-
row = {"time": round(time_ns, 6)} | {
2060-
name: round(nrg, 4) for name, nrg in energies.items()
2061-
}
2090+
row = {"time": round(time_ns, 6)} | energies
2091+
df = _pd.DataFrame([row])
2092+
2093+
path = _Path(filepath)
2094+
if path.exists() and path.stat().st_size > 0:
2095+
_parquet_append(filepath, df)
2096+
else:
2097+
# First write: embed units as schema metadata under the "somd2" key,
2098+
# consistent with how the energy trajectory parquet files are written.
2099+
table = _pa.Table.from_pandas(df)
2100+
meta = _json.dumps(
2101+
{"time_units": "ns", "energy_units": "kcal/mol"}
2102+
).encode()
2103+
table = table.replace_schema_metadata(
2104+
{b"somd2": meta, **table.schema.metadata}
2105+
)
2106+
_pq_local.write_table(table, filepath)
20622107

2063-
with open(filepath, "a", newline="") as f:
2064-
writer = _csv.DictWriter(f, fieldnames=columns)
2065-
if not file_exists:
2066-
# Write a comment line with units before the header.
2067-
f.write("# time: ns, energy: kcal/mol\n")
2068-
writer.writeheader()
2069-
writer.writerow(row)
2108+
self._last_ec_time[index] = time_ns
20702109

20712110
def _restore_backup_files(self):
20722111
"""

src/somd2/runner/_repex.py

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1108,6 +1108,16 @@ def run(self):
11081108
# Whether a frame is saved at the end of the cycle.
11091109
write_gcmc_ghosts = (i + 1) % cycles_per_frame == 0
11101110

1111+
# Current simulation time in ns for energy components saving.
1112+
time_ns = (
1113+
(
1114+
self._start_block * checkpoint_frequency
1115+
+ (i + 1) * self._config.energy_frequency
1116+
).to("ns")
1117+
if self._config.save_energy_components
1118+
else None
1119+
)
1120+
11111121
# Run a dynamics block for each replica, making sure only each GPU is only
11121122
# oversubscribed by a factor of self._config.oversubscription_factor.
11131123
for j in range(num_batches):
@@ -1121,6 +1131,7 @@ def run(self):
11211131
repeat(is_gcmc),
11221132
repeat(write_gcmc_ghosts),
11231133
repeat(is_terminal_flip),
1134+
repeat(time_ns),
11241135
):
11251136
if not result:
11261137
_logger.error(
@@ -1294,6 +1305,7 @@ def _run_block(
12941305
is_gcmc=False,
12951306
write_gcmc_ghosts=False,
12961307
is_terminal_flip=False,
1308+
time_ns=None,
12971309
):
12981310
"""
12991311
Run a dynamics block for a given replica.
@@ -1321,6 +1333,10 @@ def _run_block(
13211333
Whether a terminal flip MC move should be performed before the
13221334
dynamics block.
13231335
1336+
time_ns: float or None
1337+
The current simulation time in nanoseconds, used when saving energy
1338+
components. If None, energy components are not saved.
1339+
13241340
Returns
13251341
-------
13261342
@@ -1417,6 +1433,10 @@ def _run_block(
14171433
# Save the OpenMM state.
14181434
self._dynamics_cache.save_openmm_state(index)
14191435

1436+
# Save the energy contribution for each force.
1437+
if self._config.save_energy_components and time_ns is not None:
1438+
self._save_energy_components(index, dynamics.context(), time_ns)
1439+
14201440
# Get the energy at each lambda value.
14211441
energies = dynamics._current_energy_array()
14221442

@@ -1781,12 +1801,6 @@ def _checkpoint(self, index, lambdas, block, num_blocks, is_final_block=False):
17811801
# Commit the current system.
17821802
system = dynamics.commit()
17831803

1784-
# Save the energy contribution for each force.
1785-
if self._config.save_energy_components:
1786-
self._save_energy_components(
1787-
index, dynamics.context(), system.time().to("ns")
1788-
)
1789-
17901804
# If performing GCMC, then we need to flag the ghost waters.
17911805
if gcmc_sampler is not None:
17921806
system = gcmc_sampler._flag_ghost_waters(system)

0 commit comments

Comments
 (0)