Skip to content

Commit 8d30e0e

Browse files
authored
Add files via upload
1 parent 86c4d65 commit 8d30e0e

1 file changed

Lines changed: 189 additions & 35 deletions

File tree

multioptpy/Wrapper/mapper.py

Lines changed: 189 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
from __future__ import annotations
1010

11+
import bisect
1112
import glob
1213
import heapq
1314
import json
@@ -1215,9 +1216,30 @@ def __init__(self) -> None:
12151216
self._ref_energy_cache: float | None = _UNSET # type: ignore[assignment]
12161217
self.last_iteration: int = 0
12171218

1219+
# ── Energy-sorted indexes for O(log N) window queries ──────────────
1220+
# Each list stores (energy, id) tuples in ascending energy order,
1221+
# maintained by bisect.insort on every add_node / add_edge call.
1222+
# Nodes/edges whose energy is None are tracked separately in the
1223+
# _*_none_ids lists and always included in window query results
1224+
# (mirroring the original behaviour of skipping the energy pre-filter
1225+
# when either energy is None).
1226+
# The _index_dirty flag is raised by invalidate_energy_cache() and
1227+
# causes a full rebuild on the next window-query call, covering the
1228+
# case where backfill changes an energy from None to a real value.
1229+
self._node_energy_index: list[tuple[float, int]] = []
1230+
self._node_none_ids: list[int] = []
1231+
self._edge_energy_index: list[tuple[float, int]] = []
1232+
self._edge_none_ids: list[int] = []
1233+
self._index_dirty: bool = False
1234+
12181235
def add_node(self, node: EQNode) -> None:
12191236
self._nodes[node.node_id] = node
12201237
self._ref_energy_cache = _UNSET # type: ignore[assignment]
1238+
# Maintain energy index incrementally.
1239+
if node.energy is not None:
1240+
bisect.insort(self._node_energy_index, (node.energy, node.node_id))
1241+
else:
1242+
self._node_none_ids.append(node.node_id)
12211243

12221244
def get_node(self, node_id: int) -> EQNode | None:
12231245
return self._nodes.get(node_id)
@@ -1232,6 +1254,11 @@ def next_node_id(self) -> int:
12321254

12331255
def add_edge(self, edge: TSEdge) -> None:
12341256
self._edges[edge.edge_id] = edge
1257+
# Maintain energy index incrementally.
1258+
if edge.ts_energy is not None:
1259+
bisect.insort(self._edge_energy_index, (edge.ts_energy, edge.edge_id))
1260+
else:
1261+
self._edge_none_ids.append(edge.edge_id)
12351262

12361263
def all_edges(self) -> list[TSEdge]:
12371264
return list(self._edges.values())
@@ -1250,8 +1277,15 @@ def invalidate_energy_cache(self) -> None:
12501277
12511278
This method provides a public, encapsulated alternative to directly
12521279
manipulating the internal ``_ref_energy_cache`` sentinel.
1280+
1281+
Also marks the energy-sorted indexes as dirty so that the next call to
1282+
:meth:`nodes_in_energy_window` or :meth:`edges_in_energy_window`
1283+
triggers a full rebuild. This covers the backfill case where a node's
1284+
energy transitions from ``None`` to a real value, which would otherwise
1285+
leave the node stranded in ``_node_none_ids`` without a sorted entry.
12531286
"""
12541287
self._ref_energy_cache = _UNSET # type: ignore[assignment]
1288+
self._index_dirty = True
12551289

12561290
def reference_energy(self) -> float | None:
12571291
"""Return the unified reference energy for priority/Boltzmann calculations.
@@ -1361,6 +1395,127 @@ def load(self, filepath: str) -> None:
13611395

13621396
self.last_iteration = data.get("metadata", {}).get("last_iteration", 0)
13631397

1398+
# ------------------------------------------------------------------
1399+
# Energy-window index helpers
1400+
# ------------------------------------------------------------------
1401+
1402+
def _rebuild_indexes(self) -> None:
1403+
"""Rebuild both energy-sorted indexes from scratch.
1404+
1405+
Called automatically by the window-query methods when
1406+
``_index_dirty`` is ``True``. The most common trigger is
1407+
:meth:`invalidate_energy_cache` after a backfill pass in
1408+
``_flush_node_energy_updates``, where node energies may have
1409+
transitioned from ``None`` to real values.
1410+
1411+
Complexity: O(N log N) for nodes + O(E log E) for edges.
1412+
Called at most once per mapper iteration (the next backfill
1413+
flush) so the amortised cost per ``_process_profile`` call
1414+
remains O(log N) in steady state.
1415+
"""
1416+
self._node_energy_index = []
1417+
self._node_none_ids = []
1418+
for node in self._nodes.values():
1419+
if node.energy is not None:
1420+
bisect.insort(self._node_energy_index, (node.energy, node.node_id))
1421+
else:
1422+
self._node_none_ids.append(node.node_id)
1423+
1424+
self._edge_energy_index = []
1425+
self._edge_none_ids = []
1426+
for edge in self._edges.values():
1427+
if edge.ts_energy is not None:
1428+
bisect.insort(self._edge_energy_index, (edge.ts_energy, edge.edge_id))
1429+
else:
1430+
self._edge_none_ids.append(edge.edge_id)
1431+
1432+
self._index_dirty = False
1433+
1434+
def nodes_in_energy_window(
1435+
self,
1436+
center: float,
1437+
tol: float,
1438+
) -> list[EQNode]:
1439+
"""Return EQ nodes whose energy lies within *center* ± *tol*.
1440+
1441+
Uses a binary-search on the internal sorted index for O(log N +
1442+
k) complexity, where k is the number of nodes in the window.
1443+
Nodes with ``energy = None`` are always included (the energy
1444+
pre-filter is not applied to them in the caller either).
1445+
1446+
Parameters
1447+
----------
1448+
center:
1449+
The query energy in Hartree (typically the incoming node's
1450+
electronic energy).
1451+
tol:
1452+
Half-width of the energy window in Hartree (``self.energy_tolerance``
1453+
in :class:`ReactionMapper`).
1454+
1455+
Returns
1456+
-------
1457+
list[EQNode]
1458+
Candidate nodes to check. May be empty.
1459+
"""
1460+
if self._index_dirty:
1461+
self._rebuild_indexes()
1462+
1463+
lo = bisect.bisect_left(self._node_energy_index, (center - tol, -1))
1464+
hi = bisect.bisect_right(self._node_energy_index, (center + tol, float("inf")))
1465+
1466+
result: list[EQNode] = [
1467+
self._nodes[nid]
1468+
for _, nid in self._node_energy_index[lo:hi]
1469+
if nid in self._nodes
1470+
]
1471+
# Always include nodes whose energy is None.
1472+
for nid in self._node_none_ids:
1473+
node = self._nodes.get(nid)
1474+
if node is not None:
1475+
result.append(node)
1476+
return result
1477+
1478+
def edges_in_energy_window(
1479+
self,
1480+
center: float,
1481+
tol: float,
1482+
) -> list[TSEdge]:
1483+
"""Return TS edges whose ``ts_energy`` lies within *center* ± *tol*.
1484+
1485+
Analogous to :meth:`nodes_in_energy_window` but for
1486+
:class:`TSEdge` objects. Edges without parsed energies
1487+
(``ts_energy = None``) are always included.
1488+
1489+
Parameters
1490+
----------
1491+
center:
1492+
The incoming TS energy in Hartree.
1493+
tol:
1494+
Half-width of the energy window in Hartree.
1495+
1496+
Returns
1497+
-------
1498+
list[TSEdge]
1499+
Candidate edges to check for geometric duplication.
1500+
"""
1501+
if self._index_dirty:
1502+
self._rebuild_indexes()
1503+
1504+
lo = bisect.bisect_left(self._edge_energy_index, (center - tol, -1))
1505+
hi = bisect.bisect_right(self._edge_energy_index, (center + tol, float("inf")))
1506+
1507+
result: list[TSEdge] = [
1508+
self._edges[eid]
1509+
for _, eid in self._edge_energy_index[lo:hi]
1510+
if eid in self._edges
1511+
]
1512+
# Always include edges whose ts_energy is None.
1513+
for eid in self._edge_none_ids:
1514+
edge = self._edges.get(eid)
1515+
if edge is not None:
1516+
result.append(edge)
1517+
return result
1518+
13641519
def summary(self) -> str:
13651520
lines = [f"[NetworkGraph] nodes={len(self._nodes)} edges={len(self._edges)}"]
13661521
ref = self.reference_energy()
@@ -2449,21 +2604,21 @@ def _process_profile(self, profile_dir: str, run_dir: str) -> None:
24492604

24502605
# ── Step 3: TS duplicate check (TS-vs-TS only) ────────────────────
24512606
if ts_coords.size > 0:
2452-
for existing_edge in self.graph.all_edges():
2607+
# Narrow candidates to edges within the energy window.
2608+
# When ts_energy is None (profile parse failed) fall back to
2609+
# all_edges() so those structures still undergo geometric check.
2610+
ts_candidates: list[TSEdge] = (
2611+
self.graph.edges_in_energy_window(ts_energy, self.energy_tolerance)
2612+
if ts_energy is not None
2613+
else self.graph.all_edges()
2614+
)
2615+
for existing_edge in ts_candidates:
24532616
if not existing_edge.has_coords:
24542617
continue
24552618

2456-
# Energy pre-filter: skip RMSD computation when energies
2457-
# differ by more than the tolerance (fast path).
2458-
# When either energy is None the filter is skipped so that
2459-
# structures without parsed energies still undergo the
2460-
# geometric check.
2461-
if ts_energy is not None and existing_edge.ts_energy is not None:
2462-
energy_diff = abs(ts_energy - existing_edge.ts_energy)
2463-
if energy_diff >= self.energy_tolerance:
2464-
continue
2465-
2466-
# Geometric check via RMSD
2619+
# Geometric check via RMSD.
2620+
# Energy pre-filter is already enforced by the index window,
2621+
# so we proceed directly to the (more expensive) RMSD test.
24672622
if self.checker.are_similar(
24682623
ts_sym, ts_coords,
24692624
existing_edge.symbols, existing_edge.coords,
@@ -2603,15 +2758,24 @@ def _find_or_register_node(
26032758
)
26042759
return None
26052760

2606-
# Search only EQNode objects (graph.all_nodes) — never TSEdge objects.
2607-
# Nodes listed in excluded_node_ids are also skipped from the comparison
2608-
# pool so that a structure identical to an excluded node is treated as a
2609-
# new, independent node rather than being collapsed into the excluded one.
2610-
# Exception: nodes excluded automatically by the bond-rearrangement filter
2611-
# (_bond_rearrangement_excluded_ids) are still included in the comparison
2612-
# pool — they serve as valid deduplication references even though they are
2613-
# not explored further.
2614-
for existing in self.graph.all_nodes():
2761+
# Narrow the candidate pool to nodes within the energy window.
2762+
# When the incoming energy is None we fall back to all_nodes() so
2763+
# that structures without parsed energies still undergo the full
2764+
# geometric check (matching the original behaviour).
2765+
# Nodes listed in excluded_node_ids are also skipped from the
2766+
# comparison pool so that a structure identical to an excluded node
2767+
# is treated as a new, independent node rather than being collapsed
2768+
# into the excluded one.
2769+
# Exception: nodes excluded automatically by the bond-rearrangement
2770+
# filter (_bond_rearrangement_excluded_ids) are still included in
2771+
# the comparison pool — they serve as valid deduplication references
2772+
# even though they are not explored further.
2773+
candidates: list[EQNode] = (
2774+
self.graph.nodes_in_energy_window(energy, self.energy_tolerance)
2775+
if energy is not None
2776+
else self.graph.all_nodes()
2777+
)
2778+
for existing in candidates:
26152779
if (
26162780
existing.node_id in self.excluded_node_ids
26172781
and existing.node_id not in self._bond_rearrangement_excluded_ids
@@ -2628,20 +2792,10 @@ def _find_or_register_node(
26282792
)
26292793
continue
26302794

2631-
# Energy pre-filter (fast path). Skipped when either energy is None.
2632-
if energy is not None and existing.energy is not None:
2633-
delta_ha = abs(energy - existing.energy)
2634-
if delta_ha > self.energy_tolerance:
2635-
logger.debug(
2636-
"_find_or_register_node: EQ%d rejected by energy pre-filter "
2637-
"(|ΔE|=%.6f Ha = %.3f kcal/mol > tol=%.6f Ha = %.3f kcal/mol).",
2638-
existing.node_id,
2639-
delta_ha, delta_ha * HARTREE_TO_KCALMOL,
2640-
self.energy_tolerance,
2641-
self.energy_tolerance * HARTREE_TO_KCALMOL,
2642-
)
2643-
continue
2644-
elif energy is None or existing.energy is None:
2795+
# Energy pre-filter log — the window already enforces the tolerance
2796+
# for nodes with energy, so this branch fires only for the None-energy
2797+
# fallback path (both sides None) or cross-None cases.
2798+
if energy is None or existing.energy is None:
26452799
logger.debug(
26462800
"_find_or_register_node: energy pre-filter skipped for EQ%d "
26472801
"(new_E=%s existing_E=%s).",

0 commit comments

Comments
 (0)