88
99from __future__ import annotations
1010
11+ import bisect
1112import glob
1213import heapq
1314import json
@@ -1215,9 +1216,30 @@ def __init__(self) -> None:
12151216 self ._ref_energy_cache : float | None = _UNSET # type: ignore[assignment]
12161217 self .last_iteration : int = 0
12171218
1219+ # ── Energy-sorted indexes for O(log N) window queries ──────────────
1220+ # Each list stores (energy, id) tuples in ascending energy order,
1221+ # maintained by bisect.insort on every add_node / add_edge call.
1222+ # Nodes/edges whose energy is None are tracked separately in the
1223+ # _*_none_ids lists and always included in window query results
1224+ # (mirroring the original behaviour of skipping the energy pre-filter
1225+ # when either energy is None).
1226+ # The _index_dirty flag is raised by invalidate_energy_cache() and
1227+ # causes a full rebuild on the next window-query call, covering the
1228+ # case where backfill changes an energy from None to a real value.
1229+ self ._node_energy_index : list [tuple [float , int ]] = []
1230+ self ._node_none_ids : list [int ] = []
1231+ self ._edge_energy_index : list [tuple [float , int ]] = []
1232+ self ._edge_none_ids : list [int ] = []
1233+ self ._index_dirty : bool = False
1234+
12181235 def add_node (self , node : EQNode ) -> None :
12191236 self ._nodes [node .node_id ] = node
12201237 self ._ref_energy_cache = _UNSET # type: ignore[assignment]
1238+ # Maintain energy index incrementally.
1239+ if node .energy is not None :
1240+ bisect .insort (self ._node_energy_index , (node .energy , node .node_id ))
1241+ else :
1242+ self ._node_none_ids .append (node .node_id )
12211243
12221244 def get_node (self , node_id : int ) -> EQNode | None :
12231245 return self ._nodes .get (node_id )
@@ -1232,6 +1254,11 @@ def next_node_id(self) -> int:
12321254
12331255 def add_edge (self , edge : TSEdge ) -> None :
12341256 self ._edges [edge .edge_id ] = edge
1257+ # Maintain energy index incrementally.
1258+ if edge .ts_energy is not None :
1259+ bisect .insort (self ._edge_energy_index , (edge .ts_energy , edge .edge_id ))
1260+ else :
1261+ self ._edge_none_ids .append (edge .edge_id )
12351262
12361263 def all_edges (self ) -> list [TSEdge ]:
12371264 return list (self ._edges .values ())
@@ -1250,8 +1277,15 @@ def invalidate_energy_cache(self) -> None:
12501277
12511278 This method provides a public, encapsulated alternative to directly
12521279 manipulating the internal ``_ref_energy_cache`` sentinel.
1280+
1281+ Also marks the energy-sorted indexes as dirty so that the next call to
1282+ :meth:`nodes_in_energy_window` or :meth:`edges_in_energy_window`
1283+ triggers a full rebuild. This covers the backfill case where a node's
1284+ energy transitions from ``None`` to a real value, which would otherwise
1285+ leave the node stranded in ``_node_none_ids`` without a sorted entry.
12531286 """
12541287 self ._ref_energy_cache = _UNSET # type: ignore[assignment]
1288+ self ._index_dirty = True
12551289
12561290 def reference_energy (self ) -> float | None :
12571291 """Return the unified reference energy for priority/Boltzmann calculations.
@@ -1361,6 +1395,127 @@ def load(self, filepath: str) -> None:
13611395
13621396 self .last_iteration = data .get ("metadata" , {}).get ("last_iteration" , 0 )
13631397
1398+ # ------------------------------------------------------------------
1399+ # Energy-window index helpers
1400+ # ------------------------------------------------------------------
1401+
1402+ def _rebuild_indexes (self ) -> None :
1403+ """Rebuild both energy-sorted indexes from scratch.
1404+
1405+ Called automatically by the window-query methods when
1406+ ``_index_dirty`` is ``True``. The most common trigger is
1407+ :meth:`invalidate_energy_cache` after a backfill pass in
1408+ ``_flush_node_energy_updates``, where node energies may have
1409+ transitioned from ``None`` to real values.
1410+
1411+ Complexity: O(N log N) for nodes + O(E log E) for edges.
1412+ Called at most once per mapper iteration (the next backfill
1413+ flush) so the amortised cost per ``_process_profile`` call
1414+ remains O(log N) in steady state.
1415+ """
1416+ self ._node_energy_index = []
1417+ self ._node_none_ids = []
1418+ for node in self ._nodes .values ():
1419+ if node .energy is not None :
1420+ bisect .insort (self ._node_energy_index , (node .energy , node .node_id ))
1421+ else :
1422+ self ._node_none_ids .append (node .node_id )
1423+
1424+ self ._edge_energy_index = []
1425+ self ._edge_none_ids = []
1426+ for edge in self ._edges .values ():
1427+ if edge .ts_energy is not None :
1428+ bisect .insort (self ._edge_energy_index , (edge .ts_energy , edge .edge_id ))
1429+ else :
1430+ self ._edge_none_ids .append (edge .edge_id )
1431+
1432+ self ._index_dirty = False
1433+
1434+ def nodes_in_energy_window (
1435+ self ,
1436+ center : float ,
1437+ tol : float ,
1438+ ) -> list [EQNode ]:
1439+ """Return EQ nodes whose energy lies within *center* ± *tol*.
1440+
1441+ Uses a binary-search on the internal sorted index for O(log N +
1442+ k) complexity, where k is the number of nodes in the window.
1443+ Nodes with ``energy = None`` are always included (the energy
1444+ pre-filter is not applied to them in the caller either).
1445+
1446+ Parameters
1447+ ----------
1448+ center:
1449+ The query energy in Hartree (typically the incoming node's
1450+ electronic energy).
1451+ tol:
1452+ Half-width of the energy window in Hartree (``self.energy_tolerance``
1453+ in :class:`ReactionMapper`).
1454+
1455+ Returns
1456+ -------
1457+ list[EQNode]
1458+ Candidate nodes to check. May be empty.
1459+ """
1460+ if self ._index_dirty :
1461+ self ._rebuild_indexes ()
1462+
1463+ lo = bisect .bisect_left (self ._node_energy_index , (center - tol , - 1 ))
1464+ hi = bisect .bisect_right (self ._node_energy_index , (center + tol , float ("inf" )))
1465+
1466+ result : list [EQNode ] = [
1467+ self ._nodes [nid ]
1468+ for _ , nid in self ._node_energy_index [lo :hi ]
1469+ if nid in self ._nodes
1470+ ]
1471+ # Always include nodes whose energy is None.
1472+ for nid in self ._node_none_ids :
1473+ node = self ._nodes .get (nid )
1474+ if node is not None :
1475+ result .append (node )
1476+ return result
1477+
1478+ def edges_in_energy_window (
1479+ self ,
1480+ center : float ,
1481+ tol : float ,
1482+ ) -> list [TSEdge ]:
1483+ """Return TS edges whose ``ts_energy`` lies within *center* ± *tol*.
1484+
1485+ Analogous to :meth:`nodes_in_energy_window` but for
1486+ :class:`TSEdge` objects. Edges without parsed energies
1487+ (``ts_energy = None``) are always included.
1488+
1489+ Parameters
1490+ ----------
1491+ center:
1492+ The incoming TS energy in Hartree.
1493+ tol:
1494+ Half-width of the energy window in Hartree.
1495+
1496+ Returns
1497+ -------
1498+ list[TSEdge]
1499+ Candidate edges to check for geometric duplication.
1500+ """
1501+ if self ._index_dirty :
1502+ self ._rebuild_indexes ()
1503+
1504+ lo = bisect .bisect_left (self ._edge_energy_index , (center - tol , - 1 ))
1505+ hi = bisect .bisect_right (self ._edge_energy_index , (center + tol , float ("inf" )))
1506+
1507+ result : list [TSEdge ] = [
1508+ self ._edges [eid ]
1509+ for _ , eid in self ._edge_energy_index [lo :hi ]
1510+ if eid in self ._edges
1511+ ]
1512+ # Always include edges whose ts_energy is None.
1513+ for eid in self ._edge_none_ids :
1514+ edge = self ._edges .get (eid )
1515+ if edge is not None :
1516+ result .append (edge )
1517+ return result
1518+
13641519 def summary (self ) -> str :
13651520 lines = [f"[NetworkGraph] nodes={ len (self ._nodes )} edges={ len (self ._edges )} " ]
13661521 ref = self .reference_energy ()
@@ -2449,21 +2604,21 @@ def _process_profile(self, profile_dir: str, run_dir: str) -> None:
24492604
24502605 # ── Step 3: TS duplicate check (TS-vs-TS only) ────────────────────
24512606 if ts_coords .size > 0 :
2452- for existing_edge in self .graph .all_edges ():
2607+ # Narrow candidates to edges within the energy window.
2608+ # When ts_energy is None (profile parse failed) fall back to
2609+ # all_edges() so those structures still undergo geometric check.
2610+ ts_candidates : list [TSEdge ] = (
2611+ self .graph .edges_in_energy_window (ts_energy , self .energy_tolerance )
2612+ if ts_energy is not None
2613+ else self .graph .all_edges ()
2614+ )
2615+ for existing_edge in ts_candidates :
24532616 if not existing_edge .has_coords :
24542617 continue
24552618
2456- # Energy pre-filter: skip RMSD computation when energies
2457- # differ by more than the tolerance (fast path).
2458- # When either energy is None the filter is skipped so that
2459- # structures without parsed energies still undergo the
2460- # geometric check.
2461- if ts_energy is not None and existing_edge .ts_energy is not None :
2462- energy_diff = abs (ts_energy - existing_edge .ts_energy )
2463- if energy_diff >= self .energy_tolerance :
2464- continue
2465-
2466- # Geometric check via RMSD
2619+ # Geometric check via RMSD.
2620+ # Energy pre-filter is already enforced by the index window,
2621+ # so we proceed directly to the (more expensive) RMSD test.
24672622 if self .checker .are_similar (
24682623 ts_sym , ts_coords ,
24692624 existing_edge .symbols , existing_edge .coords ,
@@ -2603,15 +2758,24 @@ def _find_or_register_node(
26032758 )
26042759 return None
26052760
2606- # Search only EQNode objects (graph.all_nodes) — never TSEdge objects.
2607- # Nodes listed in excluded_node_ids are also skipped from the comparison
2608- # pool so that a structure identical to an excluded node is treated as a
2609- # new, independent node rather than being collapsed into the excluded one.
2610- # Exception: nodes excluded automatically by the bond-rearrangement filter
2611- # (_bond_rearrangement_excluded_ids) are still included in the comparison
2612- # pool — they serve as valid deduplication references even though they are
2613- # not explored further.
2614- for existing in self .graph .all_nodes ():
2761+ # Narrow the candidate pool to nodes within the energy window.
2762+ # When the incoming energy is None we fall back to all_nodes() so
2763+ # that structures without parsed energies still undergo the full
2764+ # geometric check (matching the original behaviour).
2765+ # Nodes listed in excluded_node_ids are also skipped from the
2766+ # comparison pool so that a structure identical to an excluded node
2767+ # is treated as a new, independent node rather than being collapsed
2768+ # into the excluded one.
2769+ # Exception: nodes excluded automatically by the bond-rearrangement
2770+ # filter (_bond_rearrangement_excluded_ids) are still included in
2771+ # the comparison pool — they serve as valid deduplication references
2772+ # even though they are not explored further.
2773+ candidates : list [EQNode ] = (
2774+ self .graph .nodes_in_energy_window (energy , self .energy_tolerance )
2775+ if energy is not None
2776+ else self .graph .all_nodes ()
2777+ )
2778+ for existing in candidates :
26152779 if (
26162780 existing .node_id in self .excluded_node_ids
26172781 and existing .node_id not in self ._bond_rearrangement_excluded_ids
@@ -2628,20 +2792,10 @@ def _find_or_register_node(
26282792 )
26292793 continue
26302794
2631- # Energy pre-filter (fast path). Skipped when either energy is None.
2632- if energy is not None and existing .energy is not None :
2633- delta_ha = abs (energy - existing .energy )
2634- if delta_ha > self .energy_tolerance :
2635- logger .debug (
2636- "_find_or_register_node: EQ%d rejected by energy pre-filter "
2637- "(|ΔE|=%.6f Ha = %.3f kcal/mol > tol=%.6f Ha = %.3f kcal/mol)." ,
2638- existing .node_id ,
2639- delta_ha , delta_ha * HARTREE_TO_KCALMOL ,
2640- self .energy_tolerance ,
2641- self .energy_tolerance * HARTREE_TO_KCALMOL ,
2642- )
2643- continue
2644- elif energy is None or existing .energy is None :
2795+ # Energy pre-filter log — the window already enforces the tolerance
2796+ # for nodes with energy, so this branch fires only for the None-energy
2797+ # fallback path (both sides None) or cross-None cases.
2798+ if energy is None or existing .energy is None :
26452799 logger .debug (
26462800 "_find_or_register_node: energy pre-filter skipped for EQ%d "
26472801 "(new_E=%s existing_E=%s)." ,
0 commit comments