From 6b0dd45dae8f77baa5cd4f7f5d2ba36cd16578c1 Mon Sep 17 00:00:00 2001
From: Kevin Boyd <kboyd@nvidia.com>
Date: Thu, 11 Jun 2026 17:44:46 -0400
Subject: [PATCH] Improve conformer RMSD workflow ergonomics

---
 agent-skills/nvmolkit-usage/SKILL.md  |  30 +++++
 nvmolkit/conformerRmsd.py             |  95 +++++++++++----
 nvmolkit/tests/test_conformer_rmsd.py | 164 +++++++++++++++++++++++++-
 3 files changed, 261 insertions(+), 28 deletions(-)

diff --git a/agent-skills/nvmolkit-usage/SKILL.md b/agent-skills/nvmolkit-usage/SKILL.md
index 96204dc9..4ec869fe 100644
--- a/agent-skills/nvmolkit-usage/SKILL.md
+++ b/agent-skills/nvmolkit-usage/SKILL.md
@@ -227,6 +227,36 @@ Inputs are `list[Mol]` with conformers already populated (typically by ETKDG, RD
 
 If any input molecule is `None` or lacks MMFF/UFF atom types, the call raises `ValueError`. The exception's `args[1]` is a dict with keys `"none"` and `"no_params"` listing the offending indices - useful for filtering a noisy input set.
 
+### Conformer RMSD and Butina clustering
+
+```python
+import torch
+from rdkit import Chem
+from rdkit.Chem.rdDistGeom import EmbedMultipleConfs
+from nvmolkit.clustering import butina
+from nvmolkit.conformerRmsd import GetConformerRMSMatrixBatch
+
+mols = [Chem.AddHs(Chem.MolFromSmiles(smi)) for smi in ["CCCCCC", "c1ccccc1"]]
+for mol in mols:
+    EmbedMultipleConfs(mol, numConfs=10)
+
+# Remove hydrogens after embedding for heavy-atom RMSD.
+heavy_mols = [Chem.RemoveHs(mol) for mol in mols]
+
+# Default RMSD output is RDKit-compatible condensed lower-triangle form.
+condensed = GetConformerRMSMatrixBatch(heavy_mols)
+
+# Butina expects a square distance matrix, so request square GPU tensors.
+square = GetConformerRMSMatrixBatch(heavy_mols, output_format="square")
+clusters = [butina(distance_matrix, cutoff=0.5).torch() for distance_matrix in square]
+
+torch.cuda.synchronize()
+for mol_clusters in clusters:
+    print(mol_clusters.cpu().tolist())
+```
+
+`GetConformerRMSMatrix(mol)` and `GetConformerRMSMatrixBatch(mols)` default to `output_format="condensed"`, returning `AsyncGpuResult` objects that wrap RDKit-style flat vectors of length `N * (N - 1) // 2`. Use `output_format="square"` when chaining into `butina()` or any other API that expects an `N x N` distance matrix. Both forms live on the GPU; call `.numpy()` on condensed results or synchronize before moving square tensors to the CPU.
+
 ### Custom forcefield options + constraints (`BatchedForcefield`)
 
 Reach for `MMFFBatchedForcefield` / `UFFBatchedForcefield` instead of the one-shot `MMFFOptimizeMoleculesConfs` / `UFFOptimizeMoleculesConfs` when you need any of:
diff --git a/nvmolkit/conformerRmsd.py b/nvmolkit/conformerRmsd.py
index 8ad49f55..70fd3f4d 100644
--- a/nvmolkit/conformerRmsd.py
+++ b/nvmolkit/conformerRmsd.py
@@ -19,8 +19,8 @@
 
 import torch
 
-from nvmolkit._arrayHelpers import *  # noqa: F403
 from nvmolkit import _conformerRmsd
+from nvmolkit._arrayHelpers import *  # noqa: F403
 from nvmolkit.types import AsyncGpuResult
 
 if TYPE_CHECKING:
@@ -29,11 +29,32 @@
 __all__ = ["GetConformerRMSMatrix", "GetConformerRMSMatrixBatch"]
 
 
+_VALID_OUTPUT_FORMATS = frozenset({"condensed", "square"})
+
+
+def _check_output_format(output_format: str) -> None:
+    if output_format not in _VALID_OUTPUT_FORMATS:
+        raise ValueError(f"output_format must be one of {sorted(_VALID_OUTPUT_FORMATS)}, got {output_format!r}")
+
+
+def _condensed_to_square(rmsd: AsyncGpuResult, num_confs: int) -> torch.Tensor:
+    values = rmsd.torch()
+    square = torch.zeros((num_confs, num_confs), device=values.device, dtype=values.dtype)
+    if num_confs < 2:
+        return square
+
+    idx = torch.tril_indices(num_confs, num_confs, offset=-1, device=values.device)
+    square[idx[0], idx[1]] = values
+    return square + square.T
+
+
 def GetConformerRMSMatrix(
     mol: "Mol",
     prealigned: bool = False,
     stream: torch.cuda.Stream | None = None,
-) -> AsyncGpuResult:
+    *,
+    output_format: str = "condensed",
+) -> AsyncGpuResult | torch.Tensor:
     """Compute the pairwise RMSD matrix between all conformers of a molecule on GPU.
 
     GPU-accelerated equivalent of ``AllChem.GetConformerRMSMatrix(mol,
@@ -50,8 +71,9 @@ def GetConformerRMSMatrix(
     * Results are returned as an :class:`AsyncGpuResult` (device tensor) rather
       than a Python list, to keep the conformer-selection pipeline on the GPU.
 
-    The result can be passed directly to :func:`nvmolkit.clustering.butina` for
-    GPU-accelerated Butina clustering.
+    By default, the result matches RDKit's condensed lower-triangle shape.  Use
+    ``output_format="square"`` when a downstream API expects an ``N x N`` distance
+    matrix, such as :func:`nvmolkit.clustering.butina`.
 
     Args:
         mol: RDKit molecule with two or more conformers.  Strip hydrogens first
@@ -60,11 +82,19 @@ def GetConformerRMSMatrix(
         prealigned: If True, skip Kabsch alignment and compute RMSD on raw
                     coordinates.  If False (default), optimally align each pair.
         stream: CUDA stream to use.  If None, uses the current stream.
+        output_format: ``"condensed"`` returns an :class:`AsyncGpuResult` wrapping
+                       a 1-D RDKit-style lower-triangle vector.  ``"square"``
+                       returns a symmetric ``torch.Tensor`` of shape ``(N, N)``
+                       on the GPU with zeros on the diagonal.
 
     Returns:
-        AsyncGpuResult wrapping a 1-D tensor of shape ``(N*(N-1)/2,)`` containing
-        RMSD values in lower-triangle condensed order.  The RMSD for conformer
-        pair (i, j) with i > j is at index ``i*(i-1)//2 + j``.
+        With ``output_format="condensed"``, an :class:`AsyncGpuResult` wrapping a
+        1-D tensor of shape ``(N*(N-1)/2,)`` containing RMSD values in
+        lower-triangle condensed order.  The RMSD for conformer pair (i, j) with
+        i > j is at index ``i*(i-1)//2 + j``.
+
+        With ``output_format="square"``, a symmetric CUDA ``torch.Tensor`` of
+        shape ``(N, N)``.
 
     Raises:
         ValueError: If ``mol`` is None or has conformers but no atoms.
@@ -83,29 +113,32 @@ def GetConformerRMSMatrix(
         >>> # GPU equivalent of: AllChem.GetConformerRMSMatrix(no_h)
         >>> rmsd_matrix = GetConformerRMSMatrix(no_h)
         >>>
-        >>> # Reshape to square for GPU Butina clustering
-        >>> import torch
-        >>> n = no_h.GetNumConformers()
-        >>> square = torch.zeros(n, n, device='cuda', dtype=torch.float64)
-        >>> idx = torch.tril_indices(n, n, offset=-1)
-        >>> square[idx[0], idx[1]] = rmsd_matrix.torch()
-        >>> square = square + square.T
+        >>> # Request square output for GPU Butina clustering
+        >>> square = GetConformerRMSMatrix(no_h, output_format="square")
         >>> clusters = butina(square, cutoff=0.5)
     """
     if mol is None:
         raise ValueError("mol must not be None")
     if stream is not None and not isinstance(stream, torch.cuda.Stream):
         raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
+    _check_output_format(output_format)
 
-    stream_ptr = (stream if stream is not None else torch.cuda.current_stream()).cuda_stream
-    return AsyncGpuResult(_conformerRmsd.GetConformerRMSMatrix(mol, prealigned, stream_ptr))
+    execution_stream = stream if stream is not None else torch.cuda.current_stream()
+    stream_ptr = execution_stream.cuda_stream
+    result = AsyncGpuResult(_conformerRmsd.GetConformerRMSMatrix(mol, prealigned, stream_ptr))
+    if output_format == "square":
+        with torch.cuda.stream(execution_stream):
+            return _condensed_to_square(result, mol.GetNumConformers())
+    return result
 
 
 def GetConformerRMSMatrixBatch(
     mols: list["Mol"],
     prealigned: bool = False,
     stream: torch.cuda.Stream | None = None,
-) -> list[AsyncGpuResult]:
+    *,
+    output_format: str = "condensed",
+) -> list[AsyncGpuResult] | list[torch.Tensor]:
     """Compute pairwise RMSD matrices for a batch of molecules on GPU.
 
     All molecules are processed in a single kernel launch, so their conformer
@@ -119,13 +152,21 @@ def GetConformerRMSMatrixBatch(
         prealigned: If True, skip Kabsch alignment and compute RMSD on raw
                     coordinates.  If False (default), optimally align each pair.
         stream: CUDA stream to use.  If None, uses the current stream.
+        output_format: ``"condensed"`` returns one :class:`AsyncGpuResult` per
+                       molecule, each wrapping a 1-D RDKit-style lower-triangle
+                       vector.  ``"square"`` returns one symmetric CUDA
+                       ``torch.Tensor`` of shape ``(N, N)`` per molecule.
 
     Returns:
-        List of :class:`AsyncGpuResult`, one per input molecule, in the same
-        order as ``mols``.  Each element wraps a 1-D tensor of shape
-        ``(N*(N-1)/2,)`` for a molecule with N conformers, or shape ``(0,)``
-        for molecules with fewer than 2 conformers.  The RMSD for conformer
-        pair (i, j) with i > j is at index ``i*(i-1)//2 + j``.
+        With ``output_format="condensed"``, a list of :class:`AsyncGpuResult`,
+        one per input molecule, in the same order as ``mols``.  Each element
+        wraps a 1-D tensor of shape ``(N*(N-1)/2,)`` for a molecule with N
+        conformers, or shape ``(0,)`` for molecules with fewer than 2
+        conformers.  The RMSD for conformer pair (i, j) with i > j is at index
+        ``i*(i-1)//2 + j``.
+
+        With ``output_format="square"``, a list of symmetric CUDA
+        ``torch.Tensor`` objects with shapes ``(N, N)``.
 
     Raises:
         ValueError: If any element of ``mols`` is None.
@@ -146,11 +187,17 @@ def GetConformerRMSMatrixBatch(
     """
     if stream is not None and not isinstance(stream, torch.cuda.Stream):
         raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
+    _check_output_format(output_format)
 
     for i, mol in enumerate(mols):
         if mol is None:
             raise ValueError(f"mol at index {i} must not be None")
 
-    stream_ptr = (stream if stream is not None else torch.cuda.current_stream()).cuda_stream
+    execution_stream = stream if stream is not None else torch.cuda.current_stream()
+    stream_ptr = execution_stream.cuda_stream
     raw = _conformerRmsd.GetConformerRMSMatrixBatch(mols, prealigned, stream_ptr)
-    return [AsyncGpuResult(r) for r in raw]
+    results = [AsyncGpuResult(r) for r in raw]
+    if output_format == "square":
+        with torch.cuda.stream(execution_stream):
+            return [_condensed_to_square(result, mol.GetNumConformers()) for result, mol in zip(results, mols)]
+    return results
diff --git a/nvmolkit/tests/test_conformer_rmsd.py b/nvmolkit/tests/test_conformer_rmsd.py
index 14df9408..b4d2e8b9 100644
--- a/nvmolkit/tests/test_conformer_rmsd.py
+++ b/nvmolkit/tests/test_conformer_rmsd.py
@@ -13,15 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import copy
-
+import numpy as np
 import pytest
 import torch
-import numpy as np
 from rdkit import Chem
 from rdkit.Chem import AllChem, rdDistGeom
 
 from nvmolkit.conformerRmsd import GetConformerRMSMatrix, GetConformerRMSMatrixBatch
+from nvmolkit.types import AsyncGpuResult
 
 
 def _numpy_kabsch_rmsd(p, q):
@@ -29,7 +28,7 @@ def _numpy_kabsch_rmsd(p, q):
     p_c = p - p.mean(axis=0)
     q_c = q - q.mean(axis=0)
     H = p_c.T @ q_c
-    U, S, Vt = np.linalg.svd(H)
+    _u, S, _vt = np.linalg.svd(H)
     d = np.sign(np.linalg.det(H))
     S[-1] *= d if d != 0.0 else 1.0
     Sp = np.sum(p_c**2)
@@ -132,6 +131,56 @@ def test_rmsd_two_conformers():
     assert abs(gpu_rms[0] - ref_rms[0]) < 0.01
 
 
+def test_rmsd_explicit_condensed_output_matches_default():
+    """Explicit condensed output preserves the default API behavior."""
+    mol = _embed_mol("CCCCCC", num_confs=10)
+    no_h = Chem.RemoveHs(mol)
+
+    default_result = GetConformerRMSMatrix(no_h)
+    condensed_result = GetConformerRMSMatrix(no_h, output_format="condensed")
+    torch.cuda.synchronize()
+
+    assert isinstance(condensed_result, AsyncGpuResult)
+    assert torch.allclose(condensed_result.torch(), default_result.torch(), atol=1e-10)
+
+
+def test_rmsd_square_output_matches_condensed():
+    """Square output expands the default RDKit-style condensed vector."""
+    mol = _embed_mol("CCCCCC", num_confs=10)
+    no_h = Chem.RemoveHs(mol)
+    n = no_h.GetNumConformers()
+
+    condensed = GetConformerRMSMatrix(no_h).torch()
+    square = GetConformerRMSMatrix(no_h, output_format="square")
+    torch.cuda.synchronize()
+
+    expected = torch.zeros((n, n), device=square.device, dtype=square.dtype)
+    idx = torch.tril_indices(n, n, offset=-1, device=square.device)
+    expected[idx[0], idx[1]] = condensed
+    expected = expected + expected.T
+
+    assert isinstance(square, torch.Tensor)
+    assert square.shape == (n, n)
+    assert torch.allclose(square, expected, atol=1e-10)
+    assert torch.allclose(torch.diag(square), torch.zeros(n, device=square.device, dtype=square.dtype))
+
+
+def test_rmsd_square_output_for_fewer_than_two_conformers():
+    """Square output handles the same <2 conformer inputs as condensed output."""
+    mol_zero = Chem.MolFromSmiles("CCO")
+    mol_one = Chem.RemoveHs(_embed_mol("CCO", num_confs=1))
+
+    square_zero = GetConformerRMSMatrix(mol_zero, output_format="square")
+    square_one = GetConformerRMSMatrix(mol_one, output_format="square")
+    torch.cuda.synchronize()
+
+    assert isinstance(square_zero, torch.Tensor)
+    assert square_zero.shape == (0, 0)
+    assert isinstance(square_one, torch.Tensor)
+    assert square_one.shape == (1, 1)
+    assert square_one.item() == 0.0
+
+
 def test_rmsd_rigid_molecule():
     """Rigid molecule (benzene) — all conformers should have near-zero RMSD."""
     mol = _embed_mol("c1ccccc1", num_confs=5)
@@ -160,6 +209,25 @@ def test_rmsd_explicit_stream():
         assert abs(g - r) < 0.01
 
 
+def test_rmsd_square_output_explicit_stream():
+    """Square conversion is enqueued on the caller-provided stream."""
+    mol = _embed_mol("CCCCCC", num_confs=10)
+    no_h = Chem.RemoveHs(mol)
+
+    s = torch.cuda.Stream()
+    square = GetConformerRMSMatrix(no_h, stream=s, output_format="square")
+    s.synchronize()
+
+    ref_rms = _numpy_rmsd_matrix(no_h)
+    n = no_h.GetNumConformers()
+    expected = torch.zeros((n, n), device=square.device, dtype=square.dtype)
+    idx = torch.tril_indices(n, n, offset=-1, device=square.device)
+    expected[idx[0], idx[1]] = torch.tensor(ref_rms, device=square.device, dtype=square.dtype)
+    expected = expected + expected.T
+
+    assert torch.allclose(square, expected, atol=0.01)
+
+
 def test_rmsd_invalid_input_none():
     """None molecule should raise ValueError."""
     with pytest.raises(ValueError, match="mol must not be None"):
@@ -181,6 +249,22 @@ def test_rmsd_invalid_stream_type():
         GetConformerRMSMatrix(no_h, stream=42)
 
 
+def test_rmsd_invalid_output_format():
+    """Unknown output_format should fail before dispatch."""
+    mol = _embed_mol("CCCC", num_confs=2)
+    no_h = Chem.RemoveHs(mol)
+    with pytest.raises(ValueError, match="output_format must be one of"):
+        GetConformerRMSMatrix(no_h, output_format="matrix")
+
+
+def test_rmsd_output_format_is_keyword_only():
+    """output_format is keyword-only to avoid ambiguous positional calls."""
+    mol = _embed_mol("CCCC", num_confs=2)
+    no_h = Chem.RemoveHs(mol)
+    with pytest.raises(TypeError):
+        GetConformerRMSMatrix(no_h, False, None, "square")
+
+
 # ---------------------------------------------------------------------------
 # Batch API tests
 # ---------------------------------------------------------------------------
@@ -220,6 +304,45 @@ def test_batch_mixed_conformer_counts():
     assert results[2].numpy().shape[0] == 0
 
 
+def test_batch_explicit_condensed_output_matches_default():
+    """Explicit batch condensed output preserves the default API behavior."""
+    mols = [Chem.RemoveHs(_embed_mol(s, num_confs=5)) for s in ["CCCC", "CCCCC"]]
+
+    default_results = GetConformerRMSMatrixBatch(mols)
+    condensed_results = GetConformerRMSMatrixBatch(mols, output_format="condensed")
+    torch.cuda.synchronize()
+
+    for default_result, condensed_result in zip(default_results, condensed_results):
+        assert isinstance(condensed_result, AsyncGpuResult)
+        assert torch.allclose(condensed_result.torch(), default_result.torch(), atol=1e-10)
+
+
+def test_batch_square_output():
+    """Batch square output returns one NxN tensor per molecule."""
+    mols = [
+        Chem.RemoveHs(_embed_mol("CCCCCC", num_confs=8)),
+        Chem.RemoveHs(_embed_mol("CC", num_confs=3)),
+        Chem.MolFromSmiles("CCO"),
+        Chem.RemoveHs(_embed_mol("CCO", num_confs=1)),
+    ]
+
+    condensed_results = GetConformerRMSMatrixBatch(mols)
+    square_results = GetConformerRMSMatrixBatch(mols, output_format="square")
+    torch.cuda.synchronize()
+
+    for mol, condensed_result, square in zip(mols, condensed_results, square_results):
+        n = mol.GetNumConformers()
+        expected = torch.zeros((n, n), device=square.device, dtype=square.dtype)
+        if n >= 2:
+            idx = torch.tril_indices(n, n, offset=-1, device=square.device)
+            expected[idx[0], idx[1]] = condensed_result.torch()
+            expected = expected + expected.T
+
+        assert isinstance(square, torch.Tensor)
+        assert square.shape == (n, n)
+        assert torch.allclose(square, expected, atol=1e-10)
+
+
 def test_batch_empty_list():
     """Empty input returns an empty list."""
     results = GetConformerRMSMatrixBatch([])
@@ -251,6 +374,20 @@ def test_batch_invalid_none():
         GetConformerRMSMatrixBatch([mol, None])
 
 
+def test_batch_invalid_output_format():
+    """Unknown batch output_format should fail before dispatch."""
+    mol = Chem.RemoveHs(_embed_mol("CCCC", num_confs=2))
+    with pytest.raises(ValueError, match="output_format must be one of"):
+        GetConformerRMSMatrixBatch([mol], output_format="matrix")
+
+
+def test_batch_output_format_is_keyword_only():
+    """Batch output_format is keyword-only to avoid ambiguous positional calls."""
+    mol = Chem.RemoveHs(_embed_mol("CCCC", num_confs=2))
+    with pytest.raises(TypeError):
+        GetConformerRMSMatrixBatch([mol], False, None, "square")
+
+
 def test_batch_explicit_stream():
     """Batch results are correct on an explicit CUDA stream."""
     mols = [Chem.RemoveHs(_embed_mol(s, num_confs=5)) for s in ["CCCC", "CCCCC"]]
@@ -266,6 +403,25 @@ def test_batch_explicit_stream():
             assert abs(g - r) < 0.01
 
 
+def test_batch_square_output_explicit_stream():
+    """Batch square conversion is enqueued on the caller-provided stream."""
+    mols = [Chem.RemoveHs(_embed_mol(s, num_confs=5)) for s in ["CCCC", "CCCCC"]]
+
+    s = torch.cuda.Stream()
+    squares = GetConformerRMSMatrixBatch(mols, stream=s, output_format="square")
+    s.synchronize()
+
+    for mol, square in zip(mols, squares):
+        ref_rms = _numpy_rmsd_matrix(mol)
+        n = mol.GetNumConformers()
+        expected = torch.zeros((n, n), device=square.device, dtype=square.dtype)
+        idx = torch.tril_indices(n, n, offset=-1, device=square.device)
+        expected[idx[0], idx[1]] = torch.tensor(ref_rms, device=square.device, dtype=square.dtype)
+        expected = expected + expected.T
+
+        assert torch.allclose(square, expected, atol=0.01)
+
+
 def test_rmsd_zero_atoms():
     """0-atom molecule with multiple conformers raises ValueError.