NVIDIA-BioNeMo · scal444 · Jun 11, 2026 · Jun 11, 2026
diff --git a/nvmolkit/clustering.py b/nvmolkit/clustering.py
@@ -32,14 +32,24 @@
 
 from nvmolkit import _clustering
 from nvmolkit._arrayHelpers import *  # noqa: F403
-from nvmolkit._fusedButina import extract_cluster_and_singletons, update_neighbor_counts, _check_fingerprint_matrix
-from nvmolkit.types import AsyncGpuResult
+from nvmolkit._fusedButina import _check_fingerprint_matrix, extract_cluster_and_singletons, update_neighbor_counts
+from nvmolkit.types import ArrayInput, AsyncGpuResult, _as_cuda_tensor, _resolve_cuda_stream, _validate_cuda_stream
 
 _VALID_NEIGHBORLIST_SIZES = frozenset({8, 16, 24, 32, 64, 128})
 
 
+def _check_distance_matrix(name: str, x: torch.Tensor) -> torch.Tensor:
+    if x.ndim != 2 or x.shape[0] != x.shape[1]:
+        raise ValueError(f"{name} must be a square 2D matrix, got shape={tuple(x.shape)}")
+    if x.dtype != torch.float64:
+        raise ValueError(f"{name} must have dtype float64")
+    if not x.is_contiguous():
+        x = x.contiguous()
+    return x
+
+
 def butina(
-    distance_matrix: AsyncGpuResult | torch.Tensor,
+    distance_matrix: ArrayInput,
     cutoff: float,
     neighborlist_max_size: int = 64,
     return_centroids: bool = False,
@@ -56,7 +66,9 @@ def butina(
 
     Args:
         distance_matrix: Square distance matrix of shape (N, N) where N is the number
-                        of items. Can be an AsyncGpuResult or torch.Tensor on GPU.
+                        of items. Can be an AsyncGpuResult, torch.Tensor, or numpy.ndarray.
+                        CPU tensors and NumPy arrays are copied to CUDA. Inputs
+                        must have dtype float64.
         cutoff: Distance threshold for clustering. Items are neighbors if their
                 distance is less than this cutoff.
         neighborlist_max_size: Maximum size of the neighborlist used for small cluster
@@ -80,24 +92,25 @@ def butina(
         raise ValueError(
             f"neighborlist_max_size must be one of {sorted(_VALID_NEIGHBORLIST_SIZES)}, got {neighborlist_max_size}"
         )
-    if stream is not None and not isinstance(stream, torch.cuda.Stream):
-        raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
-    stream_ptr = (stream if stream is not None else torch.cuda.current_stream()).cuda_stream
-    result = _clustering.butina(
-        distance_matrix.__cuda_array_interface__,
-        cutoff,
-        neighborlist_max_size,
-        return_centroids,
-        stream_ptr,
-    )
+    active_stream = _resolve_cuda_stream(stream, distance_matrix)
+    with torch.cuda.stream(active_stream):
+        distance_matrix_tensor = _as_cuda_tensor("distance_matrix", distance_matrix, stream=active_stream)
+        distance_matrix_tensor = _check_distance_matrix("distance_matrix", distance_matrix_tensor)
+        result = _clustering.butina(
+            distance_matrix_tensor.__cuda_array_interface__,
+            cutoff,
+            neighborlist_max_size,
+            return_centroids,
+            active_stream.cuda_stream,
+        )
     if return_centroids:
         clusters, centroids = result
         return AsyncGpuResult(clusters), AsyncGpuResult(centroids)
     return AsyncGpuResult(result)
 
 
 def fused_butina(
-    x: torch.Tensor,
+    x: ArrayInput,
     cutoff: float,
     return_centroids: bool = False,
     stream: torch.cuda.Stream | None = None,
@@ -110,7 +123,9 @@ def fused_butina(
     the full distance matrix. This makes it suitable for large datasets.
 
     Args:
-        x: Tensor of shape (N, D) containing the fingerprints to cluster.
+        x: Tensor-like object of shape (N, D) containing packed int32 fingerprints
+           to cluster. Can be an AsyncGpuResult, torch.Tensor, or numpy.ndarray.
+           CPU tensors and NumPy arrays are copied to CUDA.
         cutoff: Distance threshold for clustering. Items are neighbors if their
                 distance is less than this cutoff (i.e. similarity > 1 - cutoff).
         return_centroids: Whether to return centroid indices for each cluster.
@@ -125,17 +140,18 @@ def fused_butina(
         If ``return_centroids`` is True, returns a tuple ``(clusters, cluster_sizes, centroids)``
         where *centroids* is a list of centroid indices.
     """
-    _check_fingerprint_matrix("x", x)
     if metric not in ["tanimoto", "cosine"]:
         raise ValueError(f"metric must be one of ['tanimoto', 'cosine'], got {metric}")
 
-    if stream is not None and not isinstance(stream, torch.cuda.Stream):
-        raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
+    _validate_cuda_stream(stream)
 
     if cutoff < 0 or cutoff > 1:
         raise ValueError(f"cutoff must be in [0, 1], got {cutoff}")
 
-    with torch.cuda.stream(stream):
+    active_stream = _resolve_cuda_stream(stream, x)
+    with torch.cuda.stream(active_stream):
+        x = _as_cuda_tensor("x", x, stream=active_stream)
+        _check_fingerprint_matrix("x", x)
         n_start = x.shape[0]
         device = x.device
         indices = torch.arange(n_start, dtype=torch.int32, device=device)

diff --git a/nvmolkit/similarity.py b/nvmolkit/similarity.py
@@ -24,29 +24,65 @@
 
 from nvmolkit import _DataStructs
 from nvmolkit._arrayHelpers import *  # noqa: F403
-from nvmolkit.types import AsyncGpuResult
+from nvmolkit.types import ArrayInput, AsyncGpuResult, _as_cuda_tensor, _resolve_cuda_stream
 
 # --------------------------------
 # Tanimoto similarity
 # --------------------------------
 
 
+def _check_fingerprint_input(name: str, x: torch.Tensor) -> torch.Tensor:
+    if x.ndim != 2:
+        raise ValueError(f"{name} must be 2D, got shape={tuple(x.shape)}")
+    if x.dtype not in (torch.int32, torch.uint32):
+        raise ValueError(f"{name} must have dtype int32 or uint32")
+    if not x.is_contiguous():
+        x = x.contiguous()
+    return x
+
+
+def _fingerprint_inputs(
+    fingerprint_group_one: ArrayInput,
+    fingerprint_group_two: ArrayInput | None,
+    stream: torch.cuda.Stream | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.cuda.Stream]:
+    active_stream = (
+        _resolve_cuda_stream(stream, fingerprint_group_one, fingerprint_group_two)
+        if fingerprint_group_two is not None
+        else _resolve_cuda_stream(stream, fingerprint_group_one)
+    )
+    with torch.cuda.stream(active_stream):
+        bits_one = _as_cuda_tensor("fingerprint_group_one", fingerprint_group_one, stream=active_stream)
+        bits_one = _check_fingerprint_input("fingerprint_group_one", bits_one)
+        bits_two = (
+            bits_one
+            if fingerprint_group_two is None
+            else _check_fingerprint_input(
+                "fingerprint_group_two",
+                _as_cuda_tensor("fingerprint_group_two", fingerprint_group_two, stream=active_stream),
+            )
+        )
+    if bits_one.shape[1] != bits_two.shape[1]:
+        raise ValueError("fingerprint_group_one and fingerprint_group_two must have the same feature dimension")
+    return bits_one, bits_two, active_stream
+
+
 def crossTanimotoSimilarity(
-    fingerprint_group_one: AsyncGpuResult | torch.Tensor,
-    fingerprint_group_two: AsyncGpuResult | torch.Tensor | None = None,
+    fingerprint_group_one: ArrayInput,
+    fingerprint_group_two: ArrayInput | None = None,
     stream: torch.cuda.Stream | None = None,
 ) -> AsyncGpuResult:
     """Returns the Tanimoto similarity within a set of fingerprints or between two sets of fingerprints.
 
-    Expects fingerprints generated by nvMolKit or as a torch tensor located on device 0, with the leading dimension corresponding to
+    Expects fingerprints generated by nvMolKit, a torch tensor, or a numpy array, with the leading dimension corresponding to
     the number of fingerprints, and the second dimension representing the packed fingerprint
-    bitfield.
+    bitfield. CPU tensors and NumPy arrays are copied to CUDA.
 
     The special case of fingerprint_group_1 as a 1 x n_bits tensor is equivalent to RDKit's BulkTanimotoSimilarity.
 
     Args:
-        fingerprint_group_one: A torch Tensor or AsyncGpuResult computed from nvMolKit fingerprints
-        fingerprint_group_two: A torch Tensor or AsyncGpuResult computed from nvMolKit fingerprints,
+        fingerprint_group_one: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints
+        fingerprint_group_two: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints,
             or None for all-to-all similarity within fingerprint_group_one.
         stream: CUDA stream to use. If None, uses the current stream.
 
@@ -56,50 +92,42 @@ def crossTanimotoSimilarity(
         fingerprint_group_two. If fingerprint_group_two is None, computes all-to-all
         similarity within fingerprint_group_one.
     """
-    if stream is not None and not isinstance(stream, torch.cuda.Stream):
-        raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
-    bits_two_interface = (
-        fingerprint_group_two.__cuda_array_interface__
-        if fingerprint_group_two is not None
-        else fingerprint_group_one.__cuda_array_interface__
-    )
-    stream_ptr = (stream if stream is not None else torch.cuda.current_stream()).cuda_stream
-    return AsyncGpuResult(
+    bits_one, bits_two, active_stream = _fingerprint_inputs(fingerprint_group_one, fingerprint_group_two, stream)
+    result = AsyncGpuResult(
         _DataStructs.CrossTanimotoSimilarityRawBuffers(
-            fingerprint_group_one.__cuda_array_interface__, bits_two_interface, stream_ptr
+            bits_one.__cuda_array_interface__, bits_two.__cuda_array_interface__, active_stream.cuda_stream
         )
     )
+    result._input_refs = (bits_one, bits_two)
+    return result
 
 
 def crossTanimotoSimilarityMemoryConstrained(
-    fingerprint_group_one: torch.Tensor, fingerprint_group_two: torch.Tensor | None = None
+    fingerprint_group_one: ArrayInput, fingerprint_group_two: ArrayInput | None = None
 ) -> np.ndarray:
     """Returns the Tanimoto similarity within a set of fingerprints or between two sets of fingerprints.
 
     Computes results on the GPU, but returns a numpy array on the CPU. Will perform computation in chunks if necessary to avoid running out of memory on the GPU. Will still
     fail if the resulting matrix is too large to fit on the CPU.
 
-    Expects fingerprints generated by nvMolKit or as a torch tensor located on device 0, with the leading dimension corresponding to
+    Expects fingerprints generated by nvMolKit, a torch tensor, or a numpy array, with the leading dimension corresponding to
     the number of fingerprints, and the second dimension representing the packed fingerprint
-    bitfield.
+    bitfield. CPU tensors and NumPy arrays are copied to CUDA.
 
     The special case of fingerprint_group_1 as a 1 x n_bits tensor is equivalent to RDKit's BulkTanimotoSimilarity.
 
     Args:
-        fingerprint_group_one: A torch Tensor or AsyncGpuResult computed from nvMolKit fingerprints
-        fingerprint_group_two: A torch Tensor or AsyncGpuResult computed from nvMolKit fingerprints,
+        fingerprint_group_one: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints
+        fingerprint_group_two: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints,
             or None for all-to-all similarity within fingerprint_group_one.
 
     Returns:
         A numpy array of Tanimoto similarities, with index [i, j] corresponding to the similarity between
     """
-    bits_two_interface = (
-        fingerprint_group_two.__cuda_array_interface__
-        if fingerprint_group_two is not None
-        else fingerprint_group_one.__cuda_array_interface__
-    )
+    bits_one, bits_two, active_stream = _fingerprint_inputs(fingerprint_group_one, fingerprint_group_two, None)
+    active_stream.synchronize()
     vals = _DataStructs.CrossTanimotoSimilarityCPURawBuffers(
-        fingerprint_group_one.__cuda_array_interface__, bits_two_interface
+        bits_one.__cuda_array_interface__, bits_two.__cuda_array_interface__
     )
     # vals is a numpy ndarray with shape (N, M)
     return vals
@@ -111,74 +139,63 @@ def crossTanimotoSimilarityMemoryConstrained(
 
 
 def crossCosineSimilarity(
-    fingerprint_group_one: AsyncGpuResult | torch.Tensor,
-    fingerprint_group_two: AsyncGpuResult | torch.Tensor | None = None,
+    fingerprint_group_one: ArrayInput,
+    fingerprint_group_two: ArrayInput | None = None,
     stream: torch.cuda.Stream | None = None,
 ) -> AsyncGpuResult:
     """Returns the Cosine similarity between two sets of fingerprints.
 
-    Expects fingerprints generated by nvMolKit or as a torch tensor located on device 0, with the leading dimension corresponding to
+    Expects fingerprints generated by nvMolKit, a torch tensor, or a numpy array, with the leading dimension corresponding to
     the number of fingerprints, and the second dimension representing the packed fingerprint
-    bitfield.
+    bitfield. CPU tensors and NumPy arrays are copied to CUDA.
 
     The special case of fingerprint_group_1 as a 1 x n_bits tensor is equivalent to RDKit's BulkCosineSimilarity.
 
     Args:
-        fingerprint_group_one: A torch Tensor computed from nvMolKit fingerprints
-        fingerprint_group_two: A torch Tensor computed from nvMolKit fingerprints,
+        fingerprint_group_one: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints
+        fingerprint_group_two: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints,
             or None for all-to-all similarity within fingerprint_group_one.
         stream: CUDA stream to use. If None, uses the current stream.
 
     Returns:
         An AsyncGpuResult object containing the Cosine similarities, with index [i, j] corresponding to the similarity between
         fingerprint i in fingerprint_group_one and fingerprint j in fingerprint_group_two.
     """
-    if stream is not None and not isinstance(stream, torch.cuda.Stream):
-        raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
-    stream_ptr = (stream if stream is not None else torch.cuda.current_stream()).cuda_stream
-    if fingerprint_group_two is not None:
-        return AsyncGpuResult(
-            _DataStructs.CrossCosineSimilarityRawBuffers(
-                fingerprint_group_one.__cuda_array_interface__,
-                fingerprint_group_two.__cuda_array_interface__,
-                stream_ptr,
-            )
-        )
-    return AsyncGpuResult(
+    bits_one, bits_two, active_stream = _fingerprint_inputs(fingerprint_group_one, fingerprint_group_two, stream)
+    result = AsyncGpuResult(
         _DataStructs.CrossCosineSimilarityRawBuffers(
-            fingerprint_group_one.__cuda_array_interface__, fingerprint_group_one.__cuda_array_interface__, stream_ptr
+            bits_one.__cuda_array_interface__, bits_two.__cuda_array_interface__, active_stream.cuda_stream
         )
     )
+    result._input_refs = (bits_one, bits_two)
+    return result
 
 
 def crossCosineSimilarityMemoryConstrained(
-    fingerprint_group_one: torch.Tensor, fingerprint_group_two: torch.Tensor | None = None
+    fingerprint_group_one: ArrayInput, fingerprint_group_two: ArrayInput | None = None
 ) -> np.ndarray:
     """Returns the Cosine similarity between two sets of fingerprints.
 
     Computes results on the GPU, but returns a numpy array on the CPU. Will perform computation in chunks if necessary to avoid running out of memory on the GPU. Will still
     fail if the resulting matrix is too large to fit on the CPU.
 
-    Expects fingerprints generated by nvMolKit or as a torch tensor located on device 0, with the leading dimension corresponding to
+    Expects fingerprints generated by nvMolKit, a torch tensor, or a numpy array, with the leading dimension corresponding to
     the number of fingerprints, and the second dimension representing the packed fingerprint
-    bitfield.
+    bitfield. CPU tensors and NumPy arrays are copied to CUDA.
 
     The special case of fingerprint_group_1 as a 1 x n_bits tensor is equivalent to RDKit's BulkCosineSimilarity.
 
     Args:
-        fingerprint_group_one: A torch Tensor computed from nvMolKit fingerprints
-        fingerprint_group_two: A torch Tensor computed from nvMolKit fingerprints,
+        fingerprint_group_one: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints
+        fingerprint_group_two: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints,
             or None for all-to-all similarity within fingerprint_group_one.
 
     Returns:
         A numpy array of Cosine similarities, with index [i, j] corresponding to the similarity between
     """
-    bits_two_interface = (
-        fingerprint_group_two.__cuda_array_interface__
-        if fingerprint_group_two is not None
-        else fingerprint_group_one.__cuda_array_interface__
-    )
+    bits_one, bits_two, active_stream = _fingerprint_inputs(fingerprint_group_one, fingerprint_group_two, None)
+    active_stream.synchronize()
     vals = _DataStructs.CrossCosineSimilarityCPURawBuffers(
-        fingerprint_group_one.__cuda_array_interface__, bits_two_interface
+        bits_one.__cuda_array_interface__, bits_two.__cuda_array_interface__
     )
     return vals