diff --git a/nvmolkit/clustering.py b/nvmolkit/clustering.py
index 58f4a1fa..b9eb3303 100644
--- a/nvmolkit/clustering.py
+++ b/nvmolkit/clustering.py
@@ -32,14 +32,24 @@
 
 from nvmolkit import _clustering
 from nvmolkit._arrayHelpers import *  # noqa: F403
-from nvmolkit._fusedButina import extract_cluster_and_singletons, update_neighbor_counts, _check_fingerprint_matrix
-from nvmolkit.types import AsyncGpuResult
+from nvmolkit._fusedButina import _check_fingerprint_matrix, extract_cluster_and_singletons, update_neighbor_counts
+from nvmolkit.types import ArrayInput, AsyncGpuResult, _as_cuda_tensor, _resolve_cuda_stream, _validate_cuda_stream
 
 _VALID_NEIGHBORLIST_SIZES = frozenset({8, 16, 24, 32, 64, 128})
 
 
+def _check_distance_matrix(name: str, x: torch.Tensor) -> torch.Tensor:
+    if x.ndim != 2 or x.shape[0] != x.shape[1]:
+        raise ValueError(f"{name} must be a square 2D matrix, got shape={tuple(x.shape)}")
+    if x.dtype != torch.float64:
+        raise ValueError(f"{name} must have dtype float64")
+    if not x.is_contiguous():
+        x = x.contiguous()
+    return x
+
+
 def butina(
-    distance_matrix: AsyncGpuResult | torch.Tensor,
+    distance_matrix: ArrayInput,
     cutoff: float,
     neighborlist_max_size: int = 64,
     return_centroids: bool = False,
@@ -56,7 +66,9 @@ def butina(
 
     Args:
         distance_matrix: Square distance matrix of shape (N, N) where N is the number
-                        of items. Can be an AsyncGpuResult or torch.Tensor on GPU.
+                        of items. Can be an AsyncGpuResult, torch.Tensor, or numpy.ndarray.
+                        CPU tensors and NumPy arrays are copied to CUDA. Inputs
+                        must have dtype float64.
         cutoff: Distance threshold for clustering. Items are neighbors if their
                 distance is less than this cutoff.
         neighborlist_max_size: Maximum size of the neighborlist used for small cluster
@@ -80,16 +92,17 @@ def butina(
         raise ValueError(
             f"neighborlist_max_size must be one of {sorted(_VALID_NEIGHBORLIST_SIZES)}, got {neighborlist_max_size}"
         )
-    if stream is not None and not isinstance(stream, torch.cuda.Stream):
-        raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
-    stream_ptr = (stream if stream is not None else torch.cuda.current_stream()).cuda_stream
-    result = _clustering.butina(
-        distance_matrix.__cuda_array_interface__,
-        cutoff,
-        neighborlist_max_size,
-        return_centroids,
-        stream_ptr,
-    )
+    active_stream = _resolve_cuda_stream(stream, distance_matrix)
+    with torch.cuda.stream(active_stream):
+        distance_matrix_tensor = _as_cuda_tensor("distance_matrix", distance_matrix, stream=active_stream)
+        distance_matrix_tensor = _check_distance_matrix("distance_matrix", distance_matrix_tensor)
+        result = _clustering.butina(
+            distance_matrix_tensor.__cuda_array_interface__,
+            cutoff,
+            neighborlist_max_size,
+            return_centroids,
+            active_stream.cuda_stream,
+        )
     if return_centroids:
         clusters, centroids = result
         return AsyncGpuResult(clusters), AsyncGpuResult(centroids)
@@ -97,7 +110,7 @@ def butina(
 
 
 def fused_butina(
-    x: torch.Tensor,
+    x: ArrayInput,
     cutoff: float,
     return_centroids: bool = False,
     stream: torch.cuda.Stream | None = None,
@@ -110,7 +123,9 @@ def fused_butina(
     the full distance matrix. This makes it suitable for large datasets.
 
     Args:
-        x: Tensor of shape (N, D) containing the fingerprints to cluster.
+        x: Tensor-like object of shape (N, D) containing packed int32 fingerprints
+           to cluster. Can be an AsyncGpuResult, torch.Tensor, or numpy.ndarray.
+           CPU tensors and NumPy arrays are copied to CUDA.
         cutoff: Distance threshold for clustering. Items are neighbors if their
                 distance is less than this cutoff (i.e. similarity > 1 - cutoff).
         return_centroids: Whether to return centroid indices for each cluster.
@@ -125,17 +140,18 @@ def fused_butina(
         If ``return_centroids`` is True, returns a tuple ``(clusters, cluster_sizes, centroids)``
         where *centroids* is a list of centroid indices.
     """
-    _check_fingerprint_matrix("x", x)
     if metric not in ["tanimoto", "cosine"]:
         raise ValueError(f"metric must be one of ['tanimoto', 'cosine'], got {metric}")
 
-    if stream is not None and not isinstance(stream, torch.cuda.Stream):
-        raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
+    _validate_cuda_stream(stream)
 
     if cutoff < 0 or cutoff > 1:
         raise ValueError(f"cutoff must be in [0, 1], got {cutoff}")
 
-    with torch.cuda.stream(stream):
+    active_stream = _resolve_cuda_stream(stream, x)
+    with torch.cuda.stream(active_stream):
+        x = _as_cuda_tensor("x", x, stream=active_stream)
+        _check_fingerprint_matrix("x", x)
         n_start = x.shape[0]
         device = x.device
         indices = torch.arange(n_start, dtype=torch.int32, device=device)
diff --git a/nvmolkit/similarity.py b/nvmolkit/similarity.py
index 8ca2b4e3..52733b75 100644
--- a/nvmolkit/similarity.py
+++ b/nvmolkit/similarity.py
@@ -24,29 +24,65 @@
 
 from nvmolkit import _DataStructs
 from nvmolkit._arrayHelpers import *  # noqa: F403
-from nvmolkit.types import AsyncGpuResult
+from nvmolkit.types import ArrayInput, AsyncGpuResult, _as_cuda_tensor, _resolve_cuda_stream
 
 # --------------------------------
 # Tanimoto similarity
 # --------------------------------
 
 
+def _check_fingerprint_input(name: str, x: torch.Tensor) -> torch.Tensor:
+    if x.ndim != 2:
+        raise ValueError(f"{name} must be 2D, got shape={tuple(x.shape)}")
+    if x.dtype not in (torch.int32, torch.uint32):
+        raise ValueError(f"{name} must have dtype int32 or uint32")
+    if not x.is_contiguous():
+        x = x.contiguous()
+    return x
+
+
+def _fingerprint_inputs(
+    fingerprint_group_one: ArrayInput,
+    fingerprint_group_two: ArrayInput | None,
+    stream: torch.cuda.Stream | None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.cuda.Stream]:
+    active_stream = (
+        _resolve_cuda_stream(stream, fingerprint_group_one, fingerprint_group_two)
+        if fingerprint_group_two is not None
+        else _resolve_cuda_stream(stream, fingerprint_group_one)
+    )
+    with torch.cuda.stream(active_stream):
+        bits_one = _as_cuda_tensor("fingerprint_group_one", fingerprint_group_one, stream=active_stream)
+        bits_one = _check_fingerprint_input("fingerprint_group_one", bits_one)
+        bits_two = (
+            bits_one
+            if fingerprint_group_two is None
+            else _check_fingerprint_input(
+                "fingerprint_group_two",
+                _as_cuda_tensor("fingerprint_group_two", fingerprint_group_two, stream=active_stream),
+            )
+        )
+    if bits_one.shape[1] != bits_two.shape[1]:
+        raise ValueError("fingerprint_group_one and fingerprint_group_two must have the same feature dimension")
+    return bits_one, bits_two, active_stream
+
+
 def crossTanimotoSimilarity(
-    fingerprint_group_one: AsyncGpuResult | torch.Tensor,
-    fingerprint_group_two: AsyncGpuResult | torch.Tensor | None = None,
+    fingerprint_group_one: ArrayInput,
+    fingerprint_group_two: ArrayInput | None = None,
     stream: torch.cuda.Stream | None = None,
 ) -> AsyncGpuResult:
     """Returns the Tanimoto similarity within a set of fingerprints or between two sets of fingerprints.
 
-    Expects fingerprints generated by nvMolKit or as a torch tensor located on device 0, with the leading dimension corresponding to
+    Expects fingerprints generated by nvMolKit, a torch tensor, or a numpy array, with the leading dimension corresponding to
     the number of fingerprints, and the second dimension representing the packed fingerprint
-    bitfield.
+    bitfield. CPU tensors and NumPy arrays are copied to CUDA.
 
     The special case of fingerprint_group_1 as a 1 x n_bits tensor is equivalent to RDKit's BulkTanimotoSimilarity.
 
     Args:
-        fingerprint_group_one: A torch Tensor or AsyncGpuResult computed from nvMolKit fingerprints
-        fingerprint_group_two: A torch Tensor or AsyncGpuResult computed from nvMolKit fingerprints,
+        fingerprint_group_one: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints
+        fingerprint_group_two: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints,
             or None for all-to-all similarity within fingerprint_group_one.
         stream: CUDA stream to use. If None, uses the current stream.
 
@@ -56,50 +92,42 @@ def crossTanimotoSimilarity(
         fingerprint_group_two. If fingerprint_group_two is None, computes all-to-all
         similarity within fingerprint_group_one.
     """
-    if stream is not None and not isinstance(stream, torch.cuda.Stream):
-        raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
-    bits_two_interface = (
-        fingerprint_group_two.__cuda_array_interface__
-        if fingerprint_group_two is not None
-        else fingerprint_group_one.__cuda_array_interface__
-    )
-    stream_ptr = (stream if stream is not None else torch.cuda.current_stream()).cuda_stream
-    return AsyncGpuResult(
+    bits_one, bits_two, active_stream = _fingerprint_inputs(fingerprint_group_one, fingerprint_group_two, stream)
+    result = AsyncGpuResult(
         _DataStructs.CrossTanimotoSimilarityRawBuffers(
-            fingerprint_group_one.__cuda_array_interface__, bits_two_interface, stream_ptr
+            bits_one.__cuda_array_interface__, bits_two.__cuda_array_interface__, active_stream.cuda_stream
         )
     )
+    result._input_refs = (bits_one, bits_two)
+    return result
 
 
 def crossTanimotoSimilarityMemoryConstrained(
-    fingerprint_group_one: torch.Tensor, fingerprint_group_two: torch.Tensor | None = None
+    fingerprint_group_one: ArrayInput, fingerprint_group_two: ArrayInput | None = None
 ) -> np.ndarray:
     """Returns the Tanimoto similarity within a set of fingerprints or between two sets of fingerprints.
 
     Computes results on the GPU, but returns a numpy array on the CPU. Will perform computation in chunks if necessary to avoid running out of memory on the GPU. Will still
     fail if the resulting matrix is too large to fit on the CPU.
 
-    Expects fingerprints generated by nvMolKit or as a torch tensor located on device 0, with the leading dimension corresponding to
+    Expects fingerprints generated by nvMolKit, a torch tensor, or a numpy array, with the leading dimension corresponding to
     the number of fingerprints, and the second dimension representing the packed fingerprint
-    bitfield.
+    bitfield. CPU tensors and NumPy arrays are copied to CUDA.
 
     The special case of fingerprint_group_1 as a 1 x n_bits tensor is equivalent to RDKit's BulkTanimotoSimilarity.
 
     Args:
-        fingerprint_group_one: A torch Tensor or AsyncGpuResult computed from nvMolKit fingerprints
-        fingerprint_group_two: A torch Tensor or AsyncGpuResult computed from nvMolKit fingerprints,
+        fingerprint_group_one: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints
+        fingerprint_group_two: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints,
             or None for all-to-all similarity within fingerprint_group_one.
 
     Returns:
         A numpy array of Tanimoto similarities, with index [i, j] corresponding to the similarity between
     """
-    bits_two_interface = (
-        fingerprint_group_two.__cuda_array_interface__
-        if fingerprint_group_two is not None
-        else fingerprint_group_one.__cuda_array_interface__
-    )
+    bits_one, bits_two, active_stream = _fingerprint_inputs(fingerprint_group_one, fingerprint_group_two, None)
+    active_stream.synchronize()
     vals = _DataStructs.CrossTanimotoSimilarityCPURawBuffers(
-        fingerprint_group_one.__cuda_array_interface__, bits_two_interface
+        bits_one.__cuda_array_interface__, bits_two.__cuda_array_interface__
     )
     # vals is a numpy ndarray with shape (N, M)
     return vals
@@ -111,21 +139,21 @@ def crossTanimotoSimilarityMemoryConstrained(
 
 
 def crossCosineSimilarity(
-    fingerprint_group_one: AsyncGpuResult | torch.Tensor,
-    fingerprint_group_two: AsyncGpuResult | torch.Tensor | None = None,
+    fingerprint_group_one: ArrayInput,
+    fingerprint_group_two: ArrayInput | None = None,
     stream: torch.cuda.Stream | None = None,
 ) -> AsyncGpuResult:
     """Returns the Cosine similarity between two sets of fingerprints.
 
-    Expects fingerprints generated by nvMolKit or as a torch tensor located on device 0, with the leading dimension corresponding to
+    Expects fingerprints generated by nvMolKit, a torch tensor, or a numpy array, with the leading dimension corresponding to
     the number of fingerprints, and the second dimension representing the packed fingerprint
-    bitfield.
+    bitfield. CPU tensors and NumPy arrays are copied to CUDA.
 
     The special case of fingerprint_group_1 as a 1 x n_bits tensor is equivalent to RDKit's BulkCosineSimilarity.
 
     Args:
-        fingerprint_group_one: A torch Tensor computed from nvMolKit fingerprints
-        fingerprint_group_two: A torch Tensor computed from nvMolKit fingerprints,
+        fingerprint_group_one: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints
+        fingerprint_group_two: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints,
             or None for all-to-all similarity within fingerprint_group_one.
         stream: CUDA stream to use. If None, uses the current stream.
 
@@ -133,52 +161,41 @@ def crossCosineSimilarity(
         An AsyncGpuResult object containing the Cosine similarities, with index [i, j] corresponding to the similarity between
         fingerprint i in fingerprint_group_one and fingerprint j in fingerprint_group_two.
     """
-    if stream is not None and not isinstance(stream, torch.cuda.Stream):
-        raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
-    stream_ptr = (stream if stream is not None else torch.cuda.current_stream()).cuda_stream
-    if fingerprint_group_two is not None:
-        return AsyncGpuResult(
-            _DataStructs.CrossCosineSimilarityRawBuffers(
-                fingerprint_group_one.__cuda_array_interface__,
-                fingerprint_group_two.__cuda_array_interface__,
-                stream_ptr,
-            )
-        )
-    return AsyncGpuResult(
+    bits_one, bits_two, active_stream = _fingerprint_inputs(fingerprint_group_one, fingerprint_group_two, stream)
+    result = AsyncGpuResult(
         _DataStructs.CrossCosineSimilarityRawBuffers(
-            fingerprint_group_one.__cuda_array_interface__, fingerprint_group_one.__cuda_array_interface__, stream_ptr
+            bits_one.__cuda_array_interface__, bits_two.__cuda_array_interface__, active_stream.cuda_stream
         )
     )
+    result._input_refs = (bits_one, bits_two)
+    return result
 
 
 def crossCosineSimilarityMemoryConstrained(
-    fingerprint_group_one: torch.Tensor, fingerprint_group_two: torch.Tensor | None = None
+    fingerprint_group_one: ArrayInput, fingerprint_group_two: ArrayInput | None = None
 ) -> np.ndarray:
     """Returns the Cosine similarity between two sets of fingerprints.
 
     Computes results on the GPU, but returns a numpy array on the CPU. Will perform computation in chunks if necessary to avoid running out of memory on the GPU. Will still
     fail if the resulting matrix is too large to fit on the CPU.
 
-    Expects fingerprints generated by nvMolKit or as a torch tensor located on device 0, with the leading dimension corresponding to
+    Expects fingerprints generated by nvMolKit, a torch tensor, or a numpy array, with the leading dimension corresponding to
     the number of fingerprints, and the second dimension representing the packed fingerprint
-    bitfield.
+    bitfield. CPU tensors and NumPy arrays are copied to CUDA.
 
     The special case of fingerprint_group_1 as a 1 x n_bits tensor is equivalent to RDKit's BulkCosineSimilarity.
 
     Args:
-        fingerprint_group_one: A torch Tensor computed from nvMolKit fingerprints
-        fingerprint_group_two: A torch Tensor computed from nvMolKit fingerprints,
+        fingerprint_group_one: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints
+        fingerprint_group_two: A torch Tensor, numpy.ndarray, or AsyncGpuResult computed from nvMolKit fingerprints,
             or None for all-to-all similarity within fingerprint_group_one.
 
     Returns:
         A numpy array of Cosine similarities, with index [i, j] corresponding to the similarity between
     """
-    bits_two_interface = (
-        fingerprint_group_two.__cuda_array_interface__
-        if fingerprint_group_two is not None
-        else fingerprint_group_one.__cuda_array_interface__
-    )
+    bits_one, bits_two, active_stream = _fingerprint_inputs(fingerprint_group_one, fingerprint_group_two, None)
+    active_stream.synchronize()
     vals = _DataStructs.CrossCosineSimilarityCPURawBuffers(
-        fingerprint_group_one.__cuda_array_interface__, bits_two_interface
+        bits_one.__cuda_array_interface__, bits_two.__cuda_array_interface__
     )
     return vals
diff --git a/nvmolkit/tests/test_clustering.py b/nvmolkit/tests/test_clustering.py
index 56c8fc22..77fdca62 100644
--- a/nvmolkit/tests/test_clustering.py
+++ b/nvmolkit/tests/test_clustering.py
@@ -18,6 +18,7 @@
 import torch
 
 from nvmolkit.clustering import butina, fused_butina
+from nvmolkit.types import AsyncGpuResult
 
 
 def check_butina_correctness(hit_mat, clusts):
@@ -114,6 +115,27 @@ def test_butina_returns_centroids():
             assert adjacency[centroid, member].item()
 
 
+@pytest.mark.parametrize("input_kind", ["async", "cpu_tensor", "numpy"])
+def test_butina_accepts_array_input_types(input_kind):
+    n = 20
+    cutoff = 0.2
+    np.random.seed(456)
+    dists = np.random.rand(n, n)
+    dists = np.abs(dists - dists.T)
+    torch_dists = torch.tensor(dists, device="cuda", dtype=torch.float64)
+    expected = butina(torch_dists, cutoff).torch().cpu()
+
+    if input_kind == "async":
+        inp = AsyncGpuResult(torch_dists)
+    elif input_kind == "cpu_tensor":
+        inp = torch.tensor(dists, dtype=torch.float64)
+    else:
+        inp = dists
+
+    got = butina(inp, cutoff).torch().cpu()
+    torch.testing.assert_close(got, expected)
+
+
 def test_butina_on_explicit_stream():
     n = 100
     cutoff = 0.1
@@ -137,6 +159,12 @@ def test_butina_invalid_stream_type():
         butina(dists, 0.1, stream=42)
 
 
+def test_butina_rejects_non_float64_distance_matrix():
+    dists = torch.zeros(10, 10, device="cuda", dtype=torch.float32)
+    with pytest.raises(ValueError, match="distance_matrix must have dtype float64"):
+        butina(dists, 0.1)
+
+
 @pytest.mark.parametrize("invalid_size", [0, 1, 7, 9, 15, 33, 48, 100, 256])
 def test_butina_invalid_neighborlist_max_size(invalid_size):
     """Test that invalid neighborlist_max_size values are rejected before reaching the GPU."""
@@ -274,6 +302,25 @@ def test_fused_butina_return_centroids(n, metric):
                 assert sim[centroid, member] >= threshold - 1e-6
 
 
+@pytest.mark.parametrize("input_kind", ["async", "cpu_tensor", "numpy"])
+def test_fused_butina_accepts_array_input_types(input_kind):
+    x = generate_clustered_fingerprints(50, num_words=32, num_clusters=10)
+    cutoff = 0.4
+    expected_clusters, expected_cluster_sizes = fused_butina(x, cutoff=cutoff)
+
+    if input_kind == "async":
+        inp = AsyncGpuResult(x)
+    elif input_kind == "cpu_tensor":
+        inp = x.cpu()
+    else:
+        inp = x.cpu().numpy()
+
+    clusters, cluster_sizes = fused_butina(inp, cutoff=cutoff)
+    assert [frozenset(cluster) for cluster in clusters] == [frozenset(cluster) for cluster in expected_clusters]
+    assert cluster_sizes == expected_cluster_sizes
+    check_fused_butina_basic(clusters, cluster_sizes, x.shape[0])
+
+
 def test_fused_butina_on_explicit_stream():
     n = 100
     x = generate_clustered_fingerprints(n, num_words=32, num_clusters=10)
diff --git a/nvmolkit/tests/test_similarity.py b/nvmolkit/tests/test_similarity.py
index a7804380..50d5f41b 100644
--- a/nvmolkit/tests/test_similarity.py
+++ b/nvmolkit/tests/test_similarity.py
@@ -13,25 +13,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-import pytest
-import psutil
 
 import numpy as np
+import psutil
+import pytest
 import torch
-
 from rdkit.Chem import rdFingerprintGenerator
-from rdkit.DataStructs import BulkCosineSimilarity, BulkTanimotoSimilarity
-from rdkit.DataStructs import ExplicitBitVect
+from rdkit.DataStructs import BulkCosineSimilarity, BulkTanimotoSimilarity, ExplicitBitVect
 
+from nvmolkit.fingerprints import MorganFingerprintGenerator, pack_fingerprint
 from nvmolkit.similarity import (
     crossCosineSimilarity,
+    crossCosineSimilarityMemoryConstrained,
     crossTanimotoSimilarity,
     crossTanimotoSimilarityMemoryConstrained,
-    crossCosineSimilarityMemoryConstrained,
 )
-from nvmolkit.fingerprints import MorganFingerprintGenerator
-from nvmolkit.fingerprints import pack_fingerprint
-
+from nvmolkit.types import AsyncGpuResult
 
 # --------------------------------
 # Test helpers
@@ -72,10 +69,10 @@ def test_cross_similarity_fp_mismatch(simtype, size_limited_mols):
     nvmolkit_fps_cu2 = nvmolkit_fpgen2.GetFingerprints(size_limited_mols, num_threads=1)
     if simtype == "tanimoto":
         with pytest.raises(ValueError):
-            nvmolkit_sims = crossTanimotoSimilarity(nvmolkit_fps_cu, nvmolkit_fps_cu2)
+            crossTanimotoSimilarity(nvmolkit_fps_cu, nvmolkit_fps_cu2)
     else:
         with pytest.raises(ValueError):
-            nvmolkit_sims = crossCosineSimilarity(nvmolkit_fps_cu, nvmolkit_fps_cu2)
+            crossCosineSimilarity(nvmolkit_fps_cu, nvmolkit_fps_cu2)
 
 
 # --------------------------------
@@ -154,6 +151,37 @@ def test_nxm_cross_tanimoto_similarity_from_packing(nxmdims):
     torch.testing.assert_close(nvmolkit_sims, ref_sims)
 
 
+@pytest.mark.parametrize("metric", ("tanimoto", "cosine"))
+@pytest.mark.parametrize("input_kind", ("async", "cpu_tensor", "numpy"))
+def test_cross_similarity_accepts_array_input_types(metric, input_kind):
+    fps_1 = torch.randint(0, 2, (6, 256), dtype=torch.bool, device="cuda")
+    fps_2 = torch.randint(0, 2, (5, 256), dtype=torch.bool, device="cuda")
+    packed_1 = pack_fingerprint(fps_1)
+    packed_2 = pack_fingerprint(fps_2)
+
+    if metric == "tanimoto":
+        expected = crossTanimotoSimilarity(packed_1, packed_2).torch()
+    else:
+        expected = crossCosineSimilarity(packed_1, packed_2).torch()
+
+    if input_kind == "async":
+        inp_1 = AsyncGpuResult(packed_1)
+        inp_2 = AsyncGpuResult(packed_2)
+    elif input_kind == "cpu_tensor":
+        inp_1 = packed_1.cpu()
+        inp_2 = packed_2.cpu()
+    else:
+        inp_1 = packed_1.cpu().numpy()
+        inp_2 = packed_2.cpu().numpy()
+
+    if metric == "tanimoto":
+        got = crossTanimotoSimilarity(inp_1, inp_2).torch()
+    else:
+        got = crossCosineSimilarity(inp_1, inp_2).torch()
+
+    torch.testing.assert_close(got, expected)
+
+
 # --------------------------------
 # Cosine similarity tests
 # --------------------------------
@@ -267,6 +295,37 @@ def test_memory_constrained_cosine_cross(size_limited_mols, nxmdims):
     np.testing.assert_allclose(got, ref.cpu().numpy(), rtol=1e-5, atol=1e-5)
 
 
+@pytest.mark.parametrize("metric", ("tanimoto", "cosine"))
+@pytest.mark.parametrize("input_kind", ("async", "cpu_tensor", "numpy"))
+def test_memory_constrained_similarity_accepts_array_input_types(metric, input_kind):
+    fps_1 = torch.randint(0, 2, (6, 256), dtype=torch.bool, device="cuda")
+    fps_2 = torch.randint(0, 2, (5, 256), dtype=torch.bool, device="cuda")
+    packed_1 = pack_fingerprint(fps_1)
+    packed_2 = pack_fingerprint(fps_2)
+
+    if metric == "tanimoto":
+        expected = crossTanimotoSimilarityMemoryConstrained(packed_1, packed_2)
+    else:
+        expected = crossCosineSimilarityMemoryConstrained(packed_1, packed_2)
+
+    if input_kind == "async":
+        inp_1 = AsyncGpuResult(packed_1)
+        inp_2 = AsyncGpuResult(packed_2)
+    elif input_kind == "cpu_tensor":
+        inp_1 = packed_1.cpu()
+        inp_2 = packed_2.cpu()
+    else:
+        inp_1 = packed_1.cpu().numpy()
+        inp_2 = packed_2.cpu().numpy()
+
+    if metric == "tanimoto":
+        got = crossTanimotoSimilarityMemoryConstrained(inp_1, inp_2)
+    else:
+        got = crossCosineSimilarityMemoryConstrained(inp_1, inp_2)
+
+    np.testing.assert_allclose(got, expected, rtol=1e-5, atol=1e-5)
+
+
 # Test large N x M where N != M to exercise segmented path without overwhelming CPU RAM
 # Will skip on many machines.
 @pytest.mark.parametrize("metric", ("tanimoto", "cosine"))
diff --git a/nvmolkit/types.py b/nvmolkit/types.py
index 293181d9..ba06ce99 100644
--- a/nvmolkit/types.py
+++ b/nvmolkit/types.py
@@ -18,6 +18,7 @@
 from enum import Enum
 from typing import Any, Iterable, List, NamedTuple, Optional
 
+import numpy as np
 import torch
 
 from nvmolkit import _embedMolecules  # type: ignore
@@ -43,6 +44,7 @@ def __init__(
         batchesPerGpu: int = -1,
         gpuIds: Iterable[int] | None = None,
     ) -> None:
+        """Create hardware options backed by the native BatchHardwareOptions type."""
         if _embedMolecules is None:  # propagate real import failure early
             raise ImportError("nvmolkit._embedMolecules is not available; build native extensions")
         native = _embedMolecules.BatchHardwareOptions()
@@ -162,6 +164,66 @@ def numpy(self):
         return self.arr.cpu().numpy()
 
 
+ArrayInput = AsyncGpuResult | torch.Tensor | np.ndarray
+
+
+def _validate_cuda_stream(stream: torch.cuda.Stream | None) -> torch.cuda.Stream | None:
+    if stream is not None and not isinstance(stream, torch.cuda.Stream):
+        raise TypeError(f"stream must be a torch.cuda.Stream or None, got {type(stream).__name__}")
+    return stream
+
+
+def _cuda_device(value: ArrayInput) -> torch.device | None:
+    if isinstance(value, AsyncGpuResult):
+        return value.device
+    if isinstance(value, torch.Tensor) and value.is_cuda:
+        return value.device
+    return None
+
+
+def _resolve_cuda_stream(stream: torch.cuda.Stream | None, *inputs: ArrayInput) -> torch.cuda.Stream:
+    _validate_cuda_stream(stream)
+    if stream is not None:
+        return stream
+
+    for value in inputs:
+        device = _cuda_device(value)
+        if device is not None:
+            return torch.cuda.current_stream(device)
+    return torch.cuda.current_stream()
+
+
+def _as_cuda_tensor(
+    name: str,
+    value: ArrayInput,
+    *,
+    stream: torch.cuda.Stream,
+) -> torch.Tensor:
+    """Return *value* as a CUDA tensor ordered on *stream*.
+
+    ``AsyncGpuResult`` and CUDA tensors are zero-copy. CPU tensors and NumPy
+    arrays are copied to the stream device. API-specific dtype, shape, and
+    layout checks live in the public wrappers that know their native contracts.
+    """
+    if isinstance(value, AsyncGpuResult):
+        tensor = value.torch()
+    elif isinstance(value, torch.Tensor):
+        tensor = value
+    elif isinstance(value, np.ndarray):
+        tensor = torch.as_tensor(value)
+    else:
+        raise TypeError(
+            f"{name} must be an AsyncGpuResult, torch.Tensor, or numpy.ndarray, got {type(value).__name__}"
+        )
+
+    target_device = stream.device
+    if not tensor.is_cuda:
+        return tensor.to(device=target_device, non_blocking=True)
+    if tensor.device != target_device:
+        raise ValueError(f"{name} is on {tensor.device}, but stream is on {target_device}")
+    return tensor
+
+
 class CoordinateOutput(Enum):
     """Selects how conformer-producing APIs return optimized coordinates.
 
@@ -227,6 +289,7 @@ def __init__(
         energies: Optional[AsyncGpuResult] = None,
         converged: Optional[AsyncGpuResult] = None,
     ) -> None:
+        """Create a device result from GPU-resident CSR-style buffers."""
         self.values = values
         self.atom_starts = atom_starts
         self.mol_indices = mol_indices