feat: enhance CUDA device handling and update sample vector documentation

ryankert01 · ryankert01 · commit df90a6e4ba2e · 2026-03-29T14:29:29.000Z
diff --git a/qdp/qdp-python/qumat_qdp/api.py b/qdp/qdp-python/qumat_qdp/api.py
@@ -188,7 +188,15 @@ def _run_throughput_pytorch(self) -> ThroughputResult:
 
         from qumat_qdp.torch_ref import encode
 
-        device = f"cuda:{self._device_id}" if torch.cuda.is_available() else "cpu"
+        if torch.cuda.is_available():
+            if self._device_id < 0 or self._device_id >= torch.cuda.device_count():
+                raise ValueError(
+                    f"Invalid CUDA device_id {self._device_id}; "
+                    f"{torch.cuda.device_count()} device(s) available."
+                )
+            device = f"cuda:{self._device_id}"
+        else:
+            device = "cpu"
         # _validate() guarantees these are not None.
         assert self._num_qubits is not None
         assert self._total_batches is not None
@@ -205,9 +213,11 @@ def _run_throughput_pytorch(self) -> ThroughputResult:
         else:
             sample_dim = 1 << num_qubits
 
-        # Generate all batch data upfront.
-        batches = []
-        for b in range(self._total_batches + self._warmup_batches):
+        # Pre-generate a small pool of batch tensors and cycle through them
+        # to keep memory bounded at high qubit counts while still varying data.
+        pool_size = min(8, self._total_batches + self._warmup_batches)
+        pool: list[torch.Tensor] = []
+        for _ in range(pool_size):
             if encoding_method == "basis":
                 data = torch.randint(
                     0, 1 << num_qubits, (batch_size,), device=device
@@ -216,18 +226,18 @@ def _run_throughput_pytorch(self) -> ThroughputResult:
                 data = torch.randn(
                     batch_size, sample_dim, dtype=torch.float64, device=device
                 )
-            batches.append(data)
+            pool.append(data)
 
         # Warmup.
         for b in range(self._warmup_batches):
-            encode(batches[b], num_qubits, encoding_method, device=device)
+            encode(pool[b % pool_size], num_qubits, encoding_method, device=device)
         if device.startswith("cuda"):
             torch.cuda.synchronize()
 
         # Timed run.
         start = time.perf_counter()
-        for b in range(self._warmup_batches, len(batches)):
-            encode(batches[b], num_qubits, encoding_method, device=device)
+        for b in range(self._total_batches):
+            encode(pool[b % pool_size], num_qubits, encoding_method, device=device)
         if device.startswith("cuda"):
             torch.cuda.synchronize()
         duration = time.perf_counter() - start
diff --git a/qdp/qdp-python/qumat_qdp/loader.py b/qdp/qdp-python/qumat_qdp/loader.py
@@ -83,7 +83,11 @@ def _validate_loader_args(
 
 
 def _build_sample(seed: int, vector_len: int, encoding_method: str) -> list[float]:
-    """Build a single deterministic sample vector (mirrors benchmark/utils.py:build_sample)."""
+    """Build a single deterministic sample vector for the given encoding method.
+
+    Supports amplitude, angle, basis, and iqp (iqp uses the same mask-and-scale
+    logic as amplitude).
+    """
     import numpy as np
 
     if encoding_method == "basis":
@@ -337,7 +341,15 @@ def _create_pytorch_iterator(self, use_synthetic: bool) -> Iterator[object]:
 
         from qumat_qdp.torch_ref import encode
 
-        device = f"cuda:{self._device_id}" if torch.cuda.is_available() else "cpu"
+        if torch.cuda.is_available():
+            if self._device_id < 0 or self._device_id >= torch.cuda.device_count():
+                raise ValueError(
+                    f"Invalid CUDA device_id {self._device_id}; "
+                    f"{torch.cuda.device_count()} device(s) available."
+                )
+            device = f"cuda:{self._device_id}"
+        else:
+            device = "cpu"
 
         if use_synthetic:
             return self._pytorch_synthetic_iter(torch, encode, device)
diff --git a/testing/qdp_python/test_torch_ref.py b/testing/qdp_python/test_torch_ref.py
@@ -365,6 +365,7 @@ def _require_qdp(self):
         pytest.importorskip("_qdp")
 
     @pytest.mark.gpu
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
     @pytest.mark.parametrize("encoding", ["amplitude", "angle", "basis", "iqp"])
     def test_encoding_matches_rust(self, encoding):
         import _qdp