apache
diff --git a/‎qdp/qdp-core/src/dlpack.rs‎
Lines changed: 4 additions & 2 deletions b/‎qdp/qdp-core/src/dlpack.rs‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 210 additions & 0 deletions b/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 210 additions & 0 deletions
diff --git a/‎qdp/qdp-core/src/gpu/encodings/mod.rs‎
Lines changed: 35 additions & 0 deletions b/‎qdp/qdp-core/src/gpu/encodings/mod.rs‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎qdp/qdp-core/src/gpu/pipeline.rs‎
Lines changed: 38 additions & 38 deletions b/‎qdp/qdp-core/src/gpu/pipeline.rs‎
Lines changed: 38 additions & 38 deletions
@@ -257,8 +257,10 @@ impl GpuStateVector {
         let (shape, strides) = if let Some(num_samples) = self.num_samples {
             // Batch: [num_samples, state_len_per_sample]
             debug_assert!(
-                num_samples > 0 && self.size_elements.is_multiple_of(num_samples),
-                "Batch state vector size must be divisible by num_samples"
+                num_samples > 0 && self.size_elements % num_samples == 0,
+                "Batch mismatch: {} elements cannot be evenly divided into {} samples",
+                self.size_elements,
+                num_samples
             );
             let state_len_per_sample = self.size_elements / num_samples;
             let shape = vec![num_samples as i64, state_len_per_sample as i64];
 
@@ -457,6 +457,216 @@ impl QuantumEncoder for AmplitudeEncoder {
         Ok(batch_state_vector)
     }
 
+    /// Encode multiple samples in a single GPU allocation and kernel launch for f32 inputs
+    #[cfg(target_os = "linux")]
+    fn encode_batch_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        batch_data: &[f32],
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        crate::profile_scope!("AmplitudeEncoder::encode_batch_f32");
+
+        // Validate inputs. Wait, Preprocessor::validate_batch currently takes f64...
+        // We will just do a basic length check if f32 validation is missing.
+        let state_len = 1 << num_qubits;
+        if batch_data.len() != num_samples * sample_size {
+            return Err(MahoutError::InvalidInput("batch_data length mismatch".into()));
+        }
+
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatch_f32");
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
+        };
+
+        // Upload input data to GPU
+        let input_batch_gpu = {
+            crate::profile_scope!("GPU::H2D_InputBatch_f32");
+            device.htod_sync_copy(batch_data).map_err(|e| {
+                MahoutError::MemoryAllocation(format!("Failed to upload batch input: {:?}", e))
+            })?
+        };
+
+        // Compute inverse norms on GPU using warp-reduced kernel
+        let inv_norms_gpu = {
+            crate::profile_scope!("GPU::BatchNormKernel_f32");
+            use cudarc::driver::DevicePtrMut;
+            let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
+            })?;
+
+            let ret = unsafe {
+                launch_l2_norm_batch_f32(
+                    *input_batch_gpu.device_ptr() as *const f32,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f32,
+                    std::ptr::null_mut(), // default stream
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Norm reduction kernel failed: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+
+        // Validate norms on host
+        {
+            crate::profile_scope!("GPU::NormValidation_f32");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
+
+        // Launch batch kernel
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunch_f32");
+            use cudarc::driver::DevicePtr;
+            let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
+                )
+            })?;
+            let ret = unsafe {
+                launch_amplitude_encode_batch_f32(
+                    *input_batch_gpu.device_ptr() as *const f32,
+                    state_ptr as *mut c_void,
+                    *inv_norms_gpu.device_ptr() as *const f32,
+                    num_samples,
+                    sample_size,
+                    state_len,
+                    std::ptr::null_mut(), // default stream
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch kernel launch failed: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            device
+                .synchronize()
+                .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?;
+        }
+
+        Ok(batch_state_vector)
+    }
+
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const c_void,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        let state_len = 1 << num_qubits;
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+        if sample_size > state_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Sample size {} exceeds state vector size {} (2^{} qubits)",
+                sample_size, state_len, num_qubits
+            )));
+        }
+        let input_batch_d = input_batch_d as *const f32;
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatch_f32");
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
+        };
+        let inv_norms_gpu = {
+            crate::profile_scope!("GPU::BatchNormKernel_f32");
+            use cudarc::driver::DevicePtrMut;
+            let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
+                MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
+            })?;
+            let ret = unsafe {
+                launch_l2_norm_batch_f32(
+                    input_batch_d,
+                    num_samples,
+                    sample_size,
+                    *buffer.device_ptr_mut() as *mut f32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Norm reduction kernel failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+        {
+            crate::profile_scope!("GPU::NormValidation_f32");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunch_f32");
+            use cudarc::driver::DevicePtr;
+            let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+                MahoutError::InvalidInput(
+                    "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
+                )
+            })?;
+            let ret = unsafe {
+                launch_amplitude_encode_batch_f32(
+                    input_batch_d,
+                    state_ptr as *mut c_void,
+                    *inv_norms_gpu.device_ptr() as *const f32,
+                    num_samples,
+                    sample_size,
+                    state_len,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch kernel launch failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+        Ok(batch_state_vector)
+    }
+
     fn name(&self) -> &'static str {
         "amplitude"
     }
 
@@ -134,6 +134,41 @@ pub trait QuantumEncoder: Send + Sync {
             self.name()
         )))
     }
+
+    /// Encode multiple samples in a single GPU allocation and kernel launch using f32 inputs.
+    fn encode_batch_f32(
+        &self,
+        _device: &Arc<CudaDevice>,
+        _batch_data: &[f32],
+        _num_samples: usize,
+        _sample_size: usize,
+        _num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        Err(MahoutError::NotImplemented(format!(
+            "encode_batch_f32 not implemented for {}",
+            self.name()
+        )))
+    }
+
+    /// Encode batch from existing GPU pointer (zero-copy) for f32 inputs.
+    ///
+    /// # Safety
+    /// Caller must ensure `input_batch_d` points to valid GPU memory (f32).
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr_f32(
+        &self,
+        _device: &Arc<CudaDevice>,
+        _input_batch_d: *const c_void,
+        _num_samples: usize,
+        _sample_size: usize,
+        _num_qubits: usize,
+        _stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        Err(MahoutError::NotImplemented(format!(
+            "encode_batch_from_gpu_ptr_f32 not supported for {}",
+            self.name()
+        )))
+    }
 }
 
 // Encoding implementations
 
@@ -290,7 +290,7 @@ where
             "Alignment must be greater than zero".to_string(),
         ));
     }
-    if !host_data.len().is_multiple_of(align_elements) {
+    if host_data.len() % align_elements != 0 {
         return Err(MahoutError::InvalidInput(format!(
             "Host data length {} is not aligned to {} elements",
             host_data.len(),
@@ -403,14 +403,14 @@ where
 
             // Record copy start if overlap tracking enabled
             // Note: Overlap tracking is optional observability - failures should not stop the pipeline
-            if let Some(ref tracker) = overlap_tracker
-                && let Err(e) = tracker.record_copy_start(&ctx.stream_copy, event_slot)
-            {
-                log::warn!(
-                    "Chunk {}: Failed to record copy start event: {}. Overlap tracking may be incomplete.",
-                    chunk_idx,
-                    e
-                );
+            if let Some(ref tracker) = overlap_tracker {
+                if let Err(e) = tracker.record_copy_start(&ctx.stream_copy, event_slot) {
+                    log::warn!(
+                        "Chunk {}: Failed to record copy start event: {}. Overlap tracking may be incomplete.",
+                        chunk_idx,
+                        e
+                    );
+                }
             }
 
             unsafe {
@@ -422,14 +422,14 @@ where
 
                 // Record copy end if overlap tracking enabled
                 // Note: Overlap tracking is optional observability - failures should not stop the pipeline
-                if let Some(ref tracker) = overlap_tracker
-                    && let Err(e) = tracker.record_copy_end(&ctx.stream_copy, event_slot)
-                {
-                    log::warn!(
-                        "Chunk {}: Failed to record copy end event: {}. Overlap tracking may be incomplete.",
-                        chunk_idx,
-                        e
-                    );
+                if let Some(ref tracker) = overlap_tracker {
+                    if let Err(e) = tracker.record_copy_end(&ctx.stream_copy, event_slot) {
+                        log::warn!(
+                            "Chunk {}: Failed to record copy end event: {}. Overlap tracking may be incomplete.",
+                            chunk_idx,
+                            e
+                        );
+                    }
                 }
 
                 ctx.record_copy_done(event_slot)?;
@@ -456,28 +456,28 @@ where
 
             // Record compute start if overlap tracking enabled
             // Note: Overlap tracking is optional observability - failures should not stop the pipeline
-            if let Some(ref tracker) = overlap_tracker
-                && let Err(e) = tracker.record_compute_start(&ctx.stream_compute, event_slot)
-            {
-                log::warn!(
-                    "Chunk {}: Failed to record compute start event: {}. Overlap tracking may be incomplete.",
-                    chunk_idx,
-                    e
-                );
+            if let Some(ref tracker) = overlap_tracker {
+                if let Err(e) = tracker.record_compute_start(&ctx.stream_compute, event_slot) {
+                    log::warn!(
+                        "Chunk {}: Failed to record compute start event: {}. Overlap tracking may be incomplete.",
+                        chunk_idx,
+                        e
+                    );
+                }
             }
 
             kernel_launcher(&ctx.stream_compute, input_ptr, chunk_offset, chunk.len())?;
 
             // Record compute end if overlap tracking enabled
             // Note: Overlap tracking is optional observability - failures should not stop the pipeline
-            if let Some(ref tracker) = overlap_tracker
-                && let Err(e) = tracker.record_compute_end(&ctx.stream_compute, event_slot)
-            {
-                log::warn!(
-                    "Chunk {}: Failed to record compute end event: {}. Overlap tracking may be incomplete.",
-                    chunk_idx,
-                    e
-                );
+            if let Some(ref tracker) = overlap_tracker {
+                if let Err(e) = tracker.record_compute_end(&ctx.stream_compute, event_slot) {
+                    log::warn!(
+                        "Chunk {}: Failed to record compute end event: {}. Overlap tracking may be incomplete.",
+                        chunk_idx,
+                        e
+                    );
+                }
             }
         }
 
@@ -489,11 +489,10 @@ where
         // Note: log_overlap now handles both success and failure cases internally,
         // logging at appropriate levels (INFO for visibility, DEBUG for details).
         #[allow(clippy::manual_is_multiple_of)]
-        if let Some(ref tracker) = overlap_tracker
-            && (chunk_idx % 10 == 0 || chunk_idx == 0)
-        {
-            // Only log every Nth chunk to avoid excessive logging
-            // Note: log_overlap waits for events to complete, which may take time
+        if let Some(ref tracker) = overlap_tracker {
+            if chunk_idx % 10 == 0 || chunk_idx == 0 {
+                // Only log every Nth chunk to avoid excessive logging
+                // Note: log_overlap waits for events to complete, which may take time
             // If events fail (e.g., invalid resource handle), log_overlap will log
             // at INFO level so it's visible in both debug and info modes
             if let Err(e) = tracker.log_overlap(chunk_idx) {
@@ -508,6 +507,7 @@ where
                 }
                 // Don't fail the pipeline - overlap tracking is optional observability
             }
+            }
         }
 
         // Keep buffer alive until synchronization