diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 974a7b6c7a..b0c6773ff5 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -373,6 +373,221 @@ impl QuantumEncoder for AngleEncoder {
         Ok(batch_state_vector)
     }
 
+    #[cfg(target_os = "linux")]
+    fn encode_batch_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        batch_data: &[f32],
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+    ) -> Result<GpuStateVector> {
+        crate::profile_scope!("AngleEncoder::encode_batch_f32");
+
+        if num_samples == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of samples cannot be zero".into(),
+            ));
+        }
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+        if sample_size != num_qubits {
+            return Err(MahoutError::InvalidInput(format!(
+                "Angle encoding expects sample_size={} (one angle per qubit), got {}",
+                num_qubits, sample_size
+            )));
+        }
+        let expected_len = num_samples
+            .checked_mul(sample_size)
+            .ok_or_else(|| MahoutError::InvalidInput("Angle batch size overflow".to_string()))?;
+        if batch_data.len() != expected_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Batch data length {} doesn't match num_samples {} * sample_size {}",
+                batch_data.len(),
+                num_samples,
+                sample_size
+            )));
+        }
+
+        validate_qubit_count(num_qubits)?;
+
+        for (i, &val) in batch_data.iter().enumerate() {
+            if !val.is_finite() {
+                let sample_idx = i / sample_size;
+                let angle_idx = i % sample_size;
+                return Err(MahoutError::InvalidInput(format!(
+                    "Sample {} angle {} must be finite, got {}",
+                    sample_idx, angle_idx, val
+                )));
+            }
+        }
+
+        let state_len = 1 << num_qubits;
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatchF32");
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
+        };
+
+        let input_bytes = std::mem::size_of_val(batch_data);
+        let angles_gpu = {
+            crate::profile_scope!("GPU::H2D_BatchAnglesF32");
+            device.htod_sync_copy(batch_data).map_err(|e| {
+                map_allocation_error(input_bytes, "angle batch upload", Some(num_qubits), e)
+            })?
+        };
+
+        let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
+            )
+        })?;
+
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunchF32");
+            let ret = unsafe {
+                qdp_kernels::launch_angle_encode_batch_f32(
+                    *angles_gpu.device_ptr() as *const f32,
+                    state_ptr as *mut c_void,
+                    num_samples,
+                    state_len,
+                    num_qubits as u32,
+                    std::ptr::null_mut(),
+                )
+            };
+
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch angle encoding kernel (f32) failed: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            device
+                .synchronize()
+                .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?;
+        }
+
+        Ok(batch_state_vector)
+    }
+
+    #[cfg(target_os = "linux")]
+    unsafe fn encode_batch_from_gpu_ptr_f32(
+        &self,
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const c_void,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        if num_samples == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of samples cannot be zero".into(),
+            ));
+        }
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+        if sample_size != num_qubits {
+            return Err(MahoutError::InvalidInput(format!(
+                "Angle encoding expects sample_size={} (one angle per qubit), got {}",
+                num_qubits, sample_size
+            )));
+        }
+
+        validate_qubit_count(num_qubits)?;
+        let state_len = 1 << num_qubits;
+        let input_batch_d = input_batch_d as *const f32;
+        let total_angles = num_samples
+            .checked_mul(sample_size)
+            .ok_or_else(|| MahoutError::InvalidInput("Angle batch size overflow".to_string()))?;
+        let angle_validation_buffer = {
+            crate::profile_scope!("GPU::AngleFiniteCheckBatchF32");
+            use cudarc::driver::DevicePtrMut;
+            let mut buffer = device.alloc_zeros::<i32>(1).map_err(|e| {
+                MahoutError::MemoryAllocation(format!(
+                    "Failed to allocate angle validation buffer: {:?}",
+                    e
+                ))
+            })?;
+            let ret = unsafe {
+                qdp_kernels::launch_check_finite_batch_f32(
+                    input_batch_d,
+                    total_angles,
+                    *buffer.device_ptr_mut() as *mut i32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Angle finite validation kernel (f32) failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+            buffer
+        };
+        {
+            crate::profile_scope!("GPU::AngleFiniteValidationHostCopyF32");
+            let host_flags = device
+                .dtoh_sync_copy(&angle_validation_buffer)
+                .map_err(|e| {
+                    MahoutError::Cuda(format!(
+                        "Failed to copy angle validation flags to host: {:?}",
+                        e
+                    ))
+                })?;
+            if host_flags.first().copied().unwrap_or_default() != 0 {
+                return Err(MahoutError::InvalidInput(
+                    "Angle encoding batch contains non-finite values (NaN or Inf)".to_string(),
+                ));
+            }
+        }
+        let batch_state_vector = {
+            crate::profile_scope!("GPU::AllocBatchF32");
+            GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
+        };
+        let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "Batch state vector precision mismatch (expected float32 buffer)".to_string(),
+            )
+        })?;
+        {
+            crate::profile_scope!("GPU::BatchKernelLaunchF32");
+            let ret = unsafe {
+                qdp_kernels::launch_angle_encode_batch_f32(
+                    input_batch_d,
+                    state_ptr as *mut c_void,
+                    num_samples,
+                    state_len,
+                    num_qubits as u32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Batch angle encoding kernel (f32) failed: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+        Ok(batch_state_vector)
+    }
+
     fn validate_input(&self, data: &[f64], num_qubits: usize) -> Result<()> {
         validate_qubit_count(num_qubits)?;
         if data.len() != num_qubits {
@@ -472,6 +687,36 @@ impl AngleEncoder {
         Ok(state_vector)
     }
 
+    /// Encodes a batch of angle values from a device-resident `f32` buffer into GPU state
+    /// vectors, using the provided CUDA stream for all launched work.
+    ///
+    /// # Safety
+    /// The caller must ensure that `input_batch_d` points to at least
+    /// `num_samples * sample_size` contiguous `f32` values in GPU-accessible memory and remains
+    /// valid for the duration of this call. The caller must also ensure that `stream` is either
+    /// null or a valid CUDA stream handle associated with `device`, and that the memory layout is
+    /// row-major with exactly `sample_size` angles per sample.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream(
+        device: &Arc<CudaDevice>,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        unsafe {
+            AngleEncoder.encode_batch_from_gpu_ptr_f32(
+                device,
+                input_batch_d as *const c_void,
+                num_samples,
+                sample_size,
+                num_qubits,
+                stream,
+            )
+        }
+    }
+
     #[cfg(target_os = "linux")]
     fn encode_batch_async_pipeline(
         device: &Arc<CudaDevice>,
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 3db9accdc1..799eb7b180 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -768,6 +768,78 @@ impl QdpEngine {
         Ok(batch_state_vector.to_dlpack())
     }
 
+    /// Encode an angle batch from an existing GPU pointer (float32 input only).
+    ///
+    /// Zero-copy batch encoding from CUDA float32 tensors. Uses the default CUDA stream.
+    /// For stream interop use `encode_angle_batch_from_gpu_ptr_f32_with_stream`.
+    ///
+    /// # Safety
+    /// The input pointer must:
+    /// - Point to valid GPU memory on the same device as the engine
+    /// - Contain at least `num_samples * sample_size` f32 elements
+    /// - Remain valid for the duration of this call
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_angle_batch_from_gpu_ptr_f32(
+        &self,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+    ) -> Result<*mut DLManagedTensor> {
+        unsafe {
+            self.encode_angle_batch_from_gpu_ptr_f32_with_stream(
+                input_batch_d,
+                num_samples,
+                sample_size,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+        }
+    }
+
+    /// Encode an angle batch from an existing GPU pointer (float32) on a specified CUDA stream.
+    ///
+    /// # Safety
+    /// In addition to the `encode_angle_batch_from_gpu_ptr_f32` requirements, the stream pointer
+    /// must remain valid for the duration of this call.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_angle_batch_from_gpu_ptr_f32_with_stream(
+        &self,
+        input_batch_d: *const f32,
+        num_samples: usize,
+        sample_size: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::EncodeAngleBatchFromGpuPtrF32");
+
+        if num_samples == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Number of samples cannot be zero".into(),
+            ));
+        }
+        if sample_size == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Sample size cannot be zero".into(),
+            ));
+        }
+
+        validate_cuda_input_ptr(&self.device, input_batch_d as *const c_void)?;
+
+        let batch_state_vector = unsafe {
+            gpu::AngleEncoder::encode_batch_from_gpu_ptr_f32_with_stream(
+                &self.device,
+                input_batch_d,
+                num_samples,
+                sample_size,
+                num_qubits,
+                stream,
+            )
+        }?;
+        let batch_state_vector = batch_state_vector.to_precision(&self.device, self.precision)?;
+        Ok(batch_state_vector.to_dlpack())
+    }
+
     /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors)
     ///
     /// This method enables zero-copy batch encoding from PyTorch CUDA tensors.
diff --git a/qdp/qdp-core/tests/gpu_angle_encoding.rs b/qdp/qdp-core/tests/gpu_angle_encoding.rs
index c66e5eda60..6b60d51537 100644
--- a/qdp/qdp-core/tests/gpu_angle_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_angle_encoding.rs
@@ -143,3 +143,133 @@ fn test_angle_successful_encoding_from_parquet() {
         );
     }
 }
+
+#[test]
+fn test_angle_batch_f32_success() {
+    let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else {
+        println!("SKIP: No GPU available");
+        return;
+    };
+
+    let num_qubits = 3;
+    let num_samples = 2;
+    let data = vec![
+        0.0_f32,
+        std::f32::consts::FRAC_PI_2,
+        std::f32::consts::FRAC_PI_4,
+        0.2_f32,
+        0.4_f32,
+        0.6_f32,
+    ];
+
+    let dlpack_ptr = engine
+        .encode_batch_f32(&data, num_samples, num_qubits, num_qubits, "angle")
+        .expect("angle batch encode f32 should succeed");
+
+    unsafe {
+        common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, num_samples as i64, 8);
+    }
+}
+
+#[test]
+fn test_angle_batch_f32_rejects_sample_size_mismatch() {
+    let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else {
+        println!("SKIP: No GPU available");
+        return;
+    };
+
+    let data = vec![0.1_f32, 0.2, 0.3, 0.4];
+    let result = engine.encode_batch_f32(&data, 2, 2, 3, "angle");
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(
+                msg.contains("sample_size=3") || msg.contains("got 2"),
+                "msg: {msg}"
+            );
+        }
+        _ => panic!("expected InvalidInput, got {:?}", result),
+    }
+}
+
+#[test]
+fn test_angle_batch_f32_rejects_nan() {
+    let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else {
+        println!("SKIP: No GPU available");
+        return;
+    };
+
+    let data = vec![0.0_f32, f32::NAN, 0.2, 0.3];
+    let result = engine.encode_batch_f32(&data, 2, 2, 2, "angle");
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("Sample 0"), "msg: {msg}");
+            assert!(msg.contains("angle 1"), "msg: {msg}");
+            assert!(msg.contains("finite"), "msg: {msg}");
+        }
+        _ => panic!("expected InvalidInput, got {:?}", result),
+    }
+}
+
+#[test]
+fn test_angle_batch_f32_rejects_infinity() {
+    let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else {
+        println!("SKIP: No GPU available");
+        return;
+    };
+
+    let data = vec![0.0_f32, f32::INFINITY, 0.2, 0.3];
+    let result = engine.encode_batch_f32(&data, 2, 2, 2, "angle");
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("Sample 0"), "msg: {msg}");
+            assert!(msg.contains("angle 1"), "msg: {msg}");
+            assert!(msg.contains("finite"), "msg: {msg}");
+        }
+        _ => panic!("expected InvalidInput, got {:?}", result),
+    }
+}
+
+#[test]
+fn test_angle_batch_f32_rejects_zero_samples() {
+    let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else {
+        println!("SKIP: No GPU available");
+        return;
+    };
+
+    let result = engine.encode_batch_f32(&[], 0, 2, 2, "angle");
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(
+                msg.contains("zero") || msg.contains("samples"),
+                "msg: {msg}"
+            );
+        }
+        _ => panic!("expected InvalidInput, got {:?}", result),
+    }
+}
+
+#[test]
+fn test_angle_batch_f32_rejects_length_overflow() {
+    let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else {
+        println!("SKIP: No GPU available");
+        return;
+    };
+
+    let result = engine.encode_batch_f32(&[], usize::MAX, 2, 2, "angle");
+
+    assert!(result.is_err());
+    match result {
+        Err(MahoutError::InvalidInput(msg)) => {
+            assert!(msg.contains("overflow"), "msg: {msg}");
+        }
+        _ => panic!("expected InvalidInput, got {:?}", result),
+    }
+}
diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index 80a054a693..88c65e752d 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -1293,3 +1293,236 @@ fn test_encode_batch_from_gpu_ptr_f32_odd_sample_size_success() {
         )
     };
 }
+
+#[test]
+fn test_encode_angle_batch_from_gpu_ptr_f32_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let num_samples = 2;
+    let num_qubits = 3;
+    let (_device, input_d) = match common::copy_f32_to_device(&[
+        0.0,
+        std::f32::consts::FRAC_PI_2,
+        std::f32::consts::FRAC_PI_4,
+        0.2,
+        0.4,
+        0.6,
+    ]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_angle_batch_from_gpu_ptr_f32(
+                *input_d.device_ptr() as *const f32,
+                num_samples,
+                num_qubits,
+                num_qubits,
+            )
+            .expect("encode_angle_batch_from_gpu_ptr_f32")
+    };
+    unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, num_samples as i64, 8) };
+}
+
+#[test]
+fn test_encode_angle_batch_from_gpu_ptr_f32_with_stream_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (device, input_d) = match common::copy_f32_to_device(&[
+        0.0_f32,
+        std::f32::consts::FRAC_PI_2,
+        std::f32::consts::FRAC_PI_4,
+        0.2_f32,
+        0.4_f32,
+        0.6_f32,
+    ]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let stream = device.fork_default_stream().expect("fork_default_stream");
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_angle_batch_from_gpu_ptr_f32_with_stream(
+                *input_d.device_ptr() as *const f32,
+                2,
+                3,
+                3,
+                stream.stream as *mut c_void,
+            )
+            .expect("encode_angle_batch_from_gpu_ptr_f32_with_stream")
+    };
+    unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 2, 8) };
+}
+
+#[test]
+fn test_encode_angle_batch_from_gpu_ptr_f32_null_pointer() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let result = unsafe { engine.encode_angle_batch_from_gpu_ptr_f32(std::ptr::null(), 2, 2, 2) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => assert!(msg.contains("null")),
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_batch_from_gpu_ptr_f32_sample_size_mismatch() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[0.0_f32, 0.1, 0.2, 0.3, 0.4, 0.5]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let result = unsafe {
+        engine.encode_angle_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const f32, 2, 2, 3)
+    };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(
+                msg.contains("sample_size=3") || msg.contains("got 2"),
+                "msg: {msg}"
+            );
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_batch_from_gpu_ptr_f32_zero_samples() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let result = unsafe { engine.encode_angle_batch_from_gpu_ptr_f32(std::ptr::null(), 0, 2, 2) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => assert!(msg.contains("zero") || msg.contains("samples")),
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_batch_from_gpu_ptr_f32_non_finite_rejected() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) =
+        match common::copy_f32_to_device(&[0.0_f32, f32::NAN, 0.2_f32, 0.3_f32]) {
+            Some(t) => t,
+            None => {
+                println!("SKIP: No CUDA device");
+                return;
+            }
+        };
+    let result = unsafe {
+        engine.encode_angle_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const f32, 2, 2, 2)
+    };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(
+                msg.contains("non-finite") || msg.contains("NaN"),
+                "msg: {msg}"
+            );
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_batch_from_gpu_ptr_f32_infinity_rejected() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) =
+        match common::copy_f32_to_device(&[0.0_f32, f32::INFINITY, 0.2_f32, 0.3_f32]) {
+            Some(t) => t,
+            None => {
+                println!("SKIP: No CUDA device");
+                return;
+            }
+        };
+    let result = unsafe {
+        engine.encode_angle_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const f32, 2, 2, 2)
+    };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(
+                msg.contains("non-finite") || msg.contains("Inf"),
+                "msg: {msg}"
+            );
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_batch_from_gpu_ptr_f32_success_f64_engine() {
+    let Some(engine) = common::qdp_engine_with_precision(Precision::Float64) else {
+        println!("SKIP: No GPU");
+        return;
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[
+        0.0_f32,
+        std::f32::consts::FRAC_PI_2,
+        std::f32::consts::FRAC_PI_4,
+        0.2_f32,
+        0.4_f32,
+        0.6_f32,
+    ]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_angle_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const f32, 2, 3, 3)
+            .expect("encode_angle_batch_from_gpu_ptr_f32 (Float64 engine)")
+    };
+    unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 2, 8) };
+}
diff --git a/qdp/qdp-kernels/build.rs b/qdp/qdp-kernels/build.rs
index 40abf9a1a7..097b57e15f 100644
--- a/qdp/qdp-kernels/build.rs
+++ b/qdp/qdp-kernels/build.rs
@@ -34,6 +34,7 @@ fn main() {
     println!("cargo:rerun-if-changed=src/amplitude.cu");
     println!("cargo:rerun-if-changed=src/basis.cu");
     println!("cargo:rerun-if-changed=src/angle.cu");
+    println!("cargo:rerun-if-changed=src/validation.cu");
     println!("cargo:rerun-if-changed=src/iqp.cu");
     println!("cargo:rerun-if-changed=src/phase.cu");
     println!("cargo:rerun-if-env-changed=QDP_NO_CUDA");
@@ -100,6 +101,7 @@ fn main() {
         .file("src/amplitude.cu")
         .file("src/basis.cu")
         .file("src/angle.cu")
+        .file("src/validation.cu")
         .file("src/iqp.cu")
         .file("src/phase.cu")
         .compile("kernels");
diff --git a/qdp/qdp-kernels/src/angle.cu b/qdp/qdp-kernels/src/angle.cu
index 2a2d129689..57990be841 100644
--- a/qdp/qdp-kernels/src/angle.cu
+++ b/qdp/qdp-kernels/src/angle.cu
@@ -88,6 +88,34 @@ __global__ void angle_encode_batch_kernel(
     }
 }
 
+__global__ void angle_encode_batch_kernel_f32(
+    const float* __restrict__ angles_batch,
+    cuComplex* __restrict__ state_batch,
+    size_t num_samples,
+    size_t state_len,
+    unsigned int num_qubits
+) {
+    const size_t total_elements = num_samples * state_len;
+    const size_t stride = gridDim.x * blockDim.x;
+    const size_t state_mask = state_len - 1;
+
+    for (size_t global_idx = blockIdx.x * blockDim.x + threadIdx.x;
+         global_idx < total_elements;
+         global_idx += stride) {
+        const size_t sample_idx = global_idx >> num_qubits;
+        const size_t element_idx = global_idx & state_mask;
+        const float* angles = angles_batch + sample_idx * num_qubits;
+
+        float amplitude = 1.0f;
+        for (unsigned int bit = 0; bit < num_qubits; ++bit) {
+            const float angle = angles[bit];
+            amplitude *= ((element_idx >> bit) & 1U) ? sinf(angle) : cosf(angle);
+        }
+
+        state_batch[global_idx] = make_cuComplex(amplitude, 0.0f);
+    }
+}
+
 extern "C" {
 
 /// Launch angle encoding kernel
@@ -207,4 +235,47 @@ int launch_angle_encode_batch(
     return (int)cudaGetLastError();
 }
 
+/// Launch batch angle encoding kernel for float32 input
+///
+/// # Arguments
+/// * angles_batch_d - Device pointer to batch angles (num_samples * num_qubits)
+/// * state_batch_d - Device pointer to output batch state vectors
+/// * num_samples - Number of samples in batch
+/// * state_len - State vector size per sample (2^num_qubits)
+/// * num_qubits - Number of qubits (angles length)
+/// * stream - CUDA stream for async execution
+///
+/// # Returns
+/// CUDA error code (0 = cudaSuccess)
+int launch_angle_encode_batch_f32(
+    const float* angles_batch_d,
+    void* state_batch_d,
+    size_t num_samples,
+    size_t state_len,
+    unsigned int num_qubits,
+    cudaStream_t stream
+) {
+    if (num_samples == 0 || state_len == 0 || num_qubits == 0) {
+        return cudaErrorInvalidValue;
+    }
+
+    cuComplex* state_complex_d = static_cast<cuComplex*>(state_batch_d);
+
+    const int blockSize = DEFAULT_BLOCK_SIZE;
+    const size_t total_elements = num_samples * state_len;
+    const size_t blocks_needed = (total_elements + blockSize - 1) / blockSize;
+    const size_t max_blocks = MAX_GRID_BLOCKS;
+    const size_t gridSize = (blocks_needed < max_blocks) ? blocks_needed : max_blocks;
+
+    angle_encode_batch_kernel_f32<<<gridSize, blockSize, 0, stream>>>(
+        angles_batch_d,
+        state_complex_d,
+        num_samples,
+        state_len,
+        num_qubits
+    );
+
+    return (int)cudaGetLastError();
+}
+
 } // extern "C"
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index 4649092863..271039fb12 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -254,6 +254,32 @@ unsafe extern "C" {
         stream: *mut c_void,
     ) -> i32;
 
+    /// Launch batch angle encoding kernel for float32 inputs.
+    /// Returns CUDA error code (0 = success)
+    ///
+    /// # Safety
+    /// Requires valid GPU pointers, must sync before freeing
+    pub fn launch_angle_encode_batch_f32(
+        angles_batch_d: *const f32,
+        state_batch_d: *mut c_void,
+        num_samples: usize,
+        state_len: usize,
+        num_qubits: u32,
+        stream: *mut c_void,
+    ) -> i32;
+
+    /// Launch float32 batch finite-value validation.
+    /// Returns CUDA error code (0 = success)
+    ///
+    /// # Safety
+    /// Requires valid GPU pointers, must sync before reading the output flag.
+    pub fn launch_check_finite_batch_f32(
+        input_batch_d: *const f32,
+        total_angles: usize,
+        has_non_finite_d: *mut i32,
+        stream: *mut c_void,
+    ) -> i32;
+
     /// Launch IQP encoding kernel
     /// Returns CUDA error code (0 = success)
     ///
@@ -496,6 +522,30 @@ pub extern "C" fn launch_angle_encode_batch(
     999
 }
 
+#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[unsafe(no_mangle)]
+pub extern "C" fn launch_angle_encode_batch_f32(
+    _angles_batch_d: *const f32,
+    _state_batch_d: *mut c_void,
+    _num_samples: usize,
+    _state_len: usize,
+    _num_qubits: u32,
+    _stream: *mut c_void,
+) -> i32 {
+    999
+}
+
+#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[unsafe(no_mangle)]
+pub extern "C" fn launch_check_finite_batch_f32(
+    _input_batch_d: *const f32,
+    _total_angles: usize,
+    _has_non_finite_d: *mut i32,
+    _stream: *mut c_void,
+) -> i32 {
+    999
+}
+
 #[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_iqp_encode(
diff --git a/qdp/qdp-kernels/src/validation.cu b/qdp/qdp-kernels/src/validation.cu
new file mode 100644
index 0000000000..e38b65665d
--- /dev/null
+++ b/qdp/qdp-kernels/src/validation.cu
@@ -0,0 +1,82 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Shared validation CUDA kernels.
+
+#include <cuda_runtime.h>
+#include <math.h>
+#include "kernel_config.h"
+
+__global__ void check_finite_batch_kernel_f32(
+    const float* __restrict__ input_batch,
+    size_t total_values,
+    int* __restrict__ has_non_finite
+) {
+    const size_t stride = gridDim.x * blockDim.x;
+    for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+         idx < total_values;
+         idx += stride) {
+        if (!isfinite(input_batch[idx])) {
+            atomicExch(has_non_finite, 1);
+            return;
+        }
+    }
+}
+
+extern "C" {
+
+/// Launch batch finite-value validation for float32 input.
+///
+/// Writes 1 to `has_non_finite_d` if any NaN/Inf is found, else leaves it at 0.
+int launch_check_finite_batch_f32(
+    const float* input_batch_d,
+    size_t total_values,
+    int* has_non_finite_d,
+    cudaStream_t stream
+) {
+    if (total_values == 0 || has_non_finite_d == nullptr) {
+        return cudaErrorInvalidValue;
+    }
+
+    cudaError_t memset_status = cudaMemsetAsync(
+        has_non_finite_d,
+        0,
+        sizeof(int),
+        stream
+    );
+    if (memset_status != cudaSuccess) {
+        return memset_status;
+    }
+
+    const int blockSize = DEFAULT_BLOCK_SIZE;
+    size_t gridSize = (total_values + blockSize - 1) / blockSize;
+    if (gridSize == 0) {
+        gridSize = 1;
+    }
+    if (gridSize > MAX_GRID_BLOCKS) {
+        gridSize = MAX_GRID_BLOCKS;
+    }
+
+    check_finite_batch_kernel_f32<<<gridSize, blockSize, 0, stream>>>(
+        input_batch_d,
+        total_values,
+        has_non_finite_d
+    );
+
+    return (int)cudaGetLastError();
+}
+
+} // extern "C"
diff --git a/qdp/qdp-kernels/tests/angle_encode.rs b/qdp/qdp-kernels/tests/angle_encode.rs
index f42b3cd50a..a4af609af1 100644
--- a/qdp/qdp-kernels/tests/angle_encode.rs
+++ b/qdp/qdp-kernels/tests/angle_encode.rs
@@ -21,10 +21,25 @@
 #[cfg(target_os = "linux")]
 use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
 #[cfg(target_os = "linux")]
-use qdp_kernels::{CuComplex, launch_angle_encode_f32};
+use qdp_kernels::{
+    CuComplex, launch_angle_encode_batch_f32, launch_angle_encode_f32,
+    launch_check_finite_batch_f32,
+};
 
 const EPSILON_F32: f32 = 1e-5;
 
+#[cfg(target_os = "linux")]
+fn expected_amplitude_f32(angles: &[f32], basis_idx: usize) -> f32 {
+    angles.iter().enumerate().fold(1.0f32, |acc, (bit, angle)| {
+        let factor = if ((basis_idx >> bit) & 1) == 1 {
+            angle.sin()
+        } else {
+            angle.cos()
+        };
+        acc * factor
+    })
+}
+
 #[test]
 #[cfg(target_os = "linux")]
 fn test_angle_encode_basic_f32() {
@@ -147,3 +162,237 @@ fn test_angle_encode_f32_rejects_zero_qubits() {
 
     assert_ne!(result, 0, "Zero-qubit launch should fail");
 }
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_angle_encode_batch_f32_matches_expected_product_states() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let num_qubits = 3usize;
+    let num_samples = 2usize;
+    let state_len = 1usize << num_qubits;
+    let angles = vec![
+        0.0_f32,
+        std::f32::consts::FRAC_PI_2,
+        std::f32::consts::FRAC_PI_4,
+        0.2_f32,
+        0.4_f32,
+        0.6_f32,
+    ];
+
+    let input_d = device.htod_copy(angles.clone()).unwrap();
+    let mut state_d = device
+        .alloc_zeros::<CuComplex>(num_samples * state_len)
+        .unwrap();
+
+    let result = unsafe {
+        launch_angle_encode_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            num_samples,
+            state_len,
+            num_qubits as u32,
+            std::ptr::null_mut(),
+        )
+    };
+    assert_eq!(result, 0, "Batch kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+    for sample_idx in 0..num_samples {
+        let sample_angles = &angles[sample_idx * num_qubits..(sample_idx + 1) * num_qubits];
+        for basis_idx in 0..state_len {
+            let actual = state_h[sample_idx * state_len + basis_idx];
+            let expected = expected_amplitude_f32(sample_angles, basis_idx);
+            assert!(
+                (actual.x - expected).abs() < EPSILON_F32,
+                "sample {sample_idx} basis {basis_idx} expected {expected}, got {}",
+                actual.x
+            );
+            assert!(
+                actual.y.abs() < EPSILON_F32,
+                "sample {sample_idx} basis {basis_idx} imaginary expected 0, got {}",
+                actual.y
+            );
+        }
+    }
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_angle_encode_batch_f32_rejects_zero_samples() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input_d = device.htod_copy(vec![0.0_f32, 1.0_f32]).unwrap();
+    let mut state_d = device.alloc_zeros::<CuComplex>(4).unwrap();
+
+    let result = unsafe {
+        launch_angle_encode_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            0,
+            4,
+            2,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_ne!(result, 0, "Zero-sample batch launch should fail");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_check_finite_batch_f32_reports_non_finite() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input_d = device
+        .htod_copy(vec![0.0_f32, f32::INFINITY, 0.2_f32, 0.3_f32])
+        .unwrap();
+    let mut status_d = device.alloc_zeros::<i32>(1).unwrap();
+
+    let result = unsafe {
+        launch_check_finite_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            4,
+            *status_d.device_ptr_mut() as *mut i32,
+            std::ptr::null_mut(),
+        )
+    };
+    assert_eq!(result, 0, "Finite-check launch should succeed");
+
+    let status_h = device.dtoh_sync_copy(&status_d).unwrap();
+    assert_eq!(status_h, vec![1], "Expected non-finite flag to be set");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_check_finite_batch_f32_reports_nan() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input_d = device
+        .htod_copy(vec![0.0_f32, f32::NAN, 0.2_f32, 0.3_f32])
+        .unwrap();
+    let mut status_d = device.alloc_zeros::<i32>(1).unwrap();
+
+    let result = unsafe {
+        launch_check_finite_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            4,
+            *status_d.device_ptr_mut() as *mut i32,
+            std::ptr::null_mut(),
+        )
+    };
+    assert_eq!(result, 0, "Finite-check launch should succeed");
+
+    let status_h = device.dtoh_sync_copy(&status_d).unwrap();
+    assert_eq!(status_h, vec![1], "Expected NaN flag to be set");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_check_finite_batch_f32_all_finite_stays_clear() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input_d = device
+        .htod_copy(vec![0.0_f32, 0.1_f32, 0.2_f32, 0.3_f32])
+        .unwrap();
+    let mut status_d = device.alloc_zeros::<i32>(1).unwrap();
+
+    let result = unsafe {
+        launch_check_finite_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            4,
+            *status_d.device_ptr_mut() as *mut i32,
+            std::ptr::null_mut(),
+        )
+    };
+    assert_eq!(result, 0, "Finite-check launch should succeed");
+
+    let status_h = device.dtoh_sync_copy(&status_d).unwrap();
+    assert_eq!(status_h, vec![0], "Expected finite flag to remain clear");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_angle_encode_batch_f32_rejects_zero_state_len() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input_d = device.htod_copy(vec![0.0_f32, 1.0_f32]).unwrap();
+    let mut state_d = device.alloc_zeros::<CuComplex>(1).unwrap();
+
+    let result = unsafe {
+        launch_angle_encode_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            1,
+            0,
+            1,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_ne!(result, 0, "Zero state-len batch launch should fail");
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_angle_encode_batch_f32_rejects_zero_qubits() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input_d = device.htod_copy(vec![0.0_f32, 1.0_f32]).unwrap();
+    let mut state_d = device.alloc_zeros::<CuComplex>(1).unwrap();
+
+    let result = unsafe {
+        launch_angle_encode_batch_f32(
+            *input_d.device_ptr() as *const f32,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            1,
+            1,
+            0,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_ne!(result, 0, "Zero-qubit batch launch should fail");
+}