diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs b/qdp/qdp-core/src/gpu/encodings/angle.rs index 974a7b6c7a..b0c6773ff5 100644 --- a/qdp/qdp-core/src/gpu/encodings/angle.rs +++ b/qdp/qdp-core/src/gpu/encodings/angle.rs @@ -373,6 +373,221 @@ impl QuantumEncoder for AngleEncoder { Ok(batch_state_vector) } + #[cfg(target_os = "linux")] + fn encode_batch_f32( + &self, + device: &Arc, + batch_data: &[f32], + num_samples: usize, + sample_size: usize, + num_qubits: usize, + ) -> Result { + crate::profile_scope!("AngleEncoder::encode_batch_f32"); + + if num_samples == 0 { + return Err(MahoutError::InvalidInput( + "Number of samples cannot be zero".into(), + )); + } + if sample_size == 0 { + return Err(MahoutError::InvalidInput( + "Sample size cannot be zero".into(), + )); + } + if sample_size != num_qubits { + return Err(MahoutError::InvalidInput(format!( + "Angle encoding expects sample_size={} (one angle per qubit), got {}", + num_qubits, sample_size + ))); + } + let expected_len = num_samples + .checked_mul(sample_size) + .ok_or_else(|| MahoutError::InvalidInput("Angle batch size overflow".to_string()))?; + if batch_data.len() != expected_len { + return Err(MahoutError::InvalidInput(format!( + "Batch data length {} doesn't match num_samples {} * sample_size {}", + batch_data.len(), + num_samples, + sample_size + ))); + } + + validate_qubit_count(num_qubits)?; + + for (i, &val) in batch_data.iter().enumerate() { + if !val.is_finite() { + let sample_idx = i / sample_size; + let angle_idx = i % sample_size; + return Err(MahoutError::InvalidInput(format!( + "Sample {} angle {} must be finite, got {}", + sample_idx, angle_idx, val + ))); + } + } + + let state_len = 1 << num_qubits; + let batch_state_vector = { + crate::profile_scope!("GPU::AllocBatchF32"); + GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)? + }; + + let input_bytes = std::mem::size_of_val(batch_data); + let angles_gpu = { + crate::profile_scope!("GPU::H2D_BatchAnglesF32"); + device.htod_sync_copy(batch_data).map_err(|e| { + map_allocation_error(input_bytes, "angle batch upload", Some(num_qubits), e) + })? + }; + + let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| { + MahoutError::InvalidInput( + "Batch state vector precision mismatch (expected float32 buffer)".to_string(), + ) + })?; + + { + crate::profile_scope!("GPU::BatchKernelLaunchF32"); + let ret = unsafe { + qdp_kernels::launch_angle_encode_batch_f32( + *angles_gpu.device_ptr() as *const f32, + state_ptr as *mut c_void, + num_samples, + state_len, + num_qubits as u32, + std::ptr::null_mut(), + ) + }; + + if ret != 0 { + return Err(MahoutError::KernelLaunch(format!( + "Batch angle encoding kernel (f32) failed: {} ({})", + ret, + cuda_error_to_string(ret) + ))); + } + } + + { + crate::profile_scope!("GPU::Synchronize"); + device + .synchronize() + .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?; + } + + Ok(batch_state_vector) + } + + #[cfg(target_os = "linux")] + unsafe fn encode_batch_from_gpu_ptr_f32( + &self, + device: &Arc, + input_batch_d: *const c_void, + num_samples: usize, + sample_size: usize, + num_qubits: usize, + stream: *mut c_void, + ) -> Result { + if num_samples == 0 { + return Err(MahoutError::InvalidInput( + "Number of samples cannot be zero".into(), + )); + } + if sample_size == 0 { + return Err(MahoutError::InvalidInput( + "Sample size cannot be zero".into(), + )); + } + if sample_size != num_qubits { + return Err(MahoutError::InvalidInput(format!( + "Angle encoding expects sample_size={} (one angle per qubit), got {}", + num_qubits, sample_size + ))); + } + + validate_qubit_count(num_qubits)?; + let state_len = 1 << num_qubits; + let input_batch_d = input_batch_d as *const f32; + let total_angles = num_samples + .checked_mul(sample_size) + .ok_or_else(|| MahoutError::InvalidInput("Angle batch size overflow".to_string()))?; + let angle_validation_buffer = { + crate::profile_scope!("GPU::AngleFiniteCheckBatchF32"); + use cudarc::driver::DevicePtrMut; + let mut buffer = device.alloc_zeros::(1).map_err(|e| { + MahoutError::MemoryAllocation(format!( + "Failed to allocate angle validation buffer: {:?}", + e + )) + })?; + let ret = unsafe { + qdp_kernels::launch_check_finite_batch_f32( + input_batch_d, + total_angles, + *buffer.device_ptr_mut() as *mut i32, + stream, + ) + }; + if ret != 0 { + return Err(MahoutError::KernelLaunch(format!( + "Angle finite validation kernel (f32) failed with CUDA error code: {} ({})", + ret, + cuda_error_to_string(ret) + ))); + } + buffer + }; + { + crate::profile_scope!("GPU::AngleFiniteValidationHostCopyF32"); + let host_flags = device + .dtoh_sync_copy(&angle_validation_buffer) + .map_err(|e| { + MahoutError::Cuda(format!( + "Failed to copy angle validation flags to host: {:?}", + e + )) + })?; + if host_flags.first().copied().unwrap_or_default() != 0 { + return Err(MahoutError::InvalidInput( + "Angle encoding batch contains non-finite values (NaN or Inf)".to_string(), + )); + } + } + let batch_state_vector = { + crate::profile_scope!("GPU::AllocBatchF32"); + GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)? + }; + let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| { + MahoutError::InvalidInput( + "Batch state vector precision mismatch (expected float32 buffer)".to_string(), + ) + })?; + { + crate::profile_scope!("GPU::BatchKernelLaunchF32"); + let ret = unsafe { + qdp_kernels::launch_angle_encode_batch_f32( + input_batch_d, + state_ptr as *mut c_void, + num_samples, + state_len, + num_qubits as u32, + stream, + ) + }; + if ret != 0 { + return Err(MahoutError::KernelLaunch(format!( + "Batch angle encoding kernel (f32) failed: {} ({})", + ret, + cuda_error_to_string(ret) + ))); + } + } + { + crate::profile_scope!("GPU::Synchronize"); + crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?; + } + Ok(batch_state_vector) + } + fn validate_input(&self, data: &[f64], num_qubits: usize) -> Result<()> { validate_qubit_count(num_qubits)?; if data.len() != num_qubits { @@ -472,6 +687,36 @@ impl AngleEncoder { Ok(state_vector) } + /// Encodes a batch of angle values from a device-resident `f32` buffer into GPU state + /// vectors, using the provided CUDA stream for all launched work. + /// + /// # Safety + /// The caller must ensure that `input_batch_d` points to at least + /// `num_samples * sample_size` contiguous `f32` values in GPU-accessible memory and remains + /// valid for the duration of this call. The caller must also ensure that `stream` is either + /// null or a valid CUDA stream handle associated with `device`, and that the memory layout is + /// row-major with exactly `sample_size` angles per sample. + #[cfg(target_os = "linux")] + pub unsafe fn encode_batch_from_gpu_ptr_f32_with_stream( + device: &Arc, + input_batch_d: *const f32, + num_samples: usize, + sample_size: usize, + num_qubits: usize, + stream: *mut c_void, + ) -> Result { + unsafe { + AngleEncoder.encode_batch_from_gpu_ptr_f32( + device, + input_batch_d as *const c_void, + num_samples, + sample_size, + num_qubits, + stream, + ) + } + } + #[cfg(target_os = "linux")] fn encode_batch_async_pipeline( device: &Arc, diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs index 3db9accdc1..799eb7b180 100644 --- a/qdp/qdp-core/src/lib.rs +++ b/qdp/qdp-core/src/lib.rs @@ -768,6 +768,78 @@ impl QdpEngine { Ok(batch_state_vector.to_dlpack()) } + /// Encode an angle batch from an existing GPU pointer (float32 input only). + /// + /// Zero-copy batch encoding from CUDA float32 tensors. Uses the default CUDA stream. + /// For stream interop use `encode_angle_batch_from_gpu_ptr_f32_with_stream`. + /// + /// # Safety + /// The input pointer must: + /// - Point to valid GPU memory on the same device as the engine + /// - Contain at least `num_samples * sample_size` f32 elements + /// - Remain valid for the duration of this call + #[cfg(target_os = "linux")] + pub unsafe fn encode_angle_batch_from_gpu_ptr_f32( + &self, + input_batch_d: *const f32, + num_samples: usize, + sample_size: usize, + num_qubits: usize, + ) -> Result<*mut DLManagedTensor> { + unsafe { + self.encode_angle_batch_from_gpu_ptr_f32_with_stream( + input_batch_d, + num_samples, + sample_size, + num_qubits, + std::ptr::null_mut(), + ) + } + } + + /// Encode an angle batch from an existing GPU pointer (float32) on a specified CUDA stream. + /// + /// # Safety + /// In addition to the `encode_angle_batch_from_gpu_ptr_f32` requirements, the stream pointer + /// must remain valid for the duration of this call. + #[cfg(target_os = "linux")] + pub unsafe fn encode_angle_batch_from_gpu_ptr_f32_with_stream( + &self, + input_batch_d: *const f32, + num_samples: usize, + sample_size: usize, + num_qubits: usize, + stream: *mut c_void, + ) -> Result<*mut DLManagedTensor> { + crate::profile_scope!("Mahout::EncodeAngleBatchFromGpuPtrF32"); + + if num_samples == 0 { + return Err(MahoutError::InvalidInput( + "Number of samples cannot be zero".into(), + )); + } + if sample_size == 0 { + return Err(MahoutError::InvalidInput( + "Sample size cannot be zero".into(), + )); + } + + validate_cuda_input_ptr(&self.device, input_batch_d as *const c_void)?; + + let batch_state_vector = unsafe { + gpu::AngleEncoder::encode_batch_from_gpu_ptr_f32_with_stream( + &self.device, + input_batch_d, + num_samples, + sample_size, + num_qubits, + stream, + ) + }?; + let batch_state_vector = batch_state_vector.to_precision(&self.device, self.precision)?; + Ok(batch_state_vector.to_dlpack()) + } + /// Encode batch from existing GPU pointer (zero-copy for CUDA tensors) /// /// This method enables zero-copy batch encoding from PyTorch CUDA tensors. diff --git a/qdp/qdp-core/tests/gpu_angle_encoding.rs b/qdp/qdp-core/tests/gpu_angle_encoding.rs index c66e5eda60..6b60d51537 100644 --- a/qdp/qdp-core/tests/gpu_angle_encoding.rs +++ b/qdp/qdp-core/tests/gpu_angle_encoding.rs @@ -143,3 +143,133 @@ fn test_angle_successful_encoding_from_parquet() { ); } } + +#[test] +fn test_angle_batch_f32_success() { + let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else { + println!("SKIP: No GPU available"); + return; + }; + + let num_qubits = 3; + let num_samples = 2; + let data = vec![ + 0.0_f32, + std::f32::consts::FRAC_PI_2, + std::f32::consts::FRAC_PI_4, + 0.2_f32, + 0.4_f32, + 0.6_f32, + ]; + + let dlpack_ptr = engine + .encode_batch_f32(&data, num_samples, num_qubits, num_qubits, "angle") + .expect("angle batch encode f32 should succeed"); + + unsafe { + common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, num_samples as i64, 8); + } +} + +#[test] +fn test_angle_batch_f32_rejects_sample_size_mismatch() { + let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else { + println!("SKIP: No GPU available"); + return; + }; + + let data = vec![0.1_f32, 0.2, 0.3, 0.4]; + let result = engine.encode_batch_f32(&data, 2, 2, 3, "angle"); + + assert!(result.is_err()); + match result { + Err(MahoutError::InvalidInput(msg)) => { + assert!( + msg.contains("sample_size=3") || msg.contains("got 2"), + "msg: {msg}" + ); + } + _ => panic!("expected InvalidInput, got {:?}", result), + } +} + +#[test] +fn test_angle_batch_f32_rejects_nan() { + let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else { + println!("SKIP: No GPU available"); + return; + }; + + let data = vec![0.0_f32, f32::NAN, 0.2, 0.3]; + let result = engine.encode_batch_f32(&data, 2, 2, 2, "angle"); + + assert!(result.is_err()); + match result { + Err(MahoutError::InvalidInput(msg)) => { + assert!(msg.contains("Sample 0"), "msg: {msg}"); + assert!(msg.contains("angle 1"), "msg: {msg}"); + assert!(msg.contains("finite"), "msg: {msg}"); + } + _ => panic!("expected InvalidInput, got {:?}", result), + } +} + +#[test] +fn test_angle_batch_f32_rejects_infinity() { + let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else { + println!("SKIP: No GPU available"); + return; + }; + + let data = vec![0.0_f32, f32::INFINITY, 0.2, 0.3]; + let result = engine.encode_batch_f32(&data, 2, 2, 2, "angle"); + + assert!(result.is_err()); + match result { + Err(MahoutError::InvalidInput(msg)) => { + assert!(msg.contains("Sample 0"), "msg: {msg}"); + assert!(msg.contains("angle 1"), "msg: {msg}"); + assert!(msg.contains("finite"), "msg: {msg}"); + } + _ => panic!("expected InvalidInput, got {:?}", result), + } +} + +#[test] +fn test_angle_batch_f32_rejects_zero_samples() { + let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else { + println!("SKIP: No GPU available"); + return; + }; + + let result = engine.encode_batch_f32(&[], 0, 2, 2, "angle"); + + assert!(result.is_err()); + match result { + Err(MahoutError::InvalidInput(msg)) => { + assert!( + msg.contains("zero") || msg.contains("samples"), + "msg: {msg}" + ); + } + _ => panic!("expected InvalidInput, got {:?}", result), + } +} + +#[test] +fn test_angle_batch_f32_rejects_length_overflow() { + let Some(engine) = common::qdp_engine_with_precision(qdp_core::Precision::Float32) else { + println!("SKIP: No GPU available"); + return; + }; + + let result = engine.encode_batch_f32(&[], usize::MAX, 2, 2, "angle"); + + assert!(result.is_err()); + match result { + Err(MahoutError::InvalidInput(msg)) => { + assert!(msg.contains("overflow"), "msg: {msg}"); + } + _ => panic!("expected InvalidInput, got {:?}", result), + } +} diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs b/qdp/qdp-core/tests/gpu_ptr_encoding.rs index 80a054a693..88c65e752d 100644 --- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs +++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs @@ -1293,3 +1293,236 @@ fn test_encode_batch_from_gpu_ptr_f32_odd_sample_size_success() { ) }; } + +#[test] +fn test_encode_angle_batch_from_gpu_ptr_f32_success() { + let engine = match engine_f32() { + Some(e) => e, + None => { + println!("SKIP: No GPU"); + return; + } + }; + let num_samples = 2; + let num_qubits = 3; + let (_device, input_d) = match common::copy_f32_to_device(&[ + 0.0, + std::f32::consts::FRAC_PI_2, + std::f32::consts::FRAC_PI_4, + 0.2, + 0.4, + 0.6, + ]) { + Some(t) => t, + None => { + println!("SKIP: No CUDA device"); + return; + } + }; + let dlpack_ptr = unsafe { + engine + .encode_angle_batch_from_gpu_ptr_f32( + *input_d.device_ptr() as *const f32, + num_samples, + num_qubits, + num_qubits, + ) + .expect("encode_angle_batch_from_gpu_ptr_f32") + }; + unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, num_samples as i64, 8) }; +} + +#[test] +fn test_encode_angle_batch_from_gpu_ptr_f32_with_stream_success() { + let engine = match engine_f32() { + Some(e) => e, + None => { + println!("SKIP: No GPU"); + return; + } + }; + let (device, input_d) = match common::copy_f32_to_device(&[ + 0.0_f32, + std::f32::consts::FRAC_PI_2, + std::f32::consts::FRAC_PI_4, + 0.2_f32, + 0.4_f32, + 0.6_f32, + ]) { + Some(t) => t, + None => { + println!("SKIP: No CUDA device"); + return; + } + }; + let stream = device.fork_default_stream().expect("fork_default_stream"); + let dlpack_ptr = unsafe { + engine + .encode_angle_batch_from_gpu_ptr_f32_with_stream( + *input_d.device_ptr() as *const f32, + 2, + 3, + 3, + stream.stream as *mut c_void, + ) + .expect("encode_angle_batch_from_gpu_ptr_f32_with_stream") + }; + unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 2, 8) }; +} + +#[test] +fn test_encode_angle_batch_from_gpu_ptr_f32_null_pointer() { + let engine = match engine_f32() { + Some(e) => e, + None => { + println!("SKIP: No GPU"); + return; + } + }; + let result = unsafe { engine.encode_angle_batch_from_gpu_ptr_f32(std::ptr::null(), 2, 2, 2) }; + assert!(result.is_err()); + match &result.unwrap_err() { + MahoutError::InvalidInput(msg) => assert!(msg.contains("null")), + e => panic!("Expected InvalidInput, got {:?}", e), + } +} + +#[test] +fn test_encode_angle_batch_from_gpu_ptr_f32_sample_size_mismatch() { + let engine = match engine_f32() { + Some(e) => e, + None => { + println!("SKIP: No GPU"); + return; + } + }; + let (_device, input_d) = match common::copy_f32_to_device(&[0.0_f32, 0.1, 0.2, 0.3, 0.4, 0.5]) { + Some(t) => t, + None => { + println!("SKIP: No CUDA device"); + return; + } + }; + let result = unsafe { + engine.encode_angle_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const f32, 2, 2, 3) + }; + assert!(result.is_err()); + match &result.unwrap_err() { + MahoutError::InvalidInput(msg) => { + assert!( + msg.contains("sample_size=3") || msg.contains("got 2"), + "msg: {msg}" + ); + } + e => panic!("Expected InvalidInput, got {:?}", e), + } +} + +#[test] +fn test_encode_angle_batch_from_gpu_ptr_f32_zero_samples() { + let engine = match engine_f32() { + Some(e) => e, + None => { + println!("SKIP: No GPU"); + return; + } + }; + let result = unsafe { engine.encode_angle_batch_from_gpu_ptr_f32(std::ptr::null(), 0, 2, 2) }; + assert!(result.is_err()); + match &result.unwrap_err() { + MahoutError::InvalidInput(msg) => assert!(msg.contains("zero") || msg.contains("samples")), + e => panic!("Expected InvalidInput, got {:?}", e), + } +} + +#[test] +fn test_encode_angle_batch_from_gpu_ptr_f32_non_finite_rejected() { + let engine = match engine_f32() { + Some(e) => e, + None => { + println!("SKIP: No GPU"); + return; + } + }; + let (_device, input_d) = + match common::copy_f32_to_device(&[0.0_f32, f32::NAN, 0.2_f32, 0.3_f32]) { + Some(t) => t, + None => { + println!("SKIP: No CUDA device"); + return; + } + }; + let result = unsafe { + engine.encode_angle_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const f32, 2, 2, 2) + }; + assert!(result.is_err()); + match &result.unwrap_err() { + MahoutError::InvalidInput(msg) => { + assert!( + msg.contains("non-finite") || msg.contains("NaN"), + "msg: {msg}" + ); + } + e => panic!("Expected InvalidInput, got {:?}", e), + } +} + +#[test] +fn test_encode_angle_batch_from_gpu_ptr_f32_infinity_rejected() { + let engine = match engine_f32() { + Some(e) => e, + None => { + println!("SKIP: No GPU"); + return; + } + }; + let (_device, input_d) = + match common::copy_f32_to_device(&[0.0_f32, f32::INFINITY, 0.2_f32, 0.3_f32]) { + Some(t) => t, + None => { + println!("SKIP: No CUDA device"); + return; + } + }; + let result = unsafe { + engine.encode_angle_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const f32, 2, 2, 2) + }; + assert!(result.is_err()); + match &result.unwrap_err() { + MahoutError::InvalidInput(msg) => { + assert!( + msg.contains("non-finite") || msg.contains("Inf"), + "msg: {msg}" + ); + } + e => panic!("Expected InvalidInput, got {:?}", e), + } +} + +#[test] +fn test_encode_angle_batch_from_gpu_ptr_f32_success_f64_engine() { + let Some(engine) = common::qdp_engine_with_precision(Precision::Float64) else { + println!("SKIP: No GPU"); + return; + }; + let (_device, input_d) = match common::copy_f32_to_device(&[ + 0.0_f32, + std::f32::consts::FRAC_PI_2, + std::f32::consts::FRAC_PI_4, + 0.2_f32, + 0.4_f32, + 0.6_f32, + ]) { + Some(t) => t, + None => { + println!("SKIP: No CUDA device"); + return; + } + }; + let dlpack_ptr = unsafe { + engine + .encode_angle_batch_from_gpu_ptr_f32(*input_d.device_ptr() as *const f32, 2, 3, 3) + .expect("encode_angle_batch_from_gpu_ptr_f32 (Float64 engine)") + }; + unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 2, 8) }; +} diff --git a/qdp/qdp-kernels/build.rs b/qdp/qdp-kernels/build.rs index 40abf9a1a7..097b57e15f 100644 --- a/qdp/qdp-kernels/build.rs +++ b/qdp/qdp-kernels/build.rs @@ -34,6 +34,7 @@ fn main() { println!("cargo:rerun-if-changed=src/amplitude.cu"); println!("cargo:rerun-if-changed=src/basis.cu"); println!("cargo:rerun-if-changed=src/angle.cu"); + println!("cargo:rerun-if-changed=src/validation.cu"); println!("cargo:rerun-if-changed=src/iqp.cu"); println!("cargo:rerun-if-changed=src/phase.cu"); println!("cargo:rerun-if-env-changed=QDP_NO_CUDA"); @@ -100,6 +101,7 @@ fn main() { .file("src/amplitude.cu") .file("src/basis.cu") .file("src/angle.cu") + .file("src/validation.cu") .file("src/iqp.cu") .file("src/phase.cu") .compile("kernels"); diff --git a/qdp/qdp-kernels/src/angle.cu b/qdp/qdp-kernels/src/angle.cu index 2a2d129689..57990be841 100644 --- a/qdp/qdp-kernels/src/angle.cu +++ b/qdp/qdp-kernels/src/angle.cu @@ -88,6 +88,34 @@ __global__ void angle_encode_batch_kernel( } } +__global__ void angle_encode_batch_kernel_f32( + const float* __restrict__ angles_batch, + cuComplex* __restrict__ state_batch, + size_t num_samples, + size_t state_len, + unsigned int num_qubits +) { + const size_t total_elements = num_samples * state_len; + const size_t stride = gridDim.x * blockDim.x; + const size_t state_mask = state_len - 1; + + for (size_t global_idx = blockIdx.x * blockDim.x + threadIdx.x; + global_idx < total_elements; + global_idx += stride) { + const size_t sample_idx = global_idx >> num_qubits; + const size_t element_idx = global_idx & state_mask; + const float* angles = angles_batch + sample_idx * num_qubits; + + float amplitude = 1.0f; + for (unsigned int bit = 0; bit < num_qubits; ++bit) { + const float angle = angles[bit]; + amplitude *= ((element_idx >> bit) & 1U) ? sinf(angle) : cosf(angle); + } + + state_batch[global_idx] = make_cuComplex(amplitude, 0.0f); + } +} + extern "C" { /// Launch angle encoding kernel @@ -207,4 +235,47 @@ int launch_angle_encode_batch( return (int)cudaGetLastError(); } +/// Launch batch angle encoding kernel for float32 input +/// +/// # Arguments +/// * angles_batch_d - Device pointer to batch angles (num_samples * num_qubits) +/// * state_batch_d - Device pointer to output batch state vectors +/// * num_samples - Number of samples in batch +/// * state_len - State vector size per sample (2^num_qubits) +/// * num_qubits - Number of qubits (angles length) +/// * stream - CUDA stream for async execution +/// +/// # Returns +/// CUDA error code (0 = cudaSuccess) +int launch_angle_encode_batch_f32( + const float* angles_batch_d, + void* state_batch_d, + size_t num_samples, + size_t state_len, + unsigned int num_qubits, + cudaStream_t stream +) { + if (num_samples == 0 || state_len == 0 || num_qubits == 0) { + return cudaErrorInvalidValue; + } + + cuComplex* state_complex_d = static_cast(state_batch_d); + + const int blockSize = DEFAULT_BLOCK_SIZE; + const size_t total_elements = num_samples * state_len; + const size_t blocks_needed = (total_elements + blockSize - 1) / blockSize; + const size_t max_blocks = MAX_GRID_BLOCKS; + const size_t gridSize = (blocks_needed < max_blocks) ? blocks_needed : max_blocks; + + angle_encode_batch_kernel_f32<<>>( + angles_batch_d, + state_complex_d, + num_samples, + state_len, + num_qubits + ); + + return (int)cudaGetLastError(); +} + } // extern "C" diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs index 4649092863..271039fb12 100644 --- a/qdp/qdp-kernels/src/lib.rs +++ b/qdp/qdp-kernels/src/lib.rs @@ -254,6 +254,32 @@ unsafe extern "C" { stream: *mut c_void, ) -> i32; + /// Launch batch angle encoding kernel for float32 inputs. + /// Returns CUDA error code (0 = success) + /// + /// # Safety + /// Requires valid GPU pointers, must sync before freeing + pub fn launch_angle_encode_batch_f32( + angles_batch_d: *const f32, + state_batch_d: *mut c_void, + num_samples: usize, + state_len: usize, + num_qubits: u32, + stream: *mut c_void, + ) -> i32; + + /// Launch float32 batch finite-value validation. + /// Returns CUDA error code (0 = success) + /// + /// # Safety + /// Requires valid GPU pointers, must sync before reading the output flag. + pub fn launch_check_finite_batch_f32( + input_batch_d: *const f32, + total_angles: usize, + has_non_finite_d: *mut i32, + stream: *mut c_void, + ) -> i32; + /// Launch IQP encoding kernel /// Returns CUDA error code (0 = success) /// @@ -496,6 +522,30 @@ pub extern "C" fn launch_angle_encode_batch( 999 } +#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[unsafe(no_mangle)] +pub extern "C" fn launch_angle_encode_batch_f32( + _angles_batch_d: *const f32, + _state_batch_d: *mut c_void, + _num_samples: usize, + _state_len: usize, + _num_qubits: u32, + _stream: *mut c_void, +) -> i32 { + 999 +} + +#[cfg(any(not(target_os = "linux"), qdp_no_cuda))] +#[unsafe(no_mangle)] +pub extern "C" fn launch_check_finite_batch_f32( + _input_batch_d: *const f32, + _total_angles: usize, + _has_non_finite_d: *mut i32, + _stream: *mut c_void, +) -> i32 { + 999 +} + #[cfg(any(not(target_os = "linux"), qdp_no_cuda))] #[unsafe(no_mangle)] pub extern "C" fn launch_iqp_encode( diff --git a/qdp/qdp-kernels/src/validation.cu b/qdp/qdp-kernels/src/validation.cu new file mode 100644 index 0000000000..e38b65665d --- /dev/null +++ b/qdp/qdp-kernels/src/validation.cu @@ -0,0 +1,82 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Shared validation CUDA kernels. + +#include +#include +#include "kernel_config.h" + +__global__ void check_finite_batch_kernel_f32( + const float* __restrict__ input_batch, + size_t total_values, + int* __restrict__ has_non_finite +) { + const size_t stride = gridDim.x * blockDim.x; + for (size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < total_values; + idx += stride) { + if (!isfinite(input_batch[idx])) { + atomicExch(has_non_finite, 1); + return; + } + } +} + +extern "C" { + +/// Launch batch finite-value validation for float32 input. +/// +/// Writes 1 to `has_non_finite_d` if any NaN/Inf is found, else leaves it at 0. +int launch_check_finite_batch_f32( + const float* input_batch_d, + size_t total_values, + int* has_non_finite_d, + cudaStream_t stream +) { + if (total_values == 0 || has_non_finite_d == nullptr) { + return cudaErrorInvalidValue; + } + + cudaError_t memset_status = cudaMemsetAsync( + has_non_finite_d, + 0, + sizeof(int), + stream + ); + if (memset_status != cudaSuccess) { + return memset_status; + } + + const int blockSize = DEFAULT_BLOCK_SIZE; + size_t gridSize = (total_values + blockSize - 1) / blockSize; + if (gridSize == 0) { + gridSize = 1; + } + if (gridSize > MAX_GRID_BLOCKS) { + gridSize = MAX_GRID_BLOCKS; + } + + check_finite_batch_kernel_f32<<>>( + input_batch_d, + total_values, + has_non_finite_d + ); + + return (int)cudaGetLastError(); +} + +} // extern "C" diff --git a/qdp/qdp-kernels/tests/angle_encode.rs b/qdp/qdp-kernels/tests/angle_encode.rs index f42b3cd50a..a4af609af1 100644 --- a/qdp/qdp-kernels/tests/angle_encode.rs +++ b/qdp/qdp-kernels/tests/angle_encode.rs @@ -21,10 +21,25 @@ #[cfg(target_os = "linux")] use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut}; #[cfg(target_os = "linux")] -use qdp_kernels::{CuComplex, launch_angle_encode_f32}; +use qdp_kernels::{ + CuComplex, launch_angle_encode_batch_f32, launch_angle_encode_f32, + launch_check_finite_batch_f32, +}; const EPSILON_F32: f32 = 1e-5; +#[cfg(target_os = "linux")] +fn expected_amplitude_f32(angles: &[f32], basis_idx: usize) -> f32 { + angles.iter().enumerate().fold(1.0f32, |acc, (bit, angle)| { + let factor = if ((basis_idx >> bit) & 1) == 1 { + angle.sin() + } else { + angle.cos() + }; + acc * factor + }) +} + #[test] #[cfg(target_os = "linux")] fn test_angle_encode_basic_f32() { @@ -147,3 +162,237 @@ fn test_angle_encode_f32_rejects_zero_qubits() { assert_ne!(result, 0, "Zero-qubit launch should fail"); } + +#[test] +#[cfg(target_os = "linux")] +fn test_angle_encode_batch_f32_matches_expected_product_states() { + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let num_qubits = 3usize; + let num_samples = 2usize; + let state_len = 1usize << num_qubits; + let angles = vec![ + 0.0_f32, + std::f32::consts::FRAC_PI_2, + std::f32::consts::FRAC_PI_4, + 0.2_f32, + 0.4_f32, + 0.6_f32, + ]; + + let input_d = device.htod_copy(angles.clone()).unwrap(); + let mut state_d = device + .alloc_zeros::(num_samples * state_len) + .unwrap(); + + let result = unsafe { + launch_angle_encode_batch_f32( + *input_d.device_ptr() as *const f32, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + num_samples, + state_len, + num_qubits as u32, + std::ptr::null_mut(), + ) + }; + assert_eq!(result, 0, "Batch kernel launch should succeed"); + + let state_h = device.dtoh_sync_copy(&state_d).unwrap(); + for sample_idx in 0..num_samples { + let sample_angles = &angles[sample_idx * num_qubits..(sample_idx + 1) * num_qubits]; + for basis_idx in 0..state_len { + let actual = state_h[sample_idx * state_len + basis_idx]; + let expected = expected_amplitude_f32(sample_angles, basis_idx); + assert!( + (actual.x - expected).abs() < EPSILON_F32, + "sample {sample_idx} basis {basis_idx} expected {expected}, got {}", + actual.x + ); + assert!( + actual.y.abs() < EPSILON_F32, + "sample {sample_idx} basis {basis_idx} imaginary expected 0, got {}", + actual.y + ); + } + } +} + +#[test] +#[cfg(target_os = "linux")] +fn test_angle_encode_batch_f32_rejects_zero_samples() { + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let input_d = device.htod_copy(vec![0.0_f32, 1.0_f32]).unwrap(); + let mut state_d = device.alloc_zeros::(4).unwrap(); + + let result = unsafe { + launch_angle_encode_batch_f32( + *input_d.device_ptr() as *const f32, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + 0, + 4, + 2, + std::ptr::null_mut(), + ) + }; + + assert_ne!(result, 0, "Zero-sample batch launch should fail"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_check_finite_batch_f32_reports_non_finite() { + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let input_d = device + .htod_copy(vec![0.0_f32, f32::INFINITY, 0.2_f32, 0.3_f32]) + .unwrap(); + let mut status_d = device.alloc_zeros::(1).unwrap(); + + let result = unsafe { + launch_check_finite_batch_f32( + *input_d.device_ptr() as *const f32, + 4, + *status_d.device_ptr_mut() as *mut i32, + std::ptr::null_mut(), + ) + }; + assert_eq!(result, 0, "Finite-check launch should succeed"); + + let status_h = device.dtoh_sync_copy(&status_d).unwrap(); + assert_eq!(status_h, vec![1], "Expected non-finite flag to be set"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_check_finite_batch_f32_reports_nan() { + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let input_d = device + .htod_copy(vec![0.0_f32, f32::NAN, 0.2_f32, 0.3_f32]) + .unwrap(); + let mut status_d = device.alloc_zeros::(1).unwrap(); + + let result = unsafe { + launch_check_finite_batch_f32( + *input_d.device_ptr() as *const f32, + 4, + *status_d.device_ptr_mut() as *mut i32, + std::ptr::null_mut(), + ) + }; + assert_eq!(result, 0, "Finite-check launch should succeed"); + + let status_h = device.dtoh_sync_copy(&status_d).unwrap(); + assert_eq!(status_h, vec![1], "Expected NaN flag to be set"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_check_finite_batch_f32_all_finite_stays_clear() { + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let input_d = device + .htod_copy(vec![0.0_f32, 0.1_f32, 0.2_f32, 0.3_f32]) + .unwrap(); + let mut status_d = device.alloc_zeros::(1).unwrap(); + + let result = unsafe { + launch_check_finite_batch_f32( + *input_d.device_ptr() as *const f32, + 4, + *status_d.device_ptr_mut() as *mut i32, + std::ptr::null_mut(), + ) + }; + assert_eq!(result, 0, "Finite-check launch should succeed"); + + let status_h = device.dtoh_sync_copy(&status_d).unwrap(); + assert_eq!(status_h, vec![0], "Expected finite flag to remain clear"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_angle_encode_batch_f32_rejects_zero_state_len() { + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let input_d = device.htod_copy(vec![0.0_f32, 1.0_f32]).unwrap(); + let mut state_d = device.alloc_zeros::(1).unwrap(); + + let result = unsafe { + launch_angle_encode_batch_f32( + *input_d.device_ptr() as *const f32, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + 1, + 0, + 1, + std::ptr::null_mut(), + ) + }; + + assert_ne!(result, 0, "Zero state-len batch launch should fail"); +} + +#[test] +#[cfg(target_os = "linux")] +fn test_angle_encode_batch_f32_rejects_zero_qubits() { + let device = match CudaDevice::new(0) { + Ok(d) => d, + Err(_) => { + println!("SKIP: No CUDA device available"); + return; + } + }; + + let input_d = device.htod_copy(vec![0.0_f32, 1.0_f32]).unwrap(); + let mut state_d = device.alloc_zeros::(1).unwrap(); + + let result = unsafe { + launch_angle_encode_batch_f32( + *input_d.device_ptr() as *const f32, + *state_d.device_ptr_mut() as *mut std::ffi::c_void, + 1, + 1, + 0, + std::ptr::null_mut(), + ) + }; + + assert_ne!(result, 0, "Zero-qubit batch launch should fail"); +}