diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6c1d4aa..b61dbe1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,6 +52,42 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-docdeploy@v1 env: - GKSwstype: nul # Fix for Plots with GR backend. + GKSwstype: nul GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }} + gpu-tests: + name: GPU Tests - Julia ${{ matrix.version }} + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + version: ['1.10'] + steps: + - uses: actions/checkout@v5 + - uses: julia-actions/setup-julia@v1 + with: + version: ${{ matrix.version }} + arch: x64 + - uses: julia-actions/cache@v2 + - uses: julia-actions/julia-buildpkg@v1 + - uses: julia-actions/julia-runtest@v1 + env: + CUDA_TEST: true + JULIA_NUM_THREADS: 2 + static-analysis: + name: Static Analysis + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v5 + - uses: julia-actions/setup-julia@v1 + with: + version: '1.10' + arch: x64 + - uses: julia-actions/cache@v2 + - uses: julia-actions/julia-buildpkg@v1 + - name: Add JET.jl + run: julia -e 'using Pkg; Pkg.add("JET")' + - uses: julia-actions/julia-runtest@v1 + env: + JET_TEST: true + JULIA_NUM_THREADS: 2 \ No newline at end of file diff --git a/Project.toml b/Project.toml index 482cc75..e6c5426 100644 --- a/Project.toml +++ b/Project.toml @@ -12,12 +12,15 @@ SymplecticMatrices = "d07eab47-f98b-4620-8fe1-ee63e153d4ef" [weakdeps] Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" +CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" [extensions] +CUDAExt = "CUDA" MakieExt = "Makie" StaticArraysExt = "StaticArrays" [compat] +CUDA = "5" LinearAlgebra = "1.9" Makie = "0.21, 0.22, 0.23, 0.24" QuantumInterface = "0.3.7, 0.3.8, 0.4.1" diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl new file mode 100644 index 0000000..3ab1191 --- /dev/null +++ b/ext/CUDAExt/CUDAExt.jl @@ -0,0 +1,234 @@ +module CUDAExt +using CUDA +using CUDA: CuArray, CuVector, CuMatrix, @cuda, threadIdx, blockIdx, blockDim, gridDim +using Gabs +using Gabs: SymplecticBasis, QuadPairBasis, QuadBlockBasis, GaussianState, GaussianUnitary, GaussianChannel, GaussianLinearCombination, _promote_output_vector, _promote_output_matrix, _vacuumstate, _coherentstate, _squeezedstate, _thermalstate, _eprstate, _displace, _squeeze, _twosqueeze, _phaseshift, _beamsplitter, _attenuator, _amplifier, symplecticform, WIGNER_ERROR, ACTION_ERROR, HBAR_ERROR, INDEX_ERROR, SYMPLECTIC_ERROR, cross_wigner, cross_wignerchar, vacuumstate, randstate, randunitary, randchannel, randsymplectic +using LinearAlgebra +using LinearAlgebra: I, det, mul!, eigvals, Diagonal, logdet, dot, inv, diag, cholesky, Symmetric, qr, Hermitian +using Random +using Random: randn! +import Gabs: purity, entropy_vn, vacuumstate, changebasis, randstate, randunitary, randchannel, randsymplectic, ptrace, cross_wigner, cross_wignerchar, wigner, wignerchar, catstate_even, catstate_odd, catstate, gkpstate, _promote_output_vector, _promote_output_matrix, _is_gpu_array, _detect_device, coherentstate, squeezedstate, thermalstate, eprstate, displace, squeeze, twosqueeze, phaseshift, beamsplitter, attenuator, amplifier + + +const CUDA_AVAILABLE = CUDA.functional() + +function __init__() + if CUDA_AVAILABLE + @info "CUDA.jl extension loaded successfully. GPU acceleration enabled." + else + @warn "CUDA not available. GPU operations will fall back to CPU." + end +end + +function Gabs._gpu_impl(state::GaussianState, precision) + gpu_mean = CuArray{precision}(state.mean) + gpu_covar = CuArray{precision}(state.covar) + return GaussianState(state.basis, gpu_mean, gpu_covar; ħ = state.ħ) +end + +function Gabs._gpu_impl(op::GaussianUnitary, precision) + gpu_disp = CuArray{precision}(op.disp) + gpu_symplectic = CuArray{precision}(op.symplectic) + return GaussianUnitary(op.basis, gpu_disp, gpu_symplectic; ħ = op.ħ) +end + +function Gabs._gpu_impl(op::GaussianChannel, precision) + gpu_disp = CuArray{precision}(op.disp) + gpu_transform = CuArray{precision}(op.transform) + gpu_noise = CuArray{precision}(op.noise) + return GaussianChannel(op.basis, gpu_disp, gpu_transform, gpu_noise; ħ = op.ħ) +end + +function Gabs._gpu_impl(lc::GaussianLinearCombination, precision) + cpu_coeffs = Vector{precision}(Array(lc.coeffs)) + gpu_states = [Gabs._gpu_impl(state, precision) for state in lc.states] + return GaussianLinearCombination(lc.basis, cpu_coeffs, gpu_states) +end + +function Gabs._gpu_impl(x::AbstractArray, precision) + return CuArray{precision}(x) +end + +function Gabs._detect_device(x::CuArray) + return :gpu +end + +function Gabs._randstate_gpu_impl(basis::SymplecticBasis, precision; pure=false, ħ=2) + cpu_state = randstate(basis; pure=pure, ħ=ħ) + return Gabs._gpu_impl(cpu_state, precision) +end + +function Gabs._randunitary_gpu_impl(basis::SymplecticBasis, precision; passive=false, ħ=2) + cpu_unitary = randunitary(basis; passive=passive, ħ=ħ) + return Gabs._gpu_impl(cpu_unitary, precision) +end + +function Gabs._randchannel_gpu_impl(basis::SymplecticBasis, precision; ħ=2) + cpu_channel = randchannel(basis; ħ=ħ) + return Gabs._gpu_impl(cpu_channel, precision) +end + +function Gabs._randsymplectic_gpu_impl(basis::SymplecticBasis, precision; passive=false) + cpu_symplectic = randsymplectic(basis; passive=passive) + return CuArray{precision}(cpu_symplectic) +end + +function Gabs._batch_randstate_gpu_impl(basis::SymplecticBasis, n::Int, precision; pure=false, ħ=2) + cpu_states = [randstate(basis; pure=pure, ħ=ħ) for _ in 1:n] + return [Gabs._gpu_impl(state, precision) for state in cpu_states] +end + +function Gabs._batch_randunitary_gpu_impl(basis::SymplecticBasis, n::Int, precision; passive=false, ħ=2) + cpu_unitaries = [randunitary(basis; passive=passive, ħ=ħ) for _ in 1:n] + return [Gabs._gpu_impl(unitary, precision) for unitary in cpu_unitaries] +end + +function _extract_gpu_scalar(x::CuArray{T}) where T + if ndims(x) == 0 + return T(Array(x)[]) + elseif length(x) == 1 + return T(Array(x)[1]) + else + return Vector{T}(Array(x)) + end +end + +function Gabs.coherentstate(basis::SymplecticBasis{N}, alpha::CuArray; ħ=2) where {N<:Int} + T = real(eltype(alpha)) + alpha_val = _extract_gpu_scalar(alpha) + return coherentstate(CuVector{T}, CuMatrix{T}, basis, alpha_val; ħ=ħ) +end + +function Gabs.squeezedstate(basis::SymplecticBasis{N}, r::CuArray, theta; ħ=2) where {N<:Int} + T = eltype(r) + r_val = _extract_gpu_scalar(r) + return squeezedstate(CuVector{T}, CuMatrix{T}, basis, r_val, theta; ħ=ħ) +end + +function Gabs.thermalstate(basis::SymplecticBasis{N}, photons::CuArray; ħ=2) where {N<:Int} + T = eltype(photons) + photons_val = _extract_gpu_scalar(photons) + return thermalstate(CuVector{T}, CuMatrix{T}, basis, photons_val; ħ=ħ) +end + +function Gabs.eprstate(basis::SymplecticBasis{N}, r::CuArray, theta; ħ=2) where {N<:Int} + T = eltype(r) + r_val = _extract_gpu_scalar(r) + return eprstate(CuVector{T}, CuMatrix{T}, basis, r_val, theta; ħ=ħ) +end + +function Gabs.displace(basis::SymplecticBasis{N}, alpha::CuArray; ħ=2) where {N<:Int} + T = real(eltype(alpha)) + alpha_val = _extract_gpu_scalar(alpha) + return displace(CuVector{T}, CuMatrix{T}, basis, alpha_val; ħ=ħ) +end + +function Gabs.squeeze(basis::SymplecticBasis{N}, r::CuArray, theta; ħ=2) where {N<:Int} + T = eltype(r) + r_val = _extract_gpu_scalar(r) + return squeeze(CuVector{T}, CuMatrix{T}, basis, r_val, theta; ħ=ħ) +end + +function Gabs.twosqueeze(basis::SymplecticBasis{N}, r::CuArray, theta; ħ=2) where {N<:Int} + T = eltype(r) + r_val = _extract_gpu_scalar(r) + return twosqueeze(CuVector{T}, CuMatrix{T}, basis, r_val, theta; ħ=ħ) +end + +function Gabs.phaseshift(basis::SymplecticBasis{N}, theta::CuArray; ħ=2) where {N<:Int} + T = eltype(theta) + theta_val = _extract_gpu_scalar(theta) + return phaseshift(CuVector{T}, CuMatrix{T}, basis, theta_val; ħ=ħ) +end + +function Gabs.beamsplitter(basis::SymplecticBasis{N}, transmit::CuArray; ħ=2) where {N<:Int} + T = eltype(transmit) + transmit_val = _extract_gpu_scalar(transmit) + return beamsplitter(CuVector{T}, CuMatrix{T}, basis, transmit_val; ħ=ħ) +end + +function Gabs.attenuator(basis::SymplecticBasis{N}, theta::CuArray, n; ħ=2) where {N<:Int} + T = eltype(theta) + theta_val = _extract_gpu_scalar(theta) + return attenuator(CuVector{T}, CuMatrix{T}, basis, theta_val, n; ħ=ħ) +end + +function Gabs.amplifier(basis::SymplecticBasis{N}, r::CuArray, n; ħ=2) where {N<:Int} + T = eltype(r) + r_val = _extract_gpu_scalar(r) + return amplifier(CuVector{T}, CuMatrix{T}, basis, r_val, n; ħ=ħ) +end + +function Gabs.tensor(state1::GaussianState{B,M1,V1}, state2::GaussianState{B,M2,V2}) where {B<:SymplecticBasis, M1<:CuArray, V1<:CuArray, M2<:CuArray, V2<:CuArray} + T = promote_type(eltype(M1), eltype(M2)) + return tensor(CuVector{T}, CuMatrix{T}, state1, state2) +end + +function Gabs.tensor(::Type{T}, op1::GaussianUnitary{B,D1,S1}, op2::GaussianUnitary{B,D2,S2}) where { + T<:CuVector, B<:SymplecticBasis, D1<:CuArray, S1<:CuArray, D2<:CuArray, S2<:CuArray} + return tensor(T, CuMatrix{eltype(T)}, op1, op2) +end + +function Gabs.tensor(op1::GaussianUnitary{B,D1,S1}, op2::GaussianUnitary{B,D2,S2}) where { + B<:SymplecticBasis, D1<:CuArray, S1<:CuArray, D2<:CuArray, S2<:CuArray} + T = promote_type(eltype(D1), eltype(D2)) + return tensor(CuVector{T}, CuMatrix{T}, op1, op2) +end + +function Gabs.tensor(::Type{T}, op1::GaussianChannel{B,D1,T1}, op2::GaussianChannel{B,D2,T2}) where { + T<:CuVector, B<:SymplecticBasis, D1<:CuArray, T1<:CuArray, D2<:CuArray, T2<:CuArray} + return tensor(T, CuMatrix{eltype(T)}, op1, op2) +end + +function Gabs.tensor(op1::GaussianChannel{B,D1,T1}, op2::GaussianChannel{B,D2,T2}) where { + B<:SymplecticBasis, D1<:CuArray, T1<:CuArray, D2<:CuArray, T2<:CuArray} + T = promote_type(eltype(D1), eltype(D2)) + return tensor(CuVector{T}, CuMatrix{T}, op1, op2) +end + +function Gabs._adapt_device_gpu(target_constructor, source_obj, args...) + if source_obj isa GaussianState + T = real(eltype(source_obj.mean)) + elseif source_obj isa GaussianUnitary + T = real(eltype(source_obj.disp)) + elseif source_obj isa GaussianChannel + T = real(eltype(source_obj.disp)) + elseif source_obj isa GaussianLinearCombination && !isempty(source_obj.states) + T = real(eltype(source_obj.states[1].mean)) + elseif source_obj isa AbstractArray + T = real(eltype(source_obj)) + else + T = Float32 + end + cpu_obj = target_constructor(args...) + return Gabs._gpu_impl(cpu_obj, T) +end + +function Gabs.isgaussian(state::GaussianState{B,M,V}; atol::R1 = 0, rtol::R2 = atol) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, R1<:Real, R2<:Real} + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ) + return isgaussian(cpu_state; atol = atol, rtol = rtol) +end + +function Gabs.isgaussian(op::GaussianUnitary{B,D,S}; atol::R1 = 0, rtol::R2 = atol) where {B<:SymplecticBasis, D<:CuArray, S<:CuArray, R1<:Real, R2<:Real} + return issymplectic(op.basis, Array(op.symplectic); atol = atol, rtol = rtol) +end + +function Gabs.isgaussian(op::GaussianChannel{B,D,T}; atol::R1 = 0, rtol::R2 = atol) where {B<:SymplecticBasis, D<:CuArray, T<:CuArray, R1<:Real, R2<:Real} + cpu_op = GaussianChannel(op.basis, Array(op.disp), Array(op.transform), Array(op.noise); ħ = op.ħ) + return isgaussian(cpu_op; atol = atol, rtol = rtol) +end + +function Gabs.issymplectic(basis::SymplecticBasis, x::CuMatrix; atol::R1 = 0, rtol::R2 = atol) where {R1<:Real, R2<:Real} + return issymplectic(basis, Array(x); atol = atol, rtol = rtol) +end + +include("utils.jl") +include("state_operations.jl") +include("unitary_operations.jl") +include("wigner_kernels.jl") +include("cross_wigner_kernels.jl") +include("interference_wigner.jl") +include("linear_combinations.jl") +include("basis_operations.jl") +include("ptrace_operations.jl") +end \ No newline at end of file diff --git a/ext/CUDAExt/basis_operations.jl b/ext/CUDAExt/basis_operations.jl new file mode 100644 index 0000000..7efe64e --- /dev/null +++ b/ext/CUDAExt/basis_operations.jl @@ -0,0 +1,209 @@ +""" + _create_pair_to_block_permutation(nmodes::Int, ::Type{T}) where T + +Create GPU permutation indices for QuadPairBasis -> QuadBlockBasis conversion. +QuadPair: [x1,p1,x2,p2,...] -> QuadBlock: [x1,x2,...,p1,p2,...] +""" +function _create_pair_to_block_permutation(nmodes::Int, ::Type{T}) where T + x_indices = collect(1:2:2*nmodes) + p_indices = collect(2:2:2*nmodes) + perm_indices = vcat(x_indices, p_indices) + return CuArray{Int}(perm_indices) +end + +""" + _create_block_to_pair_permutation(nmodes::Int, ::Type{T}) where T + +Create GPU permutation indices for QuadBlockBasis -> QuadPairBasis conversion. +QuadBlock: [x1,x2,...,p1,p2,...] -> QuadPair: [x1,p1,x2,p2,...] +""" +function _create_block_to_pair_permutation(nmodes::Int, ::Type{T}) where T + perm_indices = Vector{Int}(undef, 2*nmodes) + for i in 1:nmodes + perm_indices[2*i-1] = i + perm_indices[2*i] = i + nmodes + end + return CuArray{Int}(perm_indices) +end + +""" + _convert_mean_pair_to_block!(mean_out::CuVector{T}, mean_in::CuVector{T}, nmodes::Int) where T + +Convert mean vector from QuadPairBasis to QuadBlockBasis using vectorized GPU operations. +""" +function _convert_mean_pair_to_block!(mean_out::CuVector{T}, mean_in::CuVector{T}, nmodes::Int) where T + x_view = @view mean_in[1:2:2*nmodes] + p_view = @view mean_in[2:2:2*nmodes] + @views mean_out[1:nmodes] .= x_view + @views mean_out[nmodes+1:2*nmodes] .= p_view + return mean_out +end + +""" + _convert_mean_block_to_pair!(mean_out::CuVector{T}, mean_in::CuVector{T}, nmodes::Int) where T + +Convert mean vector from QuadBlockBasis to QuadPairBasis using vectorized GPU operations. +""" +function _convert_mean_block_to_pair!(mean_out::CuVector{T}, mean_in::CuVector{T}, nmodes::Int) where T + x_view = @view mean_in[1:nmodes] + p_view = @view mean_in[nmodes+1:2*nmodes] + @views mean_out[1:2:2*nmodes] .= x_view + @views mean_out[2:2:2*nmodes] .= p_view + return mean_out +end + +""" + _convert_matrix_with_permutation(matrix::CuMatrix{T}, perm_indices::CuVector{Int}) where T + +Apply permutation to both rows and columns of a matrix using GPU operations. +""" +function _convert_matrix_with_permutation(matrix::CuMatrix{T}, perm_indices::CuVector{Int}) where T + return matrix[perm_indices, perm_indices] +end + +""" + changebasis(::Type{QuadBlockBasis}, state::GaussianState{QuadPairBasis, CuArray, CuArray}) + +Convert GaussianState from QuadPairBasis to QuadBlockBasis on GPU. +""" +function changebasis(::Type{B1}, state::GaussianState{B2,M,V}) where {B1<:QuadBlockBasis,B2<:QuadPairBasis,M<:CuArray,V<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ) + cpu_result = changebasis(B1, cpu_state) + return GaussianState(cpu_result.basis, CuArray(cpu_result.mean), CuArray(cpu_result.covar); ħ=cpu_result.ħ) + end + nmodes = state.basis.nmodes + T = eltype(M) + mean_new = CUDA.zeros(T, 2*nmodes) + covar_new = CUDA.zeros(T, 2*nmodes, 2*nmodes) + _convert_mean_pair_to_block!(mean_new, state.mean, nmodes) + perm_indices = _create_pair_to_block_permutation(nmodes, T) + covar_new = _convert_matrix_with_permutation(state.covar, perm_indices) + return GaussianState(B1(nmodes), mean_new, covar_new; ħ = state.ħ) +end + +""" + changebasis(::Type{QuadPairBasis}, state::GaussianState{QuadBlockBasis, CuArray, CuArray}) + +Convert GaussianState from QuadBlockBasis to QuadPairBasis on GPU. +""" +function changebasis(::Type{B1}, state::GaussianState{B2,M,V}) where {B1<:QuadPairBasis,B2<:QuadBlockBasis,M<:CuArray,V<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ) + cpu_result = changebasis(B1, cpu_state) + return GaussianState(cpu_result.basis, CuArray(cpu_result.mean), CuArray(cpu_result.covar); ħ=cpu_result.ħ) + end + nmodes = state.basis.nmodes + T = eltype(M) + mean_new = CUDA.zeros(T, 2*nmodes) + covar_new = CUDA.zeros(T, 2*nmodes, 2*nmodes) + _convert_mean_block_to_pair!(mean_new, state.mean, nmodes) + perm_indices = _create_block_to_pair_permutation(nmodes, T) + covar_new = _convert_matrix_with_permutation(state.covar, perm_indices) + return GaussianState(B1(nmodes), mean_new, covar_new; ħ = state.ħ) +end + +changebasis(::Type{<:QuadBlockBasis}, state::GaussianState{<:QuadBlockBasis,<:CuArray,<:CuArray}) = state +changebasis(::Type{<:QuadPairBasis}, state::GaussianState{<:QuadPairBasis,<:CuArray,<:CuArray}) = state + +""" + changebasis(::Type{QuadBlockBasis}, op::GaussianUnitary{QuadPairBasis, CuArray, CuArray}) + +Convert GaussianUnitary from QuadPairBasis to QuadBlockBasis on GPU. +""" +function changebasis(::Type{B1}, op::GaussianUnitary{B2,D,S}) where {B1<:QuadBlockBasis,B2<:QuadPairBasis,D<:CuArray,S<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_op = GaussianUnitary(op.basis, Array(op.disp), Array(op.symplectic); ħ=op.ħ) + cpu_result = changebasis(B1, cpu_op) + return GaussianUnitary(cpu_result.basis, CuArray(cpu_result.disp), CuArray(cpu_result.symplectic); ħ=cpu_result.ħ) + end + + nmodes = op.basis.nmodes + T = eltype(D) + disp_new = CUDA.zeros(T, 2*nmodes) + symp_new = CUDA.zeros(T, 2*nmodes, 2*nmodes) + _convert_mean_pair_to_block!(disp_new, op.disp, nmodes) + perm_indices = _create_pair_to_block_permutation(nmodes, T) + symp_new = _convert_matrix_with_permutation(op.symplectic, perm_indices) + return GaussianUnitary(B1(nmodes), disp_new, symp_new; ħ = op.ħ) +end + +""" + changebasis(::Type{QuadPairBasis}, op::GaussianUnitary{QuadBlockBasis, CuArray, CuArray}) + +Convert GaussianUnitary from QuadBlockBasis to QuadPairBasis on GPU. +""" +function changebasis(::Type{B1}, op::GaussianUnitary{B2,D,S}) where {B1<:QuadPairBasis,B2<:QuadBlockBasis,D<:CuArray,S<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_op = GaussianUnitary(op.basis, Array(op.disp), Array(op.symplectic); ħ=op.ħ) + cpu_result = changebasis(B1, cpu_op) + return GaussianUnitary(cpu_result.basis, CuArray(cpu_result.disp), CuArray(cpu_result.symplectic); ħ=cpu_result.ħ) + end + nmodes = op.basis.nmodes + T = eltype(D) + disp_new = CUDA.zeros(T, 2*nmodes) + symp_new = CUDA.zeros(T, 2*nmodes, 2*nmodes) + _convert_mean_block_to_pair!(disp_new, op.disp, nmodes) + perm_indices = _create_block_to_pair_permutation(nmodes, T) + symp_new = _convert_matrix_with_permutation(op.symplectic, perm_indices) + return GaussianUnitary(B1(nmodes), disp_new, symp_new; ħ = op.ħ) +end + +changebasis(::Type{<:QuadBlockBasis}, op::GaussianUnitary{<:QuadBlockBasis,<:CuArray,<:CuArray}) = op +changebasis(::Type{<:QuadPairBasis}, op::GaussianUnitary{<:QuadPairBasis,<:CuArray,<:CuArray}) = op + +""" + changebasis(::Type{QuadBlockBasis}, op::GaussianChannel{QuadPairBasis, CuArray, CuArray}) + +Convert GaussianChannel from QuadPairBasis to QuadBlockBasis on GPU. +""" +function changebasis(::Type{B1}, op::GaussianChannel{B2,D,T}) where {B1<:QuadBlockBasis,B2<:QuadPairBasis,D<:CuArray,T<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_op = GaussianChannel(op.basis, Array(op.disp), Array(op.transform), Array(op.noise); ħ=op.ħ) + cpu_result = changebasis(B1, cpu_op) + return GaussianChannel(cpu_result.basis, CuArray(cpu_result.disp), + CuArray(cpu_result.transform), CuArray(cpu_result.noise); ħ=cpu_result.ħ) + end + nmodes = op.basis.nmodes + Td = eltype(D) + disp_new = CUDA.zeros(Td, 2*nmodes) + transform_new = CUDA.zeros(Td, 2*nmodes, 2*nmodes) + noise_new = CUDA.zeros(Td, 2*nmodes, 2*nmodes) + _convert_mean_pair_to_block!(disp_new, op.disp, nmodes) + perm_indices = _create_pair_to_block_permutation(nmodes, Td) + transform_new = _convert_matrix_with_permutation(op.transform, perm_indices) + noise_new = _convert_matrix_with_permutation(op.noise, perm_indices) + return GaussianChannel(B1(nmodes), disp_new, transform_new, noise_new; ħ = op.ħ) +end + +""" + changebasis(::Type{QuadPairBasis}, op::GaussianChannel{QuadBlockBasis, CuArray, CuArray}) + +Convert GaussianChannel from QuadBlockBasis to QuadPairBasis on GPU. +""" +function changebasis(::Type{B1}, op::GaussianChannel{B2,D,T}) where {B1<:QuadPairBasis,B2<:QuadBlockBasis,D<:CuArray,T<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_op = GaussianChannel(op.basis, Array(op.disp), Array(op.transform), Array(op.noise); ħ=op.ħ) + cpu_result = changebasis(B1, cpu_op) + return GaussianChannel(cpu_result.basis, CuArray(cpu_result.disp), + CuArray(cpu_result.transform), CuArray(cpu_result.noise); ħ=cpu_result.ħ) + end + nmodes = op.basis.nmodes + Td = eltype(D) + disp_new = CUDA.zeros(Td, 2*nmodes) + transform_new = CUDA.zeros(Td, 2*nmodes, 2*nmodes) + noise_new = CUDA.zeros(Td, 2*nmodes, 2*nmodes) + _convert_mean_block_to_pair!(disp_new, op.disp, nmodes) + perm_indices = _create_block_to_pair_permutation(nmodes, Td) + transform_new = _convert_matrix_with_permutation(op.transform, perm_indices) + noise_new = _convert_matrix_with_permutation(op.noise, perm_indices) + return GaussianChannel(B1(nmodes), disp_new, transform_new, noise_new; ħ = op.ħ) +end +changebasis(::Type{<:QuadBlockBasis}, op::GaussianChannel{<:QuadBlockBasis,<:CuArray,<:CuArray}) = op +changebasis(::Type{<:QuadPairBasis}, op::GaussianChannel{<:QuadPairBasis,<:CuArray,<:CuArray}) = op diff --git a/ext/CUDAExt/cross_wigner_kernels.jl b/ext/CUDAExt/cross_wigner_kernels.jl new file mode 100644 index 0000000..93c9c30 --- /dev/null +++ b/ext/CUDAExt/cross_wigner_kernels.jl @@ -0,0 +1,200 @@ +""" + cross_wigner_kernel!(result, x_point, mean1, mean2, avg_covar_inv, + omega, log_det_avg_covar, nmodes, hbar) + +GPU kernel for computing cross-Wigner function between two Gaussian states. +Implements: W₁₂(x) = norm × exp[-½(x-μ̄)ᵀV̄⁻¹(x-μ̄)] × exp[i(μ₁-μ₂)ᵀΩ(x-μ̄)/ħ] +where μ̄ = (μ₁+μ₂)/2, V̄ = (V₁+V₂)/2 +""" +function cross_wigner_kernel!(result::CuDeviceArray{Complex{T}}, + x_point::CuDeviceVector{T}, + mean1::CuDeviceVector{T}, + mean2::CuDeviceVector{T}, + avg_covar_inv::CuDeviceMatrix{T}, + omega::CuDeviceMatrix{T}, + log_det_avg_covar::T, + nmodes::Int, hbar::T) where T + + if threadIdx().x == 1 && blockIdx().x == 1 + quadratic_form = zero(T) + for i in 1:length(mean1) + avg_mean_i = (mean1[i] + mean2[i]) * T(0.5) + dx_i = x_point[i] - avg_mean_i + for j in 1:length(mean1) + avg_mean_j = (mean1[j] + mean2[j]) * T(0.5) + dx_j = x_point[j] - avg_mean_j + quadratic_form += dx_i * avg_covar_inv[i, j] * dx_j + end + end + phase_arg = zero(T) + for i in 1:length(mean1) + mu_diff_i = mean1[i] - mean2[i] + avg_mean_i = (mean1[i] + mean2[i]) * T(0.5) + dx_i = x_point[i] - avg_mean_i + omega_dx_i = zero(T) + for j in 1:length(mean1) + avg_mean_j = (mean1[j] + mean2[j]) * T(0.5) + dx_j = x_point[j] - avg_mean_j + omega_dx_i += omega[i, j] * dx_j + end + phase_arg += mu_diff_i * omega_dx_i + end + log_normalization = -T(nmodes) * log(T(2π)) - T(0.5) * log_det_avg_covar + normalization = exp(log_normalization) + gaussian_part = exp(-T(0.5) * quadratic_form) + phase_factor = Complex{T}(cos(phase_arg / hbar), sin(phase_arg / hbar)) + result[1] = normalization * gaussian_part * phase_factor + end + return nothing +end + +""" + cross_wignerchar_kernel!(result, xi_point, quadratic_matrix, linear_vector) + +GPU kernel for computing cross-Wigner characteristic function between two Gaussian states. +Implements: χ₁₂(ξ) = exp(-½ξᵀΩV̄Ωᵀξ - iΩμ̄·ξ) +where μ̄ = (μ₁+μ₂)/2, V̄ = (V₁+V₂)/2 +""" +function cross_wignerchar_kernel!(result::CuDeviceArray{Complex{T}}, + xi_point::CuDeviceVector{T}, + quadratic_matrix::CuDeviceMatrix{T}, + linear_vector::CuDeviceVector{T}) where T + + if threadIdx().x == 1 && blockIdx().x == 1 + quadratic_term = zero(T) + for i in 1:length(xi_point) + for j in 1:length(xi_point) + quadratic_term += xi_point[i] * quadratic_matrix[i, j] * xi_point[j] + end + end + linear_term = zero(T) + for i in 1:length(xi_point) + linear_term += linear_vector[i] * xi_point[i] + end + real_part = -T(0.5) * quadratic_term + imag_part = -linear_term + result[1] = Complex{T}(cos(imag_part), sin(imag_part)) * exp(real_part) + end + return nothing +end + +""" + gpu_safe_logdet(A::CuMatrix{T}) where T + +Compute log determinant of positive definite matrix using Cholesky decomposition. +""" +function gpu_safe_logdet(A::CuMatrix{T}) where T + try + L = cholesky(Symmetric(A)).L + diag_L = diag(L) + log_det_L = sum(log.(diag_L)) + return T(2) * log_det_L + catch e + A_cpu = Array(A) + return T(logdet(A_cpu)) + end +end + +""" + gpu_safe_inv(A::CuMatrix{T}) where T + +Compute matrix inverse using GPU-safe operations. +""" +function gpu_safe_inv(A::CuMatrix{T}) where T + try + return inv(A) + catch e + A_cpu = Array(A) + return CuArray{T}(inv(A_cpu)) + end +end + +""" + gpu_safe_length_check(x::AbstractVector, expected_len::Int) +""" +function gpu_safe_length_check(x::AbstractVector, expected_len::Int) + actual_len = size(x, 1) + return actual_len == expected_len +end + +function Gabs.cross_wigner(state1::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, + state2::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, + x::AbstractVector) + + if !CUDA_AVAILABLE + @warn "CUDA not available for cross_wigner. This should not happen in CUDAExt." + error("GPU cross_wigner called but CUDA not available") + end + typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(SYMPLECTIC_ERROR)) + state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR)) + state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR)) + expected_len = size(state1.mean, 1) + actual_len = size(x, 1) + actual_len == expected_len || throw(ArgumentError(WIGNER_ERROR)) + if state1 === state2 + wigner_result = wigner(state1, x) + T = real(eltype(state1.mean)) + return Complex{T}(wigner_result, zero(T)) + end + T = real(eltype(state1.mean)) + nmodes = state1.basis.nmodes + x_gpu = CuArray{T}(x) + avg_covar = (state1.covar .+ state2.covar) ./ T(2) + avg_covar_inv = gpu_safe_inv(avg_covar) + log_det_avg_covar = gpu_safe_logdet(avg_covar) + omega_cpu = symplecticform(state1.basis) + omega_gpu = CuArray{T}(omega_cpu) + result = CuArray{Complex{T}}(undef, 1) + @cuda threads=1 blocks=1 cross_wigner_kernel!( + result, x_gpu, state1.mean, state2.mean, avg_covar_inv, + omega_gpu, log_det_avg_covar, nmodes, T(state1.ħ) + ) + CUDA.synchronize() + return Array(result)[1] +end + +function Gabs.cross_wignerchar(state1::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, + state2::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, + xi::AbstractVector) + + if !CUDA_AVAILABLE + @warn "CUDA not available for cross_wignerchar. This should not happen in CUDAExt." + error("GPU cross_wignerchar called but CUDA not available") + end + typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(SYMPLECTIC_ERROR)) + state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR)) + state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR)) + expected_len = size(state1.mean, 1) + actual_len = size(xi, 1) + actual_len == expected_len || throw(ArgumentError(WIGNER_ERROR)) + if state1 === state2 + return wignerchar(state1, xi) + end + T = real(eltype(state1.mean)) + xi_gpu = CuArray{T}(xi) + avg_mean = (state1.mean .+ state2.mean) ./ T(2) + avg_covar = (state1.covar .+ state2.covar) ./ T(2) + omega_cpu = symplecticform(state1.basis) + omega_gpu = CuArray{T}(omega_cpu) + temp_matrix = omega_gpu * avg_covar + quadratic_matrix = temp_matrix * transpose(omega_gpu) + linear_vector = omega_gpu * avg_mean + result = CuArray{Complex{T}}(undef, 1) + @cuda threads=1 blocks=1 cross_wignerchar_kernel!( + result, xi_gpu, quadratic_matrix, linear_vector + ) + CUDA.synchronize() + return Array(result)[1] +end + +function Gabs.cross_wigner(state1::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, + state2::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, + x::Vector{T}) where T<:Real + return cross_wigner(state1, state2, CuArray{real(eltype(state1.mean))}(x)) +end + +function Gabs.cross_wignerchar(state1::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, + state2::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, + xi::Vector{T}) where T<:Real + return cross_wignerchar(state1, state2, CuArray{real(eltype(state1.mean))}(xi)) +end \ No newline at end of file diff --git a/ext/CUDAExt/interference_wigner.jl b/ext/CUDAExt/interference_wigner.jl new file mode 100644 index 0000000..b6157e1 --- /dev/null +++ b/ext/CUDAExt/interference_wigner.jl @@ -0,0 +1,213 @@ +""" + wigner(lc::GaussianLinearCombination{GPU states}, x::AbstractVector) + +Implements: W(x) = Σᵢ |cᵢ|² Wᵢ(x) + 2 Σᵢ<ⱼ Re(cᵢ*cⱼ W_cross(ψᵢ,ψⱼ,x)) +""" +function wigner(lc::GaussianLinearCombination{B,C,S}, x::AbstractVector) where { + B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}} + if !CUDA_AVAILABLE + error("GPU interference wigner called but CUDA not available") + end + if !isempty(lc.states) + expected_len = size(lc.states[1].mean, 1) + actual_len = length(x) + actual_len == expected_len || throw(ArgumentError(WIGNER_ERROR)) + else + throw(ArgumentError("Cannot compute Wigner function of empty linear combination")) + end + T = real(eltype(lc.states[1].mean)) + result = zero(T) + x_gpu = CuArray{T}(x) + n_states = length(lc.states) + @inbounds for i in 1:n_states + ci = lc.coeffs[i] + state_i = lc.states[i] + wigner_i = wigner(state_i, x_gpu) + result += abs2(ci) * wigner_i + end + @inbounds for i in 1:(n_states-1) + ci = lc.coeffs[i] + state_i = lc.states[i] + @inbounds for j in (i+1):n_states + cj = lc.coeffs[j] + state_j = lc.states[j] + cross_w = cross_wigner(state_i, state_j, x_gpu) + interference_coeff = conj(ci) * cj + cross_term = 2 * real(interference_coeff * cross_w) + result += cross_term + end + end + return result +end + +""" + wignerchar(lc::GaussianLinearCombination{GPU states}, xi::AbstractVector) + +Implements: χ(ξ) = Σᵢ |cᵢ|² χᵢ(ξ) + 2 Σᵢ<ⱼ Re(cᵢ*cⱼ χ_cross(ψᵢ,ψⱼ,ξ)) +""" +function wignerchar(lc::GaussianLinearCombination{B,C,S}, xi::AbstractVector) where { + B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}} + if !CUDA_AVAILABLE + error("GPU interference wignerchar called but CUDA not available") + end + if isempty(lc.states) + throw(ArgumentError("Cannot compute Wigner characteristic function of empty linear combination")) + end + + expected_len = size(lc.states[1].mean, 1) + actual_len = length(xi) + actual_len == expected_len || throw(ArgumentError("Evaluation point dimension ($actual_len) does not match state dimension ($expected_len)")) + T = real(eltype(lc.states[1].mean)) + result = complex(zero(T), zero(T)) + xi_gpu = CuArray{T}(xi) + n_states = length(lc.states) + @inbounds for i in 1:n_states + ci = lc.coeffs[i] + state_i = lc.states[i] + wignerchar_i = wignerchar(state_i, xi_gpu) + result += abs2(ci) * wignerchar_i + end + @inbounds for i in 1:(n_states-1) + ci = lc.coeffs[i] + state_i = lc.states[i] + @inbounds for j in (i+1):n_states + cj = lc.coeffs[j] + state_j = lc.states[j] + cross_char = cross_wignerchar(state_i, state_j, xi_gpu) + interference_coeff = conj(ci) * cj + cross_term = 2 * real(interference_coeff * cross_char) + result += cross_term + end + end + return result +end + +""" + wigner(lc::GaussianLinearCombination{GPU states}, x::Vector{T}) where T<:Real +""" +function wigner(lc::GaussianLinearCombination{B,C,S}, x::Vector{T}) where { + B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T<:Real} + target_precision = real(eltype(lc.states[1].mean)) + return wigner(lc, CuArray{target_precision}(x)) +end + +""" + wignerchar(lc::GaussianLinearCombination{GPU states}, xi::Vector{T}) where T<:Real +""" +function wignerchar(lc::GaussianLinearCombination{B,C,S}, xi::Vector{T}) where { + B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T<:Real} + target_precision = real(eltype(lc.states[1].mean)) + return wignerchar(lc, CuArray{target_precision}(xi)) +end + +""" + wigner(lc::GaussianLinearCombination{GPU states}, x_points::CuMatrix{T}) + +Implements batched version of: W(x) = Σᵢ |cᵢ|² Wᵢ(x) + 2 Σᵢ<ⱼ Re(cᵢ*cⱼ W_cross(ψᵢ,ψⱼ,x)) + +# Arguments +- `lc::GaussianLinearCombination`: Linear combination with GPU states +- `x_points::CuMatrix{T}`: Phase space points (dimension × num_points) + +# Returns +- `CuArray{T}`: Wigner function values at all points +""" +function wigner(lc::GaussianLinearCombination{B,C,S}, x_points::CuMatrix{T}) where { + B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T} + if !CUDA_AVAILABLE + error("GPU batched interference wigner called but CUDA not available") + end + if isempty(lc.states) + throw(ArgumentError("Cannot compute Wigner function of empty linear combination")) + end + expected_len = size(lc.states[1].mean, 1) + actual_len = size(x_points, 1) + actual_len == expected_len || throw(ArgumentError("Phase space point dimension ($actual_len) does not match state dimension ($expected_len)")) + num_points = size(x_points, 2) + n_states = length(lc.states) + results = CUDA.zeros(T, num_points) + for i in 1:n_states + ci = lc.coeffs[i] + state_i = lc.states[i] + wigner_values = wigner(state_i, x_points) + results .+= abs2(ci) .* wigner_values + end + for i in 1:(n_states-1) + ci = lc.coeffs[i] + state_i = lc.states[i] + for j in (i+1):n_states + cj = lc.coeffs[j] + state_j = lc.states[j] + cross_values = cross_wigner_batch(state_i, state_j, x_points) + interference_coeff = conj(ci) * cj + cross_contributions = 2 .* real.(interference_coeff .* cross_values) + results .+= cross_contributions + end + end + return results +end + +""" + wignerchar(lc::GaussianLinearCombination{GPU states}, xi_points::CuMatrix{T}) + +Implements batched version of: χ(ξ) = Σᵢ |cᵢ|² χᵢ(ξ) + 2 Σᵢ<ⱼ Re(cᵢ*cⱼ χ_cross(ψᵢ,ψⱼ,ξ)) + +# Arguments +- `lc::GaussianLinearCombination`: Linear combination with GPU states +- `xi_points::CuMatrix{T}`: Evaluation points in phase space (dimension × num_points) + +# Returns +- `CuArray{Complex{T}}`: Characteristic function values at all points +""" +function wignerchar(lc::GaussianLinearCombination{B,C,S}, xi_points::CuMatrix{T}) where { + B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T} + if !CUDA_AVAILABLE + error("GPU batched interference wignerchar called but CUDA not available") + end + if isempty(lc.states) + throw(ArgumentError("Cannot compute Wigner characteristic function of empty linear combination")) + end + expected_len = size(lc.states[1].mean, 1) + actual_len = size(xi_points, 1) + actual_len == expected_len || throw(ArgumentError("Evaluation point dimension ($actual_len) does not match state dimension ($expected_len)")) + num_points = size(xi_points, 2) + n_states = length(lc.states) + results = CUDA.zeros(Complex{T}, num_points) + for i in 1:n_states + ci = lc.coeffs[i] + state_i = lc.states[i] + wignerchar_values = wignerchar(state_i, xi_points) + results .+= abs2(ci) .* wignerchar_values + end + for i in 1:(n_states-1) + ci = lc.coeffs[i] + state_i = lc.states[i] + for j in (i+1):n_states + cj = lc.coeffs[j] + state_j = lc.states[j] + cross_values = cross_wignerchar_batch(state_i, state_j, xi_points) + interference_coeff = conj(ci) * cj + cross_contributions = 2 .* real.(interference_coeff .* cross_values) + results .+= cross_contributions + end + end + return results +end + +""" + wigner(lc::GaussianLinearCombination{GPU states}, x_points::Matrix{T}) where T<:Real +""" +function wigner(lc::GaussianLinearCombination{B,C,S}, x_points::Matrix{T}) where { + B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T<:Real} + target_precision = real(eltype(lc.states[1].mean)) + return wigner(lc, CuArray{target_precision}(x_points)) +end + +""" + wignerchar(lc::GaussianLinearCombination{GPU states}, xi_points::Matrix{T}) where T<:Real +""" +function wignerchar(lc::GaussianLinearCombination{B,C,S}, xi_points::Matrix{T}) where { + B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T<:Real} + target_precision = real(eltype(lc.states[1].mean)) + return wignerchar(lc, CuArray{target_precision}(xi_points)) +end \ No newline at end of file diff --git a/ext/CUDAExt/linear_combinations.jl b/ext/CUDAExt/linear_combinations.jl new file mode 100644 index 0000000..0e4697f --- /dev/null +++ b/ext/CUDAExt/linear_combinations.jl @@ -0,0 +1,58 @@ +Gabs._is_gpu_array(x::CuArray) = true + +function Gabs._detect_device(lc::GaussianLinearCombination) + if isempty(lc.states) + return :cpu + end + return Gabs._detect_device(lc.states[1].mean) +end + +function Gabs._gpu_impl(lc::GaussianLinearCombination, precision::Type{T}) where T + if !CUDA_AVAILABLE + @warn "CUDA not available. Cannot transfer to GPU." + return lc + end + gpu_states = [Gabs._gpu_impl(state, precision) for state in lc.states] + cpu_coeffs = Vector{T}(Array(lc.coeffs)) + return GaussianLinearCombination(lc.basis, cpu_coeffs, gpu_states) +end + +function Base.:(*)(op::GaussianUnitary{B,<:CuArray,<:CuArray}, lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}} + op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR)) + op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR)) + new_states = [op * state for state in lc.states] + return GaussianLinearCombination(lc.basis, Vector(lc.coeffs), new_states) +end + +function Base.:(*)(op::GaussianChannel{B,<:CuArray,<:CuArray}, lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}} + op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR)) + op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR)) + new_states = [op * state for state in lc.states] + return GaussianLinearCombination(lc.basis, Vector(lc.coeffs), new_states) +end + +function Base.:(*)(op::GaussianUnitary{B,<:Array,<:Array}, lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}} + op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR)) + op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR)) + T = real(eltype(lc.states[1].mean)) + gpu_op = Gabs._gpu_impl(op, T) + new_states = [gpu_op * state for state in lc.states] + return GaussianLinearCombination(lc.basis, Vector(lc.coeffs), new_states) +end + +function Base.:(*)(op::GaussianChannel{B,<:Array,<:Array}, lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}} + op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR)) + op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR)) + T = real(eltype(lc.states[1].mean)) + gpu_op = Gabs._gpu_impl(op, T) + new_states = [gpu_op * state for state in lc.states] + return GaussianLinearCombination(lc.basis, Vector(lc.coeffs), new_states) +end + +function purity(lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}} + return 1.0 +end + +function entropy_vn(lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}} + return 0.0 +end \ No newline at end of file diff --git a/ext/CUDAExt/ptrace_operations.jl b/ext/CUDAExt/ptrace_operations.jl new file mode 100644 index 0000000..350899c --- /dev/null +++ b/ext/CUDAExt/ptrace_operations.jl @@ -0,0 +1,60 @@ +""" + ptrace(state::GaussianState{B,M,V}, indices::Union{Int, AbstractVector{<:Int}}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray} +""" +function ptrace(state::GaussianState{B,M,V}, indices::Union{Int, AbstractVector{<:Int}}) where { + B<:SymplecticBasis, M<:CuArray, V<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ) + cpu_result = ptrace(cpu_state, indices) + return GaussianState(cpu_result.basis, CuArray(cpu_result.mean), CuArray(cpu_result.covar); ħ=cpu_result.ħ) + end + indices_vec = isa(indices, Int) ? [indices] : collect(indices) + basis = state.basis + nmodes = basis.nmodes + length(indices_vec) < nmodes || throw(ArgumentError(Gabs.INDEX_ERROR)) + all_indices = collect(1:nmodes) + keep_indices = setdiff(all_indices, indices_vec) + n_keep = length(keep_indices) + T = eltype(M) + if basis isa QuadPairBasis + mean_indices = Vector{Int}() + for mode_idx in keep_indices + push!(mean_indices, 2*mode_idx - 1) + push!(mean_indices, 2*mode_idx) + end + new_mean = state.mean[CuArray(mean_indices)] + covar_indices = Vector{Int}() + for mode_idx in keep_indices + push!(covar_indices, 2*mode_idx - 1) + push!(covar_indices, 2*mode_idx) + end + covar_indices_gpu = CuArray(covar_indices) + new_covar = state.covar[covar_indices_gpu, covar_indices_gpu] + elseif basis isa QuadBlockBasis + x_indices = keep_indices + p_indices = keep_indices .+ nmodes + mean_indices = vcat(x_indices, p_indices) + new_mean = state.mean[CuArray(mean_indices)] + all_covar_indices = vcat(x_indices, p_indices) + covar_indices_gpu = CuArray(all_covar_indices) + new_covar = state.covar[covar_indices_gpu, covar_indices_gpu] + else + error("Unknown basis type: $(typeof(basis))") + end + new_basis = typeof(basis)(n_keep) + return GaussianState(new_basis, new_mean, new_covar; ħ = state.ħ) +end + +function ptrace(::Type{Tm}, ::Type{Tc}, state::GaussianState{B,<:CuArray,<:CuArray}, indices::Union{Int, AbstractVector{<:Int}}) where {Tm<:CuArray, Tc<:CuArray, B<:SymplecticBasis} + result = ptrace(state, indices) + return GaussianState(result.basis, Tm(result.mean), Tc(result.covar); ħ = result.ħ) +end + +function ptrace(::Type{T}, state::GaussianState{B,<:CuArray,<:CuArray}, indices::Union{Int, AbstractVector{<:Int}}) where {T<:CuArray, B<:SymplecticBasis} + if T <: AbstractMatrix + return ptrace(CuVector{eltype(T)}, T, state, indices) + else + return ptrace(T, T, state, indices) + end +end \ No newline at end of file diff --git a/ext/CUDAExt/state_operations.jl b/ext/CUDAExt/state_operations.jl new file mode 100644 index 0000000..70c9466 --- /dev/null +++ b/ext/CUDAExt/state_operations.jl @@ -0,0 +1,126 @@ +function vacuumstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return vacuumstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis; ħ = ħ) + end + nmodes = basis.nmodes + T = eltype(Tm) + mean = CUDA.zeros(T, 2*nmodes) + covar = CuMatrix{T}((T(ħ)/2) * I(2*nmodes)) + return GaussianState(basis, mean, covar; ħ = ħ) +end + +function thermalstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}, photons::P; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int, P} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return thermalstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis, photons; ħ = ħ) + end + T = eltype(Tm) + mean, covar = _thermalstate(basis, photons; ħ = ħ) + gpu_mean = CuArray{T}(mean) + gpu_covar = CuArray{T}(covar) + return GaussianState(basis, gpu_mean, gpu_covar; ħ = ħ) +end + +function coherentstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}, alpha::A; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int, A} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return coherentstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis, alpha; ħ = ħ) + end + T = eltype(Tm) + mean, covar = _coherentstate(basis, alpha; ħ = ħ) + gpu_mean = CuArray{T}(mean) + gpu_covar = CuArray{T}(covar) + return GaussianState(basis, gpu_mean, gpu_covar; ħ = ħ) +end + +function squeezedstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int, R} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return squeezedstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis, r, theta; ħ = ħ) + end + T = eltype(Tm) + mean, covar = _squeezedstate(basis, r, theta; ħ = ħ) + gpu_mean = CuArray{T}(mean) + gpu_covar = CuArray{T}(covar) + return GaussianState(basis, gpu_mean, gpu_covar; ħ = ħ) +end + +function eprstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int, R} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return eprstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis, r, theta; ħ = ħ) + end + T = eltype(Tm) + mean, covar = _eprstate(basis, r, theta; ħ = ħ) + gpu_mean = CuArray{T}(mean) + gpu_covar = CuArray{T}(covar) + + return GaussianState(basis, gpu_mean, gpu_covar; ħ = ħ) +end + +function vacuumstate(::Type{T}, basis::SymplecticBasis{N}; ħ = 2) where {T<:CuVector, N<:Int} + return vacuumstate(T, CuMatrix{eltype(T)}, basis; ħ = ħ) +end + +function coherentstate(::Type{T}, basis::SymplecticBasis{N}, alpha::A; ħ = 2) where {T<:CuVector, N<:Int, A} + return coherentstate(T, CuMatrix{eltype(T)}, basis, alpha; ħ = ħ) +end + +function squeezedstate(::Type{T}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R} + return squeezedstate(T, CuMatrix{eltype(T)}, basis, r, theta; ħ = ħ) +end + +function thermalstate(::Type{T}, basis::SymplecticBasis{N}, photons::P; ħ = 2) where {T<:CuVector, N<:Int, P} + return thermalstate(T, CuMatrix{eltype(T)}, basis, photons; ħ = ħ) +end + +function eprstate(::Type{T}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R} + return eprstate(T, CuMatrix{eltype(T)}, basis, r, theta; ħ = ħ) +end +function Gabs.tensor(::Type{Tm}, ::Type{Tc}, state1::GaussianState{B,M1,V1}, state2::GaussianState{B,M2,V2}) where { + Tm<:CuVector, Tc<:CuMatrix, B<:SymplecticBasis, M1<:CuArray, V1<:CuArray, M2<:CuArray, V2<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state1 = GaussianState(state1.basis, Array(state1.mean), Array(state1.covar); ħ=state1.ħ) + cpu_state2 = GaussianState(state2.basis, Array(state2.mean), Array(state2.covar); ħ=state2.ħ) + result_cpu = tensor(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, cpu_state1, cpu_state2) + return GaussianState(result_cpu.basis, CuArray(result_cpu.mean), CuArray(result_cpu.covar); ħ=result_cpu.ħ) + end + typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR)) + state1.ħ == state2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR)) + combined_basis = state1.basis ⊕ state2.basis + T = eltype(Tm) + mean_combined = vcat(state1.mean, state2.mean) + n1 = length(state1.mean) + n2 = length(state2.mean) + total_dim = n1 + n2 + covar_combined = CUDA.zeros(T, total_dim, total_dim) + covar_combined[1:n1, 1:n1] .= state1.covar + covar_combined[n1+1:end, n1+1:end] .= state2.covar + return GaussianState(combined_basis, mean_combined, covar_combined; ħ = state1.ħ) +end + +function Gabs.tensor(state1::GaussianState{B,<:Array,<:Array}, state2::GaussianState{B,<:CuArray,<:CuArray}) where {B<:SymplecticBasis} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return tensor(state1, GaussianState(state2.basis, Array(state2.mean), Array(state2.covar); ħ=state2.ħ)) + end + typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR)) + state1.ħ == state2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR)) + T = real(eltype(state2.mean)) + gpu_state1 = GaussianState(state1.basis, CuArray{T}(state1.mean), CuArray{T}(state1.covar); ħ = state1.ħ) + return tensor(gpu_state1, state2) +end + +function Gabs.tensor(state1::GaussianState{B,<:CuArray,<:CuArray}, state2::GaussianState{B,<:Array,<:Array}) where {B<:SymplecticBasis} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return tensor(GaussianState(state1.basis, Array(state1.mean), Array(state1.covar); ħ=state1.ħ), state2) + end + typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR)) + state1.ħ == state2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR)) + T = real(eltype(state1.mean)) + gpu_state2 = GaussianState(state2.basis, CuArray{T}(state2.mean), CuArray{T}(state2.covar); ħ = state2.ħ) + return tensor(state1, gpu_state2) +end \ No newline at end of file diff --git a/ext/CUDAExt/unitary_operations.jl b/ext/CUDAExt/unitary_operations.jl new file mode 100644 index 0000000..1227915 --- /dev/null +++ b/ext/CUDAExt/unitary_operations.jl @@ -0,0 +1,213 @@ +function displace(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, alpha::A; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, A} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return displace(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, alpha; ħ = ħ) + end + T = eltype(Td) + disp, symplectic = _displace(basis, alpha; ħ = ħ) + gpu_disp = CuArray{T}(disp) + gpu_symplectic = CuArray{T}(symplectic) + return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ) +end + +function squeeze(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, R} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return squeeze(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, r, theta; ħ = ħ) + end + T = eltype(Td) + disp, symplectic = _squeeze(basis, r, theta) + gpu_disp = CuArray{T}(disp) + gpu_symplectic = CuArray{T}(symplectic) + return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ) +end + +function twosqueeze(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, R} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return twosqueeze(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, r, theta; ħ = ħ) + end + T = eltype(Td) + disp, symplectic = _twosqueeze(basis, r, theta) + gpu_disp = CuArray{T}(disp) + gpu_symplectic = CuArray{T}(symplectic) + return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ) +end + +function phaseshift(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, theta::R; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, R} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return phaseshift(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, theta; ħ = ħ) + end + T = eltype(Td) + disp, symplectic = _phaseshift(basis, theta) + gpu_disp = CuArray{T}(disp) + gpu_symplectic = CuArray{T}(symplectic) + return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ) +end + +function beamsplitter(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, transmit::R; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, R} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return beamsplitter(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, transmit; ħ = ħ) + end + T = eltype(Td) + disp, symplectic = _beamsplitter(basis, transmit) + gpu_disp = CuArray{T}(disp) + gpu_symplectic = CuArray{T}(symplectic) + return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ) +end + +function attenuator(::Type{Td}, ::Type{Tt}, basis::SymplecticBasis{N}, theta::R, n::M; ħ = 2) where {Td<:CuVector, Tt<:CuMatrix, N<:Int, R, M} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return attenuator(Vector{eltype(Td)}, Matrix{eltype(Tt)}, basis, theta, n; ħ = ħ) + end + T = eltype(Td) + disp, transform, noise = _attenuator(basis, theta, n) + gpu_disp = CuArray{T}(disp) + gpu_transform = CuArray{T}(transform) + gpu_noise = CuArray{T}(noise) + return GaussianChannel(basis, gpu_disp, gpu_transform, gpu_noise; ħ = ħ) +end + +function amplifier(::Type{Td}, ::Type{Tt}, basis::SymplecticBasis{N}, r::R, n::M; ħ = 2) where {Td<:CuVector, Tt<:CuMatrix, N<:Int, R, M} + if !CUDA_AVAILABLE + gpu_fallback_warning() + return amplifier(Vector{eltype(Td)}, Matrix{eltype(Tt)}, basis, r, n; ħ = ħ) + end + T = eltype(Td) + disp, transform, noise = _amplifier(basis, r, n) + gpu_disp = CuArray{T}(disp) + gpu_transform = CuArray{T}(transform) + gpu_noise = CuArray{T}(noise) + return GaussianChannel(basis, gpu_disp, gpu_transform, gpu_noise; ħ = ħ) +end + +function displace(::Type{T}, basis::SymplecticBasis{N}, alpha::A; ħ = 2) where {T<:CuVector, N<:Int, A} + return displace(T, CuMatrix{eltype(T)}, basis, alpha; ħ = ħ) +end + +function squeeze(::Type{T}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R} + return squeeze(T, CuMatrix{eltype(T)}, basis, r, theta; ħ = ħ) +end + +function twosqueeze(::Type{T}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R} + return twosqueeze(T, CuMatrix{eltype(T)}, basis, r, theta; ħ = ħ) +end + +function phaseshift(::Type{T}, basis::SymplecticBasis{N}, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R} + return phaseshift(T, CuMatrix{eltype(T)}, basis, theta; ħ = ħ) +end + +function beamsplitter(::Type{T}, basis::SymplecticBasis{N}, transmit::R; ħ = 2) where {T<:CuVector, N<:Int, R} + return beamsplitter(T, CuMatrix{eltype(T)}, basis, transmit; ħ = ħ) +end + +function attenuator(::Type{T}, basis::SymplecticBasis{N}, theta::R, n::M; ħ = 2) where {T<:CuVector, N<:Int, R, M} + return attenuator(T, CuMatrix{eltype(T)}, basis, theta, n; ħ = ħ) +end + +function amplifier(::Type{T}, basis::SymplecticBasis{N}, r::R, n::M; ħ = 2) where {T<:CuVector, N<:Int, R, M} + return amplifier(T, CuMatrix{eltype(T)}, basis, r, n; ħ = ħ) +end +function Gabs.tensor(::Type{Td}, ::Type{Ts}, op1::GaussianUnitary{B,D1,S1}, op2::GaussianUnitary{B,D2,S2}) where { + Td<:CuVector, Ts<:CuMatrix, B<:SymplecticBasis, D1<:CuArray, S1<:CuArray, D2<:CuArray, S2<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_op1 = GaussianUnitary(op1.basis, Array(op1.disp), Array(op1.symplectic); ħ=op1.ħ) + cpu_op2 = GaussianUnitary(op2.basis, Array(op2.disp), Array(op2.symplectic); ħ=op2.ħ) + result_cpu = tensor(Vector{eltype(Td)}, Matrix{eltype(Ts)}, cpu_op1, cpu_op2) + return GaussianUnitary(result_cpu.basis, CuArray{eltype(Td)}(result_cpu.disp), CuArray{eltype(Ts)}(result_cpu.symplectic); ħ=result_cpu.ħ) + end + typeof(op1.basis) == typeof(op2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR)) + op1.ħ == op2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR)) + combined_basis = op1.basis ⊕ op2.basis + T = eltype(Td) + n1 = length(op1.disp) + n2 = length(op2.disp) + total_dim = n1 + n2 + disp_combined = vcat(CuArray{T}(op1.disp), CuArray{T}(op2.disp)) + symplectic_combined = CUDA.zeros(T, total_dim, total_dim) + symplectic_combined[1:n1, 1:n1] .= CuArray{T}(op1.symplectic) + symplectic_combined[n1+1:end, n1+1:end] .= CuArray{T}(op2.symplectic) + return GaussianUnitary(combined_basis, disp_combined, symplectic_combined; ħ = op1.ħ) +end + +function Gabs.tensor(::Type{Td}, ::Type{Tt}, op1::GaussianChannel{B,D1,T1}, op2::GaussianChannel{B,D2,T2}) where { + Td<:CuVector, Tt<:CuMatrix, B<:SymplecticBasis, D1<:CuArray, T1<:CuArray, D2<:CuArray, T2<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_op1 = GaussianChannel(op1.basis, Array(op1.disp), Array(op1.transform), Array(op1.noise); ħ=op1.ħ) + cpu_op2 = GaussianChannel(op2.basis, Array(op2.disp), Array(op2.transform), Array(op2.noise); ħ=op2.ħ) + result_cpu = tensor(Vector{eltype(Td)}, Matrix{eltype(Tt)}, cpu_op1, cpu_op2) + return GaussianChannel(result_cpu.basis, CuArray{eltype(Td)}(result_cpu.disp), CuArray{eltype(Tt)}(result_cpu.transform), CuArray{eltype(Tt)}(result_cpu.noise); ħ=result_cpu.ħ) + end + typeof(op1.basis) == typeof(op2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR)) + op1.ħ == op2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR)) + combined_basis = op1.basis ⊕ op2.basis + T = eltype(Td) + n1 = length(op1.disp) + n2 = length(op2.disp) + total_dim = n1 + n2 + disp_combined = vcat(CuArray{T}(op1.disp), CuArray{T}(op2.disp)) + transform_combined = CUDA.zeros(T, total_dim, total_dim) + transform_combined[1:n1, 1:n1] .= CuArray{T}(op1.transform) + transform_combined[n1+1:end, n1+1:end] .= CuArray{T}(op2.transform) + noise_combined = CUDA.zeros(T, total_dim, total_dim) + noise_combined[1:n1, 1:n1] .= CuArray{T}(op1.noise) + noise_combined[n1+1:end, n1+1:end] .= CuArray{T}(op2.noise) + return GaussianChannel(combined_basis, disp_combined, transform_combined, noise_combined; ħ = op1.ħ) +end + +function Base.:(*)(op::GaussianUnitary{B,<:CuArray,<:CuArray}, state::GaussianState{B,<:Array,<:Array}) where {B<:SymplecticBasis} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_op = GaussianUnitary(op.basis, Array(op.disp), Array(op.symplectic); ħ=op.ħ) + return cpu_op * state + end + op.basis == state.basis || throw(ArgumentError(ACTION_ERROR)) + op.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR)) + T = real(eltype(op.disp)) + gpu_state = GaussianState(state.basis, CuArray{T}(state.mean), CuArray{T}(state.covar); ħ = state.ħ) + return op * gpu_state +end + +function Base.:(*)(op::GaussianUnitary{B,<:Array,<:Array}, state::GaussianState{B,<:CuArray,<:CuArray}) where {B<:SymplecticBasis} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ) + return op * cpu_state + end + op.basis == state.basis || throw(ArgumentError(ACTION_ERROR)) + op.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR)) + T = real(eltype(state.mean)) + gpu_op = GaussianUnitary(op.basis, CuArray{T}(op.disp), CuArray{T}(op.symplectic); ħ = op.ħ) + return gpu_op * state +end + +function Base.:(*)(op::GaussianChannel{B,<:CuArray,<:CuArray}, state::GaussianState{B,<:Array,<:Array}) where {B<:SymplecticBasis} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_op = GaussianChannel(op.basis, Array(op.disp), Array(op.transform), Array(op.noise); ħ=op.ħ) + return cpu_op * state + end + op.basis == state.basis || throw(ArgumentError(ACTION_ERROR)) + op.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR)) + T = real(eltype(op.disp)) + gpu_state = GaussianState(state.basis, CuArray{T}(state.mean), CuArray{T}(state.covar); ħ = state.ħ) + return op * gpu_state +end + +function Base.:(*)(op::GaussianChannel{B,<:Array,<:Array}, state::GaussianState{B,<:CuArray,<:CuArray}) where {B<:SymplecticBasis} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ) + return op * cpu_state + end + op.basis == state.basis || throw(ArgumentError(ACTION_ERROR)) + op.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR)) + T = real(eltype(state.mean)) + gpu_op = GaussianChannel(op.basis, CuArray{T}(op.disp), CuArray{T}(op.transform), CuArray{T}(op.noise); ħ = op.ħ) + return gpu_op * state +end \ No newline at end of file diff --git a/ext/CUDAExt/utils.jl b/ext/CUDAExt/utils.jl new file mode 100644 index 0000000..d2ffd08 --- /dev/null +++ b/ext/CUDAExt/utils.jl @@ -0,0 +1,63 @@ +Base.@propagate_inbounds function _promote_output_vector(::Type{T1}, ::Type{T2}, vec_out) where {T1<:CuVector, T2<:CuVector} + return CuArray(vec_out) +end + +Base.@propagate_inbounds function _promote_output_vector(::Type{T1}, ::Type{T2}, vec_out) where {T1<:CuVector, T2<:Vector} + return CuArray(vec_out) +end + +Base.@propagate_inbounds function _promote_output_vector(::Type{T1}, ::Type{T2}, vec_out) where {T1<:Vector, T2<:CuVector} + return CuArray(vec_out) +end + +Base.@propagate_inbounds function _promote_output_vector(::Type{<:CuVector}, vec_out, vec_length::Int) + return CuArray(vec_out) +end + +Base.@propagate_inbounds function _promote_output_matrix(::Type{T1}, ::Type{T2}, mat_out) where {T1<:CuMatrix, T2<:CuMatrix} + return CuArray(mat_out) +end + +Base.@propagate_inbounds function _promote_output_matrix(::Type{T1}, ::Type{T2}, mat_out) where {T1<:CuMatrix, T2<:Matrix} + return CuArray(mat_out) +end + +Base.@propagate_inbounds function _promote_output_matrix(::Type{T1}, ::Type{T2}, mat_out) where {T1<:Matrix, T2<:CuMatrix} + return CuArray(mat_out) +end + +Base.@propagate_inbounds function _promote_output_matrix(::Type{<:CuMatrix}, mat_out, out_dim::Int) + return CuArray(mat_out) +end + +Base.@propagate_inbounds function _promote_output_matrix(::Type{<:CuMatrix}, mat_out, out_dim::Tuple) + return CuArray(mat_out) +end + +""" + gpu_fallback_warning() + +Issue warning when falling back to CPU due to CUDA unavailability. +""" +function gpu_fallback_warning() + if !CUDA_AVAILABLE + @warn "CUDA not available. Falling back to CPU computation. Install CUDA.jl and ensure GPU drivers are properly configured for GPU acceleration." + end +end + +""" + detect_array_device_type(x::AbstractArray) + +Detect device type of array using type introspection. +""" +function detect_array_device_type(x::CuArray) + return :gpu, eltype(x), size(x) +end + +function detect_array_device_type(x::AbstractArray) + return :cpu, eltype(x), size(x) +end + +function device(x::CuArray) + return :gpu +end \ No newline at end of file diff --git a/ext/CUDAExt/wigner_kernels.jl b/ext/CUDAExt/wigner_kernels.jl new file mode 100644 index 0000000..da77fe3 --- /dev/null +++ b/ext/CUDAExt/wigner_kernels.jl @@ -0,0 +1,383 @@ +""" + wigner_kernel!(output, x_points, mean, covar_inv, det_covar, nmodes) + +CUDA kernel for computing Wigner function values at multiple phase space points. +""" +function wigner_kernel!(output::CuDeviceVector{T}, x_points::CuDeviceMatrix{T}, + mean::CuDeviceVector{T}, covar_inv::CuDeviceMatrix{T}, + det_covar::T, nmodes::Int) where T + + idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if idx <= size(x_points, 2) + diff_sum = zero(T) + + for i in 1:length(mean) + diff_i = x_points[i, idx] - mean[i] + temp_sum = zero(T) + + for j in 1:length(mean) + diff_j = x_points[j, idx] - mean[j] + temp_sum += covar_inv[i, j] * diff_j + end + + diff_sum += diff_i * temp_sum + end + + + normalization = (2 * T(π))^nmodes * sqrt(det_covar) + exponent = -0.5 * diff_sum + + output[idx] = exp(exponent) / normalization + end + + return nothing +end + +""" + wignerchar_kernel!(output, xi_points, mean, covar, omega, nmodes, hbar) + +CUDA kernel for computing Wigner characteristic function values at multiple points. +""" +function wignerchar_kernel!(output::CuDeviceVector{Complex{T}}, xi_points::CuDeviceMatrix{T}, + precomputed_quadratic::CuDeviceMatrix{T}, precomputed_linear::CuDeviceVector{T}) where T + + idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + + if idx <= size(xi_points, 2) + quadratic_term = zero(T) + for i in 1:size(xi_points, 1) + for j in 1:size(xi_points, 1) + quadratic_term += xi_points[i, idx] * precomputed_quadratic[i, j] * xi_points[j, idx] + end + end + + linear_term = zero(T) + for i in 1:size(xi_points, 1) + linear_term += precomputed_linear[i] * xi_points[i, idx] + end + + + real_part = -0.5 * quadratic_term + imag_part = -linear_term + + output[idx] = Complex{T}(cos(imag_part), sin(imag_part)) * exp(real_part) + end + + return nothing +end + +function Gabs.wigner(state::GaussianState{B,M,V}, x::AbstractVector) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ) + return Gabs.wigner(cpu_state, x) + end + + basis = state.basis + nmodes = basis.nmodes + mean = state.mean + covar = state.covar + + length(mean) == length(x) || throw(ArgumentError(WIGNER_ERROR)) + + T = eltype(mean) + x_gpu = CuArray(T.(x)) + + diff = x_gpu .- mean + covar_inv = inv(covar) + + quad_form = dot(diff, covar_inv * diff) + + det_covar = det(covar) + normalization = (2 * T(π))^nmodes * sqrt(det_covar) + + result = exp(-0.5 * quad_form) / normalization + + return result +end + +function Gabs.wigner(state::GaussianState{B,M,V}, x_points::CuMatrix{T}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, T} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ) + return [Gabs.wigner(cpu_state, Array(x_points[:, i])) for i in 1:size(x_points, 2)] + end + basis = state.basis + nmodes = basis.nmodes + size(x_points, 1) == size(state.mean, 1) || throw(ArgumentError(WIGNER_ERROR)) + num_points = size(x_points, 2) + mean_cpu = Array(state.mean) + covar_cpu = Array(state.covar) + covar_inv_cpu = inv(covar_cpu) + det_covar = det(covar_cpu) + mean_gpu = CuArray{T}(mean_cpu) + covar_inv_gpu = CuArray{T}(covar_inv_cpu) + x_points_gpu = CuArray{T}(x_points) + output = CUDA.zeros(T, num_points) + threads_per_block = min(256, num_points) + num_blocks = cld(num_points, threads_per_block) + @cuda threads=threads_per_block blocks=num_blocks wigner_kernel!( + output, x_points_gpu, mean_gpu, covar_inv_gpu, T(det_covar), nmodes + ) + return output +end + +function Gabs.wignerchar(state::GaussianState{B,M,V}, xi::AbstractVector) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ) + return Gabs.wignerchar(cpu_state, xi) + end + basis = state.basis + nmodes = basis.nmodes + mean = state.mean + covar = state.covar + length(mean) == length(xi) || throw(ArgumentError(WIGNER_ERROR)) + T = eltype(mean) + xi_gpu = CuArray(T.(xi)) + omega = CuArray(symplecticform(basis)) + temp1 = omega * covar + temp2 = temp1 * transpose(omega) + quadratic_term = dot(xi_gpu, temp2 * xi_gpu) + omega_mean = omega * mean + linear_term = dot(omega_mean, xi_gpu) + real_part = -0.5 * quadratic_term + imag_part = -linear_term + result = Complex{T}(cos(imag_part), sin(imag_part)) * exp(real_part) + return result +end + +function Gabs.wignerchar(state::GaussianState{B,M,V}, xi_points::CuMatrix{T}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, T} + if !CUDA_AVAILABLE + gpu_fallback_warning() + cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ) + return [Gabs.wignerchar(cpu_state, Array(xi_points[:, i])) for i in 1:size(xi_points, 2)] + end + basis = state.basis + nmodes = basis.nmodes + size(xi_points, 1) == size(state.mean, 1) || throw(ArgumentError(WIGNER_ERROR)) + num_points = size(xi_points, 2) + mean_cpu = Array(state.mean) + covar_cpu = Array(state.covar) + omega_cpu = symplecticform(basis) + temp_matrix = omega_cpu * covar_cpu * transpose(omega_cpu) + precomputed_linear = omega_cpu * mean_cpu + precomputed_quadratic_gpu = CuArray{T}(temp_matrix) + precomputed_linear_gpu = CuArray{T}(precomputed_linear) + xi_points_gpu = CuArray{T}(xi_points) + output = CUDA.zeros(Complex{T}, num_points) + threads_per_block = min(256, num_points) + num_blocks = cld(num_points, threads_per_block) + @cuda threads=threads_per_block blocks=num_blocks wignerchar_kernel!( + output, xi_points_gpu, precomputed_quadratic_gpu, precomputed_linear_gpu + ) + return output +end + +""" + wigner_grid(state::GaussianState, x_range, p_range, nx::Int, np::Int) + +Create a phase space grid for Wigner function evaluation. +Returns points as a 2×(nx*np) CuMatrix for efficient GPU evaluation. +""" +function wigner_grid(state::GaussianState{B,M,V}, x_range, p_range, nx::Int, np::Int) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray} + T = eltype(M) + x_vals = range(x_range[1], x_range[2], length=nx) + p_vals = range(p_range[1], p_range[2], length=np) + total_points = nx * np + points = CUDA.zeros(T, 2, total_points) + for i in 1:nx + for j in 1:np + idx = (i-1) * np + j + points[1, idx] = T(x_vals[i]) + points[2, idx] = T(p_vals[j]) + end + end + return points +end + +""" + cross_wigner_batch(state1::GaussianState{CuArray}, state2::GaussianState{CuArray}, + x_points::CuMatrix) + + Batch evaluation of cross-Wigner function using existing GPU kernels. +""" +function cross_wigner_batch(state1::GaussianState{B,M,V}, state2::GaussianState{B,M,V}, + x_points::CuMatrix{T}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, T} + if !CUDA_AVAILABLE + error("GPU cross_wigner_batch called but CUDA not available") + end + typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(SYMPLECTIC_ERROR)) + state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR)) + state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR)) + + num_points = size(x_points, 2) + results = CUDA.zeros(Complex{T}, num_points) + _launch_cross_wigner_batch_kernel!(results, x_points, state1, state2) + return results +end + +""" + cross_wignerchar_batch(state1::GaussianState{CuArray}, state2::GaussianState{CuArray}, + xi_points::CuMatrix) + +Batch evaluation of cross-Wigner characteristic function using GPU kernels. +""" +function cross_wignerchar_batch(state1::GaussianState{B,M,V}, state2::GaussianState{B,M,V}, + xi_points::CuMatrix{T}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, T} + if !CUDA_AVAILABLE + error("GPU cross_wignerchar_batch called but CUDA not available") + end + typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(SYMPLECTIC_ERROR)) + state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR)) + state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR)) + num_points = size(xi_points, 2) + results = CUDA.zeros(Complex{T}, num_points) + _launch_cross_wignerchar_batch_kernel!(results, xi_points, state1, state2) + return results +end + +""" + _launch_cross_wigner_batch_kernel!(results, x_points, state1, state2) + +Launch GPU kernel for batch cross-Wigner evaluation. +""" +function _launch_cross_wigner_batch_kernel!(results::CuArray{Complex{T}}, + x_points::CuMatrix{T}, + state1::GaussianState, + state2::GaussianState) where T + num_points = size(x_points, 2) + nmodes = state1.basis.nmodes + mean1_cpu = Array(state1.mean) + mean2_cpu = Array(state2.mean) + covar1_cpu = Array(state1.covar) + covar2_cpu = Array(state2.covar) + omega_cpu = symplecticform(state1.basis) + avg_covar_cpu = (covar1_cpu + covar2_cpu) / 2 + avg_covar_inv_cpu = inv(avg_covar_cpu) + log_det_avg_covar = logdet(avg_covar_cpu) + mean1_gpu = CuArray{T}(mean1_cpu) + mean2_gpu = CuArray{T}(mean2_cpu) + avg_covar_inv_gpu = CuArray{T}(avg_covar_inv_cpu) + omega_gpu = CuArray{T}(omega_cpu) + threads_per_block = min(256, num_points) + num_blocks = cld(num_points, threads_per_block) + @cuda threads=threads_per_block blocks=num_blocks _cross_wigner_batch_kernel!( + results, x_points, mean1_gpu, mean2_gpu, avg_covar_inv_gpu, + omega_gpu, T(log_det_avg_covar), nmodes, T(state1.ħ) + ) + CUDA.synchronize() +end + +""" + _launch_cross_wignerchar_batch_kernel!(results, xi_points, state1, state2) + +Launch optimized GPU kernel for batch cross-Wigner characteristic function evaluation. +""" +function _launch_cross_wignerchar_batch_kernel!(results::CuArray{Complex{T}}, + xi_points::CuMatrix{T}, + state1::GaussianState, + state2::GaussianState) where T + + num_points = size(xi_points, 2) + mean1_cpu = Array(state1.mean) + mean2_cpu = Array(state2.mean) + covar1_cpu = Array(state1.covar) + covar2_cpu = Array(state2.covar) + omega_cpu = symplecticform(state1.basis) + avg_mean_cpu = (mean1_cpu + mean2_cpu) / 2 + avg_covar_cpu = (covar1_cpu + covar2_cpu) / 2 + temp_matrix = omega_cpu * avg_covar_cpu * transpose(omega_cpu) + linear_vector = omega_cpu * avg_mean_cpu + quadratic_matrix_gpu = CuArray{T}(temp_matrix) + linear_vector_gpu = CuArray{T}(linear_vector) + threads_per_block = min(256, num_points) + num_blocks = cld(num_points, threads_per_block) + @cuda threads=threads_per_block blocks=num_blocks _cross_wignerchar_batch_kernel!( + results, xi_points, quadratic_matrix_gpu, linear_vector_gpu + ) + CUDA.synchronize() +end + +""" + _cross_wigner_batch_kernel!(results, x_points, mean1, mean2, avg_covar_inv, + omega, log_det_avg_covar, nmodes, hbar) + + GPU kernel for batch cross-Wigner evaluation. +""" +function _cross_wigner_batch_kernel!(results::CuDeviceArray{Complex{T}}, + x_points::CuDeviceMatrix{T}, + mean1::CuDeviceVector{T}, + mean2::CuDeviceVector{T}, + avg_covar_inv::CuDeviceMatrix{T}, + omega::CuDeviceMatrix{T}, + log_det_avg_covar::T, + nmodes::Int, hbar::T) where T + + idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if idx <= size(x_points, 2) + dim = size(x_points, 1) + quadratic_form = zero(T) + for i in 1:dim + avg_mean_i = (mean1[i] + mean2[i]) * T(0.5) + dx_i = x_points[i, idx] - avg_mean_i + for j in 1:dim + avg_mean_j = (mean1[j] + mean2[j]) * T(0.5) + dx_j = x_points[j, idx] - avg_mean_j + quadratic_form += dx_i * avg_covar_inv[i, j] * dx_j + end + end + phase_arg = zero(T) + for i in 1:dim + mu_diff_i = mean1[i] - mean2[i] + avg_mean_i = (mean1[i] + mean2[i]) * T(0.5) + dx_i = x_points[i, idx] - avg_mean_i + omega_dx_i = zero(T) + for j in 1:dim + avg_mean_j = (mean1[j] + mean2[j]) * T(0.5) + dx_j = x_points[j, idx] - avg_mean_j + omega_dx_i += omega[i, j] * dx_j + end + phase_arg += mu_diff_i * omega_dx_i + end + log_normalization = -T(nmodes) * log(T(2π)) - T(0.5) * log_det_avg_covar + normalization = exp(log_normalization) + gaussian_part = exp(-T(0.5) * quadratic_form) + sin_val, cos_val = sincos(phase_arg / hbar) + phase_factor = Complex{T}(cos_val, sin_val) + results[idx] = normalization * gaussian_part * phase_factor + end + return nothing +end + +""" + _cross_wignerchar_batch_kernel!(results, xi_points, quadratic_matrix, linear_vector) + + GPU kernel for batch cross-Wigner characteristic function evaluation. +""" +function _cross_wignerchar_batch_kernel!(results::CuDeviceArray{Complex{T}}, + xi_points::CuDeviceMatrix{T}, + quadratic_matrix::CuDeviceMatrix{T}, + linear_vector::CuDeviceVector{T}) where T + + idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x + if idx <= size(xi_points, 2) + dim = size(xi_points, 1) + quadratic_term = zero(T) + for i in 1:dim + for j in 1:dim + quadratic_term += xi_points[i, idx] * quadratic_matrix[i, j] * xi_points[j, idx] + end + end + linear_term = zero(T) + for i in 1:dim + linear_term += linear_vector[i] * xi_points[i, idx] + end + real_part = -T(0.5) * quadratic_term + imag_part = -linear_term + sin_val, cos_val = sincos(imag_part) + results[idx] = Complex{T}(cos_val, sin_val) * exp(real_part) + end + return nothing +end \ No newline at end of file diff --git a/src/Gabs.jl b/src/Gabs.jl index d627996..3ff895d 100644 --- a/src/Gabs.jl +++ b/src/Gabs.jl @@ -38,7 +38,14 @@ export williamson, Williamson, polar, Polar, blochmessiah, BlochMessiah, # metrics purity, entropy_vn, fidelity, logarithmic_negativity, - cross_wigner, cross_wignerchar + cross_wigner, cross_wignerchar, + # GPU device management + gpu, cpu, device, adapt_device, + # optimized hybrid GPU random functions (industry standard approach) + randstate_gpu, randunitary_gpu, randchannel_gpu, randsymplectic_gpu, + batch_randstate_gpu, batch_randunitary_gpu, + #GPU simulation setup + random_ensemble_gpu, random_simulation_setup_gpu include("errors.jl") @@ -72,4 +79,5 @@ include("linearcombinations.jl") include("nongaussian_states.jl") -end +include("gpu_convenience.jl") +end \ No newline at end of file diff --git a/src/gpu_convenience.jl b/src/gpu_convenience.jl new file mode 100644 index 0000000..aa3bad3 --- /dev/null +++ b/src/gpu_convenience.jl @@ -0,0 +1,243 @@ +""" + gpu(x; precision=Float32) + +Transfer data to GPU. Works with states, operators, and arrays. +Requires CUDA.jl to be loaded. +""" +function gpu(state::GaussianState; precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _gpu_impl(state, precision) +end + +function gpu(op::GaussianUnitary; precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _gpu_impl(op, precision) +end + +function gpu(op::GaussianChannel; precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _gpu_impl(op, precision) +end + +function gpu(lc::GaussianLinearCombination; precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _gpu_impl(lc, precision) +end + +function gpu(basis::SymplecticBasis) + return basis +end + +function gpu(x::AbstractArray; precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _gpu_impl(x, precision) +end + +""" + cpu(x) + +Transfer data to CPU. Works with states, operators, and arrays. +""" +function cpu(state::GaussianState) + cpu_mean = isa(state.mean, AbstractArray) ? Array(state.mean) : state.mean + cpu_covar = isa(state.covar, AbstractArray) ? Array(state.covar) : state.covar + return GaussianState(state.basis, cpu_mean, cpu_covar; ħ = state.ħ) +end + +function cpu(op::GaussianUnitary) + cpu_disp = isa(op.disp, AbstractArray) ? Array(op.disp) : op.disp + cpu_symplectic = isa(op.symplectic, AbstractArray) ? Array(op.symplectic) : op.symplectic + return GaussianUnitary(op.basis, cpu_disp, cpu_symplectic; ħ = op.ħ) +end + +function cpu(op::GaussianChannel) + cpu_disp = isa(op.disp, AbstractArray) ? Array(op.disp) : op.disp + cpu_transform = isa(op.transform, AbstractArray) ? Array(op.transform) : op.transform + cpu_noise = isa(op.noise, AbstractArray) ? Array(op.noise) : op.noise + return GaussianChannel(op.basis, cpu_disp, cpu_transform, cpu_noise; ħ = op.ħ) +end + +function cpu(lc::GaussianLinearCombination) + cpu_coeffs = isa(lc.coeffs, AbstractArray) ? Array(lc.coeffs) : lc.coeffs + cpu_states = [cpu(state) for state in lc.states] + return GaussianLinearCombination(lc.basis, cpu_coeffs, cpu_states) +end + +function cpu(x::AbstractArray) + return Array(x) +end + +""" + device(x) + +Detect device of arrays/objects. +""" +device(x::AbstractArray) = _detect_device(x) +device(state::GaussianState) = _detect_device(state.mean) +device(op::GaussianUnitary) = _detect_device(op.disp) +device(op::GaussianChannel) = _detect_device(op.disp) +device(lc::GaussianLinearCombination) = _detect_device(lc) +function _detect_device(x) + return :cpu +end + +""" + adapt_device(target_constructor, source_obj, args...) + +Create target object on same device as source. +""" +function adapt_device(target_constructor, source_obj, args...) + source_device = device(source_obj) + if source_device == :gpu + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + @warn "GPU source object detected but CUDA extension not available. Creating CPU object instead." + return target_constructor(args...) + end + return _adapt_device_gpu(target_constructor, source_obj, args...) + else + return target_constructor(args...) + end +end + +""" + randstate_gpu(basis; pure=false, ħ=2, precision=Float32) + +Generate random Gaussian state optimized for GPU. +""" +function randstate_gpu(basis::SymplecticBasis; pure=false, ħ=2, precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _randstate_gpu_impl(basis, precision; pure=pure, ħ=ħ) +end + +""" + randunitary_gpu(basis; passive=false, ħ=2, precision=Float32) + +Generate random Gaussian unitary optimized for GPU. Uses hybrid CPU generation + GPU transfer. +""" +function randunitary_gpu(basis::SymplecticBasis; passive=false, ħ=2, precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _randunitary_gpu_impl(basis, precision; passive=passive, ħ=ħ) +end + +""" + randchannel_gpu(basis; ħ=2, precision=Float32) + +Generate random Gaussian channel optimized for GPU. Uses hybrid CPU generation + GPU transfer. +""" +function randchannel_gpu(basis::SymplecticBasis; ħ=2, precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _randchannel_gpu_impl(basis, precision; ħ=ħ) +end + +""" + randsymplectic_gpu(basis; passive=false, precision=Float32) + +Generate random symplectic matrix optimized for GPU. Uses hybrid CPU generation + GPU transfer. +""" +function randsymplectic_gpu(basis::SymplecticBasis; passive=false, precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _randsymplectic_gpu_impl(basis, precision; passive=passive) +end + +""" + batch_randstate_gpu(basis, n::Int; pure=false, ħ=2, precision=Float32) + +Generate n random states optimized for GPU in batch. +""" +function batch_randstate_gpu(basis::SymplecticBasis, n::Int; pure=false, ħ=2, precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _batch_randstate_gpu_impl(basis, n, precision; pure=pure, ħ=ħ) +end + +""" + batch_randunitary_gpu(basis, n::Int; passive=false, ħ=2, precision=Float32) + +Generate n random unitaries optimized for GPU in batch. +""" +function batch_randunitary_gpu(basis::SymplecticBasis, n::Int; passive=false, ħ=2, precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + return _batch_randunitary_gpu_impl(basis, n, precision; passive=passive, ħ=ħ) +end + +""" + random_ensemble_gpu(basis, n_states::Int, n_unitaries::Int; precision=Float32) + +Create a complete random ensemble for GPU quantum simulations. Generates: +- n_states random pure states on GPU +- n_unitaries random unitaries on GPU +- Returns tuple (states, unitaries) ready for batched GPU operations +""" +function random_ensemble_gpu(basis::SymplecticBasis, n_states::Int, n_unitaries::Int; precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + states = batch_randstate_gpu(basis, n_states; pure=true, precision=precision) + unitaries = batch_randunitary_gpu(basis, n_unitaries; passive=false, precision=precision) + return (states, unitaries) +end + +""" + random_simulation_setup_gpu(basis, n_samples::Int; + mixed_fraction=0.3, precision=Float32) + +Setup complete random simulation environment on GPU: +- Pure states (70% by default) +- Mixed states (30% by default) +- Random channels for decoherence +- Random unitaries for dynamics + +Returns NamedTuple with all components ready for GPU simulation. +""" +function random_simulation_setup_gpu(basis::SymplecticBasis, n_samples::Int; + mixed_fraction=0.3, precision=Float32) + if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt)) + error("GPU functionality requires CUDA.jl. Please run: using CUDA") + end + n_pure = round(Int, n_samples * (1 - mixed_fraction)) + n_mixed = n_samples - n_pure + pure_states = batch_randstate_gpu(basis, n_pure; pure=true, precision=precision) + mixed_states = n_mixed > 0 ? batch_randstate_gpu(basis, n_mixed; pure=false, precision=precision) : typeof(pure_states[1])[] + unitaries = batch_randunitary_gpu(basis, n_samples; passive=false, precision=precision) + n_channels = max(1, n_samples ÷ 10) + channels_cpu = [randchannel(basis) for _ in 1:n_channels] + channels = [gpu(ch; precision=precision) for ch in channels_cpu] + return ( + pure_states = pure_states, + mixed_states = mixed_states, + unitaries = unitaries, + channels = channels, + basis = basis, + precision = precision + ) +end + +function _randstate_gpu_impl end +function _randunitary_gpu_impl end +function _randchannel_gpu_impl end +function _randsymplectic_gpu_impl end +function _batch_randstate_gpu_impl end +function _batch_randunitary_gpu_impl end +function _gpu_impl end +function _adapt_device_gpu end \ No newline at end of file diff --git a/src/linearcombinations.jl b/src/linearcombinations.jl index c5c21a2..eda40a0 100644 --- a/src/linearcombinations.jl +++ b/src/linearcombinations.jl @@ -51,10 +51,10 @@ GaussianLinearCombination with 2 terms for 1 mode. """ mutable struct GaussianLinearCombination{B<:SymplecticBasis,C,S} basis::B - coeffs::Vector{C} + coeffs::C states::Vector{S} ħ::Number - function GaussianLinearCombination(basis::B, coeffs::Vector{C}, states::Vector{S}) where {B<:SymplecticBasis,C,S} + function GaussianLinearCombination(basis::B, coeffs::AbstractVector{C}, states::Vector{S}) where {B<:SymplecticBasis,C<:Number,S} length(coeffs) == length(states) || throw(DimensionMismatch("Number of coefficients ($(length(coeffs))) must match number of states ($(length(states)))")) isempty(states) && throw(ArgumentError("Cannot create an empty linear combination")) ħ = first(states).ħ @@ -67,7 +67,8 @@ mutable struct GaussianLinearCombination{B<:SymplecticBasis,C,S} throw(ArgumentError("State $i has different ħ: expected $ħ, got $(state.ħ)")) end end - return new{B,C,S}(basis, coeffs, states, ħ) + return new{B,typeof(coeffs),S}(basis, coeffs, states, ħ) + end end """ @@ -99,7 +100,7 @@ end Create a linear combination from separate vectors of coefficients and states. """ -function GaussianLinearCombination(coeffs::Vector{<:Number}, states::Vector{<:GaussianState}) +function GaussianLinearCombination(coeffs::AbstractVector{<:Number}, states::Vector{<:GaussianState}) isempty(states) && throw(ArgumentError("Cannot create an empty linear combination")) basis = first(states).basis return GaussianLinearCombination(basis, coeffs, states) @@ -125,10 +126,37 @@ Add two linear combinations of Gaussian states. Both must have the same symplect function Base.:+(lc1::GaussianLinearCombination{B}, lc2::GaussianLinearCombination{B}) where {B<:SymplecticBasis} lc1.basis == lc2.basis || throw(ArgumentError(SYMPLECTIC_ERROR)) lc1.ħ == lc2.ħ || throw(ArgumentError(HBAR_ERROR)) - coeffs = vcat(lc1.coeffs, lc2.coeffs) - states = vcat(lc1.states, lc2.states) + lc1_has_gpu_states = !isempty(lc1.states) && _is_gpu_array(lc1.states[1].mean) + lc2_has_gpu_states = !isempty(lc2.states) && _is_gpu_array(lc2.states[1].mean) + promote_to_gpu = lc1_has_gpu_states || lc2_has_gpu_states + if promote_to_gpu + T = promote_type(eltype(lc1.coeffs), eltype(lc2.coeffs)) + coeffs1 = Vector{T}(Array(lc1.coeffs)) + coeffs2 = Vector{T}(Array(lc2.coeffs)) + if lc1_has_gpu_states + states1 = lc1.states + else + precision = T <: Complex ? real(T) : T + states1 = [gpu(s, precision = precision) for s in lc1.states] + end + if lc2_has_gpu_states + states2 = lc2.states + else + precision = T <: Complex ? real(T) : T + states2 = [gpu(s, precision = precision) for s in lc2.states] + end + coeffs = vcat(coeffs1, coeffs2) + states = vcat(states1, states2) + else + T = promote_type(eltype(lc1.coeffs), eltype(lc2.coeffs)) + coeffs1 = Vector{T}(lc1.coeffs) + coeffs2 = Vector{T}(lc2.coeffs) + coeffs = vcat(coeffs1, coeffs2) + states = vcat(lc1.states, lc2.states) + end return GaussianLinearCombination(lc1.basis, coeffs, states) end + function Base.:+(lc1::GaussianLinearCombination{B1}, lc2::GaussianLinearCombination{B2}) where {B1<:SymplecticBasis,B2<:SymplecticBasis} throw(ArgumentError(SYMPLECTIC_ERROR)) end @@ -156,7 +184,8 @@ Base.:-(lc::GaussianLinearCombination) = (-1) * lc Multiply a linear combination by a scalar from the left. """ function Base.:*(α::Number, lc::GaussianLinearCombination) - new_coeffs = α .* lc.coeffs + T = promote_type(typeof(α), eltype(lc.coeffs)) + new_coeffs = Vector{T}(α .* Array(lc.coeffs)) return GaussianLinearCombination(lc.basis, new_coeffs, copy(lc.states)) end """ @@ -171,8 +200,9 @@ Base.:*(lc::GaussianLinearCombination, α::Number) = α * lc Multiply a Gaussian state by a scalar to create a linear combination. """ function Base.:*(α::Number, state::GaussianState) - coeff_type = promote_type(typeof(α), eltype(state.mean), eltype(state.covar)) - return GaussianLinearCombination(state.basis, [convert(coeff_type, α)], [state]) + T = promote_type(typeof(α), eltype(state.mean), eltype(state.covar)) + coeff_array = Vector{T}([convert(T, α)]) + return GaussianLinearCombination(state.basis, coeff_array, [state]) end """ *(state::GaussianState, α::Number) @@ -188,9 +218,22 @@ Add two Gaussian states to create a linear combination. function Base.:+(state1::GaussianState, state2::GaussianState) state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR)) state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR)) - coeff_type = promote_type(eltype(state1.mean), eltype(state1.covar), - eltype(state2.mean), eltype(state2.covar)) - return GaussianLinearCombination(state1.basis, [one(coeff_type), one(coeff_type)], [state1, state2]) + state1_gpu = _is_gpu_array(state1.mean) + state2_gpu = _is_gpu_array(state2.mean) + T = promote_type(eltype(state1.mean), eltype(state1.covar), + eltype(state2.mean), eltype(state2.covar)) + coeffs = Vector{T}([one(T), one(T)]) + if state1_gpu || state2_gpu + precision = T <: Complex ? real(T) : T + states = [ + state1_gpu ? state1 : gpu(state1, precision = precision), + state2_gpu ? state2 : gpu(state2, precision = precision) + ] + else + states = [state1, state2] + end + + return GaussianLinearCombination(state1.basis, coeffs, states) end """ +(state::GaussianState, lc::GaussianLinearCombination) @@ -200,9 +243,19 @@ Add a Gaussian state to a linear combination. function Base.:+(state::GaussianState, lc::GaussianLinearCombination) state.basis == lc.basis || throw(ArgumentError(SYMPLECTIC_ERROR)) state.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR)) - coeff_type = promote_type(eltype(state.mean), eltype(state.covar), eltype(lc.coeffs)) - new_coeffs = vcat(one(coeff_type), convert(Vector{coeff_type}, lc.coeffs)) - new_states = vcat(state, lc.states) + state_gpu = _is_gpu_array(state.mean) + lc_has_gpu = !isempty(lc.states) && _is_gpu_array(lc.states[1].mean) + T = promote_type(eltype(state.mean), eltype(state.covar), eltype(lc.coeffs)) + new_coeffs = Vector{T}(vcat([one(T)], Array(lc.coeffs))) + if state_gpu || lc_has_gpu + precision = T <: Complex ? real(T) : T + promoted_state = state_gpu ? state : gpu(state, precision = precision) + promoted_states = lc_has_gpu ? lc.states : [gpu(s, precision = precision) for s in lc.states] + new_states = vcat([promoted_state], promoted_states) + else + new_states = vcat([state], lc.states) + end + return GaussianLinearCombination(lc.basis, new_coeffs, new_states) end """ @@ -213,9 +266,18 @@ Add a linear combination to a Gaussian state. function Base.:+(lc::GaussianLinearCombination, state::GaussianState) lc.basis == state.basis || throw(ArgumentError(SYMPLECTIC_ERROR)) lc.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR)) - coeff_type = promote_type(eltype(state.mean), eltype(state.covar), eltype(lc.coeffs)) - new_coeffs = vcat(convert(Vector{coeff_type}, lc.coeffs), one(coeff_type)) - new_states = vcat(lc.states, state) + state_gpu = _is_gpu_array(state.mean) + lc_has_gpu = !isempty(lc.states) && _is_gpu_array(lc.states[1].mean) + T = promote_type(eltype(state.mean), eltype(state.covar), eltype(lc.coeffs)) + new_coeffs = Vector{T}(vcat(Array(lc.coeffs), [one(T)])) + if state_gpu || lc_has_gpu + precision = T <: Complex ? real(T) : T + promoted_state = state_gpu ? state : gpu(state, precision = precision) + promoted_states = lc_has_gpu ? lc.states : [gpu(s, precision = precision) for s in lc.states] + new_states = vcat(promoted_states, [promoted_state]) + else + new_states = vcat(lc.states, [state]) + end return GaussianLinearCombination(lc.basis, new_coeffs, new_states) end """ @@ -313,44 +375,63 @@ function simplify!(lc::GaussianLinearCombination; atol::Real=1e-14) if isempty(lc.coeffs) return lc end - keep_mask = abs.(lc.coeffs) .> atol + cpu_coeffs = Vector(Array(lc.coeffs)) + lc_has_gpu_states = !isempty(lc.states) && _is_gpu_array(lc.states[1].mean) + keep_mask = abs.(cpu_coeffs) .> atol if !any(keep_mask) - vac = vacuumstate(lc.basis, ħ = lc.ħ) - coeff_type = eltype(lc.coeffs) - lc.coeffs = [coeff_type(atol)] + vac = _device_aware_vacuum(lc) + coeff_type = eltype(cpu_coeffs) + lc.coeffs = Vector{coeff_type}([coeff_type(atol)]) lc.states = [vac] return lc end - coeffs = lc.coeffs[keep_mask] - states = lc.states[keep_mask] - unique_states = Vector{typeof(states[1])}(undef, length(states)) - combined_coeffs = Vector{eltype(coeffs)}(undef, length(states)) - n_unique = 0 - for (coeff, state) in zip(coeffs, states) - existing_idx = findfirst(s -> isapprox(s, state, atol=1e-12), @view(unique_states[1:n_unique])) + surviving_coeffs = cpu_coeffs[keep_mask] + surviving_states = lc.states[keep_mask] + if isempty(surviving_states) + return lc + end + unique_states = Vector{typeof(surviving_states[1])}() + combined_coeffs = Vector{eltype(surviving_coeffs)}() + + for (coeff, state) in zip(surviving_coeffs, surviving_states) + existing_idx = findfirst(unique_states) do existing_state + return states_approximately_equal(existing_state, state, 1e-12) + end if existing_idx === nothing - n_unique += 1 - unique_states[n_unique] = state - combined_coeffs[n_unique] = coeff + push!(unique_states, state) + push!(combined_coeffs, coeff) else combined_coeffs[existing_idx] += coeff end end - resize!(unique_states, n_unique) - resize!(combined_coeffs, n_unique) final_mask = abs.(combined_coeffs) .> atol if !any(final_mask) - vac = vacuumstate(lc.basis, ħ = lc.ħ) + vac = _device_aware_vacuum(lc) coeff_type = eltype(combined_coeffs) - lc.coeffs = [coeff_type(atol)] + lc.coeffs = Vector{coeff_type}([coeff_type(atol)]) lc.states = [vac] else - lc.coeffs = combined_coeffs[final_mask] + final_coeffs = combined_coeffs[final_mask] + lc.coeffs = Vector{eltype(final_coeffs)}(final_coeffs) lc.states = unique_states[final_mask] end return lc end +""" + states_approximately_equal(state1::GaussianState, state2::GaussianState, atol::Real) +""" +function states_approximately_equal(state1::GaussianState, state2::GaussianState, atol::Real) + state1.basis != state2.basis && return false + state1.ħ != state2.ħ && return false + try + mean_diff = maximum(abs.(Array(state1.mean) .- Array(state2.mean))) + covar_diff = maximum(abs.(Array(state1.covar) .- Array(state2.covar))) + return mean_diff <= atol && covar_diff <= atol + catch e + return false + end +end function Base.show(io::IO, mime::MIME"text/plain", lc::GaussianLinearCombination) basis_name = nameof(typeof(lc.basis)) nmodes = lc.basis.nmodes @@ -398,8 +479,22 @@ The unitary is applied to each component state while preserving coefficients. function Base.:(*)(op::GaussianUnitary, lc::GaussianLinearCombination) op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR)) op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR)) - new_states = [op * state for state in lc.states] - return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states) + op_gpu = _is_gpu_array(op.disp) + lc_gpu = (device(lc) == :gpu) + if op_gpu && !lc_gpu + T = real(eltype(op.disp)) + gpu_states = [gpu(s, precision = T) for s in lc.states] + new_states = [op * state for state in gpu_states] + return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states) + elseif !op_gpu && lc_gpu + T = real(eltype(lc.coeffs)) + gpu_op = gpu(op, precision = T) + new_states = [gpu_op * state for state in lc.states] + return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states) + else + new_states = [op * state for state in lc.states] + return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states) + end end """ @@ -411,10 +506,22 @@ The channel is applied to each component state while preserving coefficients. function Base.:(*)(op::GaussianChannel, lc::GaussianLinearCombination) op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR)) op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR)) - new_states = [op * state for state in lc.states] - return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states) + op_gpu = _is_gpu_array(op.disp) + lc_has_gpu = !isempty(lc.states) && _is_gpu_array(lc.states[1].mean) + cpu_coeffs = Vector(Array(lc.coeffs)) + if op_gpu && !lc_has_gpu + T = real(eltype(op.disp)) + gpu_states = [gpu(s, precision = T) for s in lc.states] + new_states = [op * state for state in gpu_states] + elseif !op_gpu && lc_has_gpu + T = real(eltype(lc.states[1].mean)) + gpu_op = gpu(op, precision = T) + new_states = [gpu_op * state for state in lc.states] + else + new_states = [op * state for state in lc.states] + end + return GaussianLinearCombination(lc.basis, cpu_coeffs, new_states) end - """ tensor(::Type{Tc}, ::Type{Ts}, lc1::GaussianLinearCombination, lc2::GaussianLinearCombination) @@ -684,4 +791,38 @@ function entropy_vn(lc::GaussianLinearCombination) # A GaussianLinearCombination represents a pure superposition state # |ψ⟩ = Σᵢ cᵢ|ψᵢ⟩, so S(ρ) = -Tr(ρ log ρ) = 0 return 0.0 +end + +""" + _is_gpu_array(x::AbstractArray) + +Check if an array is a GPU array. Extended by CUDA extension. +""" +function _is_gpu_array(x::AbstractArray) + return false +end + +""" + _device_aware_vacuum(lc::GaussianLinearCombination) + +Create a vacuum state on the same device as the linear combination's states. +""" +function _device_aware_vacuum(lc::GaussianLinearCombination) + vac = vacuumstate(lc.basis, ħ = lc.ħ) + if !isempty(lc.states) && _is_gpu_array(lc.states[1].mean) + T = real(eltype(lc.states[1].mean)) + return gpu(vac, precision = T) + else + return vac + end +end + +""" + _create_device_array(data, reference_array) + +Create array with proper device placement for coefficients. +""" +function _create_device_array(data, reference_array) + T = eltype(data) + return Vector{T}(Array(data)) end \ No newline at end of file diff --git a/src/nongaussian_states.jl b/src/nongaussian_states.jl index 3a17a16..5057ceb 100644 --- a/src/nongaussian_states.jl +++ b/src/nongaussian_states.jl @@ -20,6 +20,9 @@ function catstate_even(basis::SymplecticBasis, α::Number; squeeze_params=nothin if squeeze_params === nothing state_plus = coherentstate(basis, α, ħ=ħ) state_minus = coherentstate(basis, -α, ħ=ħ) + overlap = exp(-2 * abs2(α)) + norm_factor = 1 / sqrt(2 * (1 + overlap)) + else r, θ = squeeze_params squeezed_vac = squeezedstate(basis, r, θ, ħ=ħ) @@ -27,9 +30,11 @@ function catstate_even(basis::SymplecticBasis, α::Number; squeeze_params=nothin displace_minus = displace(basis, -α, ħ=ħ) state_plus = displace_plus * squeezed_vac state_minus = displace_minus * squeezed_vac + overlap_val = _overlap(state_plus, state_minus) + overlap = real(overlap_val) + norm_factor = 1 / sqrt(2 * (1 + overlap)) end - overlap = exp(-2 * abs2(α)) - norm_factor = 1 / sqrt(2 * (1 + overlap)) + coeffs = [norm_factor, norm_factor] states = [state_plus, state_minus] return GaussianLinearCombination(basis, coeffs, states) @@ -56,6 +61,8 @@ function catstate_odd(basis::SymplecticBasis, α::Number; squeeze_params=nothing if squeeze_params === nothing state_plus = coherentstate(basis, α, ħ=ħ) state_minus = coherentstate(basis, -α, ħ=ħ) + overlap = exp(-2 * abs2(α)) + norm_factor = 1 / sqrt(2 * (1 - overlap)) else r, θ = squeeze_params squeezed_vac = squeezedstate(basis, r, θ, ħ=ħ) @@ -63,14 +70,15 @@ function catstate_odd(basis::SymplecticBasis, α::Number; squeeze_params=nothing displace_minus = displace(basis, -α, ħ=ħ) state_plus = displace_plus * squeezed_vac state_minus = displace_minus * squeezed_vac + overlap_val = _overlap(state_plus, state_minus) + overlap = real(overlap_val) + norm_factor = 1 / sqrt(2 * (1 - overlap)) end - overlap = exp(-2 * abs2(α)) - norm_factor = 1 / sqrt(2 * (1 - overlap)) + coeffs = [norm_factor, -norm_factor] states = [state_plus, state_minus] return GaussianLinearCombination(basis, coeffs, states) end - """ catstate(basis::SymplecticBasis, α::Number, phase::Real=0; squeeze_params=nothing, ħ=2) @@ -93,6 +101,9 @@ function catstate(basis::SymplecticBasis, α::Number, phase::Real=0; squeeze_par if squeeze_params === nothing state_plus = coherentstate(basis, α, ħ=ħ) state_minus = coherentstate(basis, -α, ħ=ħ) + overlap = exp(-2 * abs2(α)) + phase_factor = exp(1im * phase) + norm_factor = 1 / sqrt(2 * (1 + real(phase_factor * overlap))) else r, θ = squeeze_params squeezed_vac = squeezedstate(basis, r, θ, ħ=ħ) @@ -100,18 +111,18 @@ function catstate(basis::SymplecticBasis, α::Number, phase::Real=0; squeeze_par displace_minus = displace(basis, -α, ħ=ħ) state_plus = displace_plus * squeezed_vac state_minus = displace_minus * squeezed_vac + overlap_val = _overlap(state_plus, state_minus) + phase_factor = exp(1im * phase) + norm_factor = 1 / sqrt(2 * (1 + real(phase_factor * overlap_val))) end - overlap = exp(-2 * abs2(α)) - phase_factor = exp(1im * phase) - norm_factor = 1 / sqrt(2 * (1 + real(phase_factor * overlap))) - coeffs = [norm_factor, norm_factor * phase_factor] + coeffs = [norm_factor, norm_factor * exp(1im * phase)] if abs(phase - π) < 1e-14 || abs(phase) < 1e-14 coeffs = real.(coeffs) end + states = [state_plus, state_minus] return GaussianLinearCombination(basis, coeffs, states) end - """ gkpstate(basis::SymplecticBasis; lattice=:square, delta=0.1, nmax=5, ħ=2) @@ -322,7 +333,7 @@ function catstate_odd(basis::SymplecticBasis, αs::AbstractVector; squeeze_param @inbounds for i in 1:nmodes α = αs[i] squeeze_param = squeeze_params === nothing ? nothing : squeeze_params[i] - cat = catstate_even(single_mode_basis, α, squeeze_params=squeeze_param, ħ=ħ) + cat = catstate_odd(single_mode_basis, α, squeeze_params=squeeze_param, ħ=ħ) cat_states[i] = cat end result = cat_states[1] @@ -331,7 +342,6 @@ function catstate_odd(basis::SymplecticBasis, αs::AbstractVector; squeeze_param end return result end - """ catstate(basis::SymplecticBasis, αs::AbstractVector, phases::AbstractVector=zeros(length(αs)); squeeze_params=nothing, ħ=2) @@ -348,8 +358,9 @@ function catstate(basis::SymplecticBasis, αs::AbstractVector, phases::AbstractV cat_states = Vector{GaussianLinearCombination}(undef, nmodes) @inbounds for i in 1:nmodes α = αs[i] + phase = phases[i] squeeze_param = squeeze_params === nothing ? nothing : squeeze_params[i] - cat = catstate_even(single_mode_basis, α, squeeze_params=squeeze_param, ħ=ħ) + cat = catstate(single_mode_basis, α, phase, squeeze_params=squeeze_param, ħ=ħ) cat_states[i] = cat end result = cat_states[1] diff --git a/test/Project.toml b/test/Project.toml index 8c69541..5d042c6 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -10,3 +10,5 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a" +Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 3c2b989..15ee87c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,21 +1,40 @@ +CUDA_flag = false + +if Sys.iswindows() + @info "Skipping GPU tests -- only executed on *NIX platforms." +else + CUDA_flag = get(ENV, "CUDA_TEST", "") == "true" + CUDA_flag && @info "Running with CUDA tests." + if !CUDA_flag + @info "Skipping GPU tests -- must be explicitly enabled." + @info "Environment must set CUDA_TEST=true." + end +end + +using Pkg +CUDA_flag && Pkg.add("CUDA") + using TestItemRunner using Gabs - -# filter for the test testfilter = ti -> begin - exclude = Symbol[:jet] - if !(VERSION >= v"1.10") - push!(exclude, :doctests) - push!(exclude, :aqua) - end - - return all(!in(exclude), ti.tags) + exclude = Symbol[:jet] + if get(ENV, "JET_TEST", "") == "true" + return :jet in ti.tags + else + push!(exclude, :jet) + end + if CUDA_flag + return :cuda in ti.tags + else + push!(exclude, :cuda) + end + if !(VERSION >= v"1.10") + push!(exclude, :doctests) + push!(exclude, :aqua) + end + return all(!in(exclude), ti.tags) end println("Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREADS = $(Sys.CPU_THREADS)`...") @run_package_tests filter=testfilter - -if get(ENV,"JET_TEST","")=="true" - @run_package_tests filter=(ti -> :jet in ti.tags) -end diff --git a/test/test_CUDAExt.jl b/test/test_CUDAExt.jl new file mode 100644 index 0000000..f43950e --- /dev/null +++ b/test/test_CUDAExt.jl @@ -0,0 +1,2723 @@ +@testitem "GPU Foundation - State Creation" tags=[:cuda] begin + using CUDA + using Gabs + using Gabs: device + using LinearAlgebra + using Statistics + + @testset "GPU Vacuum State" begin + basis = QuadPairBasis(1) + vac_cpu = vacuumstate(basis) + vac_gpu = vacuumstate(basis) |> gpu + @test vac_gpu.mean isa CuVector{Float32} + @test vac_gpu.covar isa CuMatrix{Float32} + @test vac_gpu.basis == basis + @test vac_gpu.ħ == 2 + @test device(vac_gpu) == :gpu + @test device(vac_cpu) == :cpu + @test Array(vac_gpu.mean) ≈ vac_cpu.mean + @test Array(vac_gpu.covar) ≈ vac_cpu.covar + vac_gpu_f64 = gpu(vacuumstate(basis), precision=Float64) + @test vac_gpu_f64.mean isa CuVector{Float64} + @test vac_gpu_f64.covar isa CuMatrix{Float64} + vac_back_to_cpu = vac_gpu |> cpu + @test device(vac_back_to_cpu) == :cpu + @test vac_back_to_cpu.mean ≈ vac_cpu.mean + @test vac_back_to_cpu.covar ≈ vac_cpu.covar + end + + @testset "GPU Coherent State" begin + basis = QuadPairBasis(1) + α = 1.0f0 + 0.5f0im + coh_cpu = coherentstate(basis, α) + coh_gpu = coherentstate(basis, α) |> gpu + @test coh_gpu.mean isa CuVector{Float32} + @test coh_gpu.covar isa CuMatrix{Float32} + @test device(coh_gpu) == :gpu + @test Array(coh_gpu.mean) ≈ Float32.(coh_cpu.mean) + @test Array(coh_gpu.covar) ≈ Float32.(coh_cpu.covar) + α_gpu = CuArray([α]) + coh_auto = coherentstate(basis, α_gpu) + @test device(coh_auto) == :gpu + @test Array(coh_auto.mean) ≈ Float32.(coh_cpu.mean) + basis_multi = QuadPairBasis(2) + α_multi = [1.0f0 + 0.5f0im, -0.3f0 + 0.8f0im] + coh_cpu_multi = coherentstate(basis_multi, α_multi) + coh_gpu_multi = coherentstate(basis_multi, α_multi) |> gpu + @test Array(coh_gpu_multi.mean) ≈ Float32.(coh_cpu_multi.mean) + @test Array(coh_gpu_multi.covar) ≈ Float32.(coh_cpu_multi.covar) + α_multi_gpu = CuArray(α_multi) + coh_multi_auto = coherentstate(basis_multi, α_multi_gpu) + @test device(coh_multi_auto) == :gpu + @test Array(coh_multi_auto.mean) ≈ Float32.(coh_cpu_multi.mean) + end + + @testset "GPU Squeezed State" begin + basis = QuadPairBasis(1) + r, θ = 0.3f0, Float32(π/4) + sq_cpu = squeezedstate(basis, r, θ) + sq_gpu = squeezedstate(basis, r, θ) |> gpu + @test sq_gpu.mean isa CuVector{Float32} + @test sq_gpu.covar isa CuMatrix{Float32} + @test device(sq_gpu) == :gpu + @test Array(sq_gpu.mean) ≈ Float32.(sq_cpu.mean) + @test Array(sq_gpu.covar) ≈ Float32.(sq_cpu.covar) rtol=1e-6 + r_gpu = CuArray([r]) + sq_auto = squeezedstate(basis, r_gpu, θ) + @test device(sq_auto) == :gpu + @test Array(sq_auto.mean) ≈ Float32.(sq_cpu.mean) + basis_multi = QuadPairBasis(2) + r_vec = [0.3f0, 0.5f0] + θ_vec = [Float32(π/4), Float32(π/6)] + sq_cpu_multi = squeezedstate(basis_multi, r_vec, θ_vec) + sq_gpu_multi = squeezedstate(basis_multi, r_vec, θ_vec) |> gpu + @test Array(sq_gpu_multi.mean) ≈ Float32.(sq_cpu_multi.mean) + @test Array(sq_gpu_multi.covar) ≈ Float32.(sq_cpu_multi.covar) rtol=1e-6 + end + + @testset "GPU Thermal State" begin + basis = QuadPairBasis(1) + n = 2.0f0 + thermal_cpu = thermalstate(basis, n) + thermal_gpu = thermalstate(basis, n) |> gpu + @test thermal_gpu.mean isa CuVector{Float32} + @test thermal_gpu.covar isa CuMatrix{Float32} + @test device(thermal_gpu) == :gpu + @test Array(thermal_gpu.mean) ≈ Float32.(thermal_cpu.mean) + @test Array(thermal_gpu.covar) ≈ Float32.(thermal_cpu.covar) + n_gpu = CuArray([n]) + thermal_auto = thermalstate(basis, n_gpu) + @test device(thermal_auto) == :gpu + @test Array(thermal_auto.mean) ≈ Float32.(thermal_cpu.mean) + basis_multi = QuadPairBasis(2) + n_vec = [2.0f0, 3.0f0] + thermal_cpu_multi = thermalstate(basis_multi, n_vec) + thermal_gpu_multi = thermalstate(basis_multi, n_vec) |> gpu + @test Array(thermal_gpu_multi.mean) ≈ Float32.(thermal_cpu_multi.mean) + @test Array(thermal_gpu_multi.covar) ≈ Float32.(thermal_cpu_multi.covar) + end + + @testset "GPU EPR State" begin + basis = QuadPairBasis(2) + r, θ = 0.5f0, Float32(π/3) + epr_cpu = eprstate(basis, r, θ) + epr_gpu = eprstate(basis, r, θ) |> gpu + @test epr_gpu.mean isa CuVector{Float32} + @test epr_gpu.covar isa CuMatrix{Float32} + @test device(epr_gpu) == :gpu + @test Array(epr_gpu.mean) ≈ Float32.(epr_cpu.mean) + @test Array(epr_gpu.covar) ≈ Float32.(epr_cpu.covar) rtol=1e-6 + end +end + +@testitem "GPU Foundation - Unitary Operations" tags=[:cuda] begin + using Gabs + using Gabs: device + using LinearAlgebra + using CUDA + + @testset "GPU Displacement" begin + basis = QuadPairBasis(1) + α = 0.5f0 + 0.3f0im + disp_cpu = displace(basis, α) + disp_gpu = displace(basis, α) |> gpu + @test disp_gpu.disp isa CuVector{Float32} + @test disp_gpu.symplectic isa CuMatrix{Float32} + @test device(disp_gpu) == :gpu + @test Array(disp_gpu.disp) ≈ Float32.(disp_cpu.disp) + @test Array(disp_gpu.symplectic) ≈ Float32.(disp_cpu.symplectic) + α_gpu = CuArray([α]) + disp_auto = displace(basis, α_gpu) + @test device(disp_auto) == :gpu + @test Array(disp_auto.disp) ≈ Float32.(disp_cpu.disp) + vac_gpu = vacuumstate(basis) |> gpu + displaced_gpu = disp_gpu * vac_gpu + vac_cpu = vacuumstate(basis) + displaced_cpu = disp_cpu * vac_cpu + @test Array(displaced_gpu.mean) ≈ Float32.(displaced_cpu.mean) rtol=1e-6 + @test Array(displaced_gpu.covar) ≈ Float32.(displaced_cpu.covar) rtol=1e-6 + end + + @testset "Mixed Device Operations" begin + basis = QuadPairBasis(1) + vac_cpu = vacuumstate(basis) + disp_gpu = displace(basis, 1.0f0) |> gpu + result = disp_gpu * vac_cpu + @test device(result) == :gpu + vac_gpu = vacuumstate(basis) |> gpu + disp_cpu = displace(basis, 1.0f0) + result2 = disp_cpu * vac_gpu + @test device(result2) == :gpu + end + + @testset "GPU Squeeze" begin + basis = QuadPairBasis(1) + r, θ = 0.3f0, Float32(π/4) + squeeze_cpu = squeeze(basis, r, θ) + squeeze_gpu = squeeze(basis, r, θ) |> gpu + @test Array(squeeze_gpu.disp) ≈ Float32.(squeeze_cpu.disp) + @test Array(squeeze_gpu.symplectic) ≈ Float32.(squeeze_cpu.symplectic) rtol=1e-6 + @test device(squeeze_gpu) == :gpu + r_gpu = CuArray([r]) + squeeze_auto = squeeze(basis, r_gpu, θ) + @test device(squeeze_auto) == :gpu + vac_gpu = vacuumstate(basis) |> gpu + squeezed_gpu = squeeze_gpu * vac_gpu + vac_cpu = vacuumstate(basis) + squeezed_cpu = squeeze_cpu * vac_cpu + @test Array(squeezed_gpu.mean) ≈ Float32.(squeezed_cpu.mean) rtol=1e-6 + @test Array(squeezed_gpu.covar) ≈ Float32.(squeezed_cpu.covar) rtol=1e-5 + end + + @testset "GPU Phase Shift" begin + basis = QuadPairBasis(1) + θ = Float32(π/3) + phase_cpu = phaseshift(basis, θ) + phase_gpu = phaseshift(basis, θ) |> gpu + @test Array(phase_gpu.disp) ≈ Float32.(phase_cpu.disp) + @test Array(phase_gpu.symplectic) ≈ Float32.(phase_cpu.symplectic) rtol=1e-6 + @test device(phase_gpu) == :gpu + end + + @testset "GPU Beam Splitter" begin + basis = QuadPairBasis(2) + transmit = 0.7f0 + bs_cpu = beamsplitter(basis, transmit) + bs_gpu = beamsplitter(basis, transmit) |> gpu + @test Array(bs_gpu.disp) ≈ Float32.(bs_cpu.disp) + @test Array(bs_gpu.symplectic) ≈ Float32.(bs_cpu.symplectic) rtol=1e-6 + @test device(bs_gpu) == :gpu + end + + @testset "GPU Two-Mode Squeeze" begin + basis = QuadPairBasis(2) + r, θ = 0.2f0, Float32(π/6) + twosq_cpu = twosqueeze(basis, r, θ) + twosq_gpu = twosqueeze(basis, r, θ) |> gpu + @test Array(twosq_gpu.disp) ≈ Float32.(twosq_cpu.disp) + @test Array(twosq_gpu.symplectic) ≈ Float32.(twosq_cpu.symplectic) rtol=1e-6 + @test device(twosq_gpu) == :gpu + end +end + +@testitem "GPU Foundation - Channel Operations" tags=[:cuda] begin + using Gabs + using Gabs: device + using LinearAlgebra + using CUDA + + @testset "GPU Attenuator Channel" begin + basis = QuadPairBasis(1) + θ, n = Float32(π/6), 2.0f0 + att_cpu = attenuator(basis, θ, n) + att_gpu = attenuator(basis, θ, n) |> gpu + @test att_gpu.disp isa CuVector{Float32} + @test att_gpu.transform isa CuMatrix{Float32} + @test att_gpu.noise isa CuMatrix{Float32} + @test device(att_gpu) == :gpu + @test Array(att_gpu.disp) ≈ Float32.(att_cpu.disp) + @test Array(att_gpu.transform) ≈ Float32.(att_cpu.transform) rtol=1e-6 + @test Array(att_gpu.noise) ≈ Float32.(att_cpu.noise) rtol=1e-6 + coh_gpu = coherentstate(basis, 1.0f0) |> gpu + attenuated_gpu = att_gpu * coh_gpu + coh_cpu = coherentstate(basis, 1.0f0) + attenuated_cpu = att_cpu * coh_cpu + @test Array(attenuated_gpu.mean) ≈ Float32.(attenuated_cpu.mean) rtol=1e-6 + @test Array(attenuated_gpu.covar) ≈ Float32.(attenuated_cpu.covar) rtol=1e-5 + end + + @testset "GPU Amplifier Channel" begin + basis = QuadPairBasis(1) + r, n = 0.2f0, 1.5f0 + amp_cpu = amplifier(basis, r, n) + amp_gpu = amplifier(basis, r, n) |> gpu + @test Array(amp_gpu.disp) ≈ Float32.(amp_cpu.disp) + @test Array(amp_gpu.transform) ≈ Float32.(amp_cpu.transform) rtol=1e-6 + @test Array(amp_gpu.noise) ≈ Float32.(amp_cpu.noise) rtol=1e-5 + @test device(amp_gpu) == :gpu + end +end + +@testitem "GPU Foundation - Wigner Functions" tags=[:cuda] begin + using Gabs + using Gabs: device + using LinearAlgebra + using CUDA + + @testset "GPU Single-Point Wigner" begin + basis = QuadPairBasis(1) + vac_cpu = vacuumstate(basis) + vac_gpu = vacuumstate(basis) |> gpu + coh_cpu = coherentstate(basis, 1.0f0) + coh_gpu = coherentstate(basis, 1.0f0) |> gpu + test_points = [[0.0f0, 0.0f0], [1.0f0, 0.5f0], [-0.5f0, 1.0f0]] + for x in test_points + w_cpu_vac = wigner(vac_cpu, x) + w_gpu_vac = wigner(vac_gpu, x) + @test w_gpu_vac ≈ w_cpu_vac rtol=1e-5 + w_cpu_coh = wigner(coh_cpu, Float32.(x)) + w_gpu_coh = wigner(coh_gpu, x) + @test w_gpu_coh ≈ w_cpu_coh rtol=1e-5 + end + end + + @testset "GPU Batch Wigner Evaluation" begin + basis = QuadPairBasis(1) + coh_gpu = coherentstate(basis, 1.0f0) |> gpu + n_points = 100 + x_points = CuMatrix{Float32}(randn(Float32, 2, n_points)) + w_batch = wigner(coh_gpu, x_points) + @test w_batch isa CuVector{Float32} + @test length(w_batch) == n_points + @test all(isfinite.(Array(w_batch))) + coh_cpu = coherentstate(basis, 1.0f0) + for i in 1:min(10, n_points) + x_point = Array(x_points[:, i]) + w_individual = wigner(coh_cpu, x_point) + w_batch_individual = Array(w_batch)[i] + @test w_batch_individual ≈ w_individual rtol=1e-4 + end + end + + @testset "GPU Single-Point Wigner Characteristic" begin + basis = QuadPairBasis(1) + vac_cpu = vacuumstate(basis) + vac_gpu = vacuumstate(basis) |> gpu + test_points = [[0.0f0, 0.0f0], [0.5f0, 0.3f0], [-0.2f0, 0.8f0]] + for xi in test_points + chi_cpu = wignerchar(vac_cpu, xi) + chi_gpu = wignerchar(vac_gpu, xi) + @test real(chi_gpu) ≈ real(chi_cpu) rtol=1e-5 + @test imag(chi_gpu) ≈ imag(chi_cpu) rtol=1e-5 + end + end + + @testset "GPU Batch Wigner Characteristic" begin + basis = QuadPairBasis(1) + sq_gpu = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu + n_points = 50 + xi_points = CuMatrix{Float32}(randn(Float32, 2, n_points) * 0.5f0) + chi_batch = wignerchar(sq_gpu, xi_points) + @test chi_batch isa CuVector{ComplexF32} + @test length(chi_batch) == n_points + @test all(isfinite.(real(Array(chi_batch)))) + @test all(isfinite.(imag(Array(chi_batch)))) + end + + @testset "GPU Wigner Grid Utility" begin + basis = QuadPairBasis(1) + coh_gpu = coherentstate(basis, 0.5f0) |> gpu + x_range = (-2.0f0, 2.0f0) + p_range = (-2.0f0, 2.0f0) + nx, np = 20, 25 + x_vals = collect(range(x_range[1], x_range[2], length=nx)) + p_vals = collect(range(p_range[1], p_range[2], length=np)) + x_grid = repeat(x_vals, 1, np)[:] + p_grid = repeat(p_vals', nx, 1)[:] + points = CuArray(Float32.([x_grid'; p_grid'])) + w_grid = wigner(coh_gpu, points) + @test length(w_grid) == nx * np + @test all(Array(w_grid) .> 0) + end +end + +@testitem "GPU Foundation - Tensor Products and Partial Traces" tags=[:cuda] begin + using Gabs + using Gabs: device + using LinearAlgebra + using CUDA + + @testset "GPU State Tensor Products" begin + basis1 = QuadPairBasis(1) + basis2 = QuadPairBasis(1) + coh1_gpu = coherentstate(basis1, 1.0f0) |> gpu + vac2_gpu = vacuumstate(basis2) |> gpu + coh1_cpu = coherentstate(basis1, 1.0f0) + vac2_cpu = vacuumstate(basis2) + tensor_gpu = tensor(coh1_gpu, vac2_gpu) + tensor_cpu = tensor(coh1_cpu, vac2_cpu) + @test tensor_gpu.basis.nmodes == 2 + @test device(tensor_gpu) == :gpu + @test Array(tensor_gpu.mean) ≈ Float32.(tensor_cpu.mean) rtol=1e-6 + @test Array(tensor_gpu.covar) ≈ Float32.(tensor_cpu.covar) rtol=1e-6 + end + + @testset "Mixed Device Tensor Products" begin + basis1 = QuadPairBasis(1) + coh_cpu = coherentstate(basis1, 1.0f0) + vac_gpu = vacuumstate(basis1) |> gpu + tensor_mixed = tensor(coh_cpu, vac_gpu) + @test device(tensor_mixed) == :gpu + end + + @testset "GPU Unitary Tensor Products" begin + basis1 = QuadPairBasis(1) + disp1_gpu = displace(basis1, 0.5f0) |> gpu + disp2_gpu = displace(basis1, -0.3f0) |> gpu + disp1_cpu = displace(basis1, 0.5f0) + disp2_cpu = displace(basis1, -0.3f0) + tensor_gpu = tensor(disp1_gpu, disp2_gpu) + tensor_cpu = tensor(disp1_cpu, disp2_cpu) + @test device(tensor_gpu) == :gpu + @test Array(tensor_gpu.disp) ≈ Float32.(tensor_cpu.disp) rtol=1e-6 + @test Array(tensor_gpu.symplectic) ≈ Float32.(tensor_cpu.symplectic) rtol=1e-6 + end + + @testset "GPU Partial Traces" begin + basis = QuadPairBasis(2) + epr_gpu = eprstate(basis, 0.3f0, Float32(π/4)) |> gpu + epr_cpu = eprstate(basis, 0.3f0, Float32(π/4)) + traced_gpu = ptrace(epr_gpu, [1]) + traced_cpu = ptrace(epr_cpu, [1]) + @test traced_gpu.basis.nmodes == 1 + @test device(traced_gpu) == :gpu + @test Array(traced_gpu.mean) ≈ Float32.(traced_cpu.mean) rtol=1e-6 + @test Array(traced_gpu.covar) ≈ Float32.(traced_cpu.covar) rtol=1e-5 + end +end + +@testitem "GPU Foundation - Device Management" tags=[:cuda] begin + using Gabs + using Gabs: device + using LinearAlgebra + using CUDA + + @testset "Device Transfer Functions" begin + basis = QuadPairBasis(1) + vac_cpu = vacuumstate(basis) + @test device(vac_cpu) == :cpu + vac_gpu = vac_cpu |> gpu + @test device(vac_gpu) == :gpu + @test vac_gpu.mean isa CuVector{Float32} + vac_back = vac_gpu |> cpu + @test device(vac_back) == :cpu + @test vac_back.mean isa Vector{Float32} + disp_cpu = displace(basis, 1.0) + disp_gpu = disp_cpu |> gpu + @test device(disp_gpu) == :gpu + disp_back = disp_gpu |> cpu + @test device(disp_back) == :cpu + att_cpu = attenuator(basis, π/4, 1.0) + att_gpu = att_cpu |> gpu + @test device(att_gpu) == :gpu + att_back = att_gpu |> cpu + @test device(att_back) == :cpu + end + + @testset "Precision Control" begin + basis = QuadPairBasis(1) + vac = vacuumstate(basis) + vac_f32 = gpu(vac, precision=Float32) + vac_f64 = gpu(vac, precision=Float64) + @test eltype(vac_f32.mean) == Float32 + @test eltype(vac_f64.mean) == Float64 + disp = displace(basis, 1.0) + disp_f32 = gpu(disp, precision=Float32) + disp_f64 = gpu(disp, precision=Float64) + @test eltype(disp_f32.disp) == Float32 + @test eltype(disp_f64.disp) == Float64 + end + + @testset "Array Device Functions" begin + x_cpu = randn(10) + x_gpu = x_cpu |> gpu + @test x_gpu isa CuVector{Float32} + @test device(x_gpu) == :gpu + x_back = x_gpu |> cpu + @test x_back isa Vector{Float32} + @test device(x_back) == :cpu + @test x_back ≈ Float32.(x_cpu) + end +end + +@testitem "GPU Tensor Products - Operators" tags=[:cuda] begin + using CUDA + using Gabs + using Gabs: device + using LinearAlgebra + + @testset "GPU GaussianUnitary Tensor Products" begin + basis1 = QuadPairBasis(1) + basis2 = QuadPairBasis(1) + disp1_cpu = displace(basis1, 0.5f0) + disp2_cpu = displace(basis2, -0.3f0) + disp1_gpu = disp1_cpu |> gpu + disp2_gpu = disp2_cpu |> gpu + @test device(disp1_gpu) == :gpu + @test device(disp2_gpu) == :gpu + tensor_gpu = tensor(disp1_gpu, disp2_gpu) + tensor_cpu = tensor(disp1_cpu, disp2_cpu) + @test device(tensor_gpu) == :gpu + @test tensor_gpu.basis.nmodes == 2 + @test Array(tensor_gpu.disp) ≈ Float32.(tensor_cpu.disp) rtol=1e-6 + @test Array(tensor_gpu.symplectic) ≈ Float32.(tensor_cpu.symplectic) rtol=1e-6 + tensor_typed = tensor(CuVector{Float64}, CuMatrix{Float64}, disp1_gpu, disp2_gpu) + @test eltype(tensor_typed.disp) == Float64 + @test eltype(tensor_typed.symplectic) == Float64 + vac1_gpu = vacuumstate(basis1) |> gpu + vac2_gpu = vacuumstate(basis2) |> gpu + tensor_state_gpu = tensor(vac1_gpu, vac2_gpu) + result_gpu = tensor_gpu * tensor_state_gpu + vac1_cpu = vacuumstate(basis1) + vac2_cpu = vacuumstate(basis2) + tensor_state_cpu = tensor(vac1_cpu, vac2_cpu) + result_cpu = tensor_cpu * tensor_state_cpu + @test Array(result_gpu.mean) ≈ Float32.(result_cpu.mean) rtol=1e-5 + @test Array(result_gpu.covar) ≈ Float32.(result_cpu.covar) rtol=1e-5 + end + + @testset "GPU GaussianChannel Tensor Products" begin + basis1 = QuadPairBasis(1) + basis2 = QuadPairBasis(1) + att1_cpu = attenuator(basis1, π/6, 1.5) + att2_cpu = attenuator(basis2, π/4, 2.0) + att1_gpu = att1_cpu |> gpu + att2_gpu = att2_cpu |> gpu + @test device(att1_gpu) == :gpu + @test device(att2_gpu) == :gpu + tensor_gpu = tensor(att1_gpu, att2_gpu) + tensor_cpu = tensor(att1_cpu, att2_cpu) + @test device(tensor_gpu) == :gpu + @test tensor_gpu.basis.nmodes == 2 + @test Array(tensor_gpu.disp) ≈ Float32.(tensor_cpu.disp) rtol=1e-6 + @test Array(tensor_gpu.transform) ≈ Float32.(tensor_cpu.transform) rtol=1e-5 + @test Array(tensor_gpu.noise) ≈ Float32.(tensor_cpu.noise) rtol=1e-5 + tensor_typed = tensor(CuVector{Float64}, CuMatrix{Float64}, att1_gpu, att2_gpu) + @test eltype(tensor_typed.disp) == Float64 + @test eltype(tensor_typed.transform) == Float64 + @test eltype(tensor_typed.noise) == Float64 + end + + @testset "Mixed Operator Types" begin + basis = QuadPairBasis(1) + disp_gpu = displace(basis, 1.0f0) |> gpu + att_gpu = attenuator(basis, π/4, 1.0) |> gpu + @test_throws ArgumentError tensor(disp_gpu, att_gpu) + end + + @testset "Error Handling" begin + basis1 = QuadPairBasis(1) + basis2 = QuadBlockBasis(1) + disp1_gpu = displace(basis1, 1.0f0) |> gpu + disp2_gpu = displace(basis2, 1.0f0) |> gpu + @test_throws ArgumentError tensor(disp1_gpu, disp2_gpu) + disp1_h1 = displace(basis1, 1.0f0, ħ=1) |> gpu + disp1_h2 = displace(basis1, 1.0f0, ħ=2) |> gpu + @test_throws ArgumentError tensor(disp1_h1, disp1_h2) + end +end + +@testitem "GPU Linear Combinations - Core Functionality" tags=[:cuda] begin + using CUDA + using Gabs + using Gabs: device, GaussianLinearCombination + using LinearAlgebra + + @testset "Device Management & Transfer" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0 + 0.5f0im) + state2 = coherentstate(basis, -1.0f0 + 0.3f0im) + coeffs = [0.6f0, 0.8f0] + lc_cpu = GaussianLinearCombination(basis, coeffs, [state1, state2]) + @test device(lc_cpu) == :cpu + @test lc_cpu.coeffs isa Vector{Float32} + @test device(lc_cpu.states[1]) == :cpu + lc_gpu = lc_cpu |> gpu + @test device(lc_gpu) == :gpu + @test lc_gpu.coeffs isa Vector{Float32} + @test device(lc_gpu.states[1]) == :gpu + @test device(lc_gpu.states[2]) == :gpu + @test lc_gpu.states[1].mean isa CuVector{Float32} + @test lc_gpu.states[1].covar isa CuMatrix{Float32} + @test lc_gpu.coeffs ≈ lc_cpu.coeffs + lc_back = lc_gpu |> cpu + @test device(lc_back) == :cpu + @test lc_back.coeffs ≈ lc_cpu.coeffs + @test Array(lc_gpu.states[1].mean) ≈ lc_back.states[1].mean rtol=1e-6 + lc_gpu_f64 = gpu(lc_cpu, precision=Float64) + @test eltype(lc_gpu_f64.coeffs) == Float64 + @test lc_gpu_f64.states[1].mean isa CuVector{Float64} + end + + @testset "Creating GPU Linear Combinations Directly" begin + basis = QuadPairBasis(1) + state1_cpu = coherentstate(basis, 1.0f0) + state2_cpu = squeezedstate(basis, 0.3f0, Float32(π/4)) + lc1 = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1_cpu, state2_cpu]) |> gpu + @test device(lc1) == :gpu + state1_gpu = coherentstate(basis, 1.0f0) |> gpu + state2_gpu = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu + lc2 = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1_gpu, state2_gpu]) + @test device(lc2) == :gpu + @test lc1.coeffs ≈ lc2.coeffs + @test Array(lc1.states[1].mean) ≈ Array(lc2.states[1].mean) rtol=1e-6 + end + + @testset "Arithmetic Operations" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = coherentstate(basis, -1.0f0) |> gpu + state3 = squeezedstate(basis, 0.2f0, 0.0f0) |> gpu + lc1 = GaussianLinearCombination(basis, [0.6f0, 0.4f0], [state1, state2]) + lc2 = GaussianLinearCombination(basis, [0.3f0], [state3]) + @test device(lc1) == :gpu + @test device(lc2) == :gpu + lc_sum = lc1 + lc2 + @test length(lc_sum) == 3 + @test device(lc_sum) == :gpu + @test lc_sum.coeffs ≈ [0.6f0, 0.4f0, 0.3f0] + lc_diff = lc1 - lc2 + @test length(lc_diff) == 3 + @test device(lc_diff) == :gpu + @test lc_diff.coeffs ≈ [0.6f0, 0.4f0, -0.3f0] + lc_scaled = 2.0f0 * lc1 + @test length(lc_scaled) == 2 + @test device(lc_scaled) == :gpu + @test lc_scaled.coeffs ≈ [1.2f0, 0.8f0] + lc_scaled2 = lc1 * 0.5f0 + @test lc_scaled2.coeffs ≈ [0.3f0, 0.2f0] + lc_neg = -lc1 + @test lc_neg.coeffs ≈ [-0.6f0, -0.4f0] + @test device(lc_neg) == :gpu + end + + @testset "Normalization" begin + basis = QuadPairBasis(1) + state1 = vacuumstate(basis) |> gpu + state2 = coherentstate(basis, 1.0f0) |> gpu + coeffs = [3.0f0, 4.0f0] + lc = GaussianLinearCombination(basis, coeffs, [state1, state2]) + @test device(lc) == :gpu + initial_norm = sqrt(sum(abs2, lc.coeffs)) + @test initial_norm ≈ 5.0f0 + Gabs.normalize!(lc) + @test device(lc) == :gpu + final_norm = sqrt(sum(abs2, lc.coeffs)) + @test final_norm ≈ 1.0f0 rtol=1e-6 + @test lc.coeffs ≈ [0.6f0, 0.8f0] rtol=1e-6 + lc_zero = GaussianLinearCombination(basis, [0.0f0, 0.0f0], [state1, state2]) + Gabs.normalize!(lc_zero) + @test lc_zero.coeffs == [0.0f0, 0.0f0] + end + + @testset "Simplification" begin + basis = QuadPairBasis(1) + vac = vacuumstate(basis) |> gpu + coh = coherentstate(basis, 1.0f0) |> gpu + coeffs1 = CuArray([0.9f0, Float32(1e-15), 0.1f0]) + lc1 = GaussianLinearCombination(basis, coeffs1, [vac, coh, vac]) + @test length(lc1) == 3 + Gabs.simplify!(lc1) + @test length(lc1) == 1 + @test device(lc1) == :gpu + coh2 = coherentstate(basis, 1.0f0) |> gpu + coeffs2 = CuArray([0.5f0, 0.3f0, 0.2f0]) + lc2 = GaussianLinearCombination(basis, coeffs2, [vac, coh, coh2]) + @test length(lc2) == 3 + Gabs.simplify!(lc2) + @test length(lc2) == 2 + @test device(lc2) == :gpu + cpu_coeffs = Array(lc2.coeffs) + cpu_vac_mean = Array(vac.mean) + cpu_coh_mean = Array(coh.mean) + vac_idx = findfirst(i -> Array(lc2.states[i].mean) ≈ cpu_vac_mean, 1:length(lc2)) + coh_idx = findfirst(i -> Array(lc2.states[i].mean) ≈ cpu_coh_mean, 1:length(lc2)) + @test vac_idx !== nothing + @test coh_idx !== nothing + @test cpu_coeffs[vac_idx] ≈ 0.5f0 + @test cpu_coeffs[coh_idx] ≈ 0.5f0 + coeffs3 = CuArray([Float32(1e-16), Float32(1e-17)]) + lc3 = GaussianLinearCombination(basis, coeffs3, [vac, coh]) + Gabs.simplify!(lc3) + @test length(lc3) == 1 + @test device(lc3) == :gpu + end + + @testset "GPU Operations - Gaussian Unitaries" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = coherentstate(basis, -1.0f0) |> gpu + lc_gpu = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1, state2]) + @test device(lc_gpu) == :gpu + disp_gpu = displace(basis, 0.5f0) |> gpu + result_gpu = disp_gpu * lc_gpu + @test device(result_gpu) == :gpu + @test length(result_gpu) == 2 + @test result_gpu.coeffs ≈ lc_gpu.coeffs + expected1 = disp_gpu * state1 + expected2 = disp_gpu * state2 + @test Array(result_gpu.states[1].mean) ≈ Array(expected1.mean) rtol=1e-6 + @test Array(result_gpu.states[2].mean) ≈ Array(expected2.mean) rtol=1e-6 + disp_cpu = displace(basis, 0.2f0) + result_mixed = disp_cpu * lc_gpu + @test device(result_mixed) == :gpu + squeeze_gpu = squeeze(basis, 0.3f0, Float32(π/4)) |> gpu + squeezed_lc = squeeze_gpu * lc_gpu + @test device(squeezed_lc) == :gpu + @test length(squeezed_lc) == 2 + phase_gpu = phaseshift(basis, π/3) |> gpu + phase_shifted = phase_gpu * lc_gpu + @test device(phase_shifted) == :gpu + end + + @testset "GPU Operations - Gaussian Channels" begin + basis = QuadPairBasis(1) + coh1 = coherentstate(basis, 1.0f0) |> gpu + coh2 = coherentstate(basis, 0.5f0) |> gpu + lc_gpu = GaussianLinearCombination(basis, [0.8f0, 0.2f0], [coh1, coh2]) + att_gpu = attenuator(basis, π/6, 2.0f0) |> gpu + attenuated = att_gpu * lc_gpu + @test device(attenuated) == :gpu + @test length(attenuated) == 2 + @test attenuated.coeffs ≈ lc_gpu.coeffs + expected1 = att_gpu * coh1 + expected2 = att_gpu * coh2 + @test Array(attenuated.states[1].mean) ≈ Array(expected1.mean) rtol=1e-6 + @test Array(attenuated.states[2].covar) ≈ Array(expected2.covar) rtol=1e-5 + amp_gpu = amplifier(basis, 0.2f0, 1.5f0) |> gpu + amplified = amp_gpu * lc_gpu + @test device(amplified) == :gpu + att_cpu = attenuator(basis, π/8, 1.0f0) + result_mixed = att_cpu * lc_gpu + @test device(result_mixed) == :gpu + end + + @testset "State Metrics" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu + lc_gpu = GaussianLinearCombination(basis, [0.6f0, 0.8f0], [state1, state2]) + @test device(lc_gpu) == :gpu + p = purity(lc_gpu) + @test p == 1.0 + s = entropy_vn(lc_gpu) + @test s == 0.0 + basis_multi = QuadPairBasis(2) + epr = eprstate(basis_multi, 0.5f0, Float32(π/3)) |> gpu + lc_multi = GaussianLinearCombination(basis_multi, [1.0f0], [epr]) + @test purity(lc_multi) == 1.0 + @test entropy_vn(lc_multi) == 0.0 + end + + @testset "Mixed Device Operations" begin + basis = QuadPairBasis(1) + state1_cpu = coherentstate(basis, 1.0f0) + state2_cpu = coherentstate(basis, -1.0f0) + lc_cpu = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1_cpu, state2_cpu]) + state3_gpu = squeezedstate(basis, 0.2f0, 0.0f0) |> gpu + lc_gpu = GaussianLinearCombination(basis, [0.5f0], [state3_gpu]) + @test device(lc_cpu) == :cpu + @test device(lc_gpu) == :gpu + lc_sum = lc_cpu + lc_gpu + disp_cpu = displace(basis, 0.3f0) + disp_gpu = displace(basis, 0.4f0) |> gpu + result1 = disp_cpu * lc_gpu + @test device(result1) == :gpu + result2 = disp_gpu * lc_cpu + @test device(result2) == :gpu + end + + @testset "Error Handling & Edge Cases" begin + basis1 = QuadPairBasis(1) + basis2 = QuadBlockBasis(1) + state1 = coherentstate(basis1, 1.0f0) |> gpu + state2 = coherentstate(basis2, 1.0f0) |> gpu + lc1 = GaussianLinearCombination(basis1, [0.5f0], [state1]) + lc2 = GaussianLinearCombination(basis2, [0.5f0], [state2]) + @test_throws ArgumentError lc1 + lc2 + state3 = GaussianState(basis1, CuArray([0.0f0, 0.0f0]), CuMatrix{Float32}(I(2)); ħ = 4) + lc3 = GaussianLinearCombination(basis1, [1.0f0], [state3]) + @test_throws ArgumentError lc1 + lc3 + disp_wrong = displace(basis2, 1.0f0) |> gpu + @test_throws ArgumentError disp_wrong * lc1 + @test_throws ArgumentError GaussianLinearCombination(basis1, Float32[], typeof(state1)[]) + tiny_coeffs = CuArray([Float32(1e-20), Float32(1e-21)]) + tiny_states = [state1, coherentstate(basis1, 2.0f0) |> gpu] + lc_tiny = GaussianLinearCombination(basis1, tiny_coeffs, tiny_states) + Gabs.simplify!(lc_tiny) + @test length(lc_tiny) == 1 + @test device(lc_tiny) == :gpu + if CUDA.functional() + @test true + else + @warn "CUDA not functional, testing fallback behavior" + end + end + + @testset "Type Stability & Performance" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu + lc = GaussianLinearCombination(basis, [0.6f0, 0.8f0], [state1, state2]) + @inferred device(lc) + @inferred purity(lc) + @inferred entropy_vn(lc) + disp = displace(basis, 0.5f0) |> gpu + result = disp * lc + @test result.states[1].mean isa CuVector{Float32} + @test result.states[1].covar isa CuMatrix{Float32} + lc_scaled = 2.0f0 * lc + @test lc_scaled.coeffs isa Vector{Float32} + @test lc_scaled.states[1].mean isa CuVector{Float32} + lc_copy = GaussianLinearCombination(basis, copy(lc.coeffs), copy(lc.states)) + Gabs.normalize!(lc_copy) + @test lc_copy.coeffs isa Vector{Float32} + @test device(lc_copy) == :gpu + end + + @testset "Multi-Mode GPU Linear Combinations" begin + basis = QuadPairBasis(2) + α_vec1 = [1.0f0 + 0.5f0im, -0.3f0 + 0.8f0im] + α_vec2 = [0.5f0, 0.2f0 - 0.4f0im] + coh1 = coherentstate(basis, α_vec1) |> gpu + coh2 = coherentstate(basis, α_vec2) |> gpu + epr = eprstate(basis, 0.3f0, Float32(π/6)) |> gpu + coeffs = [0.5f0, 0.3f0, 0.2f0] + lc_multi = GaussianLinearCombination(basis, coeffs, [coh1, coh2, epr]) + @test device(lc_multi) == :gpu + @test length(lc_multi) == 3 + @test lc_multi.basis.nmodes == 2 + squeeze_op = squeeze(basis, [0.2f0, 0.3f0], [0.0f0, Float32(π/4)]) |> gpu + squeezed_multi = squeeze_op * lc_multi + @test device(squeezed_multi) == :gpu + @test length(squeezed_multi) == 3 + @test squeezed_multi.coeffs ≈ coeffs + bs = beamsplitter(basis, 0.7f0) |> gpu + bs_result = bs * lc_multi + @test device(bs_result) == :gpu + twosq = twosqueeze(basis, 0.2f0, Float32(π/3)) |> gpu + twosq_result = twosq * lc_multi + @test device(twosq_result) == :gpu + end + + @testset "Professional API Validation" begin + @info "=== Testing Professional GPU API for Linear Combinations ===" + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu + lc = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1, state2]) + op = displace(basis, 0.5f0) |> gpu + result = op * lc + @test device(lc) == :gpu + @test device(result) == :gpu + @test result.coeffs ≈ lc.coeffs + α_gpu = CuArray([1.0f0 + 0.5f0im]) + auto_state = coherentstate(basis, α_gpu) + lc_auto = GaussianLinearCombination(basis, [1.0f0], [auto_state]) + @test device(lc_auto) == :gpu + cpu_op = squeeze(basis, 0.2f0, 0.0f0) + mixed_result = cpu_op * lc + @test device(mixed_result) == :gpu + end +end + +@testitem "GPU Linear Combinations Further Coverage" tags=[:cuda] begin + using CUDA + using Gabs + using Gabs: device, GaussianLinearCombination + using LinearAlgebra + using Test + + @testset "Basic GPU LC Creation and Device Detection" begin + basis = QuadPairBasis(1) + cpu_state = coherentstate(basis, 1.0f0) + cpu_lc = GaussianLinearCombination(basis, [0.7f0], [cpu_state]) + gpu_lc = cpu_lc |> gpu + @test device(cpu_lc) == :cpu + @test device(gpu_lc) == :gpu + @test gpu_lc.coeffs isa Vector{Float32} + @test Gabs._is_gpu_array(gpu_lc.coeffs) == false + @test gpu_lc.states[1].mean isa CuVector{Float32} + gpu_state = coherentstate(basis, 1.0f0) |> gpu + direct_gpu_lc = GaussianLinearCombination(basis, [0.8f0], [gpu_state]) + @test device(direct_gpu_lc) == :gpu + @test direct_gpu_lc.coeffs isa Vector{Float32} + cpu_state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) + gpu_state2 = cpu_state2 |> gpu + mixed_lc = GaussianLinearCombination(basis, [0.6f0, 0.4f0], [cpu_state, gpu_state2]) + @test device(mixed_lc) == :cpu + full_gpu_lc = mixed_lc |> gpu + @test device(full_gpu_lc) == :gpu + @test all(device(s) == :gpu for s in full_gpu_lc.states) + end + + @testset "Arithmetic Operations - Comprehensive Device Promotion" begin + basis = QuadPairBasis(1) + cpu_state1 = coherentstate(basis, 1.0f0) + cpu_state2 = coherentstate(basis, -1.0f0) + gpu_state1 = coherentstate(basis, 0.5f0) |> gpu + gpu_state2 = squeezedstate(basis, 0.2f0, 0.0f0) |> gpu + cpu_lc = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [cpu_state1, cpu_state2]) + gpu_lc = GaussianLinearCombination(basis, [0.6f0, 0.4f0], [gpu_state1, gpu_state2]) + result1 = cpu_lc + gpu_lc + @test device(result1) == :gpu + @test length(result1) == 4 + @test result1.coeffs isa Vector{Float32} + @test all(device(s) == :gpu for s in result1.states) + result2 = gpu_lc + cpu_lc + @test device(result2) == :gpu + @test length(result2) == 4 + cpu_lc2 = GaussianLinearCombination(basis, [0.5f0], [cpu_state1]) + result3 = cpu_lc + cpu_lc2 + @test device(result3) == :cpu + @test length(result3) == 3 + gpu_lc2 = GaussianLinearCombination(basis, [0.8f0], [gpu_state1]) + result4 = gpu_lc + gpu_lc2 + @test device(result4) == :gpu + @test length(result4) == 3 + scaled_cpu = 2.0f0 * cpu_lc + scaled_gpu = 2.0f0 * gpu_lc + @test device(scaled_cpu) == :cpu + @test device(scaled_gpu) == :gpu + @test scaled_gpu.coeffs ≈ [1.2f0, 0.8f0] + result5 = cpu_state1 + gpu_lc + @test device(result5) == :gpu + @test length(result5) == 3 + result6 = gpu_state1 + cpu_lc + @test device(result6) == :gpu + result7 = cpu_lc - gpu_lc + @test device(result7) == :gpu + result8 = gpu_state1 - cpu_lc + @test device(result8) == :gpu + end + + @testset "GPU Operator Applications - All Combinations" begin + basis = QuadPairBasis(1) + cpu_state1 = coherentstate(basis, 1.0f0) + cpu_state2 = squeezedstate(basis, 0.2f0, Float32(π/4)) + gpu_lc = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [cpu_state1, cpu_state2]) |> gpu + cpu_lc = GaussianLinearCombination(basis, [0.6f0, 0.4f0], [cpu_state1, cpu_state2]) + gpu_disp = displace(basis, 0.5f0) |> gpu + result1 = gpu_disp * gpu_lc + @test device(result1) == :gpu + @test result1.coeffs ≈ gpu_lc.coeffs + @test length(result1) == 2 + cpu_disp = displace(basis, 0.3f0) + result2 = cpu_disp * gpu_lc + @test device(result2) == :gpu + @test result2.coeffs ≈ gpu_lc.coeffs + result3 = gpu_disp * cpu_lc + @test device(result3) == :gpu + @test result3.coeffs ≈ cpu_lc.coeffs + result4 = cpu_disp * cpu_lc + @test device(result4) == :cpu + gpu_squeeze = squeeze(basis, 0.2f0, Float32(π/3)) |> gpu + gpu_phase = phaseshift(basis, Float32(π/6)) |> gpu + chained = gpu_phase * (gpu_squeeze * gpu_lc) + @test device(chained) == :gpu + @test chained.coeffs ≈ gpu_lc.coeffs + gpu_atten = attenuator(basis, π/4, 2.0f0) |> gpu + cpu_amp = amplifier(basis, 0.1f0, 1.5f0) + channel_result1 = gpu_atten * gpu_lc + @test device(channel_result1) == :gpu + channel_result2 = cpu_amp * gpu_lc + @test device(channel_result2) == :gpu + channel_result3 = gpu_atten * cpu_lc + @test device(channel_result3) == :gpu + cpu_result = cpu_disp * cpu_lc + gpu_equivalent = cpu_disp * (cpu_lc |> gpu) + cpu_back = gpu_equivalent |> cpu + @test cpu_back.coeffs ≈ cpu_result.coeffs rtol=1e-6 + @test cpu_back.states[1].mean ≈ cpu_result.states[1].mean rtol=1e-6 + @test cpu_back.states[1].covar ≈ cpu_result.states[1].covar rtol=1e-5 + end + + @testset "Memory Management and Efficiency" begin + basis = QuadPairBasis(1) + states = [coherentstate(basis, Float32(i)) for i in 1:10] + coeffs = Float32.(rand(10)) + cpu_lc = GaussianLinearCombination(basis, coeffs, states) + gpu_lc = cpu_lc |> gpu + @test sizeof(gpu_lc.coeffs) == sizeof(coeffs) + @test gpu_lc.coeffs isa Vector{Float32} + large_states = [coherentstate(basis, Float32(i)*0.1f0) for i in 1:100] + large_coeffs = Float32.(randn(100)) + large_lc = GaussianLinearCombination(basis, large_coeffs, large_states) + large_gpu_lc = large_lc |> gpu + @test device(large_gpu_lc) == :gpu + @test length(large_gpu_lc) == 100 + @test large_gpu_lc.coeffs isa Vector{Float32} + transferred_back = large_gpu_lc |> cpu + retransferred = transferred_back |> gpu + @test device(transferred_back) == :cpu + @test device(retransferred) == :gpu + @test transferred_back.coeffs ≈ large_lc.coeffs + end + + @testset "Normalization and Simplification on GPU" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu + unnormalized_lc = GaussianLinearCombination(basis, [3.0f0, 4.0f0], [state1, state2]) + @test device(unnormalized_lc) == :gpu + initial_norm = sqrt(sum(abs2, unnormalized_lc.coeffs)) + @test initial_norm ≈ 5.0f0 + Gabs.normalize!(unnormalized_lc) + @test device(unnormalized_lc) == :gpu + final_norm = sqrt(sum(abs2, unnormalized_lc.coeffs)) + @test final_norm ≈ 1.0f0 rtol=1e-6 + @test unnormalized_lc.coeffs ≈ [0.6f0, 0.8f0] rtol=1e-6 + small_coeffs = [0.9f0, Float32(1e-15), 0.1f0] + small_states = [state1, state2, coherentstate(basis, 2.0f0) |> gpu] + to_simplify = GaussianLinearCombination(basis, small_coeffs, small_states) + @test length(to_simplify) == 3 + @test device(to_simplify) == :gpu + Gabs.simplify!(to_simplify) + @test device(to_simplify) == :gpu + @test length(to_simplify) == 2 + identical_state = coherentstate(basis, 1.5f0) |> gpu + combine_coeffs = [0.3f0, 0.7f0, 0.2f0] + combine_states = [state1, identical_state, identical_state] + to_combine = GaussianLinearCombination(basis, combine_coeffs, combine_states) + @test length(to_combine) == 3 + Gabs.simplify!(to_combine) + @test length(to_combine) == 2 + @test device(to_combine) == :gpu + cpu_coeffs = Array(to_combine.coeffs) + @test any(c -> isapprox(c, 0.9f0, rtol=1e-6), cpu_coeffs) + end + + @testset "Complex Coefficients and Precision Handling" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = squeezedstate(basis, 0.2f0, Float32(π/4)) |> gpu + complex_coeffs = ComplexF32[0.6f0 + 0.8f0im, 0.3f0 - 0.4f0im] + complex_lc = GaussianLinearCombination(basis, complex_coeffs, [state1, state2]) + @test device(complex_lc) == :gpu + @test complex_lc.coeffs isa Vector{ComplexF32} + @test eltype(complex_lc.coeffs) == ComplexF32 + float64_state = coherentstate(basis, 1.0) |> gpu + mixed_lc = GaussianLinearCombination(basis, [0.5f0], [float64_state]) + @test device(mixed_lc) == :gpu + precise_op = displace(basis, 1.0) |> gpu + result = precise_op * complex_lc + @test device(result) == :gpu + @test result.coeffs ≈ complex_coeffs + large_coeffs = ComplexF32[1000.0f0, 0.001f0] + large_lc = GaussianLinearCombination(basis, large_coeffs, [state1, state2]) + Gabs.normalize!(large_lc) + @test device(large_lc) == :gpu + @test abs(sqrt(sum(abs2, large_lc.coeffs)) - 1.0f0) < 1e-6 + end + + @testset "Multi-Mode GPU Linear Combinations" begin + basis2 = QuadPairBasis(2) + basis3 = QuadPairBasis(3) + α_vec = [1.0f0 + 0.5f0im, -0.3f0 + 0.8f0im] + coh2mode = coherentstate(basis2, α_vec) |> gpu + epr_state = eprstate(basis2, 0.4f0, Float32(π/3)) |> gpu + lc_2mode = GaussianLinearCombination(basis2, [0.7f0, 0.3f0], [coh2mode, epr_state]) + @test device(lc_2mode) == :gpu + @test lc_2mode.basis.nmodes == 2 + bs = beamsplitter(basis2, 0.6f0) |> gpu + bs_result = bs * lc_2mode + @test device(bs_result) == :gpu + @test bs_result.basis.nmodes == 2 + α_vec3 = [1.0f0, 0.5f0im, -0.3f0 + 0.2f0im] + coh3mode = coherentstate(basis3, α_vec3) |> gpu + vac3mode = vacuumstate(basis3) |> gpu + lc_3mode = GaussianLinearCombination(basis3, [0.8f0, 0.2f0], [coh3mode, vac3mode]) + @test device(lc_3mode) == :gpu + @test lc_3mode.basis.nmodes == 3 + squeeze_multi = squeeze(basis3, [0.1f0, 0.2f0, 0.15f0], [0.0f0, Float32(π/4), Float32(π/2)]) |> gpu + multi_result = squeeze_multi * lc_3mode + @test device(multi_result) == :gpu + @test length(multi_result) == 2 + end + + @testset "Error Handling and Edge Cases" begin + basis1 = QuadPairBasis(1) + basis2 = QuadBlockBasis(1) + state1 = coherentstate(basis1, 1.0f0) |> gpu + state2 = coherentstate(basis2, 1.0f0) |> gpu + lc1 = GaussianLinearCombination(basis1, [0.5f0], [state1]) + lc2 = GaussianLinearCombination(basis2, [0.5f0], [state2]) + @test_throws ArgumentError lc1 + lc2 + @test_throws ArgumentError lc1 - lc2 + state_different_hbar = GaussianState(basis1, CuArray([0.0f0, 0.0f0]), + CuMatrix{Float32}(I(2)); ħ = 4) + lc_different = GaussianLinearCombination(basis1, [1.0f0], [state_different_hbar]) + @test_throws ArgumentError lc1 + lc_different + @test_throws ArgumentError GaussianLinearCombination(basis1, Float32[], typeof(state1)[]) + wrong_op = displace(basis2, 1.0f0) |> gpu + @test_throws ArgumentError wrong_op * lc1 + tiny_lc = GaussianLinearCombination(basis1, [Float32(1e-20)], [state1]) + Gabs.simplify!(tiny_lc) + @test length(tiny_lc) == 1 + @test device(tiny_lc) == :gpu + if CUDA.functional() + @test device(lc1) == :gpu + else + @warn "CUDA not functional - some GPU tests may not be meaningful" + end + end + + @testset "Performance and Type Stability" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu + gpu_lc = GaussianLinearCombination(basis, [0.6f0, 0.8f0], [state1, state2]) + @inferred device(gpu_lc) + @inferred length(gpu_lc) + @inferred gpu_lc[1] + @inferred purity(gpu_lc) + @inferred entropy_vn(gpu_lc) + lc2 = GaussianLinearCombination(basis, [0.4f0], [state1]) + op = displace(basis, 0.5f0) |> gpu + sum_result = gpu_lc + lc2 + mul_result = 2.0f0 * gpu_lc + op_result = op * gpu_lc + @test device(sum_result) == :gpu + @test device(mul_result) == :gpu + @test device(op_result) == :gpu + @test sum_result.coeffs isa Vector{Float32} + @test mul_result.coeffs isa Vector{Float32} + @test op_result.coeffs isa Vector{Float32} + @test op_result.states[1].mean isa CuVector{Float32} + @test op_result.states[1].covar isa CuMatrix{Float32} + many_states = [coherentstate(basis, Float32(i)*0.1f0) |> gpu for i in 1:50] + many_coeffs = Float32.(randn(50)) + big_lc = GaussianLinearCombination(basis, many_coeffs, many_states) + batch_op = phaseshift(basis, π/3) |> gpu + batch_result = batch_op * big_lc + @test device(batch_result) == :gpu + @test length(batch_result) == 50 + @test batch_result.coeffs isa Vector{Float32} + end + + @testset "Integration with Existing Gabs Functionality" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = squeezedstate(basis, 0.2f0, Float32(π/4)) |> gpu + gpu_lc = GaussianLinearCombination(basis, [0.6f0, 0.8f0], [state1, state2]) + @test purity(gpu_lc) == 1.0 + @test entropy_vn(gpu_lc) == 0.0 + block_basis = QuadBlockBasis(1) + pair_state = coherentstate(basis, 1.0f0) |> gpu + block_state = changebasis(QuadBlockBasis, pair_state) + mixed_basis_lc = GaussianLinearCombination(block_basis, [1.0f0], [block_state]) + @test device(mixed_basis_lc) == :gpu + random_basis = QuadPairBasis(1) + random_state = randstate(random_basis; pure=true) |> gpu + random_lc = GaussianLinearCombination(random_basis, [1.0f0], [random_state]) + @test device(random_lc) == :gpu + att = attenuator(basis, π/6, 2.0f0) |> gpu + amp = amplifier(basis, 0.1f0, 1.2f0) |> gpu + composed_result = amp * (att * gpu_lc) + @test device(composed_result) == :gpu + @test composed_result.coeffs ≈ gpu_lc.coeffs + x_test = [0.0f0, 0.0f0] + end + + @testset "Professional API Usage Examples" begin + basis = QuadPairBasis(1) + state1 = coherentstate(basis, 1.0f0) |> gpu + state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu + cat_like = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1, state2]) + @test device(cat_like) == :gpu + cpu_disp = displace(basis, 0.5f0) + result = cpu_disp * cat_like + @test device(result) == :gpu + cpu_state = thermalstate(basis, 1.0f0) + mixed_result = cpu_state + cat_like + @test device(mixed_result) == :gpu + @test cat_like.coeffs isa Vector{Float32} + @test Gabs._is_gpu_array(cat_like.coeffs) == false + @test cat_like.states[1].mean isa CuVector{Float32} + ops = [squeeze(basis, Float32(0.1f0*i), Float32(π/6*i)) |> gpu for i in 1:5] + results = [op * cat_like for op in ops] + @test all(device(r) == :gpu for r in results) + end +end + +@testitem "GPU Basis Operations" tags=[:cuda] begin + using Gabs + using Gabs: device, GaussianLinearCombination + using CUDA + using LinearAlgebra + using Test + + if !CUDA.functional() + @warn "CUDA not functional. Skipping GPU basis operations tests." + return + end + + @testset "Public API Device Management" begin + + @testset "Device Detection with Public API" begin + cpu_array = rand(Float32, 10) + gpu_array = CuArray(cpu_array) + @test device(cpu_array) == :cpu + @test device(gpu_array) == :gpu + basis = QuadPairBasis(2) + cpu_state = coherentstate(basis, 1.0 + 0.5im) + gpu_state = gpu(cpu_state, precision=Float32) + @test device(cpu_state) == :cpu + @test device(gpu_state) == :gpu + cpu_op = displace(basis, 0.5 + 0.3im) + gpu_op = gpu(cpu_op, precision=Float32) + @test device(cpu_op) == :cpu + @test device(gpu_op) == :gpu + end + + @testset "Device Transfer with Public API" begin + basis = QuadPairBasis(2) + cpu_state = coherentstate(basis, 1.0 + 0.5im) + gpu_state_f32 = gpu(cpu_state, precision=Float32) + gpu_state_f64 = gpu(cpu_state, precision=Float64) + @test device(gpu_state_f32) == :gpu + @test device(gpu_state_f64) == :gpu + @test eltype(gpu_state_f32.mean) == Float32 + @test eltype(gpu_state_f64.mean) == Float64 + cpu_back_f32 = cpu(gpu_state_f32) + cpu_back_f64 = cpu(gpu_state_f64) + @test device(cpu_back_f32) == :cpu + @test device(cpu_back_f64) == :cpu + @test isapprox(cpu_back_f32.mean, cpu_state.mean, rtol=1e-6) + @test isapprox(cpu_back_f64.mean, cpu_state.mean, rtol=1e-12) + end + + @testset "Adapt Device Functionality" begin + basis = QuadPairBasis(1) + cpu_state = coherentstate(basis, 1.0) + gpu_state = gpu(cpu_state, precision=Float32) + new_cpu_state = adapt_device(coherentstate, cpu_state, basis, 2.0) + new_gpu_state = adapt_device(coherentstate, gpu_state, basis, 2.0) + @test device(new_cpu_state) == :cpu + @test device(new_gpu_state) == :gpu + end + end + + @testset "GPU GaussianState Basis Conversions" begin + nmodes_list = [1, 2, 5] + precisions = [Float32, Float64] + + for nmodes in nmodes_list + for T in precisions + @testset "$(nmodes) modes, $(T) precision" begin + pair_basis = QuadPairBasis(nmodes) + block_basis = QuadBlockBasis(nmodes) + cpu_coherent_pair = coherentstate(pair_basis, 1.0 + 0.5im) + cpu_coherent_block = coherentstate(block_basis, 1.0 + 0.5im) + cpu_squeezed_pair = squeezedstate(pair_basis, 0.3, π/4) + cpu_squeezed_block = squeezedstate(block_basis, 0.3, π/4) + gpu_coherent_pair = gpu(cpu_coherent_pair, precision=T) + gpu_coherent_block = gpu(cpu_coherent_block, precision=T) + gpu_squeezed_pair = gpu(cpu_squeezed_pair, precision=T) + gpu_squeezed_block = gpu(cpu_squeezed_block, precision=T) + + @testset "QuadPairBasis -> QuadBlockBasis" begin + gpu_converted = changebasis(QuadBlockBasis, gpu_coherent_pair) + cpu_converted = changebasis(QuadBlockBasis, cpu_coherent_pair) + @test gpu_converted.basis isa QuadBlockBasis + @test gpu_converted.basis.nmodes == nmodes + @test device(gpu_converted) == :gpu + @test eltype(gpu_converted.mean) == T + @test eltype(gpu_converted.covar) == T + @test gpu_converted.ħ == cpu_coherent_pair.ħ + @test isapprox(Array(gpu_converted.mean), cpu_converted.mean, atol=1e-6) + @test isapprox(Array(gpu_converted.covar), cpu_converted.covar, atol=1e-6) + gpu_squeezed_converted = changebasis(QuadBlockBasis, gpu_squeezed_pair) + cpu_squeezed_converted = changebasis(QuadBlockBasis, cpu_squeezed_pair) + @test device(gpu_squeezed_converted) == :gpu + @test isapprox(Array(gpu_squeezed_converted.mean), cpu_squeezed_converted.mean, atol=1e-6) + @test isapprox(Array(gpu_squeezed_converted.covar), cpu_squeezed_converted.covar, atol=1e-6) + end + + @testset "QuadBlockBasis -> QuadPairBasis" begin + gpu_converted = changebasis(QuadPairBasis, gpu_coherent_block) + cpu_converted = changebasis(QuadPairBasis, cpu_coherent_block) + @test gpu_converted.basis isa QuadPairBasis + @test gpu_converted.basis.nmodes == nmodes + @test device(gpu_converted) == :gpu + @test eltype(gpu_converted.mean) == T + @test eltype(gpu_converted.covar) == T + @test isapprox(Array(gpu_converted.mean), cpu_converted.mean, atol=1e-6) + @test isapprox(Array(gpu_converted.covar), cpu_converted.covar, atol=1e-6) + gpu_squeezed_converted = changebasis(QuadPairBasis, gpu_squeezed_block) + cpu_squeezed_converted = changebasis(QuadPairBasis, cpu_squeezed_block) + @test device(gpu_squeezed_converted) == :gpu + @test isapprox(Array(gpu_squeezed_converted.mean), cpu_squeezed_converted.mean, atol=1e-6) + @test isapprox(Array(gpu_squeezed_converted.covar), cpu_squeezed_converted.covar, atol=1e-6) + end + + @testset "Same basis no-op conversions" begin + same_basis_pair = changebasis(QuadPairBasis, gpu_coherent_pair) + same_basis_block = changebasis(QuadBlockBasis, gpu_coherent_block) + @test device(same_basis_pair) == :gpu + @test device(same_basis_block) == :gpu + @test same_basis_pair.basis isa QuadPairBasis + @test same_basis_block.basis isa QuadBlockBasis + @test isapprox(Array(same_basis_pair.mean), Array(gpu_coherent_pair.mean), atol=1e-12) + @test isapprox(Array(same_basis_block.mean), Array(gpu_coherent_block.mean), atol=1e-12) + end + + @testset "Round-trip conversion consistency" begin + gpu_pair_to_block = changebasis(QuadBlockBasis, gpu_coherent_pair) + gpu_round_trip = changebasis(QuadPairBasis, gpu_pair_to_block) + @test device(gpu_round_trip) == :gpu + @test isapprox(Array(gpu_round_trip.mean), Array(gpu_coherent_pair.mean), atol=1e-6) + @test isapprox(Array(gpu_round_trip.covar), Array(gpu_coherent_pair.covar), atol=1e-6) + gpu_block_to_pair = changebasis(QuadPairBasis, gpu_coherent_block) + gpu_round_trip2 = changebasis(QuadBlockBasis, gpu_block_to_pair) + @test device(gpu_round_trip2) == :gpu + @test isapprox(Array(gpu_round_trip2.mean), Array(gpu_coherent_block.mean), atol=1e-6) + @test isapprox(Array(gpu_round_trip2.covar), Array(gpu_coherent_block.covar), atol=1e-6) + end + end + end + end + end + + @testset "GPU GaussianUnitary Basis Conversions" begin + for nmodes in [1, 2] + for T in [Float32, Float64] + @testset "$(nmodes) modes, $(T) precision" begin + pair_basis = QuadPairBasis(nmodes) + block_basis = QuadBlockBasis(nmodes) + cpu_displace_pair = displace(pair_basis, 0.5 + 0.3im) + cpu_displace_block = displace(block_basis, 0.5 + 0.3im) + cpu_squeeze_pair = squeeze(pair_basis, 0.2, π/3) + cpu_squeeze_block = squeeze(block_basis, 0.2, π/3) + gpu_displace_pair = gpu(cpu_displace_pair, precision=T) + gpu_displace_block = gpu(cpu_displace_block, precision=T) + gpu_squeeze_pair = gpu(cpu_squeeze_pair, precision=T) + gpu_squeeze_block = gpu(cpu_squeeze_block, precision=T) + + @testset "Unitary basis conversions" begin + gpu_converted = changebasis(QuadBlockBasis, gpu_displace_pair) + cpu_converted = changebasis(QuadBlockBasis, cpu_displace_pair) + @test gpu_converted.basis isa QuadBlockBasis + @test device(gpu_converted) == :gpu + @test isapprox(Array(gpu_converted.disp), cpu_converted.disp, atol=1e-6) + @test isapprox(Array(gpu_converted.symplectic), cpu_converted.symplectic, atol=1e-6) + gpu_converted2 = changebasis(QuadPairBasis, gpu_displace_block) + cpu_converted2 = changebasis(QuadPairBasis, cpu_displace_block) + @test gpu_converted2.basis isa QuadPairBasis + @test device(gpu_converted2) == :gpu + @test isapprox(Array(gpu_converted2.disp), cpu_converted2.disp, atol=1e-6) + @test isapprox(Array(gpu_converted2.symplectic), cpu_converted2.symplectic, atol=1e-6) + gpu_squeeze_converted = changebasis(QuadBlockBasis, gpu_squeeze_pair) + cpu_squeeze_converted = changebasis(QuadBlockBasis, cpu_squeeze_pair) + @test device(gpu_squeeze_converted) == :gpu + @test isapprox(Array(gpu_squeeze_converted.disp), cpu_squeeze_converted.disp, atol=1e-6) + @test isapprox(Array(gpu_squeeze_converted.symplectic), cpu_squeeze_converted.symplectic, atol=1e-6) + end + + @testset "Unitary no-op conversions" begin + same_basis_pair = changebasis(QuadPairBasis, gpu_displace_pair) + same_basis_block = changebasis(QuadBlockBasis, gpu_displace_block) + @test device(same_basis_pair) == :gpu + @test device(same_basis_block) == :gpu + @test same_basis_pair.basis isa QuadPairBasis + @test same_basis_block.basis isa QuadBlockBasis + end + end + end + end + end + + @testset "GPU GaussianChannel Basis Conversions" begin + for nmodes in [1, 2] + for T in [Float32, Float64] + @testset "$(nmodes) modes, $(T) precision" begin + pair_basis = QuadPairBasis(nmodes) + block_basis = QuadBlockBasis(nmodes) + cpu_attenuator_pair = attenuator(pair_basis, π/4, 2) + cpu_attenuator_block = attenuator(block_basis, π/4, 2) + cpu_amplifier_pair = amplifier(pair_basis, 0.3, 3) + cpu_amplifier_block = amplifier(block_basis, 0.3, 3) + gpu_attenuator_pair = gpu(cpu_attenuator_pair, precision=T) + gpu_attenuator_block = gpu(cpu_attenuator_block, precision=T) + gpu_amplifier_pair = gpu(cpu_amplifier_pair, precision=T) + gpu_amplifier_block = gpu(cpu_amplifier_block, precision=T) + + @testset "Channel basis conversions" begin + gpu_converted = changebasis(QuadBlockBasis, gpu_attenuator_pair) + cpu_converted = changebasis(QuadBlockBasis, cpu_attenuator_pair) + @test gpu_converted.basis isa QuadBlockBasis + @test device(gpu_converted) == :gpu + @test isapprox(Array(gpu_converted.disp), cpu_converted.disp, atol=1e-6) + @test isapprox(Array(gpu_converted.transform), cpu_converted.transform, atol=1e-6) + @test isapprox(Array(gpu_converted.noise), cpu_converted.noise, atol=1e-6) + gpu_converted2 = changebasis(QuadPairBasis, gpu_attenuator_block) + cpu_converted2 = changebasis(QuadPairBasis, cpu_attenuator_block) + @test gpu_converted2.basis isa QuadPairBasis + @test device(gpu_converted2) == :gpu + @test isapprox(Array(gpu_converted2.disp), cpu_converted2.disp, atol=1e-6) + @test isapprox(Array(gpu_converted2.transform), cpu_converted2.transform, atol=1e-6) + @test isapprox(Array(gpu_converted2.noise), cpu_converted2.noise, atol=1e-6) + gpu_amp_converted = changebasis(QuadBlockBasis, gpu_amplifier_pair) + cpu_amp_converted = changebasis(QuadBlockBasis, cpu_amplifier_pair) + @test device(gpu_amp_converted) == :gpu + @test isapprox(Array(gpu_amp_converted.disp), cpu_amp_converted.disp, atol=1e-6) + @test isapprox(Array(gpu_amp_converted.transform), cpu_amp_converted.transform, atol=1e-6) + @test isapprox(Array(gpu_amp_converted.noise), cpu_amp_converted.noise, atol=1e-6) + end + + @testset "Channel no-op conversions" begin + same_basis_pair = changebasis(QuadPairBasis, gpu_attenuator_pair) + same_basis_block = changebasis(QuadBlockBasis, gpu_attenuator_block) + @test device(same_basis_pair) == :gpu + @test device(same_basis_block) == :gpu + @test same_basis_pair.basis isa QuadPairBasis + @test same_basis_block.basis isa QuadBlockBasis + end + end + end + end + end + + @testset "CPU/GPU Mixed Operations with Basis Conversion" begin + basis_pair = QuadPairBasis(2) + basis_block = QuadBlockBasis(2) + cpu_state_pair = coherentstate(basis_pair, 1.0 + 0.5im) + gpu_state_pair = gpu(coherentstate(basis_pair, 0.5 - 0.3im), precision=Float32) + cpu_state_block = coherentstate(basis_block, 2.0 + 0.1im) + gpu_state_block = gpu(coherentstate(basis_block, -0.8 + 0.7im), precision=Float32) + + @testset "Mixed device basis conversions" begin + cpu_pair_to_block = changebasis(QuadBlockBasis, cpu_state_pair) + @test device(cpu_pair_to_block) == :cpu + @test cpu_pair_to_block.basis isa QuadBlockBasis + gpu_pair_to_block = changebasis(QuadBlockBasis, gpu_state_pair) + @test device(gpu_pair_to_block) == :gpu + @test gpu_pair_to_block.basis isa QuadBlockBasis + cpu_to_gpu_converted = gpu(changebasis(QuadBlockBasis, cpu_state_pair), precision=Float32) + @test device(cpu_to_gpu_converted) == :gpu + @test cpu_to_gpu_converted.basis isa QuadBlockBasis + gpu_to_cpu_converted = cpu(changebasis(QuadPairBasis, gpu_state_block)) + @test device(gpu_to_cpu_converted) == :cpu + @test gpu_to_cpu_converted.basis isa QuadPairBasis + end + + @testset "Operations across converted states" begin + cpu_op_pair = displace(basis_pair, 0.3) + gpu_op_block = gpu(displace(basis_block, 0.4), precision=Float32) + converted_state = changebasis(QuadBlockBasis, gpu_state_pair) + result = gpu_op_block * converted_state + @test device(result) == :gpu + @test result.basis isa QuadBlockBasis + cpu_state_converted = changebasis(QuadPairBasis, cpu_state_block) + result2 = cpu_op_pair * cpu_state_converted + @test device(result2) == :cpu + @test result2.basis isa QuadPairBasis + end + end + + @testset "GPU Basis Conversion Error Handling" begin + basis_pair = QuadPairBasis(2) + cpu_state = coherentstate(basis_pair, 1.0 + 0.5im) + converted_cpu = changebasis(QuadBlockBasis, cpu_state) + @test converted_cpu.basis isa QuadBlockBasis + @test device(converted_cpu) == :cpu + @test isgaussian(converted_cpu, atol=1e-4) + squeezed_cpu = squeezedstate(basis_pair, 0.5, π/4) + converted_squeezed = changebasis(QuadBlockBasis, squeezed_cpu) + @test isgaussian(converted_squeezed, atol=1e-4) + thermal_cpu = thermalstate(basis_pair, 2.0) + converted_thermal = changebasis(QuadBlockBasis, thermal_cpu) + @test isgaussian(converted_thermal, atol=1e-4) + end + + @testset "Memory Efficiency and Performance Validation" begin + basis_pair = QuadPairBasis(10) + large_state = gpu(squeezedstate(basis_pair, 0.5, π/6), precision=Float32) + @test device(large_state) == :gpu + @test eltype(large_state.mean) == Float32 + converted_state = changebasis(QuadBlockBasis, large_state) + @test device(converted_state) == :gpu + @test converted_state.basis isa QuadBlockBasis + @test eltype(converted_state.mean) == Float32 + @test eltype(converted_state.covar) == Float32 + round_trip = changebasis(QuadPairBasis, converted_state) + @test device(round_trip) == :gpu + @test isapprox(Array(round_trip.mean), Array(large_state.mean), atol=1e-6) + @test isapprox(Array(round_trip.covar), Array(large_state.covar), atol=1e-6) + end + + @testset "Integration with Linear Combinations" begin + basis_pair = QuadPairBasis(2) + state1 = gpu(coherentstate(basis_pair, 1.0), precision=Float32) + state2 = gpu(coherentstate(basis_pair, -1.0), precision=Float32) + lc = GaussianLinearCombination(basis_pair, [0.7f0, 0.3f0], [state1, state2]) + @test device(lc) == :gpu + @test lc.basis isa QuadPairBasis + converted_state1 = changebasis(QuadBlockBasis, state1) + converted_state2 = changebasis(QuadBlockBasis, state2) + @test device(converted_state1) == :gpu + @test device(converted_state2) == :gpu + @test converted_state1.basis isa QuadBlockBasis + @test converted_state2.basis isa QuadBlockBasis + lc_converted = GaussianLinearCombination(QuadBlockBasis(2), [0.7f0, 0.3f0], + [converted_state1, converted_state2]) + @test device(lc_converted) == :gpu + @test lc_converted.basis isa QuadBlockBasis + end +end + +@testitem "GPU Random Generation Suite" tags=[:cuda] begin + using Gabs + using Gabs: device, GaussianState, GaussianUnitary, GaussianChannel + using CUDA + using LinearAlgebra: adjoint, inv, eigvals, det + using Test + + if !CUDA.functional() + @warn "CUDA not functional. Skipping GPU random generation tests." + return + end + + @testset "GPU Random States - Physical Properties" begin + nmodes = rand(1:3) + qpairbasis = QuadPairBasis(nmodes) + qblockbasis = QuadBlockBasis(nmodes) + + @testset "Pure Random States - Convenience API" begin + rs_pair_gpu = randstate_gpu(qpairbasis; pure=true) + rs_block_gpu = randstate_gpu(qblockbasis; pure=true) + @test rs_pair_gpu isa GaussianState + @test rs_block_gpu isa GaussianState + @test device(rs_pair_gpu) == :gpu + @test device(rs_block_gpu) == :gpu + @test rs_pair_gpu.ħ == 2 + @test rs_block_gpu.ħ == 2 + rs_pair_cpu = cpu(rs_pair_gpu) + rs_block_cpu = cpu(rs_block_gpu) + @test isgaussian(rs_pair_cpu, atol = 1e-3) + @test isgaussian(rs_block_cpu, atol = 1e-3) + @test isapprox(purity(rs_pair_cpu), 1.0, atol = 1e-3) + @test isapprox(purity(rs_block_cpu), 1.0, atol = 1e-3) + end + + @testset "Mixed Random States - Convenience API" begin + rs_mixed_gpu = randstate_gpu(qpairbasis; pure=false) + @test rs_mixed_gpu isa GaussianState + @test device(rs_mixed_gpu) == :gpu + rs_mixed_cpu = cpu(rs_mixed_gpu) + @test isgaussian(rs_mixed_cpu, atol = 1e-3) + end + + @testset "Typed Random States - Your GPU Extensions" begin + rs_typed_gpu = randstate(CuVector{Float32}, CuMatrix{Float32}, qpairbasis; pure=true) + @test rs_typed_gpu isa GaussianState + @test device(rs_typed_gpu) == :gpu + @test eltype(rs_typed_gpu.mean) == Float32 + @test eltype(rs_typed_gpu.covar) == Float32 + rs_typed_cpu = cpu(rs_typed_gpu) + @test isgaussian(rs_typed_cpu, atol = 1e-3) + @test isapprox(purity(rs_typed_cpu), 1.0, atol = 1e-3) + end + + @testset "Batch States - Your Batch Functions" begin + batch_size = 3 + states_gpu = batch_randstate_gpu(qpairbasis, batch_size; pure=true) + @test length(states_gpu) == batch_size + @test all(s isa GaussianState for s in states_gpu) + @test all(device(s) == :gpu for s in states_gpu) + state_cpu = cpu(states_gpu[1]) + @test isgaussian(state_cpu, atol = 1e-3) + @test isapprox(purity(state_cpu), 1.0, atol = 1e-3) + end + end + + @testset "GPU Random Unitaries - Your GPU Functions" begin + nmodes = rand(1:3) + qpairbasis = QuadPairBasis(nmodes) + + @testset "Passive Unitaries - Convenience API" begin + ru_passive_gpu = randunitary_gpu(qpairbasis; passive=true) + @test ru_passive_gpu isa GaussianUnitary + @test device(ru_passive_gpu) == :gpu + @test ru_passive_gpu.ħ == 2 + S_cpu = Array(ru_passive_gpu.symplectic) + @test isapprox(S_cpu', inv(S_cpu), atol = 1e-3) + @test issymplectic(qpairbasis, S_cpu, atol = 1e-3) + end + + @testset "Active Unitaries - Convenience API" begin + ru_active_gpu = randunitary_gpu(qpairbasis; passive=false) + @test ru_active_gpu isa GaussianUnitary + @test device(ru_active_gpu) == :gpu + @test ru_active_gpu.ħ == 2 + S_cpu = Array(ru_active_gpu.symplectic) + @test issymplectic(qpairbasis, S_cpu, atol = 1e-3) + end + + @testset "Typed Unitaries - Your GPU Extensions" begin + ru_typed_gpu = randunitary(CuVector{Float32}, CuMatrix{Float32}, qpairbasis; passive=false) + @test ru_typed_gpu isa GaussianUnitary + @test device(ru_typed_gpu) == :gpu + @test eltype(ru_typed_gpu.disp) == Float32 + @test eltype(ru_typed_gpu.symplectic) == Float32 + S_cpu = Array(ru_typed_gpu.symplectic) + @test issymplectic(qpairbasis, S_cpu, atol = 1e-3) + end + + @testset "Batch Unitaries - Your Batch Functions" begin + batch_size = 3 + unitaries_gpu = batch_randunitary_gpu(qpairbasis, batch_size; passive=false) + @test length(unitaries_gpu) == batch_size + @test all(u isa GaussianUnitary for u in unitaries_gpu) + @test all(device(u) == :gpu for u in unitaries_gpu) + S_cpu = Array(unitaries_gpu[1].symplectic) + @test issymplectic(qpairbasis, S_cpu, atol = 1e-3) + end + end + + @testset "GPU Random Channels - Your GPU Functions" begin + nmodes = rand(1:2) + qpairbasis = QuadPairBasis(nmodes) + + @testset "Basic Channel Generation - Convenience API" begin + rc_gpu = randchannel_gpu(qpairbasis) + @test rc_gpu isa GaussianChannel + @test device(rc_gpu) == :gpu + @test rc_gpu.ħ == 2 + rc_cpu = cpu(rc_gpu) + @test isgaussian(rc_cpu, atol = 1e-3) + state_gpu = randstate_gpu(qpairbasis) + output_gpu = rc_gpu * state_gpu + @test output_gpu isa GaussianState + @test device(output_gpu) == :gpu + end + + @testset "Typed Channels - Your GPU Extensions" begin + rc_typed_gpu = randchannel(CuVector{Float32}, CuMatrix{Float32}, qpairbasis) + @test rc_typed_gpu isa GaussianChannel + @test device(rc_typed_gpu) == :gpu + @test eltype(rc_typed_gpu.disp) == Float32 + @test eltype(rc_typed_gpu.transform) == Float32 + @test eltype(rc_typed_gpu.noise) == Float32 + rc_cpu = cpu(rc_typed_gpu) + @test isgaussian(rc_cpu, atol = 1e-3) + end + end + + @testset "GPU Random Symplectic Matrices - Your GPU Functions" begin + nmodes = rand(1:3) + qpairbasis = QuadPairBasis(nmodes) + qblockbasis = QuadBlockBasis(nmodes) + + @testset "Passive Symplectic - Convenience API" begin + S_passive_gpu = randsymplectic_gpu(qpairbasis; passive=true) + @test S_passive_gpu isa CuMatrix + @test size(S_passive_gpu) == (2*nmodes, 2*nmodes) + S_passive_cpu = Array(S_passive_gpu) + @test isapprox(S_passive_cpu', inv(S_passive_cpu), atol = 1e-3) + @test issymplectic(qpairbasis, S_passive_cpu, atol = 1e-3) + end + + @testset "Active Symplectic - Convenience API" begin + S_active_gpu = randsymplectic_gpu(qblockbasis; passive=false) + @test S_active_gpu isa CuMatrix + @test size(S_active_gpu) == (2*nmodes, 2*nmodes) + S_active_cpu = Array(S_active_gpu) + @test issymplectic(qblockbasis, S_active_cpu, atol = 1e-3) + end + + @testset "Typed Symplectic - Your GPU Extensions" begin + S_typed_gpu = randsymplectic(CuMatrix{Float32}, qpairbasis; passive=true) + @test S_typed_gpu isa CuMatrix{Float32} + @test eltype(S_typed_gpu) == Float32 + @test size(S_typed_gpu) == (2*nmodes, 2*nmodes) + S_typed_cpu = Array(S_typed_gpu) + @test isapprox(S_typed_cpu', inv(S_typed_cpu), atol = 1e-3) + @test issymplectic(qpairbasis, S_typed_cpu, atol = 1e-3) + end + end + + @testset "GPU Random Integration - Your Functions Working Together" begin + nmodes = 2 + qpairbasis = QuadPairBasis(nmodes) + + @testset "State-Unitary Operations" begin + state_gpu = randstate_gpu(qpairbasis; pure=true) + unitary_gpu = randunitary_gpu(qpairbasis; passive=false) + transformed_gpu = unitary_gpu * state_gpu + @test transformed_gpu isa GaussianState + @test device(transformed_gpu) == :gpu + transformed_cpu = cpu(transformed_gpu) + @test isgaussian(transformed_cpu, atol = 1e-3) + end + + @testset "State-Channel Operations" begin + state_gpu = randstate_gpu(qpairbasis; pure=false) + channel_gpu = randchannel_gpu(qpairbasis) + output_gpu = channel_gpu * state_gpu + @test output_gpu isa GaussianState + @test device(output_gpu) == :gpu + output_cpu = cpu(output_gpu) + @test isgaussian(output_cpu, atol = 1e-3) + end + end + + @testset "GPU Random Error Handling - Your API" begin + qpairbasis = QuadPairBasis(1) + @test_nowarn randstate(CuVector{Float32}, CuMatrix{Float32}, qpairbasis) + @test_nowarn randunitary(CuVector{Float32}, CuMatrix{Float32}, qpairbasis) + @test_nowarn randchannel(CuVector{Float32}, CuMatrix{Float32}, qpairbasis) + @test_nowarn randsymplectic(CuMatrix{Float32}, qpairbasis) + @test_nowarn randstate_gpu(qpairbasis) + @test_nowarn randunitary_gpu(qpairbasis) + @test_nowarn randchannel_gpu(qpairbasis) + @test_nowarn randsymplectic_gpu(qpairbasis) + @test_nowarn batch_randstate_gpu(qpairbasis, 2) + @test_nowarn batch_randunitary_gpu(qpairbasis, 2) + end + + @testset "GPU Random Precision Control - Your Features" begin + qpairbasis = QuadPairBasis(1) + + @testset "Float32 Precision" begin + state_f32 = randstate_gpu(qpairbasis; precision=Float32) + unitary_f32 = randunitary_gpu(qpairbasis; precision=Float32) + symp_f32 = randsymplectic_gpu(qpairbasis; precision=Float32) + @test eltype(state_f32.mean) == Float32 + @test eltype(unitary_f32.disp) == Float32 + @test eltype(symp_f32) == Float32 + end + + @testset "Float64 Precision" begin + state_f64 = randstate_gpu(qpairbasis; precision=Float64) + unitary_f64 = randunitary_gpu(qpairbasis; precision=Float64) + symp_f64 = randsymplectic_gpu(qpairbasis; precision=Float64) + @test eltype(state_f64.mean) == Float64 + @test eltype(unitary_f64.disp) == Float64 + @test eltype(symp_f64) == Float64 + end + end + + @testset "GPU Random Performance - Your Batch Functions" begin + qpairbasis = QuadPairBasis(2) + + @testset "Batch vs Individual Performance" begin + batch_size = 5 + batch_states = batch_randstate_gpu(qpairbasis, batch_size; pure=true) + batch_unitaries = batch_randunitary_gpu(qpairbasis, batch_size; passive=false) + @test length(batch_states) == batch_size + @test length(batch_unitaries) == batch_size + @test all(device(s) == :gpu for s in batch_states) + @test all(device(u) == :gpu for u in batch_unitaries) + state_cpu = cpu(batch_states[1]) + unitary_cpu = cpu(batch_unitaries[1]) + @test isgaussian(state_cpu, atol = 1e-3) + @test issymplectic(qpairbasis, Array(batch_unitaries[1].symplectic), atol = 1e-3) + end + end +end + +@testitem "GPU Cross-Wigner Foundation - single thread, but accuracy tests" tags=[:cuda] begin + using Gabs + using Gabs: device, cross_wigner, cross_wignerchar + using LinearAlgebra + using CUDA + + @testset "GPU Cross-Wigner Function Correctness" begin + basis = QuadPairBasis(1) + state1_cpu = coherentstate(basis, 1.0 + 0.5im) + state2_cpu = coherentstate(basis, -0.8 + 0.3im) + state1_gpu = GaussianState(basis, CuArray{Float32}(state1_cpu.mean), CuArray{Float32}(state1_cpu.covar); ħ = state1_cpu.ħ) + state2_gpu = GaussianState(basis, CuArray{Float32}(state2_cpu.mean), CuArray{Float32}(state2_cpu.covar); ħ = state2_cpu.ħ) + test_points = [ + [0.0, 0.0], + [1.0, 0.5], + [-0.5, 1.2], + [0.3, -0.8], + [2.1, -1.4] + ] + + @testset "Cross-Wigner vs CPU Implementation" begin + for x in test_points + cpu_result = cross_wigner(state1_cpu, state2_cpu, x) + gpu_result = cross_wigner(state1_gpu, state2_gpu, x) + @test real(cpu_result) ≈ real(gpu_result) atol=1e-6 rtol=1e-5 + @test imag(cpu_result) ≈ imag(gpu_result) atol=1e-6 rtol=1e-5 + end + end + + @testset "Cross-Wigner Hermiticity Property" begin + for x in test_points + w12 = cross_wigner(state1_gpu, state2_gpu, x) + w21 = cross_wigner(state2_gpu, state1_gpu, x) + @test real(w12) ≈ real(w21) atol=1e-6 rtol=1e-5 + @test imag(w12) ≈ -imag(w21) atol=1e-6 rtol=1e-5 + @test w12 ≈ conj(w21) atol=1e-6 rtol=1e-5 + end + end + + @testset "Cross-Wigner Identity Property" begin + for x in test_points + cross_diag = cross_wigner(state1_gpu, state1_gpu, x) + regular_wigner = wigner(state1_gpu, x) + @test real(cross_diag) ≈ regular_wigner atol=1e-6 rtol=1e-5 + @test abs(imag(cross_diag)) < 1e-6 + end + end + end + + @testset "GPU Cross-Wigner Characteristic Function Correctness" begin + basis = QuadPairBasis(1) + state1_cpu = squeezedstate(basis, 0.3, π/4) + state2_cpu = coherentstate(basis, 0.5 - 0.2im) + state1_gpu = GaussianState(basis, CuArray{Float32}(state1_cpu.mean), CuArray{Float32}(state1_cpu.covar); ħ = state1_cpu.ħ) + state2_gpu = GaussianState(basis, CuArray{Float32}(state2_cpu.mean), CuArray{Float32}(state2_cpu.covar); ħ = state2_cpu.ħ) + test_xi = [ + [0.0, 0.0], + [0.1, -0.3], + [-0.7, 0.4], + [1.2, 0.8], + [-0.9, -1.1] + ] + + @testset "Cross-WignerChar vs CPU Implementation" begin + for xi in test_xi + cpu_result = cross_wignerchar(state1_cpu, state2_cpu, xi) + gpu_result = cross_wignerchar(state1_gpu, state2_gpu, xi) + @test real(cpu_result) ≈ real(gpu_result) atol=1e-6 rtol=1e-5 + @test imag(cpu_result) ≈ imag(gpu_result) atol=1e-6 rtol=1e-5 + end + end + + @testset "Cross-WignerChar Hermiticity Property" begin + for xi in test_xi + chi12 = cross_wignerchar(state1_gpu, state2_gpu, xi) + chi21 = cross_wignerchar(state2_gpu, state1_gpu, -xi) + @test real(chi12) ≈ real(chi21) atol=1e-6 rtol=1e-5 + @test imag(chi12) ≈ -imag(chi21) atol=1e-6 rtol=1e-5 + end + end + + @testset "Cross-WignerChar Identity Property" begin + for xi in test_xi + cross_diag = cross_wignerchar(state1_gpu, state1_gpu, xi) + regular_char = wignerchar(state1_gpu, xi) + @test abs(cross_diag - regular_char) < 1e-6 + end + end + end + + @testset "GPU Cross-Wigner Multi-mode Systems" begin + basis = QuadPairBasis(2) + state1_cpu = coherentstate(basis, [1.0 + 0.5im, -0.3 + 0.8im]) + state2_cpu = squeezedstate(basis, [0.2, 0.4], [π/6, π/3]) + state1_gpu = GaussianState(basis, CuArray{Float32}(state1_cpu.mean), CuArray{Float32}(state1_cpu.covar); ħ = state1_cpu.ħ) + state2_gpu = GaussianState(basis, CuArray{Float32}(state2_cpu.mean), CuArray{Float32}(state2_cpu.covar); ħ = state2_cpu.ħ) + test_points_2mode = [ + [0.0, 0.0, 0.0, 0.0], + [1.0, 0.5, -0.3, 0.8], + [-0.2, 1.1, 0.7, -0.4] + ] + + @testset "Multi-mode Cross-Wigner Correctness" begin + for x in test_points_2mode + cpu_result = cross_wigner(state1_cpu, state2_cpu, x) + gpu_result = cross_wigner(state1_gpu, state2_gpu, x) + @test real(cpu_result) ≈ real(gpu_result) atol=1e-5 rtol=1e-4 + @test imag(cpu_result) ≈ imag(gpu_result) atol=1e-5 rtol=1e-4 + end + end + + @testset "Multi-mode Cross-WignerChar Correctness" begin + for xi in test_points_2mode + cpu_result = cross_wignerchar(state1_cpu, state2_cpu, xi) + gpu_result = cross_wignerchar(state1_gpu, state2_gpu, xi) + @test real(cpu_result) ≈ real(gpu_result) atol=1e-5 rtol=1e-4 + @test imag(cpu_result) ≈ imag(gpu_result) atol=1e-5 rtol=1e-4 + end + end + end + + @testset "GPU Cross-Wigner Mixed Device Support" begin + basis = QuadPairBasis(1) + state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.8 + 0.6im) + state2_gpu = squeezedstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.4, π/8) + x_cpu = [0.5, -0.3] + xi_cpu = [0.2, 0.8] + + @testset "GPU States with CPU Points" begin + result_w = cross_wigner(state1_gpu, state2_gpu, x_cpu) + result_chi = cross_wignerchar(state1_gpu, state2_gpu, xi_cpu) + @test result_w isa ComplexF32 + @test result_chi isa ComplexF32 + @test isfinite(real(result_w)) && isfinite(imag(result_w)) + @test isfinite(real(result_chi)) && isfinite(imag(result_chi)) + end + end + + @testset "GPU Cross-Wigner Error Handling" begin + basis1 = QuadPairBasis(1) + basis2 = QuadBlockBasis(1) + state1 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis1, 1.0) + state2 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis2, 1.0) + state3 = GaussianState(basis1, CuArray{Float32}([1.0, 0.0]), CuArray{Float32}([1.0 0.0; 0.0 1.0]); ħ = 4) + x = [0.0, 0.0] + + @testset "Basis Mismatch Detection" begin + @test_throws ArgumentError cross_wigner(state1, state2, x) + @test_throws ArgumentError cross_wignerchar(state1, state2, x) + end + + @testset "ħ Mismatch Detection" begin + @test_throws ArgumentError cross_wigner(state1, state3, x) + @test_throws ArgumentError cross_wignerchar(state1, state3, x) + end + + @testset "Dimension Mismatch Detection" begin + wrong_x = [0.0] + @test_throws ArgumentError cross_wigner(state1, state1, wrong_x) + @test_throws ArgumentError cross_wignerchar(state1, state1, wrong_x) + end + end + + @testset "GPU Cross-Wigner Performance Validation" begin + basis = QuadPairBasis(2) + state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, [1.0 + 0.5im, -0.3 + 0.8im]) + state2_gpu = eprstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.3, π/4) + x = [0.1, 0.2, 0.3, 0.4] + xi = [0.5, 0.6, 0.7, 0.8] + + @testset "GPU Functions Execute Successfully" begin + @test begin + result_w = cross_wigner(state1_gpu, state2_gpu, x) + result_chi = cross_wignerchar(state1_gpu, state2_gpu, xi) + isfinite(real(result_w)) && isfinite(imag(result_w)) && + isfinite(real(result_chi)) && isfinite(imag(result_chi)) + end + end + end +end + +@testitem "GPU Full Interference Wigner Functions" tags=[:cuda] begin + using Gabs + using Gabs: device + using LinearAlgebra + using CUDA + + if CUDA.functional() + @testset "Single-State Linear Combinations" begin + basis = QuadPairBasis(1) + alpha = 1.0f0 + 0.5f0*im + gpu_state = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha) + coeffs = [1.0f0] + lc_single = GaussianLinearCombination(basis, coeffs, [gpu_state]) + x_cpu = [0.1f0, 0.3f0] + xi_cpu = [0.2f0, -0.4f0] + x_gpu = CuArray(x_cpu) + xi_gpu = CuArray(xi_cpu) + w_single_direct = wigner(gpu_state, x_gpu) + w_single_interference = wigner(lc_single, x_gpu) + @test w_single_direct ≈ w_single_interference rtol=1e-6 + char_single_direct = wignerchar(gpu_state, xi_gpu) + char_single_interference = wignerchar(lc_single, xi_gpu) + @test char_single_direct ≈ char_single_interference rtol=1e-6 + w_mixed = wigner(lc_single, x_cpu) + @test w_single_direct ≈ w_mixed rtol=1e-6 + char_mixed = wignerchar(lc_single, xi_cpu) + @test char_single_direct ≈ char_mixed rtol=1e-6 + end + + @testset "Two-State Interference" begin + basis = QuadPairBasis(1) + alpha = 1.0f0 + state_plus = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha) + state_minus = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, -alpha) + norm_factor = 1.0f0 / sqrt(2.0f0 * (1.0f0 + exp(-2.0f0 * abs2(alpha)))) + coeffs = [norm_factor, norm_factor] + cat_even = GaussianLinearCombination(basis, coeffs, [state_plus, state_minus]) + x_test = [0.0f0, 0.0f0] + xi_test = [0.1f0, 0.2f0] + w_interference = wigner(cat_even, x_test) + char_interference = wignerchar(cat_even, xi_test) + w_plus = wigner(state_plus, CuArray(x_test)) + w_minus = wigner(state_minus, CuArray(x_test)) + cross_w = cross_wigner(state_plus, state_minus, CuArray(x_test)) + w_manual = abs2(coeffs[1]) * w_plus + abs2(coeffs[2]) * w_minus + + 2 * real(conj(coeffs[1]) * coeffs[2] * cross_w) + @test w_interference ≈ w_manual rtol=1e-6 + @test isreal(w_interference) + char_plus = wignerchar(state_plus, CuArray(xi_test)) + char_minus = wignerchar(state_minus, CuArray(xi_test)) + cross_char = cross_wignerchar(state_plus, state_minus, CuArray(xi_test)) + char_manual = abs2(coeffs[1]) * char_plus + abs2(coeffs[2]) * char_minus + + 2 * real(conj(coeffs[1]) * coeffs[2] * cross_char) + @test char_interference ≈ char_manual rtol=1e-6 + end + + @testset "Multi-State Interference" begin + basis = QuadPairBasis(1) + alphas = [1.0f0, 0.0f0, -1.0f0] + states = [coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, α) for α in alphas] + coeffs = [0.5f0, 0.3f0, 0.2f0] + lc_multi = GaussianLinearCombination(basis, coeffs, states) + x_test = [0.5f0, -0.3f0] + xi_test = [0.2f0, 0.4f0] + w_gpu = wigner(lc_multi, x_test) + char_gpu = wignerchar(lc_multi, xi_test) + w_diagonal = sum(abs2(coeffs[i]) * wigner(states[i], CuArray(x_test)) for i in 1:3) + w_cross = 0.0f0 + for i in 1:2 + for j in (i+1):3 + cross_term = 2 * real(conj(coeffs[i]) * coeffs[j] * + cross_wigner(states[i], states[j], CuArray(x_test))) + w_cross += cross_term + end + end + w_manual = w_diagonal + w_cross + @test w_gpu ≈ w_manual rtol=1e-5 + char_diagonal = sum(abs2(coeffs[i]) * wignerchar(states[i], CuArray(xi_test)) for i in 1:3) + char_cross = complex(0.0f0) + for i in 1:2 + for j in (i+1):3 + cross_term = 2 * real(conj(coeffs[i]) * coeffs[j] * + cross_wignerchar(states[i], states[j], CuArray(xi_test))) + char_cross += cross_term + end + end + char_manual = char_diagonal + char_cross + @test char_gpu ≈ char_manual rtol=1e-5 + end + + @testset "Complex Coefficients" begin + basis = QuadPairBasis(1) + alpha1, alpha2 = 1.0f0, -0.5f0 + state1 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha1) + state2 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha2) + c1 = 0.6f0 + 0.2f0*im + c2 = 0.3f0 - 0.4f0*im + coeffs = ComplexF32[c1, c2] + lc_complex = GaussianLinearCombination(basis, coeffs, [state1, state2]) + x_test = [0.2f0, 0.1f0] + w_gpu = wigner(lc_complex, x_test) + w1 = wigner(state1, CuArray(x_test)) + w2 = wigner(state2, CuArray(x_test)) + cross_w = cross_wigner(state1, state2, CuArray(x_test)) + w_manual = abs2(c1) * w1 + abs2(c2) * w2 + 2 * real(conj(c1) * c2 * cross_w) + @test w_gpu ≈ w_manual rtol=1e-6 + @test isreal(w_gpu) + end + + @testset "Multi-Mode Systems" begin + basis = QuadPairBasis(2) + alpha1 = ComplexF32[1.0f0, 0.5f0] + alpha2 = ComplexF32[-0.5f0, 1.0f0] + state1 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha1) + state2 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha2) + coeffs = [0.7f0, 0.3f0] + lc_multimode = GaussianLinearCombination(basis, coeffs, [state1, state2]) + x_test = [0.1f0, 0.2f0, -0.1f0, 0.3f0] + xi_test = [0.05f0, -0.1f0, 0.15f0, -0.05f0] + w_gpu = wigner(lc_multimode, x_test) + char_gpu = wignerchar(lc_multimode, xi_test) + @test isfinite(w_gpu) + @test isreal(w_gpu) + @test isfinite(char_gpu) + w1 = wigner(state1, CuArray(x_test)) + w2 = wigner(state2, CuArray(x_test)) + cross_w = cross_wigner(state1, state2, CuArray(x_test)) + w_manual = abs2(coeffs[1]) * w1 + abs2(coeffs[2]) * w2 + + 2 * real(conj(coeffs[1]) * coeffs[2] * cross_w) + @test w_gpu ≈ w_manual rtol=1e-6 + end + + @testset "Error Handling" begin + basis = QuadPairBasis(1) + state = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 1.0f0) + lc = GaussianLinearCombination(basis, [1.0f0], [state]) + @test_throws ArgumentError wigner(lc, [0.0f0]) + @test_throws ArgumentError wignerchar(lc, [0.0f0, 0.0f0, 0.0f0]) + basis2 = QuadPairBasis(2) + state1_2mode = tensor(state, state) + lc2 = GaussianLinearCombination(basis2, [1.0f0], [state1_2mode]) + @test_throws ArgumentError wigner(lc2, [0.0f0, 0.0f0]) + @test_throws ArgumentError wignerchar(lc2, [0.0f0, 0.0f0, 0.0f0]) + end + + @testset "Performance and Memory Efficiency" begin + basis = QuadPairBasis(1) + n_states = 5 + states = [coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, + Float32(i) * (1.0f0 + 0.1f0*im)) for i in 1:n_states] + coeffs = normalize([Float32(i) for i in 1:n_states]) + lc_large = GaussianLinearCombination(basis, coeffs, states) + x_test = [0.0f0, 0.0f0] + w_result = wigner(lc_large, x_test) + elapsed_time = @elapsed begin + for _ in 1:100 + wigner(lc_large, x_test) + end + end + @test elapsed_time < 1.0 + @test isfinite(w_result) + @test isreal(w_result) + CUDA.reclaim() + initial_memory = CUDA.used_memory() + w_large = wigner(lc_large, x_test) + peak_memory = CUDA.used_memory() + memory_increase = peak_memory - initial_memory + @test memory_increase < 100_000_000 + @test isfinite(w_large) + end + + @testset "Numerical Stability" begin + basis = QuadPairBasis(1) + small_alpha = 1.0f-6 + large_alpha = 1.0f6 + small_state = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, small_alpha) + large_state = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, large_alpha) + coeffs = [1.0f0/sqrt(2.0f0), 1.0f0/sqrt(2.0f0)] + lc_extreme = GaussianLinearCombination(basis, coeffs, [small_state, large_state]) + x_test = [0.0f0, 0.0f0] + w_extreme = wigner(lc_extreme, x_test) + char_extreme = wignerchar(lc_extreme, [0.1f0, 0.1f0]) + @test isfinite(w_extreme) + @test isfinite(char_extreme) + @test !isnan(w_extreme) + @test !isnan(char_extreme) + end + else + @test_skip "CUDA not available, skipping GPU interference tests" + end +end + +@testitem "GPU Batched Interference Test" tags=[:cuda] begin + using Gabs + using Gabs: device + using LinearAlgebra + using CUDA + using Test + + @testset "GPU Batched Interference Wigner Functions" begin + basis = QuadPairBasis(1) + α1, α2 = 1.0f0 + 0.5f0*im, -1.0f0 + 0.3f0*im + state1 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, α1) + state2 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, α2) + coeffs = Float32[0.6, 0.8] + lc = GaussianLinearCombination(basis, coeffs, [state1, state2]) + @test device(lc) == :gpu + @test length(lc) == 2 + + @testset "Batched Wigner Evaluation" begin + num_points = 100 + x_points = CuArray(randn(Float32, 2, num_points)) + w_batch = wigner(lc, x_points) + @test w_batch isa CuArray{Float32} + @test size(w_batch) == (num_points,) + @test device(w_batch) == :gpu + @test all(isfinite, Array(w_batch)) + w_single = [wigner(lc, @view x_points[:, i]) for i in 1:min(10, num_points)] + w_batch_sample = Array(w_batch[1:length(w_single)]) + @test w_batch_sample ≈ w_single rtol=1e-5 + end + + @testset "Batched Wigner Characteristic Function" begin + num_points = 50 + xi_points = CuArray(randn(Float32, 2, num_points)) + char_batch = wignerchar(lc, xi_points) + @test char_batch isa CuArray{ComplexF32} + @test size(char_batch) == (num_points,) + @test device(char_batch) == :gpu + @test all(isfinite, Array(real.(char_batch))) + @test all(isfinite, Array(imag.(char_batch))) + char_single = [wignerchar(lc, @view xi_points[:, i]) for i in 1:min(5, num_points)] + char_batch_sample = Array(char_batch[1:length(char_single)]) + @test char_batch_sample ≈ char_single rtol=1e-5 + end + + @testset "Automatic CPU->GPU Promotion" begin + num_points = 20 + x_points_cpu = randn(Float32, 2, num_points) + xi_points_cpu = randn(Float32, 2, num_points) + w_auto = wigner(lc, x_points_cpu) + char_auto = wignerchar(lc, xi_points_cpu) + @test device(w_auto) == :gpu + @test device(char_auto) == :gpu + w_direct = wigner(lc, CuArray(x_points_cpu)) + char_direct = wignerchar(lc, CuArray(xi_points_cpu)) + @test Array(w_auto) ≈ Array(w_direct) rtol=1e-6 + @test Array(char_auto) ≈ Array(char_direct) rtol=1e-6 + end + + @testset "Performance Scaling" begin + large_num_points = 1000 + x_large = CuArray(randn(Float32, 2, large_num_points)) + @test_nowarn begin + w_large = wigner(lc, x_large) + @test length(w_large) == large_num_points + @test device(w_large) == :gpu + end + end + + @testset "Multi-State Interference" begin + state3 = squeezedstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.2f0, Float32(π/6)) + coeffs_multi = Float32[0.5, 0.3, 0.2] + lc_multi = GaussianLinearCombination(basis, coeffs_multi, [state1, state2, state3]) + num_points = 50 + x_points = CuArray(randn(Float32, 2, num_points)) + w_multi = wigner(lc_multi, x_points) + char_multi = wignerchar(lc_multi, x_points) + @test w_multi isa CuArray{Float32} + @test char_multi isa CuArray{ComplexF32} + @test size(w_multi) == (num_points,) + @test size(char_multi) == (num_points,) + @test all(isfinite, Array(w_multi)) + @test all(isfinite, Array(real.(char_multi))) + @test all(isfinite, Array(imag.(char_multi))) + end + + @testset "Error Handling and Edge Cases" begin + x_wrong_dim = CuArray(randn(Float32, 3, 10)) + @test_throws ArgumentError wigner(lc, x_wrong_dim) + @test_throws ArgumentError wignerchar(lc, x_wrong_dim) + @test_throws DimensionMismatch GaussianLinearCombination(basis, Float32[0.5, 0.3], [state1]) + basis_wrong = QuadBlockBasis(1) + state_wrong = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis_wrong, 1.0f0) + @test_throws ArgumentError GaussianLinearCombination(basis, Float32[0.5], [state_wrong]) + x_single = CuArray(randn(Float32, 2, 1)) + w_single_batch = wigner(lc, x_single) + @test length(w_single_batch) == 1 + @test isfinite(Array(w_single_batch)[1]) + end + end + + @testset "GPU Batched Interference - Float64 Precision" begin + basis = QuadPairBasis(1) + state1_f64 = coherentstate(CuVector{Float64}, CuMatrix{Float64}, basis, 1.0 + 0.5*im) + state2_f64 = coherentstate(CuVector{Float64}, CuMatrix{Float64}, basis, -1.0 + 0.3*im) + coeffs_f64 = [0.6, 0.8] + lc_f64 = GaussianLinearCombination(basis, coeffs_f64, [state1_f64, state2_f64]) + x_points_f64 = CuArray(randn(Float64, 2, 20)) + w_f64 = wigner(lc_f64, x_points_f64) + char_f64 = wignerchar(lc_f64, x_points_f64) + @test w_f64 isa CuArray{Float64} + @test char_f64 isa CuArray{ComplexF64} + @test all(isfinite, Array(w_f64)) + @test all(isfinite, Array(real.(char_f64))) + @test all(isfinite, Array(imag.(char_f64))) + end + + @testset "GPU Batched Interference - Multi-Mode" begin + basis_2mode = QuadPairBasis(2) + state1_2m = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis_2mode, 1.0f0) + state2_2m = squeezedstate(CuVector{Float32}, CuMatrix{Float32}, basis_2mode, 0.3f0, 0.0f0) + coeffs_2m = Float32[0.7, 0.3] + lc_2m = GaussianLinearCombination(basis_2mode, coeffs_2m, [state1_2m, state2_2m]) + num_points = 30 + x_points_2m = CuArray(randn(Float32, 4, num_points)) + w_2m = wigner(lc_2m, x_points_2m) + char_2m = wignerchar(lc_2m, x_points_2m) + @test w_2m isa CuArray{Float32} + @test char_2m isa CuArray{ComplexF32} + @test size(w_2m) == (num_points,) + @test size(char_2m) == (num_points,) + @test all(isfinite, Array(w_2m)) + @test all(isfinite, Array(real.(char_2m))) + @test all(isfinite, Array(imag.(char_2m))) + end + +end + +#Benchmark code +#= +@testitem "GPU Performance Benchmarks for Wigner and Tensor" tags=[:cuda] begin + using CUDA + using Gabs + using Gabs: device + using LinearAlgebra + using Statistics + + function gpu_warmup() + basis = QuadPairBasis(1) + try + for _ in 1:5 + vac = vacuumstate(basis) |> gpu + coh = coherentstate(basis, 1.0f0) |> gpu + end + vac = vacuumstate(basis) |> gpu + disp = displace(basis, 1.0f0) |> gpu + for _ in 1:5 + displaced = disp * vac + end + coh = coherentstate(basis, 0.5f0) |> gpu + test_points = CuArray(randn(Float32, 2, 100)) + for _ in 1:3 + w = wigner(coh, test_points) + end + CUDA.synchronize() + GC.gc() + catch e + @error "GPU warmup failed" exception=e + rethrow(e) + end + end + + function benchmark_operation(gpu_func, cpu_func, name::String; n_trials=5, min_time=1e-6) + gpu_times = Float64[] + for trial in 1:n_trials + CUDA.synchronize() + t_start = time_ns() + result_gpu = gpu_func() + CUDA.synchronize() + t_end = time_ns() + elapsed = max((t_end - t_start) / 1e9, min_time) + push!(gpu_times, elapsed) + end + cpu_times = Float64[] + for trial in 1:n_trials + GC.gc() + t_start = time_ns() + result_cpu = cpu_func() + t_end = time_ns() + elapsed = max((t_end - t_start) / 1e9, min_time) + push!(cpu_times, elapsed) + end + gpu_median = median(gpu_times) + cpu_median = median(cpu_times) + speedup = cpu_median / gpu_median + @info "Benchmark: $name" cpu_time=round(cpu_median, digits=4) gpu_time=round(gpu_median, digits=4) speedup=round(speedup, digits=2) + return speedup, gpu_median, cpu_median + end + + @testset "GPU Warmup" begin + gpu_warmup() + @test true + end + + @testset "API Usage" begin + basis = QuadPairBasis(1) + state = coherentstate(basis, 1.0) |> gpu + op = displace(basis, 0.5) |> gpu + result = op * state + @test device(state) == :gpu + @test device(op) == :gpu + @test device(result) == :gpu + α_gpu = CuArray([1.0f0 + 0.5f0im]) + auto_state = coherentstate(basis, α_gpu) + @test device(auto_state) == :gpu + end + + @testset "State Creation Benchmarks" begin + + @testset "Single-Mode State Creation" begin + basis = QuadPairBasis(1) + α = 1.0f0 + 0.5f0im + gpu_func = () -> coherentstate(basis, α) |> gpu + cpu_func = () -> coherentstate(basis, α) + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Single-mode coherent state creation") + coh_gpu = gpu_func() + coh_cpu = cpu_func() + @test Array(coh_gpu.mean) ≈ Float32.(coh_cpu.mean) rtol=1e-6 + @test speedup > 0.01 + end + + @testset "Multi-Mode State Creation" begin + basis = QuadPairBasis(10) + α_vec = randn(ComplexF32, 10) + gpu_func = () -> coherentstate(basis, α_vec) |> gpu + cpu_func = () -> coherentstate(basis, α_vec) + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "10-mode coherent state creation") + @test speedup > 0.03 + end + + @testset "Very Large State Creation" begin + basis = QuadPairBasis(20) + α_vec = randn(ComplexF32, 20) + gpu_func = () -> coherentstate(basis, α_vec) |> gpu + cpu_func = () -> coherentstate(basis, α_vec) + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "20-mode coherent state creation") + @test speedup > 0.04 + if speedup > 1.0 + @info "GPU advantage emerging for very large state creation: $(round(speedup, digits=2))x" + end + end + end + + @testset "Operation Application Benchmarks" begin + + @testset "Single-Mode Operations" begin + basis = QuadPairBasis(1) + vac_gpu = vacuumstate(basis) |> gpu + vac_cpu = vacuumstate(basis) + disp_gpu = displace(basis, 1.0f0) |> gpu + disp_cpu = displace(basis, 1.0f0) + gpu_func = () -> disp_gpu * vac_gpu + cpu_func = () -> disp_cpu * vac_cpu + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Single-mode operation application") + result_gpu = gpu_func() + result_cpu = cpu_func() + @test Array(result_gpu.mean) ≈ Float32.(result_cpu.mean) rtol=1e-6 + @test speedup > 0.01 + end + + @testset "Multi-Mode Operations" begin + basis = QuadPairBasis(5) + α_vec = randn(ComplexF32, 5) + coh_gpu = coherentstate(basis, α_vec) |> gpu + coh_cpu = coherentstate(basis, α_vec) + r_vec = randn(Float32, 5) * 0.3f0 + θ_vec = randn(Float32, 5) + squeeze_gpu = squeeze(basis, r_vec, θ_vec) |> gpu + squeeze_cpu = squeeze(basis, r_vec, θ_vec) + gpu_func = () -> squeeze_gpu * coh_gpu + cpu_func = () -> squeeze_cpu * coh_cpu + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "5-mode squeeze operation") + @test speedup > 0.02 + end + end + + @testset "Tensor Product Benchmarks" begin + + @testset "Small Tensor Products" begin + basis1 = QuadPairBasis(2) + basis2 = QuadPairBasis(2) + α1 = randn(ComplexF32, 2) + α2 = randn(ComplexF32, 2) + coh1_gpu = coherentstate(basis1, α1) |> gpu + coh2_gpu = coherentstate(basis2, α2) |> gpu + coh1_cpu = coherentstate(basis1, α1) + coh2_cpu = coherentstate(basis2, α2) + gpu_func = () -> tensor(coh1_gpu, coh2_gpu) + cpu_func = () -> tensor(coh1_cpu, coh2_cpu) + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Small tensor product (2⊗2 modes)") + result_gpu = gpu_func() + result_cpu = cpu_func() + @test result_gpu.basis.nmodes == 4 + @test Array(result_gpu.mean) ≈ Float32.(result_cpu.mean) rtol=1e-6 + @test Array(result_gpu.covar) ≈ Float32.(result_cpu.covar) rtol=1e-6 + @test speedup > 0.05 + end + + @testset "Larger Tensor Products" begin + basis1 = QuadPairBasis(3) + basis2 = QuadPairBasis(4) + α1 = randn(ComplexF32, 3) + α2 = randn(ComplexF32, 4) + coh1_gpu = coherentstate(basis1, α1) |> gpu + coh2_gpu = coherentstate(basis2, α2) |> gpu + coh1_cpu = coherentstate(basis1, α1) + coh2_cpu = coherentstate(basis2, α2) + gpu_func = () -> tensor(coh1_gpu, coh2_gpu) + cpu_func = () -> tensor(coh1_cpu, coh2_cpu) + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Large tensor product (3⊗4 modes)") + result_gpu = gpu_func() + result_cpu = cpu_func() + @test result_gpu.basis.nmodes == 7 + @test Array(result_gpu.mean) ≈ Float32.(result_cpu.mean) rtol=1e-6 + @test speedup > 0.05 + if speedup > 1.0 + @info "GPU tensor product advantage: $(round(speedup, digits=2))x" + end + end + end + + @testset "Wigner Function Benchmarks - The GPU Sweet Spot" begin + + @testset "Single-Point Wigner" begin + basis = QuadPairBasis(1) + coh_gpu = coherentstate(basis, 1.0f0) |> gpu + coh_cpu = coherentstate(basis, 1.0f0) + x_point = [0.5f0, 0.3f0] + gpu_func = () -> wigner(coh_gpu, x_point) + cpu_func = () -> wigner(coh_cpu, x_point) + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Single-point Wigner evaluation") + @test gpu_func() ≈ cpu_func() rtol=1e-5 + @test speedup > 0.05 + end + + @testset "Batch Wigner Evaluation - Small" begin + basis = QuadPairBasis(1) + coh_gpu = coherentstate(basis, 0.8f0) |> gpu + coh_cpu = coherentstate(basis, 0.8f0) + n_points = 1000 + x_points_gpu = CuArray(randn(Float32, 2, n_points)) + x_points_cpu = Array(x_points_gpu) + gpu_func = () -> wigner(coh_gpu, x_points_gpu) + cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points] + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Batch Wigner (1000 points)", n_trials=3) + @test speedup > 2.0 + w_gpu = Array(gpu_func()) + w_cpu = cpu_func() + @test w_gpu ≈ w_cpu rtol=1e-4 + @info "GPU batch Wigner showing advantage: $(round(speedup, digits=1))x speedup" + end + + @testset "Batch Wigner Evaluation - Large" begin + basis = QuadPairBasis(1) + coh_gpu = coherentstate(basis, 0.5f0) |> gpu + coh_cpu = coherentstate(basis, 0.5f0) + n_points = 25000 + x_points_gpu = CuArray(randn(Float32, 2, n_points)) + x_points_cpu = Array(x_points_gpu) + gpu_func = () -> wigner(coh_gpu, x_points_gpu) + cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points] + speedup, gpu_time, cpu_time = benchmark_operation(gpu_func, cpu_func, "Large batch Wigner (25,000 points)", n_trials=3) + @test speedup > 10.0 + @test gpu_time < 0.1 + w_gpu = Array(gpu_func()) + for i in 1:min(10, n_points) + w_single = wigner(coh_cpu, x_points_cpu[:, i]) + @test w_gpu[i] ≈ w_single rtol=1e-4 + end + @info "Large batch GPU advantage: $(round(speedup, digits=1))x speedup" + end + + @testset "Multi-Mode Wigner" begin + basis = QuadPairBasis(2) + α_vec = [0.5f0 + 0.3f0im, -0.2f0 + 0.8f0im] + coh_gpu = coherentstate(basis, α_vec) |> gpu + coh_cpu = coherentstate(basis, α_vec) + n_points = 5000 + x_points_gpu = CuArray(randn(Float32, 4, n_points)) + x_points_cpu = Array(x_points_gpu) + gpu_func = () -> wigner(coh_gpu, x_points_gpu) + cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points] + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Multi-mode Wigner (2 modes, 5000 points)", n_trials=3) + @test speedup > 5.0 + @info "Multi-mode GPU advantage: $(round(speedup, digits=1))x speedup" + end + end + + @testset "Wigner Characteristic Function Benchmarks" begin + + @testset "Batch Wigner Characteristic - Professional API" begin + basis = QuadPairBasis(1) + sq_gpu = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu + sq_cpu = squeezedstate(basis, 0.3f0, Float32(π/4)) + n_points = 5000 + xi_points_gpu = CuArray(randn(Float32, 2, n_points) * 0.5f0) + xi_points_cpu = Array(xi_points_gpu) + gpu_func = () -> wignerchar(sq_gpu, xi_points_gpu) + cpu_func = () -> [wignerchar(sq_cpu, xi_points_cpu[:, i]) for i in 1:n_points] + speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Batch Wigner characteristic (5,000 points)", n_trials=3) + @test speedup > 2.0 + chi_gpu = Array(gpu_func()) + for i in 1:min(5, n_points) + chi_single = wignerchar(sq_cpu, xi_points_cpu[:, i]) + @test real(chi_gpu[i]) ≈ real(chi_single) rtol=1e-4 + @test imag(chi_gpu[i]) ≈ imag(chi_single) rtol=1e-4 + end + @info "Wigner characteristic GPU advantage: $(round(speedup, digits=1))x speedup" + end + end + + @testset " Performance Tests" begin + basis = QuadPairBasis(1) + coh_gpu = coherentstate(basis, 1.0f0) |> gpu + coh_cpu = coherentstate(basis, 1.0f0) + n_points = 15000 + x_points_gpu = CuArray(randn(Float32, 2, n_points)) + x_points_cpu = Array(x_points_gpu) + gpu_func = () -> wigner(coh_gpu, x_points_gpu) + cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points] + speedup, gpu_time, cpu_time = benchmark_operation(gpu_func, cpu_func, " GPU validation") + @test speedup > 15.0 + @test gpu_time < cpu_time / 10 + w_gpu = Array(gpu_func()) + w_sample = [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:min(20, n_points)] + @test w_gpu[1:length(w_sample)] ≈ w_sample rtol=1e-4 + end + + @testset "Scaling Analysis" begin + basis = QuadPairBasis(1) + coh_gpu = coherentstate(basis, 1.0f0) |> gpu + coh_cpu = coherentstate(basis, 1.0f0) + point_counts = [100, 500, 1000, 5000, 10000, 25000] + speedups = Float64[] + for n_points in point_counts + x_points_gpu = CuArray(randn(Float32, 2, n_points)) + x_points_cpu = Array(x_points_gpu) + gpu_func = () -> wigner(coh_gpu, x_points_gpu) + cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points] + speedup, gpu_time, cpu_time = benchmark_operation(gpu_func, cpu_func, "Scaling test ($n_points points)", n_trials=3) + push!(speedups, speedup) + @info "Scaling point" n_points=n_points speedup=round(speedup, digits=2) gpu_time=round(gpu_time, digits=4) cpu_time=round(cpu_time, digits=4) + end + @test speedups[end] > speedups[1] + @test maximum(speedups) > 10.0 + @info "Peak GPU speedup: $(round(maximum(speedups), digits=2))x" + end +end +=# + +#= +@testitem "GPU Batched Performance Benchmarks" tags=[:cuda] begin + using Gabs + using Gabs: device + using LinearAlgebra + using CUDA + using Test + using BenchmarkTools + + function cpu_wigner_batch(lc, x_points) + results = Vector{Float32}(undef, size(x_points, 2)) + for i in 1:size(x_points, 2) + results[i] = wigner(lc, x_points[:, i]) + end + return results + end + + function cpu_wignerchar_batch(lc, xi_points) + results = Vector{ComplexF32}(undef, size(xi_points, 2)) + for i in 1:size(xi_points, 2) + results[i] = wignerchar(lc, xi_points[:, i]) + end + return results + end + + function cpu_lc_wigner_batch(lc, x_points) + results = Vector{Float32}(undef, size(x_points, 2)) + for i in 1:size(x_points, 2) + results[i] = wigner(lc, x_points[:, i]) + end + return results + end + + @testset "GPU vs CPU Speedup Benchmarks - Batched Interference" begin + println("GPU BATCHED INTERFERENCE PERFORMANCE BENCHMARKS") + test_configs = [ + (1, 100, "Small: 1-mode, 2-state, 100 points"), + (1, 1_000, "Medium: 1-mode, 2-state, 1K points"), + (1, 10_000, "Large: 1-mode, 2-state, 10K points"), + (2, 1_000, "Multi-mode: 2-mode, 2-state, 1K points"), + (1, 50_000, "Very Large: 1-mode, 2-state, 50K points"), + (3, 500, "High-D: 3-mode, 2-state, 500 points"), + ] + @testset "Wigner Function Benchmarks" begin + println("\nWIGNER FUNCTION BENCHMARKS:") + for (nmodes, npoints, description) in test_configs + println("\nTesting: $description") + basis = QuadPairBasis(nmodes) + phase_space_dim = 2 * nmodes + state1_cpu = coherentstate(basis, 1.0f0 + 0.5f0*im) + state2_cpu = coherentstate(basis, -1.0f0 + 0.3f0*im) + state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 1.0f0 + 0.5f0*im) + state2_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, -1.0f0 + 0.3f0*im) + coeffs = Float32[0.6, 0.8] + coeffs ./= norm(coeffs) + cpu_states_array = [state1_cpu, state2_cpu] + gpu_states_array = [state1_gpu, state2_gpu] + cpu_lc = GaussianLinearCombination(basis, coeffs, cpu_states_array) + gpu_lc = GaussianLinearCombination(basis, coeffs, gpu_states_array) + @test device(gpu_lc) == :gpu + x_points_cpu = randn(Float32, phase_space_dim, npoints) + x_points_gpu = CuArray(x_points_cpu) + if npoints >= 100 + wigner(cpu_lc, x_points_cpu[:, 1]) + wigner(gpu_lc, x_points_gpu[:, 1:min(10, npoints)]) + CUDA.synchronize() + end + print(" CPU: ") + cpu_time = @belapsed cpu_wigner_batch($cpu_lc, $x_points_cpu) samples=3 evals=1 + cpu_result = cpu_wigner_batch(cpu_lc, x_points_cpu) + print(" GPU: ") + gpu_time = @belapsed begin + result = wigner($gpu_lc, $x_points_gpu) + CUDA.synchronize() + result + end samples=3 evals=1 + gpu_result = wigner(gpu_lc, x_points_gpu) + CUDA.synchronize() + sample_size = min(20, npoints) + cpu_sample = cpu_result[1:sample_size] + gpu_sample = Array(gpu_result[1:sample_size]) + @test cpu_sample ≈ gpu_sample rtol=1e-4 + speedup = cpu_time / gpu_time + println(" Results:") + println(" CPU Time: $(round(cpu_time*1000, digits=2)) ms") + println(" GPU Time: $(round(gpu_time*1000, digits=2)) ms") + println(" Speedup: $(round(speedup, digits=1))x") + end + end + + @testset "Wigner Characteristic Function Benchmarks" begin + println("\n\nWIGNER CHARACTERISTIC FUNCTION BENCHMARKS:") + char_configs = [ + (1, 500, "Small: 1-mode, 2-state, 500 points"), + (1, 5_000, "Medium: 1-mode, 2-state, 5K points"), + (2, 1_000, "Multi-mode: 2-mode, 2-state, 1K points"), + (1, 20_000,"Large: 1-mode, 2-state, 20K points"), + ] + + for (nmodes, npoints, description) in char_configs + println("\nTesting: $description") + basis = QuadPairBasis(nmodes) + phase_space_dim = 2 * nmodes + state1_cpu = coherentstate(basis, 1.0f0 + 0.5f0*im) + state2_cpu = coherentstate(basis, -1.0f0 + 0.3f0*im) + state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 1.0f0 + 0.5f0*im) + state2_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, -1.0f0 + 0.3f0*im) + coeffs = Float32[0.6, 0.8] + coeffs ./= norm(coeffs) + cpu_states_array = [state1_cpu, state2_cpu] + gpu_states_array = [state1_gpu, state2_gpu] + cpu_lc = GaussianLinearCombination(basis, coeffs, cpu_states_array) + gpu_lc = GaussianLinearCombination(basis, coeffs, gpu_states_array) + xi_points_cpu = randn(Float32, phase_space_dim, npoints) + xi_points_gpu = CuArray(xi_points_cpu) + if npoints >= 100 + wignerchar(cpu_lc, xi_points_cpu[:, 1]) + wignerchar(gpu_lc, xi_points_gpu[:, 1:min(5, npoints)]) + CUDA.synchronize() + end + print(" CPU: ") + cpu_time = @belapsed cpu_wignerchar_batch($cpu_lc, $xi_points_cpu) samples=3 evals=1 + cpu_result = cpu_wignerchar_batch(cpu_lc, xi_points_cpu) + print(" GPU: ") + gpu_time = @belapsed begin + result = wignerchar($gpu_lc, $xi_points_gpu) + CUDA.synchronize() + result + end samples=3 evals=1 + gpu_result = wignerchar(gpu_lc, xi_points_gpu) + CUDA.synchronize() + sample_size = min(10, npoints) + cpu_sample = cpu_result[1:sample_size] + gpu_sample = Array(gpu_result[1:sample_size]) + @test cpu_sample ≈ gpu_sample rtol=1e-4 + speedup = cpu_time / gpu_time + println(" Results:") + println(" CPU Time: $(round(cpu_time*1000, digits=2)) ms") + println(" GPU Time: $(round(gpu_time*1000, digits=2)) ms") + println(" Speedup: $(round(speedup, digits=1))x") + end + end + + @testset "Cross-Wigner Batch Performance" begin + println("\n\nCROSS-WIGNER BATCH PERFORMANCE:") + basis = QuadPairBasis(1) + state1_cpu = coherentstate(basis, 1.0f0 + 0.5f0*im) + state2_cpu = squeezedstate(basis, 0.3f0, Float32(π/4)) + state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 1.0f0 + 0.5f0*im) + state2_gpu = squeezedstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.3f0, Float32(π/4)) + batch_sizes = [100, 1_000, 10_000, 50_000] + + for npoints in batch_sizes + println("\nCross-Wigner batch size: $npoints points") + x_points_cpu = randn(Float32, 2, npoints) + x_points_gpu = CuArray(x_points_cpu) + if npoints >= 100 + Gabs.cross_wigner(state1_cpu, state2_cpu, x_points_cpu[:, 1]) + end + coeffs = Float32[0.7, 0.3] + cpu_states_array = [state1_cpu, state2_cpu] + gpu_states_array = [state1_gpu, state2_gpu] + cpu_lc = GaussianLinearCombination(basis, coeffs, cpu_states_array) + gpu_lc = GaussianLinearCombination(basis, coeffs, gpu_states_array) + print(" CPU: ") + cpu_time = @belapsed cpu_lc_wigner_batch($cpu_lc, $x_points_cpu) samples=2 evals=1 + print(" GPU: ") + gpu_time = @belapsed begin + result = wigner($gpu_lc, $x_points_gpu) + CUDA.synchronize() + result + end samples=2 evals=1 + speedup = cpu_time / gpu_time + println(" Results:") + println(" CPU Time: $(round(cpu_time*1000, digits=2)) ms") + println(" GPU Time: $(round(gpu_time*1000, digits=2)) ms") + println(" Speedup: $(round(speedup, digits=1))x") + end + end + end +end +=# \ No newline at end of file