diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 6c1d4aa..b61dbe1 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -52,6 +52,42 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-docdeploy@v1
         env:
-          GKSwstype: nul # Fix for Plots with GR backend.
+          GKSwstype: nul 
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           DOCUMENTER_KEY: ${{ secrets.DOCUMENTER_KEY }}
+  gpu-tests:
+    name: GPU Tests - Julia ${{ matrix.version }}
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        version: ['1.10']  
+    steps:
+      - uses: actions/checkout@v5
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: ${{ matrix.version }}
+          arch: x64
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@v1
+      - uses: julia-actions/julia-runtest@v1
+        env:
+          CUDA_TEST: true
+          JULIA_NUM_THREADS: 2
+  static-analysis:
+    name: Static Analysis
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+      - uses: julia-actions/setup-julia@v1
+        with:
+          version: '1.10'
+          arch: x64
+      - uses: julia-actions/cache@v2
+      - uses: julia-actions/julia-buildpkg@v1
+      - name: Add JET.jl
+        run: julia -e 'using Pkg; Pkg.add("JET")'
+      - uses: julia-actions/julia-runtest@v1
+        env:
+          JET_TEST: true
+          JULIA_NUM_THREADS: 2
\ No newline at end of file
diff --git a/Project.toml b/Project.toml
index 482cc75..e6c5426 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,12 +12,15 @@ SymplecticMatrices = "d07eab47-f98b-4620-8fe1-ee63e153d4ef"
 [weakdeps]
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
+CUDAExt = "CUDA"
 MakieExt = "Makie"
 StaticArraysExt = "StaticArrays"
 
 [compat]
+CUDA = "5"
 LinearAlgebra = "1.9"
 Makie = "0.21, 0.22, 0.23, 0.24"
 QuantumInterface = "0.3.7, 0.3.8, 0.4.1"
diff --git a/ext/CUDAExt/CUDAExt.jl b/ext/CUDAExt/CUDAExt.jl
new file mode 100644
index 0000000..3ab1191
--- /dev/null
+++ b/ext/CUDAExt/CUDAExt.jl
@@ -0,0 +1,234 @@
+module CUDAExt
+using CUDA
+using CUDA: CuArray, CuVector, CuMatrix, @cuda, threadIdx, blockIdx, blockDim, gridDim
+using Gabs
+using Gabs: SymplecticBasis, QuadPairBasis, QuadBlockBasis, GaussianState, GaussianUnitary, GaussianChannel, GaussianLinearCombination, _promote_output_vector, _promote_output_matrix, _vacuumstate, _coherentstate, _squeezedstate, _thermalstate, _eprstate, _displace, _squeeze, _twosqueeze, _phaseshift, _beamsplitter, _attenuator, _amplifier, symplecticform, WIGNER_ERROR, ACTION_ERROR, HBAR_ERROR, INDEX_ERROR, SYMPLECTIC_ERROR, cross_wigner, cross_wignerchar, vacuumstate, randstate, randunitary, randchannel, randsymplectic
+using LinearAlgebra
+using LinearAlgebra: I, det, mul!, eigvals, Diagonal, logdet, dot, inv, diag, cholesky, Symmetric, qr, Hermitian
+using Random
+using Random: randn!
+import Gabs: purity, entropy_vn, vacuumstate, changebasis, randstate, randunitary, randchannel, randsymplectic, ptrace, cross_wigner, cross_wignerchar, wigner, wignerchar, catstate_even, catstate_odd, catstate, gkpstate, _promote_output_vector, _promote_output_matrix, _is_gpu_array, _detect_device, coherentstate, squeezedstate, thermalstate, eprstate, displace, squeeze, twosqueeze, phaseshift, beamsplitter, attenuator, amplifier
+
+
+const CUDA_AVAILABLE = CUDA.functional()
+
+function __init__()
+    if CUDA_AVAILABLE
+        @info "CUDA.jl extension loaded successfully. GPU acceleration enabled."
+    else
+        @warn "CUDA not available. GPU operations will fall back to CPU."
+    end
+end
+
+function Gabs._gpu_impl(state::GaussianState, precision)
+    gpu_mean = CuArray{precision}(state.mean)
+    gpu_covar = CuArray{precision}(state.covar)
+    return GaussianState(state.basis, gpu_mean, gpu_covar; ħ = state.ħ)
+end
+
+function Gabs._gpu_impl(op::GaussianUnitary, precision)
+    gpu_disp = CuArray{precision}(op.disp)
+    gpu_symplectic = CuArray{precision}(op.symplectic)
+    return GaussianUnitary(op.basis, gpu_disp, gpu_symplectic; ħ = op.ħ)
+end
+
+function Gabs._gpu_impl(op::GaussianChannel, precision)
+    gpu_disp = CuArray{precision}(op.disp)
+    gpu_transform = CuArray{precision}(op.transform)
+    gpu_noise = CuArray{precision}(op.noise)
+    return GaussianChannel(op.basis, gpu_disp, gpu_transform, gpu_noise; ħ = op.ħ)
+end
+
+function Gabs._gpu_impl(lc::GaussianLinearCombination, precision)
+    cpu_coeffs = Vector{precision}(Array(lc.coeffs))
+    gpu_states = [Gabs._gpu_impl(state, precision) for state in lc.states]
+    return GaussianLinearCombination(lc.basis, cpu_coeffs, gpu_states)
+end
+
+function Gabs._gpu_impl(x::AbstractArray, precision)
+    return CuArray{precision}(x)
+end
+
+function Gabs._detect_device(x::CuArray)
+    return :gpu
+end
+
+function Gabs._randstate_gpu_impl(basis::SymplecticBasis, precision; pure=false, ħ=2)
+    cpu_state = randstate(basis; pure=pure, ħ=ħ)
+    return Gabs._gpu_impl(cpu_state, precision)
+end
+
+function Gabs._randunitary_gpu_impl(basis::SymplecticBasis, precision; passive=false, ħ=2)
+    cpu_unitary = randunitary(basis; passive=passive, ħ=ħ)
+    return Gabs._gpu_impl(cpu_unitary, precision)
+end
+
+function Gabs._randchannel_gpu_impl(basis::SymplecticBasis, precision; ħ=2)
+    cpu_channel = randchannel(basis; ħ=ħ)
+    return Gabs._gpu_impl(cpu_channel, precision)
+end
+
+function Gabs._randsymplectic_gpu_impl(basis::SymplecticBasis, precision; passive=false)
+    cpu_symplectic = randsymplectic(basis; passive=passive)
+    return CuArray{precision}(cpu_symplectic)
+end
+
+function Gabs._batch_randstate_gpu_impl(basis::SymplecticBasis, n::Int, precision; pure=false, ħ=2)
+    cpu_states = [randstate(basis; pure=pure, ħ=ħ) for _ in 1:n]
+    return [Gabs._gpu_impl(state, precision) for state in cpu_states]
+end
+
+function Gabs._batch_randunitary_gpu_impl(basis::SymplecticBasis, n::Int, precision; passive=false, ħ=2)
+    cpu_unitaries = [randunitary(basis; passive=passive, ħ=ħ) for _ in 1:n]
+    return [Gabs._gpu_impl(unitary, precision) for unitary in cpu_unitaries]
+end
+
+function _extract_gpu_scalar(x::CuArray{T}) where T
+    if ndims(x) == 0
+        return T(Array(x)[])
+    elseif length(x) == 1
+        return T(Array(x)[1])
+    else
+        return Vector{T}(Array(x))
+    end
+end
+
+function Gabs.coherentstate(basis::SymplecticBasis{N}, alpha::CuArray; ħ=2) where {N<:Int}
+    T = real(eltype(alpha))
+    alpha_val = _extract_gpu_scalar(alpha)
+    return coherentstate(CuVector{T}, CuMatrix{T}, basis, alpha_val; ħ=ħ)
+end
+
+function Gabs.squeezedstate(basis::SymplecticBasis{N}, r::CuArray, theta; ħ=2) where {N<:Int}
+    T = eltype(r)
+    r_val = _extract_gpu_scalar(r)
+    return squeezedstate(CuVector{T}, CuMatrix{T}, basis, r_val, theta; ħ=ħ)
+end
+
+function Gabs.thermalstate(basis::SymplecticBasis{N}, photons::CuArray; ħ=2) where {N<:Int}
+    T = eltype(photons)
+    photons_val = _extract_gpu_scalar(photons)
+    return thermalstate(CuVector{T}, CuMatrix{T}, basis, photons_val; ħ=ħ)
+end
+
+function Gabs.eprstate(basis::SymplecticBasis{N}, r::CuArray, theta; ħ=2) where {N<:Int}
+    T = eltype(r)
+    r_val = _extract_gpu_scalar(r)
+    return eprstate(CuVector{T}, CuMatrix{T}, basis, r_val, theta; ħ=ħ)
+end
+
+function Gabs.displace(basis::SymplecticBasis{N}, alpha::CuArray; ħ=2) where {N<:Int}
+    T = real(eltype(alpha))
+    alpha_val = _extract_gpu_scalar(alpha)
+    return displace(CuVector{T}, CuMatrix{T}, basis, alpha_val; ħ=ħ)
+end
+
+function Gabs.squeeze(basis::SymplecticBasis{N}, r::CuArray, theta; ħ=2) where {N<:Int}
+    T = eltype(r)
+    r_val = _extract_gpu_scalar(r)
+    return squeeze(CuVector{T}, CuMatrix{T}, basis, r_val, theta; ħ=ħ)
+end
+
+function Gabs.twosqueeze(basis::SymplecticBasis{N}, r::CuArray, theta; ħ=2) where {N<:Int}
+    T = eltype(r)
+    r_val = _extract_gpu_scalar(r)
+    return twosqueeze(CuVector{T}, CuMatrix{T}, basis, r_val, theta; ħ=ħ)
+end
+
+function Gabs.phaseshift(basis::SymplecticBasis{N}, theta::CuArray; ħ=2) where {N<:Int}
+    T = eltype(theta)
+    theta_val = _extract_gpu_scalar(theta)
+    return phaseshift(CuVector{T}, CuMatrix{T}, basis, theta_val; ħ=ħ)
+end
+
+function Gabs.beamsplitter(basis::SymplecticBasis{N}, transmit::CuArray; ħ=2) where {N<:Int}
+    T = eltype(transmit)
+    transmit_val = _extract_gpu_scalar(transmit)
+    return beamsplitter(CuVector{T}, CuMatrix{T}, basis, transmit_val; ħ=ħ)
+end
+
+function Gabs.attenuator(basis::SymplecticBasis{N}, theta::CuArray, n; ħ=2) where {N<:Int}
+    T = eltype(theta)
+    theta_val = _extract_gpu_scalar(theta)
+    return attenuator(CuVector{T}, CuMatrix{T}, basis, theta_val, n; ħ=ħ)
+end
+
+function Gabs.amplifier(basis::SymplecticBasis{N}, r::CuArray, n; ħ=2) where {N<:Int}
+    T = eltype(r)
+    r_val = _extract_gpu_scalar(r)
+    return amplifier(CuVector{T}, CuMatrix{T}, basis, r_val, n; ħ=ħ)
+end
+
+function Gabs.tensor(state1::GaussianState{B,M1,V1}, state2::GaussianState{B,M2,V2}) where {B<:SymplecticBasis, M1<:CuArray, V1<:CuArray, M2<:CuArray, V2<:CuArray}
+    T = promote_type(eltype(M1), eltype(M2))
+    return tensor(CuVector{T}, CuMatrix{T}, state1, state2)
+end
+
+function Gabs.tensor(::Type{T}, op1::GaussianUnitary{B,D1,S1}, op2::GaussianUnitary{B,D2,S2}) where {
+    T<:CuVector, B<:SymplecticBasis, D1<:CuArray, S1<:CuArray, D2<:CuArray, S2<:CuArray}
+    return tensor(T, CuMatrix{eltype(T)}, op1, op2)
+end
+
+function Gabs.tensor(op1::GaussianUnitary{B,D1,S1}, op2::GaussianUnitary{B,D2,S2}) where {
+    B<:SymplecticBasis, D1<:CuArray, S1<:CuArray, D2<:CuArray, S2<:CuArray}
+    T = promote_type(eltype(D1), eltype(D2))
+    return tensor(CuVector{T}, CuMatrix{T}, op1, op2)
+end
+
+function Gabs.tensor(::Type{T}, op1::GaussianChannel{B,D1,T1}, op2::GaussianChannel{B,D2,T2}) where {
+    T<:CuVector, B<:SymplecticBasis, D1<:CuArray, T1<:CuArray, D2<:CuArray, T2<:CuArray}
+    return tensor(T, CuMatrix{eltype(T)}, op1, op2)
+end
+
+function Gabs.tensor(op1::GaussianChannel{B,D1,T1}, op2::GaussianChannel{B,D2,T2}) where {
+    B<:SymplecticBasis, D1<:CuArray, T1<:CuArray, D2<:CuArray, T2<:CuArray}
+    T = promote_type(eltype(D1), eltype(D2))
+    return tensor(CuVector{T}, CuMatrix{T}, op1, op2)
+end
+
+function Gabs._adapt_device_gpu(target_constructor, source_obj, args...)
+    if source_obj isa GaussianState
+        T = real(eltype(source_obj.mean))
+    elseif source_obj isa GaussianUnitary
+        T = real(eltype(source_obj.disp))
+    elseif source_obj isa GaussianChannel
+        T = real(eltype(source_obj.disp))
+    elseif source_obj isa GaussianLinearCombination && !isempty(source_obj.states)
+        T = real(eltype(source_obj.states[1].mean))
+    elseif source_obj isa AbstractArray
+        T = real(eltype(source_obj))
+    else
+        T = Float32 
+    end
+    cpu_obj = target_constructor(args...)
+    return Gabs._gpu_impl(cpu_obj, T)
+end
+
+function Gabs.isgaussian(state::GaussianState{B,M,V}; atol::R1 = 0, rtol::R2 = atol) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, R1<:Real, R2<:Real}
+    cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ)
+    return isgaussian(cpu_state; atol = atol, rtol = rtol)
+end
+
+function Gabs.isgaussian(op::GaussianUnitary{B,D,S}; atol::R1 = 0, rtol::R2 = atol) where {B<:SymplecticBasis, D<:CuArray, S<:CuArray, R1<:Real, R2<:Real}
+    return issymplectic(op.basis, Array(op.symplectic); atol = atol, rtol = rtol)
+end
+
+function Gabs.isgaussian(op::GaussianChannel{B,D,T}; atol::R1 = 0, rtol::R2 = atol) where {B<:SymplecticBasis, D<:CuArray, T<:CuArray, R1<:Real, R2<:Real}
+    cpu_op = GaussianChannel(op.basis, Array(op.disp), Array(op.transform), Array(op.noise); ħ = op.ħ)
+    return isgaussian(cpu_op; atol = atol, rtol = rtol)
+end
+
+function Gabs.issymplectic(basis::SymplecticBasis, x::CuMatrix; atol::R1 = 0, rtol::R2 = atol) where {R1<:Real, R2<:Real}
+    return issymplectic(basis, Array(x); atol = atol, rtol = rtol)
+end
+
+include("utils.jl")
+include("state_operations.jl") 
+include("unitary_operations.jl")
+include("wigner_kernels.jl")
+include("cross_wigner_kernels.jl")
+include("interference_wigner.jl")
+include("linear_combinations.jl")
+include("basis_operations.jl")
+include("ptrace_operations.jl")
+end
\ No newline at end of file
diff --git a/ext/CUDAExt/basis_operations.jl b/ext/CUDAExt/basis_operations.jl
new file mode 100644
index 0000000..7efe64e
--- /dev/null
+++ b/ext/CUDAExt/basis_operations.jl
@@ -0,0 +1,209 @@
+"""
+    _create_pair_to_block_permutation(nmodes::Int, ::Type{T}) where T
+
+Create GPU permutation indices for QuadPairBasis -> QuadBlockBasis conversion.
+QuadPair: [x1,p1,x2,p2,...] -> QuadBlock: [x1,x2,...,p1,p2,...]
+"""
+function _create_pair_to_block_permutation(nmodes::Int, ::Type{T}) where T
+    x_indices = collect(1:2:2*nmodes)  
+    p_indices = collect(2:2:2*nmodes) 
+    perm_indices = vcat(x_indices, p_indices)  
+    return CuArray{Int}(perm_indices)
+end
+
+"""
+    _create_block_to_pair_permutation(nmodes::Int, ::Type{T}) where T
+
+Create GPU permutation indices for QuadBlockBasis -> QuadPairBasis conversion.
+QuadBlock: [x1,x2,...,p1,p2,...] -> QuadPair: [x1,p1,x2,p2,...]
+"""
+function _create_block_to_pair_permutation(nmodes::Int, ::Type{T}) where T
+    perm_indices = Vector{Int}(undef, 2*nmodes)
+    for i in 1:nmodes
+        perm_indices[2*i-1] = i       
+        perm_indices[2*i] = i + nmodes 
+    end
+    return CuArray{Int}(perm_indices)
+end
+
+"""
+    _convert_mean_pair_to_block!(mean_out::CuVector{T}, mean_in::CuVector{T}, nmodes::Int) where T
+
+Convert mean vector from QuadPairBasis to QuadBlockBasis using vectorized GPU operations.
+"""
+function _convert_mean_pair_to_block!(mean_out::CuVector{T}, mean_in::CuVector{T}, nmodes::Int) where T
+    x_view = @view mean_in[1:2:2*nmodes]  
+    p_view = @view mean_in[2:2:2*nmodes]  
+    @views mean_out[1:nmodes] .= x_view
+    @views mean_out[nmodes+1:2*nmodes] .= p_view
+    return mean_out
+end
+
+"""
+    _convert_mean_block_to_pair!(mean_out::CuVector{T}, mean_in::CuVector{T}, nmodes::Int) where T
+
+Convert mean vector from QuadBlockBasis to QuadPairBasis using vectorized GPU operations.
+"""
+function _convert_mean_block_to_pair!(mean_out::CuVector{T}, mean_in::CuVector{T}, nmodes::Int) where T
+    x_view = @view mean_in[1:nmodes]           
+    p_view = @view mean_in[nmodes+1:2*nmodes]   
+    @views mean_out[1:2:2*nmodes] .= x_view     
+    @views mean_out[2:2:2*nmodes] .= p_view    
+    return mean_out
+end
+
+"""
+    _convert_matrix_with_permutation(matrix::CuMatrix{T}, perm_indices::CuVector{Int}) where T
+
+Apply permutation to both rows and columns of a matrix using GPU operations.
+"""
+function _convert_matrix_with_permutation(matrix::CuMatrix{T}, perm_indices::CuVector{Int}) where T
+    return matrix[perm_indices, perm_indices]
+end
+
+"""
+    changebasis(::Type{QuadBlockBasis}, state::GaussianState{QuadPairBasis, CuArray, CuArray})
+
+Convert GaussianState from QuadPairBasis to QuadBlockBasis on GPU.
+"""
+function changebasis(::Type{B1}, state::GaussianState{B2,M,V}) where {B1<:QuadBlockBasis,B2<:QuadPairBasis,M<:CuArray,V<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ)
+        cpu_result = changebasis(B1, cpu_state)
+        return GaussianState(cpu_result.basis, CuArray(cpu_result.mean), CuArray(cpu_result.covar); ħ=cpu_result.ħ)
+    end
+    nmodes = state.basis.nmodes
+    T = eltype(M)
+    mean_new = CUDA.zeros(T, 2*nmodes)
+    covar_new = CUDA.zeros(T, 2*nmodes, 2*nmodes)
+    _convert_mean_pair_to_block!(mean_new, state.mean, nmodes)
+    perm_indices = _create_pair_to_block_permutation(nmodes, T)
+    covar_new = _convert_matrix_with_permutation(state.covar, perm_indices)
+    return GaussianState(B1(nmodes), mean_new, covar_new; ħ = state.ħ)
+end
+
+"""
+    changebasis(::Type{QuadPairBasis}, state::GaussianState{QuadBlockBasis, CuArray, CuArray})
+
+Convert GaussianState from QuadBlockBasis to QuadPairBasis on GPU.
+"""
+function changebasis(::Type{B1}, state::GaussianState{B2,M,V}) where {B1<:QuadPairBasis,B2<:QuadBlockBasis,M<:CuArray,V<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ)
+        cpu_result = changebasis(B1, cpu_state)
+        return GaussianState(cpu_result.basis, CuArray(cpu_result.mean), CuArray(cpu_result.covar); ħ=cpu_result.ħ)
+    end
+    nmodes = state.basis.nmodes
+    T = eltype(M)
+    mean_new = CUDA.zeros(T, 2*nmodes)
+    covar_new = CUDA.zeros(T, 2*nmodes, 2*nmodes)
+    _convert_mean_block_to_pair!(mean_new, state.mean, nmodes)
+    perm_indices = _create_block_to_pair_permutation(nmodes, T)
+    covar_new = _convert_matrix_with_permutation(state.covar, perm_indices)
+    return GaussianState(B1(nmodes), mean_new, covar_new; ħ = state.ħ)
+end
+
+changebasis(::Type{<:QuadBlockBasis}, state::GaussianState{<:QuadBlockBasis,<:CuArray,<:CuArray}) = state
+changebasis(::Type{<:QuadPairBasis}, state::GaussianState{<:QuadPairBasis,<:CuArray,<:CuArray}) = state
+
+"""
+    changebasis(::Type{QuadBlockBasis}, op::GaussianUnitary{QuadPairBasis, CuArray, CuArray})
+
+Convert GaussianUnitary from QuadPairBasis to QuadBlockBasis on GPU.
+"""
+function changebasis(::Type{B1}, op::GaussianUnitary{B2,D,S}) where {B1<:QuadBlockBasis,B2<:QuadPairBasis,D<:CuArray,S<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_op = GaussianUnitary(op.basis, Array(op.disp), Array(op.symplectic); ħ=op.ħ)
+        cpu_result = changebasis(B1, cpu_op)
+        return GaussianUnitary(cpu_result.basis, CuArray(cpu_result.disp), CuArray(cpu_result.symplectic); ħ=cpu_result.ħ)
+    end
+    
+    nmodes = op.basis.nmodes
+    T = eltype(D)
+    disp_new = CUDA.zeros(T, 2*nmodes)
+    symp_new = CUDA.zeros(T, 2*nmodes, 2*nmodes)
+    _convert_mean_pair_to_block!(disp_new, op.disp, nmodes)
+    perm_indices = _create_pair_to_block_permutation(nmodes, T)
+    symp_new = _convert_matrix_with_permutation(op.symplectic, perm_indices)
+    return GaussianUnitary(B1(nmodes), disp_new, symp_new; ħ = op.ħ)
+end
+
+"""
+    changebasis(::Type{QuadPairBasis}, op::GaussianUnitary{QuadBlockBasis, CuArray, CuArray})
+
+Convert GaussianUnitary from QuadBlockBasis to QuadPairBasis on GPU.
+"""
+function changebasis(::Type{B1}, op::GaussianUnitary{B2,D,S}) where {B1<:QuadPairBasis,B2<:QuadBlockBasis,D<:CuArray,S<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_op = GaussianUnitary(op.basis, Array(op.disp), Array(op.symplectic); ħ=op.ħ)
+        cpu_result = changebasis(B1, cpu_op)
+        return GaussianUnitary(cpu_result.basis, CuArray(cpu_result.disp), CuArray(cpu_result.symplectic); ħ=cpu_result.ħ)
+    end
+    nmodes = op.basis.nmodes
+    T = eltype(D)
+    disp_new = CUDA.zeros(T, 2*nmodes)
+    symp_new = CUDA.zeros(T, 2*nmodes, 2*nmodes)
+    _convert_mean_block_to_pair!(disp_new, op.disp, nmodes)
+    perm_indices = _create_block_to_pair_permutation(nmodes, T)
+    symp_new = _convert_matrix_with_permutation(op.symplectic, perm_indices)
+    return GaussianUnitary(B1(nmodes), disp_new, symp_new; ħ = op.ħ)
+end
+
+changebasis(::Type{<:QuadBlockBasis}, op::GaussianUnitary{<:QuadBlockBasis,<:CuArray,<:CuArray}) = op
+changebasis(::Type{<:QuadPairBasis}, op::GaussianUnitary{<:QuadPairBasis,<:CuArray,<:CuArray}) = op
+
+"""
+    changebasis(::Type{QuadBlockBasis}, op::GaussianChannel{QuadPairBasis, CuArray, CuArray})
+
+Convert GaussianChannel from QuadPairBasis to QuadBlockBasis on GPU.
+"""
+function changebasis(::Type{B1}, op::GaussianChannel{B2,D,T}) where {B1<:QuadBlockBasis,B2<:QuadPairBasis,D<:CuArray,T<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_op = GaussianChannel(op.basis, Array(op.disp), Array(op.transform), Array(op.noise); ħ=op.ħ)
+        cpu_result = changebasis(B1, cpu_op)
+        return GaussianChannel(cpu_result.basis, CuArray(cpu_result.disp), 
+                              CuArray(cpu_result.transform), CuArray(cpu_result.noise); ħ=cpu_result.ħ)
+    end
+    nmodes = op.basis.nmodes
+    Td = eltype(D)
+    disp_new = CUDA.zeros(Td, 2*nmodes)
+    transform_new = CUDA.zeros(Td, 2*nmodes, 2*nmodes)
+    noise_new = CUDA.zeros(Td, 2*nmodes, 2*nmodes)
+    _convert_mean_pair_to_block!(disp_new, op.disp, nmodes)
+    perm_indices = _create_pair_to_block_permutation(nmodes, Td)
+    transform_new = _convert_matrix_with_permutation(op.transform, perm_indices)
+    noise_new = _convert_matrix_with_permutation(op.noise, perm_indices)
+    return GaussianChannel(B1(nmodes), disp_new, transform_new, noise_new; ħ = op.ħ)
+end
+
+"""
+    changebasis(::Type{QuadPairBasis}, op::GaussianChannel{QuadBlockBasis, CuArray, CuArray})
+
+Convert GaussianChannel from QuadBlockBasis to QuadPairBasis on GPU.
+"""
+function changebasis(::Type{B1}, op::GaussianChannel{B2,D,T}) where {B1<:QuadPairBasis,B2<:QuadBlockBasis,D<:CuArray,T<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_op = GaussianChannel(op.basis, Array(op.disp), Array(op.transform), Array(op.noise); ħ=op.ħ)
+        cpu_result = changebasis(B1, cpu_op)
+        return GaussianChannel(cpu_result.basis, CuArray(cpu_result.disp),
+                              CuArray(cpu_result.transform), CuArray(cpu_result.noise); ħ=cpu_result.ħ)
+    end
+    nmodes = op.basis.nmodes
+    Td = eltype(D)
+    disp_new = CUDA.zeros(Td, 2*nmodes)
+    transform_new = CUDA.zeros(Td, 2*nmodes, 2*nmodes)
+    noise_new = CUDA.zeros(Td, 2*nmodes, 2*nmodes)
+    _convert_mean_block_to_pair!(disp_new, op.disp, nmodes)
+    perm_indices = _create_block_to_pair_permutation(nmodes, Td)
+    transform_new = _convert_matrix_with_permutation(op.transform, perm_indices)
+    noise_new = _convert_matrix_with_permutation(op.noise, perm_indices)
+    return GaussianChannel(B1(nmodes), disp_new, transform_new, noise_new; ħ = op.ħ)
+end
+changebasis(::Type{<:QuadBlockBasis}, op::GaussianChannel{<:QuadBlockBasis,<:CuArray,<:CuArray}) = op
+changebasis(::Type{<:QuadPairBasis}, op::GaussianChannel{<:QuadPairBasis,<:CuArray,<:CuArray}) = op
diff --git a/ext/CUDAExt/cross_wigner_kernels.jl b/ext/CUDAExt/cross_wigner_kernels.jl
new file mode 100644
index 0000000..93c9c30
--- /dev/null
+++ b/ext/CUDAExt/cross_wigner_kernels.jl
@@ -0,0 +1,200 @@
+"""
+    cross_wigner_kernel!(result, x_point, mean1, mean2, avg_covar_inv, 
+                         omega, log_det_avg_covar, nmodes, hbar)
+
+GPU kernel for computing cross-Wigner function between two Gaussian states.
+Implements: W₁₂(x) = norm × exp[-½(x-μ̄)ᵀV̄⁻¹(x-μ̄)] × exp[i(μ₁-μ₂)ᵀΩ(x-μ̄)/ħ]
+where μ̄ = (μ₁+μ₂)/2, V̄ = (V₁+V₂)/2
+"""
+function cross_wigner_kernel!(result::CuDeviceArray{Complex{T}}, 
+                              x_point::CuDeviceVector{T},
+                              mean1::CuDeviceVector{T}, 
+                              mean2::CuDeviceVector{T},
+                              avg_covar_inv::CuDeviceMatrix{T},
+                              omega::CuDeviceMatrix{T}, 
+                              log_det_avg_covar::T, 
+                              nmodes::Int, hbar::T) where T
+
+    if threadIdx().x == 1 && blockIdx().x == 1
+        quadratic_form = zero(T)
+        for i in 1:length(mean1)
+            avg_mean_i = (mean1[i] + mean2[i]) * T(0.5)
+            dx_i = x_point[i] - avg_mean_i
+            for j in 1:length(mean1)
+                avg_mean_j = (mean1[j] + mean2[j]) * T(0.5)
+                dx_j = x_point[j] - avg_mean_j
+                quadratic_form += dx_i * avg_covar_inv[i, j] * dx_j
+            end
+        end
+        phase_arg = zero(T)
+        for i in 1:length(mean1)
+            mu_diff_i = mean1[i] - mean2[i]
+            avg_mean_i = (mean1[i] + mean2[i]) * T(0.5)
+            dx_i = x_point[i] - avg_mean_i
+            omega_dx_i = zero(T)
+            for j in 1:length(mean1)
+                avg_mean_j = (mean1[j] + mean2[j]) * T(0.5)
+                dx_j = x_point[j] - avg_mean_j
+                omega_dx_i += omega[i, j] * dx_j
+            end
+            phase_arg += mu_diff_i * omega_dx_i
+        end
+        log_normalization = -T(nmodes) * log(T(2π)) - T(0.5) * log_det_avg_covar
+        normalization = exp(log_normalization)
+        gaussian_part = exp(-T(0.5) * quadratic_form)
+        phase_factor = Complex{T}(cos(phase_arg / hbar), sin(phase_arg / hbar))
+        result[1] = normalization * gaussian_part * phase_factor
+    end
+    return nothing
+end
+
+"""
+    cross_wignerchar_kernel!(result, xi_point, quadratic_matrix, linear_vector)
+
+GPU kernel for computing cross-Wigner characteristic function between two Gaussian states.
+Implements: χ₁₂(ξ) = exp(-½ξᵀΩV̄Ωᵀξ - iΩμ̄·ξ)
+where μ̄ = (μ₁+μ₂)/2, V̄ = (V₁+V₂)/2
+"""
+function cross_wignerchar_kernel!(result::CuDeviceArray{Complex{T}},
+                                  xi_point::CuDeviceVector{T},
+                                  quadratic_matrix::CuDeviceMatrix{T},
+                                  linear_vector::CuDeviceVector{T}) where T
+    
+    if threadIdx().x == 1 && blockIdx().x == 1
+        quadratic_term = zero(T)
+        for i in 1:length(xi_point)
+            for j in 1:length(xi_point)
+                quadratic_term += xi_point[i] * quadratic_matrix[i, j] * xi_point[j]
+            end
+        end
+        linear_term = zero(T)
+        for i in 1:length(xi_point)
+            linear_term += linear_vector[i] * xi_point[i]
+        end
+        real_part = -T(0.5) * quadratic_term
+        imag_part = -linear_term
+        result[1] = Complex{T}(cos(imag_part), sin(imag_part)) * exp(real_part)
+    end
+    return nothing
+end
+
+"""
+    gpu_safe_logdet(A::CuMatrix{T}) where T
+
+Compute log determinant of positive definite matrix using Cholesky decomposition.
+"""
+function gpu_safe_logdet(A::CuMatrix{T}) where T
+    try
+        L = cholesky(Symmetric(A)).L
+        diag_L = diag(L)  
+        log_det_L = sum(log.(diag_L))  
+        return T(2) * log_det_L
+    catch e
+        A_cpu = Array(A)
+        return T(logdet(A_cpu))
+    end
+end
+
+"""
+    gpu_safe_inv(A::CuMatrix{T}) where T
+
+Compute matrix inverse using GPU-safe operations.
+"""
+function gpu_safe_inv(A::CuMatrix{T}) where T
+    try
+        return inv(A)  
+    catch e
+        A_cpu = Array(A)
+        return CuArray{T}(inv(A_cpu))
+    end
+end
+
+"""
+    gpu_safe_length_check(x::AbstractVector, expected_len::Int)
+"""
+function gpu_safe_length_check(x::AbstractVector, expected_len::Int)
+    actual_len = size(x, 1)  
+    return actual_len == expected_len
+end
+
+function Gabs.cross_wigner(state1::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, 
+                           state2::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, 
+                           x::AbstractVector)
+    
+    if !CUDA_AVAILABLE
+        @warn "CUDA not available for cross_wigner. This should not happen in CUDAExt."
+        error("GPU cross_wigner called but CUDA not available")
+    end
+    typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(SYMPLECTIC_ERROR))
+    state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR))
+    state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR))
+    expected_len = size(state1.mean, 1)
+    actual_len = size(x, 1)
+    actual_len == expected_len || throw(ArgumentError(WIGNER_ERROR))
+    if state1 === state2
+        wigner_result = wigner(state1, x)
+        T = real(eltype(state1.mean))
+        return Complex{T}(wigner_result, zero(T))
+    end
+    T = real(eltype(state1.mean))
+    nmodes = state1.basis.nmodes
+    x_gpu = CuArray{T}(x)
+    avg_covar = (state1.covar .+ state2.covar) ./ T(2)
+    avg_covar_inv = gpu_safe_inv(avg_covar)
+    log_det_avg_covar = gpu_safe_logdet(avg_covar)
+    omega_cpu = symplecticform(state1.basis)
+    omega_gpu = CuArray{T}(omega_cpu)
+    result = CuArray{Complex{T}}(undef, 1)
+    @cuda threads=1 blocks=1 cross_wigner_kernel!(
+        result, x_gpu, state1.mean, state2.mean, avg_covar_inv, 
+        omega_gpu, log_det_avg_covar, nmodes, T(state1.ħ)
+    )
+    CUDA.synchronize()  
+    return Array(result)[1]  
+end
+
+function Gabs.cross_wignerchar(state1::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, 
+                               state2::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, 
+                               xi::AbstractVector)
+    
+    if !CUDA_AVAILABLE
+        @warn "CUDA not available for cross_wignerchar. This should not happen in CUDAExt."
+        error("GPU cross_wignerchar called but CUDA not available")
+    end
+    typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(SYMPLECTIC_ERROR))
+    state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR))
+    state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR))
+    expected_len = size(state1.mean, 1)
+    actual_len = size(xi, 1)
+    actual_len == expected_len || throw(ArgumentError(WIGNER_ERROR))
+    if state1 === state2
+        return wignerchar(state1, xi)
+    end
+    T = real(eltype(state1.mean))
+    xi_gpu = CuArray{T}(xi)
+    avg_mean = (state1.mean .+ state2.mean) ./ T(2)
+    avg_covar = (state1.covar .+ state2.covar) ./ T(2)
+    omega_cpu = symplecticform(state1.basis)
+    omega_gpu = CuArray{T}(omega_cpu)
+    temp_matrix = omega_gpu * avg_covar
+    quadratic_matrix = temp_matrix * transpose(omega_gpu)
+    linear_vector = omega_gpu * avg_mean
+    result = CuArray{Complex{T}}(undef, 1)
+    @cuda threads=1 blocks=1 cross_wignerchar_kernel!(
+        result, xi_gpu, quadratic_matrix, linear_vector
+    )
+    CUDA.synchronize()  
+    return Array(result)[1]  
+end
+
+function Gabs.cross_wigner(state1::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, 
+                           state2::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, 
+                           x::Vector{T}) where T<:Real
+    return cross_wigner(state1, state2, CuArray{real(eltype(state1.mean))}(x))
+end
+
+function Gabs.cross_wignerchar(state1::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, 
+                               state2::GaussianState{<:SymplecticBasis,<:CuArray,<:CuArray}, 
+                               xi::Vector{T}) where T<:Real
+    return cross_wignerchar(state1, state2, CuArray{real(eltype(state1.mean))}(xi))
+end
\ No newline at end of file
diff --git a/ext/CUDAExt/interference_wigner.jl b/ext/CUDAExt/interference_wigner.jl
new file mode 100644
index 0000000..b6157e1
--- /dev/null
+++ b/ext/CUDAExt/interference_wigner.jl
@@ -0,0 +1,213 @@
+"""
+    wigner(lc::GaussianLinearCombination{GPU states}, x::AbstractVector)
+
+Implements: W(x) = Σᵢ |cᵢ|² Wᵢ(x) + 2 Σᵢ<ⱼ Re(cᵢ*cⱼ W_cross(ψᵢ,ψⱼ,x))
+"""
+function wigner(lc::GaussianLinearCombination{B,C,S}, x::AbstractVector) where {
+    B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}}
+    if !CUDA_AVAILABLE
+        error("GPU interference wigner called but CUDA not available")
+    end
+    if !isempty(lc.states)
+        expected_len = size(lc.states[1].mean, 1) 
+        actual_len = length(x)
+        actual_len == expected_len || throw(ArgumentError(WIGNER_ERROR))
+    else
+        throw(ArgumentError("Cannot compute Wigner function of empty linear combination"))
+    end
+    T = real(eltype(lc.states[1].mean))
+    result = zero(T)    
+    x_gpu = CuArray{T}(x)
+    n_states = length(lc.states)
+    @inbounds for i in 1:n_states
+        ci = lc.coeffs[i]  
+        state_i = lc.states[i]  
+        wigner_i = wigner(state_i, x_gpu)  
+        result += abs2(ci) * wigner_i
+    end
+    @inbounds for i in 1:(n_states-1)
+        ci = lc.coeffs[i]  
+        state_i = lc.states[i] 
+        @inbounds for j in (i+1):n_states
+            cj = lc.coeffs[j]  
+            state_j = lc.states[j]  
+            cross_w = cross_wigner(state_i, state_j, x_gpu) 
+            interference_coeff = conj(ci) * cj
+            cross_term = 2 * real(interference_coeff * cross_w)
+            result += cross_term
+        end
+    end
+    return result
+end
+
+"""
+    wignerchar(lc::GaussianLinearCombination{GPU states}, xi::AbstractVector)
+
+Implements: χ(ξ) = Σᵢ |cᵢ|² χᵢ(ξ) + 2 Σᵢ<ⱼ Re(cᵢ*cⱼ χ_cross(ψᵢ,ψⱼ,ξ))
+"""
+function wignerchar(lc::GaussianLinearCombination{B,C,S}, xi::AbstractVector) where {
+    B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}}
+    if !CUDA_AVAILABLE
+        error("GPU interference wignerchar called but CUDA not available")
+    end
+    if isempty(lc.states)
+        throw(ArgumentError("Cannot compute Wigner characteristic function of empty linear combination"))
+    end
+    
+    expected_len = size(lc.states[1].mean, 1)  
+    actual_len = length(xi)
+    actual_len == expected_len || throw(ArgumentError("Evaluation point dimension ($actual_len) does not match state dimension ($expected_len)"))
+    T = real(eltype(lc.states[1].mean))
+    result = complex(zero(T), zero(T))
+    xi_gpu = CuArray{T}(xi)
+    n_states = length(lc.states)
+    @inbounds for i in 1:n_states
+        ci = lc.coeffs[i] 
+        state_i = lc.states[i] 
+        wignerchar_i = wignerchar(state_i, xi_gpu) 
+        result += abs2(ci) * wignerchar_i
+    end
+    @inbounds for i in 1:(n_states-1)
+        ci = lc.coeffs[i] 
+        state_i = lc.states[i] 
+        @inbounds for j in (i+1):n_states
+            cj = lc.coeffs[j]  
+            state_j = lc.states[j]  
+            cross_char = cross_wignerchar(state_i, state_j, xi_gpu) 
+            interference_coeff = conj(ci) * cj
+            cross_term = 2 * real(interference_coeff * cross_char)
+            result += cross_term
+        end
+    end
+    return result
+end
+
+"""
+    wigner(lc::GaussianLinearCombination{GPU states}, x::Vector{T}) where T<:Real
+"""
+function wigner(lc::GaussianLinearCombination{B,C,S}, x::Vector{T}) where {
+    B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T<:Real}
+    target_precision = real(eltype(lc.states[1].mean))
+    return wigner(lc, CuArray{target_precision}(x))
+end
+
+"""
+    wignerchar(lc::GaussianLinearCombination{GPU states}, xi::Vector{T}) where T<:Real
+"""
+function wignerchar(lc::GaussianLinearCombination{B,C,S}, xi::Vector{T}) where {
+    B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T<:Real}
+    target_precision = real(eltype(lc.states[1].mean))
+    return wignerchar(lc, CuArray{target_precision}(xi))
+end
+
+"""
+    wigner(lc::GaussianLinearCombination{GPU states}, x_points::CuMatrix{T})
+
+Implements batched version of: W(x) = Σᵢ |cᵢ|² Wᵢ(x) + 2 Σᵢ<ⱼ Re(cᵢ*cⱼ W_cross(ψᵢ,ψⱼ,x))
+
+# Arguments
+- `lc::GaussianLinearCombination`: Linear combination with GPU states
+- `x_points::CuMatrix{T}`: Phase space points (dimension × num_points)
+
+# Returns  
+- `CuArray{T}`: Wigner function values at all points
+"""
+function wigner(lc::GaussianLinearCombination{B,C,S}, x_points::CuMatrix{T}) where {
+    B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T}
+    if !CUDA_AVAILABLE
+        error("GPU batched interference wigner called but CUDA not available")
+    end    
+    if isempty(lc.states)
+        throw(ArgumentError("Cannot compute Wigner function of empty linear combination"))
+    end
+    expected_len = size(lc.states[1].mean, 1)
+    actual_len = size(x_points, 1)
+    actual_len == expected_len || throw(ArgumentError("Phase space point dimension ($actual_len) does not match state dimension ($expected_len)"))
+    num_points = size(x_points, 2)
+    n_states = length(lc.states)
+    results = CUDA.zeros(T, num_points)
+    for i in 1:n_states
+        ci = lc.coeffs[i]  
+        state_i = lc.states[i]  
+        wigner_values = wigner(state_i, x_points) 
+        results .+= abs2(ci) .* wigner_values
+    end
+    for i in 1:(n_states-1)
+        ci = lc.coeffs[i]
+        state_i = lc.states[i]
+        for j in (i+1):n_states
+            cj = lc.coeffs[j]
+            state_j = lc.states[j]
+            cross_values = cross_wigner_batch(state_i, state_j, x_points)
+            interference_coeff = conj(ci) * cj
+            cross_contributions = 2 .* real.(interference_coeff .* cross_values)
+            results .+= cross_contributions
+        end
+    end
+    return results
+end
+
+"""
+    wignerchar(lc::GaussianLinearCombination{GPU states}, xi_points::CuMatrix{T})
+
+Implements batched version of: χ(ξ) = Σᵢ |cᵢ|² χᵢ(ξ) + 2 Σᵢ<ⱼ Re(cᵢ*cⱼ χ_cross(ψᵢ,ψⱼ,ξ))
+
+# Arguments
+- `lc::GaussianLinearCombination`: Linear combination with GPU states  
+- `xi_points::CuMatrix{T}`: Evaluation points in phase space (dimension × num_points)
+
+# Returns
+- `CuArray{Complex{T}}`: Characteristic function values at all points
+"""
+function wignerchar(lc::GaussianLinearCombination{B,C,S}, xi_points::CuMatrix{T}) where {
+    B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T}
+    if !CUDA_AVAILABLE
+        error("GPU batched interference wignerchar called but CUDA not available")
+    end
+    if isempty(lc.states)
+        throw(ArgumentError("Cannot compute Wigner characteristic function of empty linear combination"))
+    end
+    expected_len = size(lc.states[1].mean, 1)
+    actual_len = size(xi_points, 1)
+    actual_len == expected_len || throw(ArgumentError("Evaluation point dimension ($actual_len) does not match state dimension ($expected_len)"))
+    num_points = size(xi_points, 2)
+    n_states = length(lc.states)
+    results = CUDA.zeros(Complex{T}, num_points)
+    for i in 1:n_states
+        ci = lc.coeffs[i]
+        state_i = lc.states[i]
+        wignerchar_values = wignerchar(state_i, xi_points)
+        results .+= abs2(ci) .* wignerchar_values
+    end
+    for i in 1:(n_states-1)
+        ci = lc.coeffs[i]
+        state_i = lc.states[i]
+        for j in (i+1):n_states
+            cj = lc.coeffs[j]
+            state_j = lc.states[j]
+            cross_values = cross_wignerchar_batch(state_i, state_j, xi_points)
+            interference_coeff = conj(ci) * cj
+            cross_contributions = 2 .* real.(interference_coeff .* cross_values)
+            results .+= cross_contributions
+        end
+    end
+    return results
+end
+
+"""
+    wigner(lc::GaussianLinearCombination{GPU states}, x_points::Matrix{T}) where T<:Real
+"""
+function wigner(lc::GaussianLinearCombination{B,C,S}, x_points::Matrix{T}) where {
+    B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T<:Real}
+    target_precision = real(eltype(lc.states[1].mean))
+    return wigner(lc, CuArray{target_precision}(x_points))
+end
+
+"""
+    wignerchar(lc::GaussianLinearCombination{GPU states}, xi_points::Matrix{T}) where T<:Real
+"""
+function wignerchar(lc::GaussianLinearCombination{B,C,S}, xi_points::Matrix{T}) where {
+    B<:SymplecticBasis, C, S<:GaussianState{B,<:CuArray,<:CuArray}, T<:Real}
+    target_precision = real(eltype(lc.states[1].mean))
+    return wignerchar(lc, CuArray{target_precision}(xi_points))
+end
\ No newline at end of file
diff --git a/ext/CUDAExt/linear_combinations.jl b/ext/CUDAExt/linear_combinations.jl
new file mode 100644
index 0000000..0e4697f
--- /dev/null
+++ b/ext/CUDAExt/linear_combinations.jl
@@ -0,0 +1,58 @@
+Gabs._is_gpu_array(x::CuArray) = true
+
+function Gabs._detect_device(lc::GaussianLinearCombination)
+    if isempty(lc.states)
+        return :cpu
+    end
+    return Gabs._detect_device(lc.states[1].mean)
+end
+
+function Gabs._gpu_impl(lc::GaussianLinearCombination, precision::Type{T}) where T
+    if !CUDA_AVAILABLE
+        @warn "CUDA not available. Cannot transfer to GPU."
+        return lc
+    end
+    gpu_states = [Gabs._gpu_impl(state, precision) for state in lc.states]
+    cpu_coeffs = Vector{T}(Array(lc.coeffs))  
+    return GaussianLinearCombination(lc.basis, cpu_coeffs, gpu_states)
+end
+
+function Base.:(*)(op::GaussianUnitary{B,<:CuArray,<:CuArray}, lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}}
+    op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR))
+    op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR))
+    new_states = [op * state for state in lc.states]
+    return GaussianLinearCombination(lc.basis, Vector(lc.coeffs), new_states)
+end
+
+function Base.:(*)(op::GaussianChannel{B,<:CuArray,<:CuArray}, lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}}
+    op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR))
+    op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR))
+    new_states = [op * state for state in lc.states]
+    return GaussianLinearCombination(lc.basis, Vector(lc.coeffs), new_states)
+end
+
+function Base.:(*)(op::GaussianUnitary{B,<:Array,<:Array}, lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}}
+    op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR))
+    op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR))
+    T = real(eltype(lc.states[1].mean))
+    gpu_op = Gabs._gpu_impl(op, T)
+    new_states = [gpu_op * state for state in lc.states]
+    return GaussianLinearCombination(lc.basis, Vector(lc.coeffs), new_states)
+end
+
+function Base.:(*)(op::GaussianChannel{B,<:Array,<:Array}, lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}}
+    op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR))
+    op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR))
+    T = real(eltype(lc.states[1].mean))
+    gpu_op = Gabs._gpu_impl(op, T)
+    new_states = [gpu_op * state for state in lc.states]
+    return GaussianLinearCombination(lc.basis, Vector(lc.coeffs), new_states)
+end
+
+function purity(lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}}
+    return 1.0
+end
+
+function entropy_vn(lc::GaussianLinearCombination{B,C,S}) where {B<:SymplecticBasis,C,S<:GaussianState{B,<:CuArray,<:CuArray}}
+    return 0.0
+end
\ No newline at end of file
diff --git a/ext/CUDAExt/ptrace_operations.jl b/ext/CUDAExt/ptrace_operations.jl
new file mode 100644
index 0000000..350899c
--- /dev/null
+++ b/ext/CUDAExt/ptrace_operations.jl
@@ -0,0 +1,60 @@
+"""
+    ptrace(state::GaussianState{B,M,V}, indices::Union{Int, AbstractVector{<:Int}}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray}
+"""
+function ptrace(state::GaussianState{B,M,V}, indices::Union{Int, AbstractVector{<:Int}}) where {
+    B<:SymplecticBasis, M<:CuArray, V<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ)
+        cpu_result = ptrace(cpu_state, indices)
+        return GaussianState(cpu_result.basis, CuArray(cpu_result.mean), CuArray(cpu_result.covar); ħ=cpu_result.ħ)
+    end
+    indices_vec = isa(indices, Int) ? [indices] : collect(indices)
+    basis = state.basis
+    nmodes = basis.nmodes
+    length(indices_vec) < nmodes || throw(ArgumentError(Gabs.INDEX_ERROR))
+    all_indices = collect(1:nmodes)
+    keep_indices = setdiff(all_indices, indices_vec)
+    n_keep = length(keep_indices)
+    T = eltype(M)
+    if basis isa QuadPairBasis
+        mean_indices = Vector{Int}()
+        for mode_idx in keep_indices
+            push!(mean_indices, 2*mode_idx - 1)  
+            push!(mean_indices, 2*mode_idx)     
+        end
+        new_mean = state.mean[CuArray(mean_indices)]
+        covar_indices = Vector{Int}()
+        for mode_idx in keep_indices
+            push!(covar_indices, 2*mode_idx - 1) 
+            push!(covar_indices, 2*mode_idx)     
+        end
+        covar_indices_gpu = CuArray(covar_indices)
+        new_covar = state.covar[covar_indices_gpu, covar_indices_gpu]
+    elseif basis isa QuadBlockBasis
+        x_indices = keep_indices
+        p_indices = keep_indices .+ nmodes
+        mean_indices = vcat(x_indices, p_indices)
+        new_mean = state.mean[CuArray(mean_indices)]
+        all_covar_indices = vcat(x_indices, p_indices)
+        covar_indices_gpu = CuArray(all_covar_indices)
+        new_covar = state.covar[covar_indices_gpu, covar_indices_gpu]
+    else
+        error("Unknown basis type: $(typeof(basis))")
+    end
+    new_basis = typeof(basis)(n_keep)
+    return GaussianState(new_basis, new_mean, new_covar; ħ = state.ħ)
+end
+
+function ptrace(::Type{Tm}, ::Type{Tc}, state::GaussianState{B,<:CuArray,<:CuArray}, indices::Union{Int, AbstractVector{<:Int}}) where {Tm<:CuArray, Tc<:CuArray, B<:SymplecticBasis}
+    result = ptrace(state, indices)
+    return GaussianState(result.basis, Tm(result.mean), Tc(result.covar); ħ = result.ħ)
+end
+
+function ptrace(::Type{T}, state::GaussianState{B,<:CuArray,<:CuArray}, indices::Union{Int, AbstractVector{<:Int}}) where {T<:CuArray, B<:SymplecticBasis}
+    if T <: AbstractMatrix
+        return ptrace(CuVector{eltype(T)}, T, state, indices)
+    else
+        return ptrace(T, T, state, indices)
+    end
+end
\ No newline at end of file
diff --git a/ext/CUDAExt/state_operations.jl b/ext/CUDAExt/state_operations.jl
new file mode 100644
index 0000000..70c9466
--- /dev/null
+++ b/ext/CUDAExt/state_operations.jl
@@ -0,0 +1,126 @@
+function vacuumstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return vacuumstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis; ħ = ħ)
+    end
+    nmodes = basis.nmodes
+    T = eltype(Tm)
+    mean = CUDA.zeros(T, 2*nmodes)
+    covar = CuMatrix{T}((T(ħ)/2) * I(2*nmodes))
+    return GaussianState(basis, mean, covar; ħ = ħ)
+end
+
+function thermalstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}, photons::P; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int, P}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return thermalstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis, photons; ħ = ħ)
+    end
+    T = eltype(Tm)
+    mean, covar = _thermalstate(basis, photons; ħ = ħ)
+    gpu_mean = CuArray{T}(mean)
+    gpu_covar = CuArray{T}(covar)
+    return GaussianState(basis, gpu_mean, gpu_covar; ħ = ħ)
+end
+
+function coherentstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}, alpha::A; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int, A}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return coherentstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis, alpha; ħ = ħ)
+    end
+    T = eltype(Tm)
+    mean, covar = _coherentstate(basis, alpha; ħ = ħ)
+    gpu_mean = CuArray{T}(mean)
+    gpu_covar = CuArray{T}(covar)
+    return GaussianState(basis, gpu_mean, gpu_covar; ħ = ħ)
+end
+
+function squeezedstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int, R}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return squeezedstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis, r, theta; ħ = ħ)
+    end
+    T = eltype(Tm)
+    mean, covar = _squeezedstate(basis, r, theta; ħ = ħ)
+    gpu_mean = CuArray{T}(mean)
+    gpu_covar = CuArray{T}(covar)
+    return GaussianState(basis, gpu_mean, gpu_covar; ħ = ħ)
+end
+
+function eprstate(::Type{Tm}, ::Type{Tc}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {Tm<:CuVector, Tc<:CuMatrix, N<:Int, R}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return eprstate(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, basis, r, theta; ħ = ħ)
+    end
+    T = eltype(Tm)
+    mean, covar = _eprstate(basis, r, theta; ħ = ħ)
+    gpu_mean = CuArray{T}(mean)
+    gpu_covar = CuArray{T}(covar)
+    
+    return GaussianState(basis, gpu_mean, gpu_covar; ħ = ħ)
+end
+
+function vacuumstate(::Type{T}, basis::SymplecticBasis{N}; ħ = 2) where {T<:CuVector, N<:Int}
+    return vacuumstate(T, CuMatrix{eltype(T)}, basis; ħ = ħ)
+end
+
+function coherentstate(::Type{T}, basis::SymplecticBasis{N}, alpha::A; ħ = 2) where {T<:CuVector, N<:Int, A}
+    return coherentstate(T, CuMatrix{eltype(T)}, basis, alpha; ħ = ħ)
+end
+
+function squeezedstate(::Type{T}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R}
+    return squeezedstate(T, CuMatrix{eltype(T)}, basis, r, theta; ħ = ħ)
+end
+
+function thermalstate(::Type{T}, basis::SymplecticBasis{N}, photons::P; ħ = 2) where {T<:CuVector, N<:Int, P}
+    return thermalstate(T, CuMatrix{eltype(T)}, basis, photons; ħ = ħ)
+end
+
+function eprstate(::Type{T}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R}
+    return eprstate(T, CuMatrix{eltype(T)}, basis, r, theta; ħ = ħ)
+end
+function Gabs.tensor(::Type{Tm}, ::Type{Tc}, state1::GaussianState{B,M1,V1}, state2::GaussianState{B,M2,V2}) where {
+    Tm<:CuVector, Tc<:CuMatrix, B<:SymplecticBasis, M1<:CuArray, V1<:CuArray, M2<:CuArray, V2<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state1 = GaussianState(state1.basis, Array(state1.mean), Array(state1.covar); ħ=state1.ħ)
+        cpu_state2 = GaussianState(state2.basis, Array(state2.mean), Array(state2.covar); ħ=state2.ħ)
+        result_cpu = tensor(Vector{eltype(Tm)}, Matrix{eltype(Tc)}, cpu_state1, cpu_state2)
+        return GaussianState(result_cpu.basis, CuArray(result_cpu.mean), CuArray(result_cpu.covar); ħ=result_cpu.ħ)
+    end
+    typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR))
+    state1.ħ == state2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR))
+    combined_basis = state1.basis ⊕ state2.basis
+    T = eltype(Tm)
+    mean_combined = vcat(state1.mean, state2.mean)
+    n1 = length(state1.mean)
+    n2 = length(state2.mean)
+    total_dim = n1 + n2
+    covar_combined = CUDA.zeros(T, total_dim, total_dim)
+    covar_combined[1:n1, 1:n1] .= state1.covar
+    covar_combined[n1+1:end, n1+1:end] .= state2.covar
+    return GaussianState(combined_basis, mean_combined, covar_combined; ħ = state1.ħ)
+end
+
+function Gabs.tensor(state1::GaussianState{B,<:Array,<:Array}, state2::GaussianState{B,<:CuArray,<:CuArray}) where {B<:SymplecticBasis}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return tensor(state1, GaussianState(state2.basis, Array(state2.mean), Array(state2.covar); ħ=state2.ħ))
+    end
+    typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR))
+    state1.ħ == state2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR))
+    T = real(eltype(state2.mean))
+    gpu_state1 = GaussianState(state1.basis, CuArray{T}(state1.mean), CuArray{T}(state1.covar); ħ = state1.ħ)
+    return tensor(gpu_state1, state2)
+end
+
+function Gabs.tensor(state1::GaussianState{B,<:CuArray,<:CuArray}, state2::GaussianState{B,<:Array,<:Array}) where {B<:SymplecticBasis}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return tensor(GaussianState(state1.basis, Array(state1.mean), Array(state1.covar); ħ=state1.ħ), state2)
+    end
+    typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR))
+    state1.ħ == state2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR))
+    T = real(eltype(state1.mean))
+    gpu_state2 = GaussianState(state2.basis, CuArray{T}(state2.mean), CuArray{T}(state2.covar); ħ = state2.ħ)
+    return tensor(state1, gpu_state2)
+end
\ No newline at end of file
diff --git a/ext/CUDAExt/unitary_operations.jl b/ext/CUDAExt/unitary_operations.jl
new file mode 100644
index 0000000..1227915
--- /dev/null
+++ b/ext/CUDAExt/unitary_operations.jl
@@ -0,0 +1,213 @@
+function displace(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, alpha::A; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, A}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return displace(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, alpha; ħ = ħ)
+    end
+    T = eltype(Td)    
+    disp, symplectic = _displace(basis, alpha; ħ = ħ)
+    gpu_disp = CuArray{T}(disp)
+    gpu_symplectic = CuArray{T}(symplectic)
+    return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ)
+end
+
+function squeeze(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, R}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return squeeze(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, r, theta; ħ = ħ)
+    end
+    T = eltype(Td)
+    disp, symplectic = _squeeze(basis, r, theta)
+    gpu_disp = CuArray{T}(disp)
+    gpu_symplectic = CuArray{T}(symplectic)
+    return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ)
+end
+
+function twosqueeze(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, R}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return twosqueeze(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, r, theta; ħ = ħ)
+    end
+    T = eltype(Td)
+    disp, symplectic = _twosqueeze(basis, r, theta)
+    gpu_disp = CuArray{T}(disp)
+    gpu_symplectic = CuArray{T}(symplectic)
+    return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ)
+end
+
+function phaseshift(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, theta::R; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, R}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return phaseshift(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, theta; ħ = ħ)
+    end
+    T = eltype(Td)
+    disp, symplectic = _phaseshift(basis, theta)
+    gpu_disp = CuArray{T}(disp)
+    gpu_symplectic = CuArray{T}(symplectic)
+    return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ)
+end
+
+function beamsplitter(::Type{Td}, ::Type{Ts}, basis::SymplecticBasis{N}, transmit::R; ħ = 2) where {Td<:CuVector, Ts<:CuMatrix, N<:Int, R}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return beamsplitter(Vector{eltype(Td)}, Matrix{eltype(Ts)}, basis, transmit; ħ = ħ)
+    end
+    T = eltype(Td)
+    disp, symplectic = _beamsplitter(basis, transmit)
+    gpu_disp = CuArray{T}(disp)
+    gpu_symplectic = CuArray{T}(symplectic)
+    return GaussianUnitary(basis, gpu_disp, gpu_symplectic; ħ = ħ)
+end
+
+function attenuator(::Type{Td}, ::Type{Tt}, basis::SymplecticBasis{N}, theta::R, n::M; ħ = 2) where {Td<:CuVector, Tt<:CuMatrix, N<:Int, R, M}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return attenuator(Vector{eltype(Td)}, Matrix{eltype(Tt)}, basis, theta, n; ħ = ħ)
+    end
+    T = eltype(Td)
+    disp, transform, noise = _attenuator(basis, theta, n)
+    gpu_disp = CuArray{T}(disp)
+    gpu_transform = CuArray{T}(transform)
+    gpu_noise = CuArray{T}(noise)
+    return GaussianChannel(basis, gpu_disp, gpu_transform, gpu_noise; ħ = ħ)
+end
+
+function amplifier(::Type{Td}, ::Type{Tt}, basis::SymplecticBasis{N}, r::R, n::M; ħ = 2) where {Td<:CuVector, Tt<:CuMatrix, N<:Int, R, M}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        return amplifier(Vector{eltype(Td)}, Matrix{eltype(Tt)}, basis, r, n; ħ = ħ)
+    end
+    T = eltype(Td)
+    disp, transform, noise = _amplifier(basis, r, n)
+    gpu_disp = CuArray{T}(disp)
+    gpu_transform = CuArray{T}(transform)
+    gpu_noise = CuArray{T}(noise)
+    return GaussianChannel(basis, gpu_disp, gpu_transform, gpu_noise; ħ = ħ)
+end
+
+function displace(::Type{T}, basis::SymplecticBasis{N}, alpha::A; ħ = 2) where {T<:CuVector, N<:Int, A}
+    return displace(T, CuMatrix{eltype(T)}, basis, alpha; ħ = ħ)
+end
+
+function squeeze(::Type{T}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R}
+    return squeeze(T, CuMatrix{eltype(T)}, basis, r, theta; ħ = ħ)
+end
+
+function twosqueeze(::Type{T}, basis::SymplecticBasis{N}, r::R, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R}
+    return twosqueeze(T, CuMatrix{eltype(T)}, basis, r, theta; ħ = ħ)
+end
+
+function phaseshift(::Type{T}, basis::SymplecticBasis{N}, theta::R; ħ = 2) where {T<:CuVector, N<:Int, R}
+    return phaseshift(T, CuMatrix{eltype(T)}, basis, theta; ħ = ħ)
+end
+
+function beamsplitter(::Type{T}, basis::SymplecticBasis{N}, transmit::R; ħ = 2) where {T<:CuVector, N<:Int, R}
+    return beamsplitter(T, CuMatrix{eltype(T)}, basis, transmit; ħ = ħ)
+end
+
+function attenuator(::Type{T}, basis::SymplecticBasis{N}, theta::R, n::M; ħ = 2) where {T<:CuVector, N<:Int, R, M}
+    return attenuator(T, CuMatrix{eltype(T)}, basis, theta, n; ħ = ħ)
+end
+
+function amplifier(::Type{T}, basis::SymplecticBasis{N}, r::R, n::M; ħ = 2) where {T<:CuVector, N<:Int, R, M}
+    return amplifier(T, CuMatrix{eltype(T)}, basis, r, n; ħ = ħ)
+end
+function Gabs.tensor(::Type{Td}, ::Type{Ts}, op1::GaussianUnitary{B,D1,S1}, op2::GaussianUnitary{B,D2,S2}) where {
+    Td<:CuVector, Ts<:CuMatrix, B<:SymplecticBasis, D1<:CuArray, S1<:CuArray, D2<:CuArray, S2<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_op1 = GaussianUnitary(op1.basis, Array(op1.disp), Array(op1.symplectic); ħ=op1.ħ)
+        cpu_op2 = GaussianUnitary(op2.basis, Array(op2.disp), Array(op2.symplectic); ħ=op2.ħ)
+        result_cpu = tensor(Vector{eltype(Td)}, Matrix{eltype(Ts)}, cpu_op1, cpu_op2)
+        return GaussianUnitary(result_cpu.basis, CuArray{eltype(Td)}(result_cpu.disp), CuArray{eltype(Ts)}(result_cpu.symplectic); ħ=result_cpu.ħ)
+    end
+    typeof(op1.basis) == typeof(op2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR))
+    op1.ħ == op2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR))
+    combined_basis = op1.basis ⊕ op2.basis
+    T = eltype(Td)
+    n1 = length(op1.disp)
+    n2 = length(op2.disp)
+    total_dim = n1 + n2
+    disp_combined = vcat(CuArray{T}(op1.disp), CuArray{T}(op2.disp))
+    symplectic_combined = CUDA.zeros(T, total_dim, total_dim)
+    symplectic_combined[1:n1, 1:n1] .= CuArray{T}(op1.symplectic)
+    symplectic_combined[n1+1:end, n1+1:end] .= CuArray{T}(op2.symplectic)
+    return GaussianUnitary(combined_basis, disp_combined, symplectic_combined; ħ = op1.ħ)
+end
+
+function Gabs.tensor(::Type{Td}, ::Type{Tt}, op1::GaussianChannel{B,D1,T1}, op2::GaussianChannel{B,D2,T2}) where {
+    Td<:CuVector, Tt<:CuMatrix, B<:SymplecticBasis, D1<:CuArray, T1<:CuArray, D2<:CuArray, T2<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_op1 = GaussianChannel(op1.basis, Array(op1.disp), Array(op1.transform), Array(op1.noise); ħ=op1.ħ)
+        cpu_op2 = GaussianChannel(op2.basis, Array(op2.disp), Array(op2.transform), Array(op2.noise); ħ=op2.ħ)
+        result_cpu = tensor(Vector{eltype(Td)}, Matrix{eltype(Tt)}, cpu_op1, cpu_op2)
+        return GaussianChannel(result_cpu.basis, CuArray{eltype(Td)}(result_cpu.disp), CuArray{eltype(Tt)}(result_cpu.transform), CuArray{eltype(Tt)}(result_cpu.noise); ħ=result_cpu.ħ)
+    end
+    typeof(op1.basis) == typeof(op2.basis) || throw(ArgumentError(Gabs.SYMPLECTIC_ERROR))
+    op1.ħ == op2.ħ || throw(ArgumentError(Gabs.HBAR_ERROR))
+    combined_basis = op1.basis ⊕ op2.basis  
+    T = eltype(Td)
+    n1 = length(op1.disp)
+    n2 = length(op2.disp) 
+    total_dim = n1 + n2
+    disp_combined = vcat(CuArray{T}(op1.disp), CuArray{T}(op2.disp))
+    transform_combined = CUDA.zeros(T, total_dim, total_dim)
+    transform_combined[1:n1, 1:n1] .= CuArray{T}(op1.transform)
+    transform_combined[n1+1:end, n1+1:end] .= CuArray{T}(op2.transform)
+    noise_combined = CUDA.zeros(T, total_dim, total_dim)
+    noise_combined[1:n1, 1:n1] .= CuArray{T}(op1.noise)
+    noise_combined[n1+1:end, n1+1:end] .= CuArray{T}(op2.noise)
+    return GaussianChannel(combined_basis, disp_combined, transform_combined, noise_combined; ħ = op1.ħ)
+end
+
+function Base.:(*)(op::GaussianUnitary{B,<:CuArray,<:CuArray}, state::GaussianState{B,<:Array,<:Array}) where {B<:SymplecticBasis}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_op = GaussianUnitary(op.basis, Array(op.disp), Array(op.symplectic); ħ=op.ħ)
+        return cpu_op * state
+    end
+    op.basis == state.basis || throw(ArgumentError(ACTION_ERROR))
+    op.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR))
+    T = real(eltype(op.disp))
+    gpu_state = GaussianState(state.basis, CuArray{T}(state.mean), CuArray{T}(state.covar); ħ = state.ħ)
+    return op * gpu_state
+end
+
+function Base.:(*)(op::GaussianUnitary{B,<:Array,<:Array}, state::GaussianState{B,<:CuArray,<:CuArray}) where {B<:SymplecticBasis}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ)
+        return op * cpu_state
+    end
+    op.basis == state.basis || throw(ArgumentError(ACTION_ERROR))
+    op.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR))
+    T = real(eltype(state.mean))
+    gpu_op = GaussianUnitary(op.basis, CuArray{T}(op.disp), CuArray{T}(op.symplectic); ħ = op.ħ)
+    return gpu_op * state
+end
+
+function Base.:(*)(op::GaussianChannel{B,<:CuArray,<:CuArray}, state::GaussianState{B,<:Array,<:Array}) where {B<:SymplecticBasis}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_op = GaussianChannel(op.basis, Array(op.disp), Array(op.transform), Array(op.noise); ħ=op.ħ)
+        return cpu_op * state
+    end
+    op.basis == state.basis || throw(ArgumentError(ACTION_ERROR))
+    op.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR))
+    T = real(eltype(op.disp))
+    gpu_state = GaussianState(state.basis, CuArray{T}(state.mean), CuArray{T}(state.covar); ħ = state.ħ)
+    return op * gpu_state
+end
+
+function Base.:(*)(op::GaussianChannel{B,<:Array,<:Array}, state::GaussianState{B,<:CuArray,<:CuArray}) where {B<:SymplecticBasis}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ=state.ħ)
+        return op * cpu_state
+    end
+    op.basis == state.basis || throw(ArgumentError(ACTION_ERROR))
+    op.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR))
+    T = real(eltype(state.mean))
+    gpu_op = GaussianChannel(op.basis, CuArray{T}(op.disp), CuArray{T}(op.transform), CuArray{T}(op.noise); ħ = op.ħ)
+    return gpu_op * state
+end
\ No newline at end of file
diff --git a/ext/CUDAExt/utils.jl b/ext/CUDAExt/utils.jl
new file mode 100644
index 0000000..d2ffd08
--- /dev/null
+++ b/ext/CUDAExt/utils.jl
@@ -0,0 +1,63 @@
+Base.@propagate_inbounds function _promote_output_vector(::Type{T1}, ::Type{T2}, vec_out) where {T1<:CuVector, T2<:CuVector}
+    return CuArray(vec_out)
+end
+
+Base.@propagate_inbounds function _promote_output_vector(::Type{T1}, ::Type{T2}, vec_out) where {T1<:CuVector, T2<:Vector}
+    return CuArray(vec_out)
+end
+
+Base.@propagate_inbounds function _promote_output_vector(::Type{T1}, ::Type{T2}, vec_out) where {T1<:Vector, T2<:CuVector}
+    return CuArray(vec_out)
+end
+
+Base.@propagate_inbounds function _promote_output_vector(::Type{<:CuVector}, vec_out, vec_length::Int)
+    return CuArray(vec_out)
+end
+
+Base.@propagate_inbounds function _promote_output_matrix(::Type{T1}, ::Type{T2}, mat_out) where {T1<:CuMatrix, T2<:CuMatrix}
+    return CuArray(mat_out)
+end
+
+Base.@propagate_inbounds function _promote_output_matrix(::Type{T1}, ::Type{T2}, mat_out) where {T1<:CuMatrix, T2<:Matrix}
+    return CuArray(mat_out)
+end
+
+Base.@propagate_inbounds function _promote_output_matrix(::Type{T1}, ::Type{T2}, mat_out) where {T1<:Matrix, T2<:CuMatrix}
+    return CuArray(mat_out)
+end
+
+Base.@propagate_inbounds function _promote_output_matrix(::Type{<:CuMatrix}, mat_out, out_dim::Int)
+    return CuArray(mat_out)
+end
+
+Base.@propagate_inbounds function _promote_output_matrix(::Type{<:CuMatrix}, mat_out, out_dim::Tuple)
+    return CuArray(mat_out)
+end
+
+"""
+    gpu_fallback_warning()
+
+Issue warning when falling back to CPU due to CUDA unavailability.
+"""
+function gpu_fallback_warning()
+    if !CUDA_AVAILABLE
+        @warn "CUDA not available. Falling back to CPU computation. Install CUDA.jl and ensure GPU drivers are properly configured for GPU acceleration."
+    end
+end
+
+"""
+    detect_array_device_type(x::AbstractArray)
+
+Detect device type of array using type introspection.
+"""
+function detect_array_device_type(x::CuArray)
+    return :gpu, eltype(x), size(x)
+end
+
+function detect_array_device_type(x::AbstractArray) 
+    return :cpu, eltype(x), size(x)
+end
+
+function device(x::CuArray)
+    return :gpu
+end
\ No newline at end of file
diff --git a/ext/CUDAExt/wigner_kernels.jl b/ext/CUDAExt/wigner_kernels.jl
new file mode 100644
index 0000000..da77fe3
--- /dev/null
+++ b/ext/CUDAExt/wigner_kernels.jl
@@ -0,0 +1,383 @@
+"""
+    wigner_kernel!(output, x_points, mean, covar_inv, det_covar, nmodes)
+
+CUDA kernel for computing Wigner function values at multiple phase space points.
+"""
+function wigner_kernel!(output::CuDeviceVector{T}, x_points::CuDeviceMatrix{T}, 
+                       mean::CuDeviceVector{T}, covar_inv::CuDeviceMatrix{T}, 
+                       det_covar::T, nmodes::Int) where T
+    
+    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    
+    if idx <= size(x_points, 2)
+        diff_sum = zero(T)
+        
+        for i in 1:length(mean)
+            diff_i = x_points[i, idx] - mean[i]
+            temp_sum = zero(T)
+            
+            for j in 1:length(mean)
+                diff_j = x_points[j, idx] - mean[j]
+                temp_sum += covar_inv[i, j] * diff_j
+            end
+            
+            diff_sum += diff_i * temp_sum
+        end
+        
+
+        normalization = (2 * T(π))^nmodes * sqrt(det_covar)
+        exponent = -0.5 * diff_sum
+        
+        output[idx] = exp(exponent) / normalization
+    end
+    
+    return nothing
+end
+
+"""
+    wignerchar_kernel!(output, xi_points, mean, covar, omega, nmodes, hbar)
+
+CUDA kernel for computing Wigner characteristic function values at multiple points.
+"""
+function wignerchar_kernel!(output::CuDeviceVector{Complex{T}}, xi_points::CuDeviceMatrix{T},
+                                 precomputed_quadratic::CuDeviceMatrix{T}, precomputed_linear::CuDeviceVector{T}) where T
+    
+    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    
+    if idx <= size(xi_points, 2)
+        quadratic_term = zero(T)
+        for i in 1:size(xi_points, 1)
+            for j in 1:size(xi_points, 1)
+                quadratic_term += xi_points[i, idx] * precomputed_quadratic[i, j] * xi_points[j, idx]
+            end
+        end
+        
+        linear_term = zero(T)
+        for i in 1:size(xi_points, 1)
+            linear_term += precomputed_linear[i] * xi_points[i, idx]
+        end
+        
+
+        real_part = -0.5 * quadratic_term
+        imag_part = -linear_term
+        
+        output[idx] = Complex{T}(cos(imag_part), sin(imag_part)) * exp(real_part)
+    end
+    
+    return nothing
+end
+
+function Gabs.wigner(state::GaussianState{B,M,V}, x::AbstractVector) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ)
+        return Gabs.wigner(cpu_state, x)
+    end
+    
+    basis = state.basis
+    nmodes = basis.nmodes
+    mean = state.mean
+    covar = state.covar
+    
+    length(mean) == length(x) || throw(ArgumentError(WIGNER_ERROR))
+    
+    T = eltype(mean)
+    x_gpu = CuArray(T.(x))
+    
+    diff = x_gpu .- mean
+    covar_inv = inv(covar)
+    
+    quad_form = dot(diff, covar_inv * diff)
+    
+    det_covar = det(covar)
+    normalization = (2 * T(π))^nmodes * sqrt(det_covar)
+    
+    result = exp(-0.5 * quad_form) / normalization
+    
+    return result  
+end
+
+function Gabs.wigner(state::GaussianState{B,M,V}, x_points::CuMatrix{T}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, T}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ)
+        return [Gabs.wigner(cpu_state, Array(x_points[:, i])) for i in 1:size(x_points, 2)]
+    end
+    basis = state.basis
+    nmodes = basis.nmodes
+    size(x_points, 1) == size(state.mean, 1) || throw(ArgumentError(WIGNER_ERROR))
+    num_points = size(x_points, 2)
+    mean_cpu = Array(state.mean)
+    covar_cpu = Array(state.covar)
+    covar_inv_cpu = inv(covar_cpu)
+    det_covar = det(covar_cpu)
+    mean_gpu = CuArray{T}(mean_cpu)
+    covar_inv_gpu = CuArray{T}(covar_inv_cpu)
+    x_points_gpu = CuArray{T}(x_points)
+    output = CUDA.zeros(T, num_points)
+    threads_per_block = min(256, num_points)
+    num_blocks = cld(num_points, threads_per_block)
+    @cuda threads=threads_per_block blocks=num_blocks wigner_kernel!(
+        output, x_points_gpu, mean_gpu, covar_inv_gpu, T(det_covar), nmodes
+    )
+    return output
+end
+
+function Gabs.wignerchar(state::GaussianState{B,M,V}, xi::AbstractVector) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ)
+        return Gabs.wignerchar(cpu_state, xi)
+    end
+    basis = state.basis
+    nmodes = basis.nmodes
+    mean = state.mean
+    covar = state.covar
+    length(mean) == length(xi) || throw(ArgumentError(WIGNER_ERROR))
+    T = eltype(mean)
+    xi_gpu = CuArray(T.(xi))
+    omega = CuArray(symplecticform(basis))
+    temp1 = omega * covar
+    temp2 = temp1 * transpose(omega)
+    quadratic_term = dot(xi_gpu, temp2 * xi_gpu)
+    omega_mean = omega * mean
+    linear_term = dot(omega_mean, xi_gpu)
+    real_part = -0.5 * quadratic_term  
+    imag_part = -linear_term
+    result = Complex{T}(cos(imag_part), sin(imag_part)) * exp(real_part)
+    return result  
+end
+
+function Gabs.wignerchar(state::GaussianState{B,M,V}, xi_points::CuMatrix{T}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, T}
+    if !CUDA_AVAILABLE
+        gpu_fallback_warning()
+        cpu_state = GaussianState(state.basis, Array(state.mean), Array(state.covar); ħ = state.ħ)
+        return [Gabs.wignerchar(cpu_state, Array(xi_points[:, i])) for i in 1:size(xi_points, 2)]
+    end
+    basis = state.basis
+    nmodes = basis.nmodes
+    size(xi_points, 1) == size(state.mean, 1) || throw(ArgumentError(WIGNER_ERROR))
+    num_points = size(xi_points, 2)
+    mean_cpu = Array(state.mean)
+    covar_cpu = Array(state.covar)
+    omega_cpu = symplecticform(basis)
+    temp_matrix = omega_cpu * covar_cpu * transpose(omega_cpu)
+    precomputed_linear = omega_cpu * mean_cpu
+    precomputed_quadratic_gpu = CuArray{T}(temp_matrix)
+    precomputed_linear_gpu = CuArray{T}(precomputed_linear)
+    xi_points_gpu = CuArray{T}(xi_points)
+    output = CUDA.zeros(Complex{T}, num_points)
+    threads_per_block = min(256, num_points)
+    num_blocks = cld(num_points, threads_per_block)
+    @cuda threads=threads_per_block blocks=num_blocks wignerchar_kernel!(
+        output, xi_points_gpu, precomputed_quadratic_gpu, precomputed_linear_gpu
+    )
+    return output
+end
+
+"""
+    wigner_grid(state::GaussianState, x_range, p_range, nx::Int, np::Int)
+
+Create a phase space grid for Wigner function evaluation.
+Returns points as a 2×(nx*np) CuMatrix for efficient GPU evaluation.
+"""
+function wigner_grid(state::GaussianState{B,M,V}, x_range, p_range, nx::Int, np::Int) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray}
+    T = eltype(M)
+    x_vals = range(x_range[1], x_range[2], length=nx)
+    p_vals = range(p_range[1], p_range[2], length=np)
+    total_points = nx * np
+    points = CUDA.zeros(T, 2, total_points)
+    for i in 1:nx
+        for j in 1:np
+            idx = (i-1) * np + j
+            points[1, idx] = T(x_vals[i])
+            points[2, idx] = T(p_vals[j])
+        end
+    end
+    return points
+end
+
+"""
+    cross_wigner_batch(state1::GaussianState{CuArray}, state2::GaussianState{CuArray}, 
+                       x_points::CuMatrix)
+
+ Batch evaluation of cross-Wigner function using existing GPU kernels.
+"""
+function cross_wigner_batch(state1::GaussianState{B,M,V}, state2::GaussianState{B,M,V}, 
+                           x_points::CuMatrix{T}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, T}
+    if !CUDA_AVAILABLE
+        error("GPU cross_wigner_batch called but CUDA not available")
+    end
+    typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(SYMPLECTIC_ERROR))
+    state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR))
+    state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR))
+    
+    num_points = size(x_points, 2)
+    results = CUDA.zeros(Complex{T}, num_points)
+    _launch_cross_wigner_batch_kernel!(results, x_points, state1, state2)
+    return results
+end
+
+"""
+    cross_wignerchar_batch(state1::GaussianState{CuArray}, state2::GaussianState{CuArray}, 
+                          xi_points::CuMatrix)
+
+Batch evaluation of cross-Wigner characteristic function using GPU kernels.
+"""
+function cross_wignerchar_batch(state1::GaussianState{B,M,V}, state2::GaussianState{B,M,V}, 
+                               xi_points::CuMatrix{T}) where {B<:SymplecticBasis, M<:CuArray, V<:CuArray, T}
+    if !CUDA_AVAILABLE
+        error("GPU cross_wignerchar_batch called but CUDA not available")
+    end
+    typeof(state1.basis) == typeof(state2.basis) || throw(ArgumentError(SYMPLECTIC_ERROR))
+    state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR))
+    state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR))
+    num_points = size(xi_points, 2)
+    results = CUDA.zeros(Complex{T}, num_points)
+    _launch_cross_wignerchar_batch_kernel!(results, xi_points, state1, state2)
+    return results
+end
+
+"""
+    _launch_cross_wigner_batch_kernel!(results, x_points, state1, state2)
+
+Launch GPU kernel for batch cross-Wigner evaluation.
+"""
+function _launch_cross_wigner_batch_kernel!(results::CuArray{Complex{T}}, 
+                                           x_points::CuMatrix{T},
+                                           state1::GaussianState, 
+                                           state2::GaussianState) where T
+    num_points = size(x_points, 2)
+    nmodes = state1.basis.nmodes
+    mean1_cpu = Array(state1.mean)
+    mean2_cpu = Array(state2.mean)
+    covar1_cpu = Array(state1.covar)
+    covar2_cpu = Array(state2.covar)
+    omega_cpu = symplecticform(state1.basis)
+    avg_covar_cpu = (covar1_cpu + covar2_cpu) / 2
+    avg_covar_inv_cpu = inv(avg_covar_cpu)
+    log_det_avg_covar = logdet(avg_covar_cpu)
+    mean1_gpu = CuArray{T}(mean1_cpu)
+    mean2_gpu = CuArray{T}(mean2_cpu)
+    avg_covar_inv_gpu = CuArray{T}(avg_covar_inv_cpu)
+    omega_gpu = CuArray{T}(omega_cpu)
+    threads_per_block = min(256, num_points)
+    num_blocks = cld(num_points, threads_per_block)
+    @cuda threads=threads_per_block blocks=num_blocks _cross_wigner_batch_kernel!(
+        results, x_points, mean1_gpu, mean2_gpu, avg_covar_inv_gpu, 
+        omega_gpu, T(log_det_avg_covar), nmodes, T(state1.ħ)
+    )
+    CUDA.synchronize()
+end
+
+"""
+    _launch_cross_wignerchar_batch_kernel!(results, xi_points, state1, state2)
+
+Launch optimized GPU kernel for batch cross-Wigner characteristic function evaluation.
+"""
+function _launch_cross_wignerchar_batch_kernel!(results::CuArray{Complex{T}}, 
+                                               xi_points::CuMatrix{T},
+                                               state1::GaussianState, 
+                                               state2::GaussianState) where T
+    
+    num_points = size(xi_points, 2)
+    mean1_cpu = Array(state1.mean)
+    mean2_cpu = Array(state2.mean)
+    covar1_cpu = Array(state1.covar)
+    covar2_cpu = Array(state2.covar)
+    omega_cpu = symplecticform(state1.basis)
+    avg_mean_cpu = (mean1_cpu + mean2_cpu) / 2
+    avg_covar_cpu = (covar1_cpu + covar2_cpu) / 2
+    temp_matrix = omega_cpu * avg_covar_cpu * transpose(omega_cpu)
+    linear_vector = omega_cpu * avg_mean_cpu
+    quadratic_matrix_gpu = CuArray{T}(temp_matrix)
+    linear_vector_gpu = CuArray{T}(linear_vector)
+    threads_per_block = min(256, num_points)
+    num_blocks = cld(num_points, threads_per_block)
+    @cuda threads=threads_per_block blocks=num_blocks _cross_wignerchar_batch_kernel!(
+        results, xi_points, quadratic_matrix_gpu, linear_vector_gpu
+    )
+    CUDA.synchronize()
+end
+
+"""
+    _cross_wigner_batch_kernel!(results, x_points, mean1, mean2, avg_covar_inv, 
+                               omega, log_det_avg_covar, nmodes, hbar)
+
+ GPU kernel for batch cross-Wigner evaluation.
+"""
+function _cross_wigner_batch_kernel!(results::CuDeviceArray{Complex{T}}, 
+                                    x_points::CuDeviceMatrix{T},
+                                    mean1::CuDeviceVector{T}, 
+                                    mean2::CuDeviceVector{T},
+                                    avg_covar_inv::CuDeviceMatrix{T},
+                                    omega::CuDeviceMatrix{T}, 
+                                    log_det_avg_covar::T, 
+                                    nmodes::Int, hbar::T) where T
+
+    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if idx <= size(x_points, 2)
+        dim = size(x_points, 1)
+        quadratic_form = zero(T)
+        for i in 1:dim
+            avg_mean_i = (mean1[i] + mean2[i]) * T(0.5)
+            dx_i = x_points[i, idx] - avg_mean_i
+            for j in 1:dim
+                avg_mean_j = (mean1[j] + mean2[j]) * T(0.5)
+                dx_j = x_points[j, idx] - avg_mean_j
+                quadratic_form += dx_i * avg_covar_inv[i, j] * dx_j
+            end
+        end
+        phase_arg = zero(T)
+        for i in 1:dim
+            mu_diff_i = mean1[i] - mean2[i]
+            avg_mean_i = (mean1[i] + mean2[i]) * T(0.5)
+            dx_i = x_points[i, idx] - avg_mean_i
+            omega_dx_i = zero(T)
+            for j in 1:dim
+                avg_mean_j = (mean1[j] + mean2[j]) * T(0.5)
+                dx_j = x_points[j, idx] - avg_mean_j
+                omega_dx_i += omega[i, j] * dx_j
+            end
+            phase_arg += mu_diff_i * omega_dx_i
+        end
+        log_normalization = -T(nmodes) * log(T(2π)) - T(0.5) * log_det_avg_covar
+        normalization = exp(log_normalization)
+        gaussian_part = exp(-T(0.5) * quadratic_form)
+        sin_val, cos_val = sincos(phase_arg / hbar)
+        phase_factor = Complex{T}(cos_val, sin_val)
+        results[idx] = normalization * gaussian_part * phase_factor
+    end
+    return nothing
+end
+
+"""
+    _cross_wignerchar_batch_kernel!(results, xi_points, quadratic_matrix, linear_vector)
+
+ GPU kernel for batch cross-Wigner characteristic function evaluation.
+"""
+function _cross_wignerchar_batch_kernel!(results::CuDeviceArray{Complex{T}},
+                                        xi_points::CuDeviceMatrix{T},
+                                        quadratic_matrix::CuDeviceMatrix{T},
+                                        linear_vector::CuDeviceVector{T}) where T
+    
+    idx = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+    if idx <= size(xi_points, 2)
+        dim = size(xi_points, 1)
+        quadratic_term = zero(T)
+        for i in 1:dim
+            for j in 1:dim
+                quadratic_term += xi_points[i, idx] * quadratic_matrix[i, j] * xi_points[j, idx]
+            end
+        end
+        linear_term = zero(T)
+        for i in 1:dim
+            linear_term += linear_vector[i] * xi_points[i, idx]
+        end
+        real_part = -T(0.5) * quadratic_term
+        imag_part = -linear_term
+        sin_val, cos_val = sincos(imag_part)
+        results[idx] = Complex{T}(cos_val, sin_val) * exp(real_part)
+    end
+    return nothing
+end
\ No newline at end of file
diff --git a/src/Gabs.jl b/src/Gabs.jl
index d627996..3ff895d 100644
--- a/src/Gabs.jl
+++ b/src/Gabs.jl
@@ -38,7 +38,14 @@ export
     williamson, Williamson, polar, Polar, blochmessiah, BlochMessiah,
     # metrics
     purity, entropy_vn, fidelity, logarithmic_negativity,
-    cross_wigner, cross_wignerchar
+    cross_wigner, cross_wignerchar,
+    # GPU device management
+    gpu, cpu, device, adapt_device,
+    # optimized hybrid GPU random functions (industry standard approach)
+    randstate_gpu, randunitary_gpu, randchannel_gpu, randsymplectic_gpu,
+    batch_randstate_gpu, batch_randunitary_gpu,
+    #GPU simulation setup
+    random_ensemble_gpu, random_simulation_setup_gpu
 
 include("errors.jl")
 
@@ -72,4 +79,5 @@ include("linearcombinations.jl")
 
 include("nongaussian_states.jl")
 
-end
+include("gpu_convenience.jl")
+end
\ No newline at end of file
diff --git a/src/gpu_convenience.jl b/src/gpu_convenience.jl
new file mode 100644
index 0000000..aa3bad3
--- /dev/null
+++ b/src/gpu_convenience.jl
@@ -0,0 +1,243 @@
+"""
+    gpu(x; precision=Float32)
+
+Transfer data to GPU. Works with states, operators, and arrays.
+Requires CUDA.jl to be loaded.
+"""
+function gpu(state::GaussianState; precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _gpu_impl(state, precision)
+end
+
+function gpu(op::GaussianUnitary; precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _gpu_impl(op, precision)
+end
+
+function gpu(op::GaussianChannel; precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _gpu_impl(op, precision)
+end
+
+function gpu(lc::GaussianLinearCombination; precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _gpu_impl(lc, precision)
+end
+
+function gpu(basis::SymplecticBasis)
+    return basis  
+end
+
+function gpu(x::AbstractArray; precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _gpu_impl(x, precision)
+end
+
+"""
+    cpu(x)
+
+Transfer data to CPU. Works with states, operators, and arrays.
+"""
+function cpu(state::GaussianState)
+    cpu_mean = isa(state.mean, AbstractArray) ? Array(state.mean) : state.mean
+    cpu_covar = isa(state.covar, AbstractArray) ? Array(state.covar) : state.covar
+    return GaussianState(state.basis, cpu_mean, cpu_covar; ħ = state.ħ)
+end
+
+function cpu(op::GaussianUnitary)
+    cpu_disp = isa(op.disp, AbstractArray) ? Array(op.disp) : op.disp
+    cpu_symplectic = isa(op.symplectic, AbstractArray) ? Array(op.symplectic) : op.symplectic
+    return GaussianUnitary(op.basis, cpu_disp, cpu_symplectic; ħ = op.ħ)
+end
+
+function cpu(op::GaussianChannel)
+    cpu_disp = isa(op.disp, AbstractArray) ? Array(op.disp) : op.disp
+    cpu_transform = isa(op.transform, AbstractArray) ? Array(op.transform) : op.transform
+    cpu_noise = isa(op.noise, AbstractArray) ? Array(op.noise) : op.noise
+    return GaussianChannel(op.basis, cpu_disp, cpu_transform, cpu_noise; ħ = op.ħ)
+end
+
+function cpu(lc::GaussianLinearCombination)
+    cpu_coeffs = isa(lc.coeffs, AbstractArray) ? Array(lc.coeffs) : lc.coeffs
+    cpu_states = [cpu(state) for state in lc.states]
+    return GaussianLinearCombination(lc.basis, cpu_coeffs, cpu_states)
+end
+
+function cpu(x::AbstractArray)
+    return Array(x)
+end
+
+"""
+    device(x)
+
+Detect device of arrays/objects.
+"""
+device(x::AbstractArray) = _detect_device(x)
+device(state::GaussianState) = _detect_device(state.mean)
+device(op::GaussianUnitary) = _detect_device(op.disp)
+device(op::GaussianChannel) = _detect_device(op.disp)
+device(lc::GaussianLinearCombination) = _detect_device(lc)
+function _detect_device(x)
+    return :cpu 
+end
+
+"""
+    adapt_device(target_constructor, source_obj, args...)
+
+Create target object on same device as source.
+"""
+function adapt_device(target_constructor, source_obj, args...)
+    source_device = device(source_obj)
+    if source_device == :gpu
+        if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+            @warn "GPU source object detected but CUDA extension not available. Creating CPU object instead."
+            return target_constructor(args...)
+        end
+        return _adapt_device_gpu(target_constructor, source_obj, args...)
+    else
+        return target_constructor(args...)
+    end
+end
+
+"""
+    randstate_gpu(basis; pure=false, ħ=2, precision=Float32)
+
+Generate random Gaussian state optimized for GPU. 
+"""
+function randstate_gpu(basis::SymplecticBasis; pure=false, ħ=2, precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _randstate_gpu_impl(basis, precision; pure=pure, ħ=ħ)
+end
+
+"""
+    randunitary_gpu(basis; passive=false, ħ=2, precision=Float32)
+
+Generate random Gaussian unitary optimized for GPU. Uses hybrid CPU generation + GPU transfer.
+"""
+function randunitary_gpu(basis::SymplecticBasis; passive=false, ħ=2, precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _randunitary_gpu_impl(basis, precision; passive=passive, ħ=ħ)
+end
+
+"""
+    randchannel_gpu(basis; ħ=2, precision=Float32)
+
+Generate random Gaussian channel optimized for GPU. Uses hybrid CPU generation + GPU transfer.
+"""
+function randchannel_gpu(basis::SymplecticBasis; ħ=2, precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _randchannel_gpu_impl(basis, precision; ħ=ħ)
+end
+
+"""
+    randsymplectic_gpu(basis; passive=false, precision=Float32)
+
+Generate random symplectic matrix optimized for GPU. Uses hybrid CPU generation + GPU transfer.
+"""
+function randsymplectic_gpu(basis::SymplecticBasis; passive=false, precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _randsymplectic_gpu_impl(basis, precision; passive=passive)
+end
+
+"""
+    batch_randstate_gpu(basis, n::Int; pure=false, ħ=2, precision=Float32)
+
+Generate n random states optimized for GPU in batch. 
+"""
+function batch_randstate_gpu(basis::SymplecticBasis, n::Int; pure=false, ħ=2, precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _batch_randstate_gpu_impl(basis, n, precision; pure=pure, ħ=ħ)
+end
+
+"""
+    batch_randunitary_gpu(basis, n::Int; passive=false, ħ=2, precision=Float32)
+
+Generate n random unitaries optimized for GPU in batch.
+"""
+function batch_randunitary_gpu(basis::SymplecticBasis, n::Int; passive=false, ħ=2, precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    return _batch_randunitary_gpu_impl(basis, n, precision; passive=passive, ħ=ħ)
+end
+
+"""
+    random_ensemble_gpu(basis, n_states::Int, n_unitaries::Int; precision=Float32)
+
+Create a complete random ensemble for GPU quantum simulations. Generates:
+- n_states random pure states on GPU
+- n_unitaries random unitaries on GPU
+- Returns tuple (states, unitaries) ready for batched GPU operations
+"""
+function random_ensemble_gpu(basis::SymplecticBasis, n_states::Int, n_unitaries::Int; precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    states = batch_randstate_gpu(basis, n_states; pure=true, precision=precision)
+    unitaries = batch_randunitary_gpu(basis, n_unitaries; passive=false, precision=precision)
+    return (states, unitaries)
+end
+
+"""
+    random_simulation_setup_gpu(basis, n_samples::Int; 
+                                mixed_fraction=0.3, precision=Float32)
+
+Setup complete random simulation environment on GPU:
+- Pure states (70% by default)
+- Mixed states (30% by default)  
+- Random channels for decoherence
+- Random unitaries for dynamics
+
+Returns NamedTuple with all components ready for GPU simulation.
+"""
+function random_simulation_setup_gpu(basis::SymplecticBasis, n_samples::Int; 
+                                     mixed_fraction=0.3, precision=Float32)
+    if !isdefined(Base, :get_extension) || isnothing(Base.get_extension(@__MODULE__, :CUDAExt))
+        error("GPU functionality requires CUDA.jl. Please run: using CUDA")
+    end
+    n_pure = round(Int, n_samples * (1 - mixed_fraction))
+    n_mixed = n_samples - n_pure
+    pure_states = batch_randstate_gpu(basis, n_pure; pure=true, precision=precision)
+    mixed_states = n_mixed > 0 ? batch_randstate_gpu(basis, n_mixed; pure=false, precision=precision) : typeof(pure_states[1])[]
+    unitaries = batch_randunitary_gpu(basis, n_samples; passive=false, precision=precision)
+    n_channels = max(1, n_samples ÷ 10)
+    channels_cpu = [randchannel(basis) for _ in 1:n_channels]
+    channels = [gpu(ch; precision=precision) for ch in channels_cpu]
+    return (
+        pure_states = pure_states,
+        mixed_states = mixed_states,
+        unitaries = unitaries,
+        channels = channels,
+        basis = basis,
+        precision = precision
+    )
+end
+
+function _randstate_gpu_impl end
+function _randunitary_gpu_impl end  
+function _randchannel_gpu_impl end
+function _randsymplectic_gpu_impl end
+function _batch_randstate_gpu_impl end
+function _batch_randunitary_gpu_impl end
+function _gpu_impl end
+function _adapt_device_gpu end
\ No newline at end of file
diff --git a/src/linearcombinations.jl b/src/linearcombinations.jl
index c5c21a2..eda40a0 100644
--- a/src/linearcombinations.jl
+++ b/src/linearcombinations.jl
@@ -51,10 +51,10 @@ GaussianLinearCombination with 2 terms for 1 mode.
 """
 mutable struct GaussianLinearCombination{B<:SymplecticBasis,C,S}
     basis::B
-    coeffs::Vector{C}
+    coeffs::C 
     states::Vector{S}
     ħ::Number
-    function GaussianLinearCombination(basis::B, coeffs::Vector{C}, states::Vector{S}) where {B<:SymplecticBasis,C,S}
+    function GaussianLinearCombination(basis::B, coeffs::AbstractVector{C}, states::Vector{S}) where {B<:SymplecticBasis,C<:Number,S}                                                   
         length(coeffs) == length(states) || throw(DimensionMismatch("Number of coefficients ($(length(coeffs))) must match number of states ($(length(states)))"))
         isempty(states) && throw(ArgumentError("Cannot create an empty linear combination"))
         ħ = first(states).ħ
@@ -67,7 +67,8 @@ mutable struct GaussianLinearCombination{B<:SymplecticBasis,C,S}
                 throw(ArgumentError("State $i has different ħ: expected $ħ, got $(state.ħ)"))
             end
         end
-        return new{B,C,S}(basis, coeffs, states, ħ)
+        return new{B,typeof(coeffs),S}(basis, coeffs, states, ħ)
+                         
     end
 end
 """
@@ -99,7 +100,7 @@ end
 
 Create a linear combination from separate vectors of coefficients and states.
 """
-function GaussianLinearCombination(coeffs::Vector{<:Number}, states::Vector{<:GaussianState})
+function GaussianLinearCombination(coeffs::AbstractVector{<:Number}, states::Vector{<:GaussianState})
     isempty(states) && throw(ArgumentError("Cannot create an empty linear combination"))
     basis = first(states).basis
     return GaussianLinearCombination(basis, coeffs, states)
@@ -125,10 +126,37 @@ Add two linear combinations of Gaussian states. Both must have the same symplect
 function Base.:+(lc1::GaussianLinearCombination{B}, lc2::GaussianLinearCombination{B}) where {B<:SymplecticBasis}
     lc1.basis == lc2.basis || throw(ArgumentError(SYMPLECTIC_ERROR))
     lc1.ħ == lc2.ħ || throw(ArgumentError(HBAR_ERROR))
-    coeffs = vcat(lc1.coeffs, lc2.coeffs)
-    states = vcat(lc1.states, lc2.states)
+    lc1_has_gpu_states = !isempty(lc1.states) && _is_gpu_array(lc1.states[1].mean)
+    lc2_has_gpu_states = !isempty(lc2.states) && _is_gpu_array(lc2.states[1].mean)
+    promote_to_gpu = lc1_has_gpu_states || lc2_has_gpu_states
+    if promote_to_gpu
+        T = promote_type(eltype(lc1.coeffs), eltype(lc2.coeffs))
+        coeffs1 = Vector{T}(Array(lc1.coeffs))  
+        coeffs2 = Vector{T}(Array(lc2.coeffs))  
+        if lc1_has_gpu_states
+            states1 = lc1.states
+        else
+            precision = T <: Complex ? real(T) : T
+            states1 = [gpu(s, precision = precision) for s in lc1.states]
+        end
+        if lc2_has_gpu_states
+            states2 = lc2.states
+        else
+            precision = T <: Complex ? real(T) : T
+            states2 = [gpu(s, precision = precision) for s in lc2.states]
+        end
+        coeffs = vcat(coeffs1, coeffs2)
+        states = vcat(states1, states2)
+    else
+        T = promote_type(eltype(lc1.coeffs), eltype(lc2.coeffs))
+        coeffs1 = Vector{T}(lc1.coeffs)
+        coeffs2 = Vector{T}(lc2.coeffs)
+        coeffs = vcat(coeffs1, coeffs2)
+        states = vcat(lc1.states, lc2.states)
+    end
     return GaussianLinearCombination(lc1.basis, coeffs, states)
 end
+
 function Base.:+(lc1::GaussianLinearCombination{B1}, lc2::GaussianLinearCombination{B2}) where {B1<:SymplecticBasis,B2<:SymplecticBasis}
     throw(ArgumentError(SYMPLECTIC_ERROR))
 end
@@ -156,7 +184,8 @@ Base.:-(lc::GaussianLinearCombination) = (-1) * lc
 Multiply a linear combination by a scalar from the left.
 """
 function Base.:*(α::Number, lc::GaussianLinearCombination)
-    new_coeffs = α .* lc.coeffs
+    T = promote_type(typeof(α), eltype(lc.coeffs))
+    new_coeffs = Vector{T}(α .* Array(lc.coeffs)) 
     return GaussianLinearCombination(lc.basis, new_coeffs, copy(lc.states))
 end
 """
@@ -171,8 +200,9 @@ Base.:*(lc::GaussianLinearCombination, α::Number) = α * lc
 Multiply a Gaussian state by a scalar to create a linear combination.
 """
 function Base.:*(α::Number, state::GaussianState)
-    coeff_type = promote_type(typeof(α), eltype(state.mean), eltype(state.covar))
-    return GaussianLinearCombination(state.basis, [convert(coeff_type, α)], [state])
+    T = promote_type(typeof(α), eltype(state.mean), eltype(state.covar))
+    coeff_array = Vector{T}([convert(T, α)])  
+    return GaussianLinearCombination(state.basis, coeff_array, [state])
 end
 """
     *(state::GaussianState, α::Number)
@@ -188,9 +218,22 @@ Add two Gaussian states to create a linear combination.
 function Base.:+(state1::GaussianState, state2::GaussianState)
     state1.basis == state2.basis || throw(ArgumentError(SYMPLECTIC_ERROR))
     state1.ħ == state2.ħ || throw(ArgumentError(HBAR_ERROR))
-    coeff_type = promote_type(eltype(state1.mean), eltype(state1.covar), 
-                              eltype(state2.mean), eltype(state2.covar))
-    return GaussianLinearCombination(state1.basis, [one(coeff_type), one(coeff_type)], [state1, state2])
+    state1_gpu = _is_gpu_array(state1.mean)
+    state2_gpu = _is_gpu_array(state2.mean)
+    T = promote_type(eltype(state1.mean), eltype(state1.covar), 
+                     eltype(state2.mean), eltype(state2.covar))
+    coeffs = Vector{T}([one(T), one(T)])
+    if state1_gpu || state2_gpu
+        precision = T <: Complex ? real(T) : T
+        states = [
+            state1_gpu ? state1 : gpu(state1, precision = precision),
+            state2_gpu ? state2 : gpu(state2, precision = precision)
+        ]
+    else
+        states = [state1, state2]
+    end
+    
+    return GaussianLinearCombination(state1.basis, coeffs, states)
 end
 """
     +(state::GaussianState, lc::GaussianLinearCombination)
@@ -200,9 +243,19 @@ Add a Gaussian state to a linear combination.
 function Base.:+(state::GaussianState, lc::GaussianLinearCombination)
     state.basis == lc.basis || throw(ArgumentError(SYMPLECTIC_ERROR))
     state.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR))
-    coeff_type = promote_type(eltype(state.mean), eltype(state.covar), eltype(lc.coeffs))
-    new_coeffs = vcat(one(coeff_type), convert(Vector{coeff_type}, lc.coeffs))
-    new_states = vcat(state, lc.states)
+    state_gpu = _is_gpu_array(state.mean)
+    lc_has_gpu = !isempty(lc.states) && _is_gpu_array(lc.states[1].mean)
+    T = promote_type(eltype(state.mean), eltype(state.covar), eltype(lc.coeffs))
+    new_coeffs = Vector{T}(vcat([one(T)], Array(lc.coeffs)))
+    if state_gpu || lc_has_gpu
+        precision = T <: Complex ? real(T) : T
+        promoted_state = state_gpu ? state : gpu(state, precision = precision)
+        promoted_states = lc_has_gpu ? lc.states : [gpu(s, precision = precision) for s in lc.states]
+        new_states = vcat([promoted_state], promoted_states)
+    else
+        new_states = vcat([state], lc.states)
+    end
+    
     return GaussianLinearCombination(lc.basis, new_coeffs, new_states)
 end
 """
@@ -213,9 +266,18 @@ Add a linear combination to a Gaussian state.
 function Base.:+(lc::GaussianLinearCombination, state::GaussianState)
     lc.basis == state.basis || throw(ArgumentError(SYMPLECTIC_ERROR))
     lc.ħ == state.ħ || throw(ArgumentError(HBAR_ERROR))
-    coeff_type = promote_type(eltype(state.mean), eltype(state.covar), eltype(lc.coeffs))
-    new_coeffs = vcat(convert(Vector{coeff_type}, lc.coeffs), one(coeff_type))
-    new_states = vcat(lc.states, state)
+    state_gpu = _is_gpu_array(state.mean)
+    lc_has_gpu = !isempty(lc.states) && _is_gpu_array(lc.states[1].mean)
+    T = promote_type(eltype(state.mean), eltype(state.covar), eltype(lc.coeffs))
+    new_coeffs = Vector{T}(vcat(Array(lc.coeffs), [one(T)]))
+    if state_gpu || lc_has_gpu
+        precision = T <: Complex ? real(T) : T
+        promoted_state = state_gpu ? state : gpu(state, precision = precision)
+        promoted_states = lc_has_gpu ? lc.states : [gpu(s, precision = precision) for s in lc.states]
+        new_states = vcat(promoted_states, [promoted_state])
+    else
+        new_states = vcat(lc.states, [state])
+    end
     return GaussianLinearCombination(lc.basis, new_coeffs, new_states)
 end
 """
@@ -313,44 +375,63 @@ function simplify!(lc::GaussianLinearCombination; atol::Real=1e-14)
     if isempty(lc.coeffs)
         return lc
     end
-    keep_mask = abs.(lc.coeffs) .> atol
+    cpu_coeffs = Vector(Array(lc.coeffs))  
+    lc_has_gpu_states = !isempty(lc.states) && _is_gpu_array(lc.states[1].mean)
+    keep_mask = abs.(cpu_coeffs) .> atol
     if !any(keep_mask)
-        vac = vacuumstate(lc.basis, ħ = lc.ħ)
-        coeff_type = eltype(lc.coeffs)
-        lc.coeffs = [coeff_type(atol)] 
+        vac = _device_aware_vacuum(lc)
+        coeff_type = eltype(cpu_coeffs)
+        lc.coeffs = Vector{coeff_type}([coeff_type(atol)]) 
         lc.states = [vac]
         return lc
     end
-    coeffs = lc.coeffs[keep_mask]
-    states = lc.states[keep_mask]
-    unique_states = Vector{typeof(states[1])}(undef, length(states))
-    combined_coeffs = Vector{eltype(coeffs)}(undef, length(states))
-    n_unique = 0  
-    for (coeff, state) in zip(coeffs, states)
-        existing_idx = findfirst(s -> isapprox(s, state, atol=1e-12), @view(unique_states[1:n_unique]))
+    surviving_coeffs = cpu_coeffs[keep_mask]
+    surviving_states = lc.states[keep_mask] 
+    if isempty(surviving_states)
+        return lc
+    end
+    unique_states = Vector{typeof(surviving_states[1])}()
+    combined_coeffs = Vector{eltype(surviving_coeffs)}()
+    
+    for (coeff, state) in zip(surviving_coeffs, surviving_states)
+        existing_idx = findfirst(unique_states) do existing_state
+            return states_approximately_equal(existing_state, state, 1e-12)
+        end
         if existing_idx === nothing
-            n_unique += 1
-            unique_states[n_unique] = state
-            combined_coeffs[n_unique] = coeff
+            push!(unique_states, state)
+            push!(combined_coeffs, coeff)
         else
             combined_coeffs[existing_idx] += coeff
         end
     end
-    resize!(unique_states, n_unique)
-    resize!(combined_coeffs, n_unique)
     final_mask = abs.(combined_coeffs) .> atol
     if !any(final_mask)
-        vac = vacuumstate(lc.basis, ħ = lc.ħ)
+        vac = _device_aware_vacuum(lc)
         coeff_type = eltype(combined_coeffs)
-        lc.coeffs = [coeff_type(atol)]
+        lc.coeffs = Vector{coeff_type}([coeff_type(atol)]) 
         lc.states = [vac]
     else
-        lc.coeffs = combined_coeffs[final_mask]
+        final_coeffs = combined_coeffs[final_mask]
+        lc.coeffs = Vector{eltype(final_coeffs)}(final_coeffs) 
         lc.states = unique_states[final_mask]
     end
     return lc
 end
 
+"""
+    states_approximately_equal(state1::GaussianState, state2::GaussianState, atol::Real)
+"""
+function states_approximately_equal(state1::GaussianState, state2::GaussianState, atol::Real)
+    state1.basis != state2.basis && return false
+    state1.ħ != state2.ħ && return false
+    try
+        mean_diff = maximum(abs.(Array(state1.mean) .- Array(state2.mean)))
+        covar_diff = maximum(abs.(Array(state1.covar) .- Array(state2.covar)))
+        return mean_diff <= atol && covar_diff <= atol
+    catch e
+        return false
+    end
+end
 function Base.show(io::IO, mime::MIME"text/plain", lc::GaussianLinearCombination)
     basis_name = nameof(typeof(lc.basis))
     nmodes = lc.basis.nmodes
@@ -398,8 +479,22 @@ The unitary is applied to each component state while preserving coefficients.
 function Base.:(*)(op::GaussianUnitary, lc::GaussianLinearCombination)
     op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR))
     op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR))
-    new_states = [op * state for state in lc.states]
-    return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states)
+    op_gpu = _is_gpu_array(op.disp)
+    lc_gpu = (device(lc) == :gpu)
+    if op_gpu && !lc_gpu
+        T = real(eltype(op.disp))
+        gpu_states = [gpu(s, precision = T) for s in lc.states]
+        new_states = [op * state for state in gpu_states]
+        return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states)
+    elseif !op_gpu && lc_gpu
+        T = real(eltype(lc.coeffs))
+        gpu_op = gpu(op, precision = T)
+        new_states = [gpu_op * state for state in lc.states]
+        return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states)
+    else
+        new_states = [op * state for state in lc.states]
+        return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states)
+    end
 end
 
 """
@@ -411,10 +506,22 @@ The channel is applied to each component state while preserving coefficients.
 function Base.:(*)(op::GaussianChannel, lc::GaussianLinearCombination)
     op.basis == lc.basis || throw(ArgumentError(ACTION_ERROR))
     op.ħ == lc.ħ || throw(ArgumentError(HBAR_ERROR))
-    new_states = [op * state for state in lc.states]
-    return GaussianLinearCombination(lc.basis, copy(lc.coeffs), new_states)
+    op_gpu = _is_gpu_array(op.disp)
+    lc_has_gpu = !isempty(lc.states) && _is_gpu_array(lc.states[1].mean)
+    cpu_coeffs = Vector(Array(lc.coeffs))
+    if op_gpu && !lc_has_gpu
+        T = real(eltype(op.disp))
+        gpu_states = [gpu(s, precision = T) for s in lc.states]
+        new_states = [op * state for state in gpu_states]
+    elseif !op_gpu && lc_has_gpu
+        T = real(eltype(lc.states[1].mean))
+        gpu_op = gpu(op, precision = T)
+        new_states = [gpu_op * state for state in lc.states]
+    else
+        new_states = [op * state for state in lc.states]
+    end
+    return GaussianLinearCombination(lc.basis, cpu_coeffs, new_states)
 end
-
 """
     tensor(::Type{Tc}, ::Type{Ts}, lc1::GaussianLinearCombination, lc2::GaussianLinearCombination)
 
@@ -684,4 +791,38 @@ function entropy_vn(lc::GaussianLinearCombination)
     # A GaussianLinearCombination represents a pure superposition state
     # |ψ⟩ = Σᵢ cᵢ|ψᵢ⟩, so S(ρ) = -Tr(ρ log ρ) = 0
     return 0.0
+end
+
+"""
+    _is_gpu_array(x::AbstractArray)
+
+Check if an array is a GPU array. Extended by CUDA extension.
+"""
+function _is_gpu_array(x::AbstractArray)
+    return false  
+end
+
+"""
+    _device_aware_vacuum(lc::GaussianLinearCombination)
+
+Create a vacuum state on the same device as the linear combination's states.
+"""
+function _device_aware_vacuum(lc::GaussianLinearCombination)
+    vac = vacuumstate(lc.basis, ħ = lc.ħ)
+    if !isempty(lc.states) && _is_gpu_array(lc.states[1].mean)
+        T = real(eltype(lc.states[1].mean))
+        return gpu(vac, precision = T)
+    else
+        return vac
+    end
+end
+
+"""
+    _create_device_array(data, reference_array)
+
+Create array with proper device placement for coefficients.
+"""
+function _create_device_array(data, reference_array)
+    T = eltype(data)
+    return Vector{T}(Array(data))  
 end
\ No newline at end of file
diff --git a/src/nongaussian_states.jl b/src/nongaussian_states.jl
index 3a17a16..5057ceb 100644
--- a/src/nongaussian_states.jl
+++ b/src/nongaussian_states.jl
@@ -20,6 +20,9 @@ function catstate_even(basis::SymplecticBasis, α::Number; squeeze_params=nothin
     if squeeze_params === nothing
         state_plus = coherentstate(basis, α, ħ=ħ)
         state_minus = coherentstate(basis, -α, ħ=ħ)
+        overlap = exp(-2 * abs2(α))
+        norm_factor = 1 / sqrt(2 * (1 + overlap))
+        
     else
         r, θ = squeeze_params
         squeezed_vac = squeezedstate(basis, r, θ, ħ=ħ) 
@@ -27,9 +30,11 @@ function catstate_even(basis::SymplecticBasis, α::Number; squeeze_params=nothin
         displace_minus = displace(basis, -α, ħ=ħ)
         state_plus = displace_plus * squeezed_vac
         state_minus = displace_minus * squeezed_vac
+        overlap_val = _overlap(state_plus, state_minus)
+        overlap = real(overlap_val) 
+        norm_factor = 1 / sqrt(2 * (1 + overlap))
     end
-    overlap = exp(-2 * abs2(α))
-    norm_factor = 1 / sqrt(2 * (1 + overlap))
+    
     coeffs = [norm_factor, norm_factor]
     states = [state_plus, state_minus]
     return GaussianLinearCombination(basis, coeffs, states)
@@ -56,6 +61,8 @@ function catstate_odd(basis::SymplecticBasis, α::Number; squeeze_params=nothing
     if squeeze_params === nothing
         state_plus = coherentstate(basis, α, ħ=ħ)
         state_minus = coherentstate(basis, -α, ħ=ħ)
+        overlap = exp(-2 * abs2(α))
+        norm_factor = 1 / sqrt(2 * (1 - overlap))
     else
         r, θ = squeeze_params
         squeezed_vac = squeezedstate(basis, r, θ, ħ=ħ)  
@@ -63,14 +70,15 @@ function catstate_odd(basis::SymplecticBasis, α::Number; squeeze_params=nothing
         displace_minus = displace(basis, -α, ħ=ħ)
         state_plus = displace_plus * squeezed_vac
         state_minus = displace_minus * squeezed_vac
+        overlap_val = _overlap(state_plus, state_minus)
+        overlap = real(overlap_val)  
+        norm_factor = 1 / sqrt(2 * (1 - overlap))
     end
-    overlap = exp(-2 * abs2(α))
-    norm_factor = 1 / sqrt(2 * (1 - overlap))
+    
     coeffs = [norm_factor, -norm_factor]
     states = [state_plus, state_minus]
     return GaussianLinearCombination(basis, coeffs, states)
 end
-
 """
     catstate(basis::SymplecticBasis, α::Number, phase::Real=0; squeeze_params=nothing, ħ=2)
 
@@ -93,6 +101,9 @@ function catstate(basis::SymplecticBasis, α::Number, phase::Real=0; squeeze_par
     if squeeze_params === nothing
         state_plus = coherentstate(basis, α, ħ=ħ)
         state_minus = coherentstate(basis, -α, ħ=ħ)
+        overlap = exp(-2 * abs2(α))
+        phase_factor = exp(1im * phase)
+        norm_factor = 1 / sqrt(2 * (1 + real(phase_factor * overlap)))
     else
         r, θ = squeeze_params
         squeezed_vac = squeezedstate(basis, r, θ, ħ=ħ)  
@@ -100,18 +111,18 @@ function catstate(basis::SymplecticBasis, α::Number, phase::Real=0; squeeze_par
         displace_minus = displace(basis, -α, ħ=ħ)
         state_plus = displace_plus * squeezed_vac
         state_minus = displace_minus * squeezed_vac
+        overlap_val = _overlap(state_plus, state_minus)
+        phase_factor = exp(1im * phase)
+        norm_factor = 1 / sqrt(2 * (1 + real(phase_factor * overlap_val)))
     end
-    overlap = exp(-2 * abs2(α))
-    phase_factor = exp(1im * phase)
-    norm_factor = 1 / sqrt(2 * (1 + real(phase_factor * overlap)))
-    coeffs = [norm_factor, norm_factor * phase_factor]
+    coeffs = [norm_factor, norm_factor * exp(1im * phase)]
     if abs(phase - π) < 1e-14 || abs(phase) < 1e-14
         coeffs = real.(coeffs)
     end
+    
     states = [state_plus, state_minus]
     return GaussianLinearCombination(basis, coeffs, states)
 end
-
 """
     gkpstate(basis::SymplecticBasis; lattice=:square, delta=0.1, nmax=5, ħ=2)
 
@@ -322,7 +333,7 @@ function catstate_odd(basis::SymplecticBasis, αs::AbstractVector; squeeze_param
     @inbounds for i in 1:nmodes  
         α = αs[i]
         squeeze_param = squeeze_params === nothing ? nothing : squeeze_params[i]
-        cat = catstate_even(single_mode_basis, α, squeeze_params=squeeze_param, ħ=ħ)
+        cat = catstate_odd(single_mode_basis, α, squeeze_params=squeeze_param, ħ=ħ)
         cat_states[i] = cat  
     end
     result = cat_states[1]
@@ -331,7 +342,6 @@ function catstate_odd(basis::SymplecticBasis, αs::AbstractVector; squeeze_param
     end
     return result
 end
-
 """
     catstate(basis::SymplecticBasis, αs::AbstractVector, phases::AbstractVector=zeros(length(αs)); squeeze_params=nothing, ħ=2)
 
@@ -348,8 +358,9 @@ function catstate(basis::SymplecticBasis, αs::AbstractVector, phases::AbstractV
     cat_states = Vector{GaussianLinearCombination}(undef, nmodes)  
     @inbounds for i in 1:nmodes  
         α = αs[i]
+        phase = phases[i]
         squeeze_param = squeeze_params === nothing ? nothing : squeeze_params[i]
-        cat = catstate_even(single_mode_basis, α, squeeze_params=squeeze_param, ħ=ħ)
+        cat = catstate(single_mode_basis, α, phase, squeeze_params=squeeze_param, ħ=ħ)
         cat_states[i] = cat  
     end
     result = cat_states[1]
diff --git a/test/Project.toml b/test/Project.toml
index 8c69541..5d042c6 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -10,3 +10,5 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TestItemRunner = "f8b46487-2199-4994-9208-9a1283c18c0a"
+Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 3c2b989..15ee87c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,21 +1,40 @@
+CUDA_flag = false
+
+if Sys.iswindows()
+    @info "Skipping GPU tests -- only executed on *NIX platforms."
+else
+    CUDA_flag = get(ENV, "CUDA_TEST", "") == "true"
+    CUDA_flag && @info "Running with CUDA tests."
+    if !CUDA_flag
+        @info "Skipping GPU tests -- must be explicitly enabled."
+        @info "Environment must set CUDA_TEST=true."
+    end
+end
+
+using Pkg
+CUDA_flag && Pkg.add("CUDA")
+
 using TestItemRunner
 using Gabs
-
-# filter for the test
 testfilter = ti -> begin
-  exclude = Symbol[:jet]
-  if !(VERSION >= v"1.10")
-    push!(exclude, :doctests)
-    push!(exclude, :aqua)                                 
-  end
-
-  return all(!in(exclude), ti.tags)
+    exclude = Symbol[:jet]
+    if get(ENV, "JET_TEST", "") == "true"
+        return :jet in ti.tags
+    else
+        push!(exclude, :jet)
+    end
+    if CUDA_flag
+        return :cuda in ti.tags
+    else
+        push!(exclude, :cuda)
+    end
+    if !(VERSION >= v"1.10")
+        push!(exclude, :doctests)
+        push!(exclude, :aqua)                                 
+    end
+    return all(!in(exclude), ti.tags)
 end
 
 println("Starting tests with $(Threads.nthreads()) threads out of `Sys.CPU_THREADS = $(Sys.CPU_THREADS)`...")
 
 @run_package_tests filter=testfilter
-
-if get(ENV,"JET_TEST","")=="true"
-    @run_package_tests filter=(ti -> :jet in ti.tags)
-end
diff --git a/test/test_CUDAExt.jl b/test/test_CUDAExt.jl
new file mode 100644
index 0000000..f43950e
--- /dev/null
+++ b/test/test_CUDAExt.jl
@@ -0,0 +1,2723 @@
+@testitem "GPU Foundation - State Creation" tags=[:cuda] begin
+    using CUDA
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra
+    using Statistics
+
+    @testset "GPU Vacuum State" begin
+        basis = QuadPairBasis(1)
+        vac_cpu = vacuumstate(basis)
+        vac_gpu = vacuumstate(basis) |> gpu
+        @test vac_gpu.mean isa CuVector{Float32}
+        @test vac_gpu.covar isa CuMatrix{Float32}
+        @test vac_gpu.basis == basis
+        @test vac_gpu.ħ == 2
+        @test device(vac_gpu) == :gpu
+        @test device(vac_cpu) == :cpu
+        @test Array(vac_gpu.mean) ≈ vac_cpu.mean
+        @test Array(vac_gpu.covar) ≈ vac_cpu.covar
+        vac_gpu_f64 = gpu(vacuumstate(basis), precision=Float64)
+        @test vac_gpu_f64.mean isa CuVector{Float64}
+        @test vac_gpu_f64.covar isa CuMatrix{Float64}
+        vac_back_to_cpu = vac_gpu |> cpu
+        @test device(vac_back_to_cpu) == :cpu
+        @test vac_back_to_cpu.mean ≈ vac_cpu.mean
+        @test vac_back_to_cpu.covar ≈ vac_cpu.covar
+    end
+    
+    @testset "GPU Coherent State" begin
+        basis = QuadPairBasis(1)
+        α = 1.0f0 + 0.5f0im
+        coh_cpu = coherentstate(basis, α)
+        coh_gpu = coherentstate(basis, α) |> gpu
+        @test coh_gpu.mean isa CuVector{Float32}
+        @test coh_gpu.covar isa CuMatrix{Float32}
+        @test device(coh_gpu) == :gpu
+        @test Array(coh_gpu.mean) ≈ Float32.(coh_cpu.mean)
+        @test Array(coh_gpu.covar) ≈ Float32.(coh_cpu.covar)
+        α_gpu = CuArray([α])
+        coh_auto = coherentstate(basis, α_gpu)
+        @test device(coh_auto) == :gpu
+        @test Array(coh_auto.mean) ≈ Float32.(coh_cpu.mean)
+        basis_multi = QuadPairBasis(2)
+        α_multi = [1.0f0 + 0.5f0im, -0.3f0 + 0.8f0im]
+        coh_cpu_multi = coherentstate(basis_multi, α_multi)
+        coh_gpu_multi = coherentstate(basis_multi, α_multi) |> gpu
+        @test Array(coh_gpu_multi.mean) ≈ Float32.(coh_cpu_multi.mean)
+        @test Array(coh_gpu_multi.covar) ≈ Float32.(coh_cpu_multi.covar)
+        α_multi_gpu = CuArray(α_multi)
+        coh_multi_auto = coherentstate(basis_multi, α_multi_gpu)
+        @test device(coh_multi_auto) == :gpu
+        @test Array(coh_multi_auto.mean) ≈ Float32.(coh_cpu_multi.mean)
+    end
+    
+    @testset "GPU Squeezed State" begin
+        basis = QuadPairBasis(1)
+        r, θ = 0.3f0, Float32(π/4)
+        sq_cpu = squeezedstate(basis, r, θ)
+        sq_gpu = squeezedstate(basis, r, θ) |> gpu
+        @test sq_gpu.mean isa CuVector{Float32}
+        @test sq_gpu.covar isa CuMatrix{Float32}
+        @test device(sq_gpu) == :gpu
+        @test Array(sq_gpu.mean) ≈ Float32.(sq_cpu.mean)
+        @test Array(sq_gpu.covar) ≈ Float32.(sq_cpu.covar) rtol=1e-6
+        r_gpu = CuArray([r])
+        sq_auto = squeezedstate(basis, r_gpu, θ)
+        @test device(sq_auto) == :gpu
+        @test Array(sq_auto.mean) ≈ Float32.(sq_cpu.mean)
+        basis_multi = QuadPairBasis(2)
+        r_vec = [0.3f0, 0.5f0]
+        θ_vec = [Float32(π/4), Float32(π/6)]
+        sq_cpu_multi = squeezedstate(basis_multi, r_vec, θ_vec)
+        sq_gpu_multi = squeezedstate(basis_multi, r_vec, θ_vec) |> gpu
+        @test Array(sq_gpu_multi.mean) ≈ Float32.(sq_cpu_multi.mean)
+        @test Array(sq_gpu_multi.covar) ≈ Float32.(sq_cpu_multi.covar) rtol=1e-6
+    end
+    
+    @testset "GPU Thermal State" begin
+        basis = QuadPairBasis(1)
+        n = 2.0f0
+        thermal_cpu = thermalstate(basis, n)
+        thermal_gpu = thermalstate(basis, n) |> gpu
+        @test thermal_gpu.mean isa CuVector{Float32}
+        @test thermal_gpu.covar isa CuMatrix{Float32}
+        @test device(thermal_gpu) == :gpu
+        @test Array(thermal_gpu.mean) ≈ Float32.(thermal_cpu.mean)
+        @test Array(thermal_gpu.covar) ≈ Float32.(thermal_cpu.covar)
+        n_gpu = CuArray([n])
+        thermal_auto = thermalstate(basis, n_gpu)
+        @test device(thermal_auto) == :gpu
+        @test Array(thermal_auto.mean) ≈ Float32.(thermal_cpu.mean)
+        basis_multi = QuadPairBasis(2)
+        n_vec = [2.0f0, 3.0f0]
+        thermal_cpu_multi = thermalstate(basis_multi, n_vec)
+        thermal_gpu_multi = thermalstate(basis_multi, n_vec) |> gpu
+        @test Array(thermal_gpu_multi.mean) ≈ Float32.(thermal_cpu_multi.mean)
+        @test Array(thermal_gpu_multi.covar) ≈ Float32.(thermal_cpu_multi.covar)
+    end
+    
+    @testset "GPU EPR State" begin
+        basis = QuadPairBasis(2)
+        r, θ = 0.5f0, Float32(π/3)
+        epr_cpu = eprstate(basis, r, θ)
+        epr_gpu = eprstate(basis, r, θ) |> gpu
+        @test epr_gpu.mean isa CuVector{Float32}
+        @test epr_gpu.covar isa CuMatrix{Float32}
+        @test device(epr_gpu) == :gpu
+        @test Array(epr_gpu.mean) ≈ Float32.(epr_cpu.mean)
+        @test Array(epr_gpu.covar) ≈ Float32.(epr_cpu.covar) rtol=1e-6
+    end
+end
+
+@testitem "GPU Foundation - Unitary Operations" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra
+    using CUDA
+        
+    @testset "GPU Displacement" begin
+        basis = QuadPairBasis(1)
+        α = 0.5f0 + 0.3f0im
+        disp_cpu = displace(basis, α)
+        disp_gpu = displace(basis, α) |> gpu
+        @test disp_gpu.disp isa CuVector{Float32}
+        @test disp_gpu.symplectic isa CuMatrix{Float32}
+        @test device(disp_gpu) == :gpu
+        @test Array(disp_gpu.disp) ≈ Float32.(disp_cpu.disp)
+        @test Array(disp_gpu.symplectic) ≈ Float32.(disp_cpu.symplectic)
+        α_gpu = CuArray([α])
+        disp_auto = displace(basis, α_gpu)
+        @test device(disp_auto) == :gpu
+        @test Array(disp_auto.disp) ≈ Float32.(disp_cpu.disp)
+        vac_gpu = vacuumstate(basis) |> gpu
+        displaced_gpu = disp_gpu * vac_gpu
+        vac_cpu = vacuumstate(basis)
+        displaced_cpu = disp_cpu * vac_cpu
+        @test Array(displaced_gpu.mean) ≈ Float32.(displaced_cpu.mean) rtol=1e-6
+        @test Array(displaced_gpu.covar) ≈ Float32.(displaced_cpu.covar) rtol=1e-6
+    end
+    
+    @testset "Mixed Device Operations" begin
+        basis = QuadPairBasis(1)
+        vac_cpu = vacuumstate(basis)
+        disp_gpu = displace(basis, 1.0f0) |> gpu
+        result = disp_gpu * vac_cpu
+        @test device(result) == :gpu
+        vac_gpu = vacuumstate(basis) |> gpu
+        disp_cpu = displace(basis, 1.0f0)
+        result2 = disp_cpu * vac_gpu
+        @test device(result2) == :gpu
+    end
+    
+    @testset "GPU Squeeze" begin
+        basis = QuadPairBasis(1)
+        r, θ = 0.3f0, Float32(π/4)
+        squeeze_cpu = squeeze(basis, r, θ)
+        squeeze_gpu = squeeze(basis, r, θ) |> gpu
+        @test Array(squeeze_gpu.disp) ≈ Float32.(squeeze_cpu.disp)
+        @test Array(squeeze_gpu.symplectic) ≈ Float32.(squeeze_cpu.symplectic) rtol=1e-6
+        @test device(squeeze_gpu) == :gpu
+        r_gpu = CuArray([r])
+        squeeze_auto = squeeze(basis, r_gpu, θ)
+        @test device(squeeze_auto) == :gpu
+        vac_gpu = vacuumstate(basis) |> gpu
+        squeezed_gpu = squeeze_gpu * vac_gpu
+        vac_cpu = vacuumstate(basis)
+        squeezed_cpu = squeeze_cpu * vac_cpu
+        @test Array(squeezed_gpu.mean) ≈ Float32.(squeezed_cpu.mean) rtol=1e-6
+        @test Array(squeezed_gpu.covar) ≈ Float32.(squeezed_cpu.covar) rtol=1e-5
+    end
+    
+    @testset "GPU Phase Shift" begin
+        basis = QuadPairBasis(1)
+        θ = Float32(π/3)
+        phase_cpu = phaseshift(basis, θ)
+        phase_gpu = phaseshift(basis, θ) |> gpu
+        @test Array(phase_gpu.disp) ≈ Float32.(phase_cpu.disp)
+        @test Array(phase_gpu.symplectic) ≈ Float32.(phase_cpu.symplectic) rtol=1e-6
+        @test device(phase_gpu) == :gpu
+    end
+    
+    @testset "GPU Beam Splitter" begin
+        basis = QuadPairBasis(2)
+        transmit = 0.7f0
+        bs_cpu = beamsplitter(basis, transmit)
+        bs_gpu = beamsplitter(basis, transmit) |> gpu
+        @test Array(bs_gpu.disp) ≈ Float32.(bs_cpu.disp)
+        @test Array(bs_gpu.symplectic) ≈ Float32.(bs_cpu.symplectic) rtol=1e-6
+        @test device(bs_gpu) == :gpu
+    end
+    
+    @testset "GPU Two-Mode Squeeze" begin
+        basis = QuadPairBasis(2)
+        r, θ = 0.2f0, Float32(π/6)
+        twosq_cpu = twosqueeze(basis, r, θ)
+        twosq_gpu = twosqueeze(basis, r, θ) |> gpu
+        @test Array(twosq_gpu.disp) ≈ Float32.(twosq_cpu.disp)
+        @test Array(twosq_gpu.symplectic) ≈ Float32.(twosq_cpu.symplectic) rtol=1e-6
+        @test device(twosq_gpu) == :gpu
+    end
+end
+
+@testitem "GPU Foundation - Channel Operations" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra
+    using CUDA        
+        
+    @testset "GPU Attenuator Channel" begin
+        basis = QuadPairBasis(1)
+        θ, n = Float32(π/6), 2.0f0
+        att_cpu = attenuator(basis, θ, n)
+        att_gpu = attenuator(basis, θ, n) |> gpu
+        @test att_gpu.disp isa CuVector{Float32}
+        @test att_gpu.transform isa CuMatrix{Float32}
+        @test att_gpu.noise isa CuMatrix{Float32}
+        @test device(att_gpu) == :gpu
+        @test Array(att_gpu.disp) ≈ Float32.(att_cpu.disp)
+        @test Array(att_gpu.transform) ≈ Float32.(att_cpu.transform) rtol=1e-6
+        @test Array(att_gpu.noise) ≈ Float32.(att_cpu.noise) rtol=1e-6
+        coh_gpu = coherentstate(basis, 1.0f0) |> gpu
+        attenuated_gpu = att_gpu * coh_gpu
+        coh_cpu = coherentstate(basis, 1.0f0)
+        attenuated_cpu = att_cpu * coh_cpu
+        @test Array(attenuated_gpu.mean) ≈ Float32.(attenuated_cpu.mean) rtol=1e-6
+        @test Array(attenuated_gpu.covar) ≈ Float32.(attenuated_cpu.covar) rtol=1e-5
+    end
+    
+    @testset "GPU Amplifier Channel" begin
+        basis = QuadPairBasis(1)
+        r, n = 0.2f0, 1.5f0
+        amp_cpu = amplifier(basis, r, n)
+        amp_gpu = amplifier(basis, r, n) |> gpu
+        @test Array(amp_gpu.disp) ≈ Float32.(amp_cpu.disp)
+        @test Array(amp_gpu.transform) ≈ Float32.(amp_cpu.transform) rtol=1e-6
+        @test Array(amp_gpu.noise) ≈ Float32.(amp_cpu.noise) rtol=1e-5
+        @test device(amp_gpu) == :gpu
+    end
+end
+
+@testitem "GPU Foundation - Wigner Functions" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra
+    using CUDA
+        
+    @testset "GPU Single-Point Wigner" begin
+        basis = QuadPairBasis(1)
+        vac_cpu = vacuumstate(basis)
+        vac_gpu = vacuumstate(basis) |> gpu
+        coh_cpu = coherentstate(basis, 1.0f0)
+        coh_gpu = coherentstate(basis, 1.0f0) |> gpu
+        test_points = [[0.0f0, 0.0f0], [1.0f0, 0.5f0], [-0.5f0, 1.0f0]]
+        for x in test_points
+            w_cpu_vac = wigner(vac_cpu, x)
+            w_gpu_vac = wigner(vac_gpu, x)
+            @test w_gpu_vac ≈ w_cpu_vac rtol=1e-5
+            w_cpu_coh = wigner(coh_cpu, Float32.(x))
+            w_gpu_coh = wigner(coh_gpu, x)
+            @test w_gpu_coh ≈ w_cpu_coh rtol=1e-5
+        end
+    end
+    
+    @testset "GPU Batch Wigner Evaluation" begin
+        basis = QuadPairBasis(1)
+        coh_gpu = coherentstate(basis, 1.0f0) |> gpu
+        n_points = 100
+        x_points = CuMatrix{Float32}(randn(Float32, 2, n_points))
+        w_batch = wigner(coh_gpu, x_points)
+        @test w_batch isa CuVector{Float32}
+        @test length(w_batch) == n_points
+        @test all(isfinite.(Array(w_batch)))
+        coh_cpu = coherentstate(basis, 1.0f0)
+        for i in 1:min(10, n_points)
+            x_point = Array(x_points[:, i])
+            w_individual = wigner(coh_cpu, x_point)
+            w_batch_individual = Array(w_batch)[i]
+            @test w_batch_individual ≈ w_individual rtol=1e-4
+        end
+    end
+    
+    @testset "GPU Single-Point Wigner Characteristic" begin
+        basis = QuadPairBasis(1)
+        vac_cpu = vacuumstate(basis)
+        vac_gpu = vacuumstate(basis) |> gpu
+        test_points = [[0.0f0, 0.0f0], [0.5f0, 0.3f0], [-0.2f0, 0.8f0]]
+        for xi in test_points
+            chi_cpu = wignerchar(vac_cpu, xi)
+            chi_gpu = wignerchar(vac_gpu, xi)
+            @test real(chi_gpu) ≈ real(chi_cpu) rtol=1e-5
+            @test imag(chi_gpu) ≈ imag(chi_cpu) rtol=1e-5
+        end
+    end
+    
+    @testset "GPU Batch Wigner Characteristic" begin
+        basis = QuadPairBasis(1)
+        sq_gpu = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu
+        n_points = 50
+        xi_points = CuMatrix{Float32}(randn(Float32, 2, n_points) * 0.5f0)
+        chi_batch = wignerchar(sq_gpu, xi_points)
+        @test chi_batch isa CuVector{ComplexF32}
+        @test length(chi_batch) == n_points
+        @test all(isfinite.(real(Array(chi_batch))))
+        @test all(isfinite.(imag(Array(chi_batch))))
+    end
+    
+    @testset "GPU Wigner Grid Utility" begin
+        basis = QuadPairBasis(1)
+        coh_gpu = coherentstate(basis, 0.5f0) |> gpu
+        x_range = (-2.0f0, 2.0f0)
+        p_range = (-2.0f0, 2.0f0)
+        nx, np = 20, 25
+        x_vals = collect(range(x_range[1], x_range[2], length=nx))
+        p_vals = collect(range(p_range[1], p_range[2], length=np))
+        x_grid = repeat(x_vals, 1, np)[:]
+        p_grid = repeat(p_vals', nx, 1)[:]
+        points = CuArray(Float32.([x_grid'; p_grid']))
+        w_grid = wigner(coh_gpu, points)
+        @test length(w_grid) == nx * np
+        @test all(Array(w_grid) .> 0)
+    end
+end
+
+@testitem "GPU Foundation - Tensor Products and Partial Traces" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra
+    using CUDA  
+        
+    @testset "GPU State Tensor Products" begin
+        basis1 = QuadPairBasis(1)
+        basis2 = QuadPairBasis(1)
+        coh1_gpu = coherentstate(basis1, 1.0f0) |> gpu
+        vac2_gpu = vacuumstate(basis2) |> gpu
+        coh1_cpu = coherentstate(basis1, 1.0f0)
+        vac2_cpu = vacuumstate(basis2)
+        tensor_gpu = tensor(coh1_gpu, vac2_gpu)
+        tensor_cpu = tensor(coh1_cpu, vac2_cpu)
+        @test tensor_gpu.basis.nmodes == 2
+        @test device(tensor_gpu) == :gpu
+        @test Array(tensor_gpu.mean) ≈ Float32.(tensor_cpu.mean) rtol=1e-6
+        @test Array(tensor_gpu.covar) ≈ Float32.(tensor_cpu.covar) rtol=1e-6
+    end
+    
+    @testset "Mixed Device Tensor Products" begin
+        basis1 = QuadPairBasis(1)
+        coh_cpu = coherentstate(basis1, 1.0f0)
+        vac_gpu = vacuumstate(basis1) |> gpu
+        tensor_mixed = tensor(coh_cpu, vac_gpu)
+        @test device(tensor_mixed) == :gpu
+    end
+    
+    @testset "GPU Unitary Tensor Products" begin
+        basis1 = QuadPairBasis(1)
+        disp1_gpu = displace(basis1, 0.5f0) |> gpu
+        disp2_gpu = displace(basis1, -0.3f0) |> gpu
+        disp1_cpu = displace(basis1, 0.5f0)
+        disp2_cpu = displace(basis1, -0.3f0)
+        tensor_gpu = tensor(disp1_gpu, disp2_gpu)
+        tensor_cpu = tensor(disp1_cpu, disp2_cpu)
+        @test device(tensor_gpu) == :gpu
+        @test Array(tensor_gpu.disp) ≈ Float32.(tensor_cpu.disp) rtol=1e-6
+        @test Array(tensor_gpu.symplectic) ≈ Float32.(tensor_cpu.symplectic) rtol=1e-6
+    end
+    
+    @testset "GPU Partial Traces" begin
+        basis = QuadPairBasis(2)
+        epr_gpu = eprstate(basis, 0.3f0, Float32(π/4)) |> gpu
+        epr_cpu = eprstate(basis, 0.3f0, Float32(π/4))
+        traced_gpu = ptrace(epr_gpu, [1])
+        traced_cpu = ptrace(epr_cpu, [1])
+        @test traced_gpu.basis.nmodes == 1
+        @test device(traced_gpu) == :gpu
+        @test Array(traced_gpu.mean) ≈ Float32.(traced_cpu.mean) rtol=1e-6
+        @test Array(traced_gpu.covar) ≈ Float32.(traced_cpu.covar) rtol=1e-5
+    end
+end
+
+@testitem "GPU Foundation - Device Management" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra
+     using CUDA
+        
+    @testset "Device Transfer Functions" begin
+        basis = QuadPairBasis(1)
+        vac_cpu = vacuumstate(basis)
+        @test device(vac_cpu) == :cpu
+        vac_gpu = vac_cpu |> gpu
+        @test device(vac_gpu) == :gpu
+        @test vac_gpu.mean isa CuVector{Float32}
+        vac_back = vac_gpu |> cpu
+        @test device(vac_back) == :cpu
+        @test vac_back.mean isa Vector{Float32}
+        disp_cpu = displace(basis, 1.0)
+        disp_gpu = disp_cpu |> gpu
+        @test device(disp_gpu) == :gpu
+        disp_back = disp_gpu |> cpu
+        @test device(disp_back) == :cpu
+        att_cpu = attenuator(basis, π/4, 1.0)
+        att_gpu = att_cpu |> gpu
+        @test device(att_gpu) == :gpu
+        att_back = att_gpu |> cpu
+        @test device(att_back) == :cpu
+    end
+    
+    @testset "Precision Control" begin
+        basis = QuadPairBasis(1)
+        vac = vacuumstate(basis)
+        vac_f32 = gpu(vac, precision=Float32)
+        vac_f64 = gpu(vac, precision=Float64)
+        @test eltype(vac_f32.mean) == Float32
+        @test eltype(vac_f64.mean) == Float64
+        disp = displace(basis, 1.0)
+        disp_f32 = gpu(disp, precision=Float32)
+        disp_f64 = gpu(disp, precision=Float64)
+        @test eltype(disp_f32.disp) == Float32
+        @test eltype(disp_f64.disp) == Float64
+    end
+    
+    @testset "Array Device Functions" begin
+        x_cpu = randn(10)
+        x_gpu = x_cpu |> gpu
+        @test x_gpu isa CuVector{Float32}
+        @test device(x_gpu) == :gpu
+        x_back = x_gpu |> cpu
+        @test x_back isa Vector{Float32}
+        @test device(x_back) == :cpu
+        @test x_back ≈ Float32.(x_cpu)
+    end
+end
+
+@testitem "GPU Tensor Products - Operators" tags=[:cuda] begin
+    using CUDA
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra
+    
+    @testset "GPU GaussianUnitary Tensor Products" begin
+        basis1 = QuadPairBasis(1)
+        basis2 = QuadPairBasis(1)
+        disp1_cpu = displace(basis1, 0.5f0)
+        disp2_cpu = displace(basis2, -0.3f0)
+        disp1_gpu = disp1_cpu |> gpu
+        disp2_gpu = disp2_cpu |> gpu
+        @test device(disp1_gpu) == :gpu
+        @test device(disp2_gpu) == :gpu
+        tensor_gpu = tensor(disp1_gpu, disp2_gpu)
+        tensor_cpu = tensor(disp1_cpu, disp2_cpu)
+        @test device(tensor_gpu) == :gpu
+        @test tensor_gpu.basis.nmodes == 2
+        @test Array(tensor_gpu.disp) ≈ Float32.(tensor_cpu.disp) rtol=1e-6
+        @test Array(tensor_gpu.symplectic) ≈ Float32.(tensor_cpu.symplectic) rtol=1e-6
+        tensor_typed = tensor(CuVector{Float64}, CuMatrix{Float64}, disp1_gpu, disp2_gpu)
+        @test eltype(tensor_typed.disp) == Float64
+        @test eltype(tensor_typed.symplectic) == Float64
+        vac1_gpu = vacuumstate(basis1) |> gpu
+        vac2_gpu = vacuumstate(basis2) |> gpu
+        tensor_state_gpu = tensor(vac1_gpu, vac2_gpu)
+        result_gpu = tensor_gpu * tensor_state_gpu
+        vac1_cpu = vacuumstate(basis1)
+        vac2_cpu = vacuumstate(basis2)  
+        tensor_state_cpu = tensor(vac1_cpu, vac2_cpu)
+        result_cpu = tensor_cpu * tensor_state_cpu
+        @test Array(result_gpu.mean) ≈ Float32.(result_cpu.mean) rtol=1e-5
+        @test Array(result_gpu.covar) ≈ Float32.(result_cpu.covar) rtol=1e-5
+    end
+    
+    @testset "GPU GaussianChannel Tensor Products" begin
+        basis1 = QuadPairBasis(1)
+        basis2 = QuadPairBasis(1)
+        att1_cpu = attenuator(basis1, π/6, 1.5)
+        att2_cpu = attenuator(basis2, π/4, 2.0)
+        att1_gpu = att1_cpu |> gpu
+        att2_gpu = att2_cpu |> gpu
+        @test device(att1_gpu) == :gpu
+        @test device(att2_gpu) == :gpu
+        tensor_gpu = tensor(att1_gpu, att2_gpu)
+        tensor_cpu = tensor(att1_cpu, att2_cpu)
+        @test device(tensor_gpu) == :gpu
+        @test tensor_gpu.basis.nmodes == 2
+        @test Array(tensor_gpu.disp) ≈ Float32.(tensor_cpu.disp) rtol=1e-6
+        @test Array(tensor_gpu.transform) ≈ Float32.(tensor_cpu.transform) rtol=1e-5
+        @test Array(tensor_gpu.noise) ≈ Float32.(tensor_cpu.noise) rtol=1e-5
+        tensor_typed = tensor(CuVector{Float64}, CuMatrix{Float64}, att1_gpu, att2_gpu)
+        @test eltype(tensor_typed.disp) == Float64
+        @test eltype(tensor_typed.transform) == Float64
+        @test eltype(tensor_typed.noise) == Float64
+    end
+    
+    @testset "Mixed Operator Types" begin
+        basis = QuadPairBasis(1)
+        disp_gpu = displace(basis, 1.0f0) |> gpu
+        att_gpu = attenuator(basis, π/4, 1.0) |> gpu
+        @test_throws ArgumentError tensor(disp_gpu, att_gpu)
+    end
+    
+    @testset "Error Handling" begin
+        basis1 = QuadPairBasis(1)  
+        basis2 = QuadBlockBasis(1)
+        disp1_gpu = displace(basis1, 1.0f0) |> gpu
+        disp2_gpu = displace(basis2, 1.0f0) |> gpu
+        @test_throws ArgumentError tensor(disp1_gpu, disp2_gpu)
+        disp1_h1 = displace(basis1, 1.0f0, ħ=1) |> gpu  
+        disp1_h2 = displace(basis1, 1.0f0, ħ=2) |> gpu
+        @test_throws ArgumentError tensor(disp1_h1, disp1_h2)
+    end
+end
+
+@testitem "GPU Linear Combinations - Core Functionality" tags=[:cuda] begin
+    using CUDA
+    using Gabs
+    using Gabs: device, GaussianLinearCombination
+    using LinearAlgebra
+
+    @testset "Device Management & Transfer" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0 + 0.5f0im)
+        state2 = coherentstate(basis, -1.0f0 + 0.3f0im)
+        coeffs = [0.6f0, 0.8f0]
+        lc_cpu = GaussianLinearCombination(basis, coeffs, [state1, state2])
+        @test device(lc_cpu) == :cpu
+        @test lc_cpu.coeffs isa Vector{Float32}
+        @test device(lc_cpu.states[1]) == :cpu
+        lc_gpu = lc_cpu |> gpu
+        @test device(lc_gpu) == :gpu
+        @test lc_gpu.coeffs isa Vector{Float32}
+        @test device(lc_gpu.states[1]) == :gpu
+        @test device(lc_gpu.states[2]) == :gpu
+        @test lc_gpu.states[1].mean isa CuVector{Float32}
+        @test lc_gpu.states[1].covar isa CuMatrix{Float32}
+        @test lc_gpu.coeffs ≈ lc_cpu.coeffs
+        lc_back = lc_gpu |> cpu
+        @test device(lc_back) == :cpu
+        @test lc_back.coeffs ≈ lc_cpu.coeffs
+        @test Array(lc_gpu.states[1].mean) ≈ lc_back.states[1].mean rtol=1e-6
+        lc_gpu_f64 = gpu(lc_cpu, precision=Float64)
+        @test eltype(lc_gpu_f64.coeffs) == Float64
+        @test lc_gpu_f64.states[1].mean isa CuVector{Float64}
+    end
+    
+    @testset "Creating GPU Linear Combinations Directly" begin
+        basis = QuadPairBasis(1)
+        state1_cpu = coherentstate(basis, 1.0f0)
+        state2_cpu = squeezedstate(basis, 0.3f0, Float32(π/4))
+        lc1 = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1_cpu, state2_cpu]) |> gpu
+        @test device(lc1) == :gpu
+        state1_gpu = coherentstate(basis, 1.0f0) |> gpu
+        state2_gpu = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu
+        lc2 = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1_gpu, state2_gpu])
+        @test device(lc2) == :gpu
+        @test lc1.coeffs ≈ lc2.coeffs
+        @test Array(lc1.states[1].mean) ≈ Array(lc2.states[1].mean) rtol=1e-6
+    end
+
+    @testset "Arithmetic Operations" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = coherentstate(basis, -1.0f0) |> gpu
+        state3 = squeezedstate(basis, 0.2f0, 0.0f0) |> gpu
+        lc1 = GaussianLinearCombination(basis, [0.6f0, 0.4f0], [state1, state2])
+        lc2 = GaussianLinearCombination(basis, [0.3f0], [state3])
+        @test device(lc1) == :gpu
+        @test device(lc2) == :gpu
+        lc_sum = lc1 + lc2
+        @test length(lc_sum) == 3
+        @test device(lc_sum) == :gpu
+        @test lc_sum.coeffs ≈ [0.6f0, 0.4f0, 0.3f0]
+        lc_diff = lc1 - lc2
+        @test length(lc_diff) == 3
+        @test device(lc_diff) == :gpu
+        @test lc_diff.coeffs ≈ [0.6f0, 0.4f0, -0.3f0]
+        lc_scaled = 2.0f0 * lc1
+        @test length(lc_scaled) == 2
+        @test device(lc_scaled) == :gpu
+        @test lc_scaled.coeffs ≈ [1.2f0, 0.8f0]
+        lc_scaled2 = lc1 * 0.5f0
+        @test lc_scaled2.coeffs ≈ [0.3f0, 0.2f0]
+        lc_neg = -lc1
+        @test lc_neg.coeffs ≈ [-0.6f0, -0.4f0]
+        @test device(lc_neg) == :gpu
+    end
+
+    @testset "Normalization" begin
+        basis = QuadPairBasis(1)
+        state1 = vacuumstate(basis) |> gpu
+        state2 = coherentstate(basis, 1.0f0) |> gpu
+        coeffs = [3.0f0, 4.0f0]
+        lc = GaussianLinearCombination(basis, coeffs, [state1, state2])
+        @test device(lc) == :gpu
+        initial_norm = sqrt(sum(abs2, lc.coeffs))
+        @test initial_norm ≈ 5.0f0
+        Gabs.normalize!(lc)
+        @test device(lc) == :gpu
+        final_norm = sqrt(sum(abs2, lc.coeffs))
+        @test final_norm ≈ 1.0f0 rtol=1e-6
+        @test lc.coeffs ≈ [0.6f0, 0.8f0] rtol=1e-6
+        lc_zero = GaussianLinearCombination(basis, [0.0f0, 0.0f0], [state1, state2])
+        Gabs.normalize!(lc_zero)
+        @test lc_zero.coeffs == [0.0f0, 0.0f0]
+    end
+
+    @testset "Simplification" begin
+        basis = QuadPairBasis(1)
+        vac = vacuumstate(basis) |> gpu
+        coh = coherentstate(basis, 1.0f0) |> gpu
+        coeffs1 = CuArray([0.9f0, Float32(1e-15), 0.1f0])
+        lc1 = GaussianLinearCombination(basis, coeffs1, [vac, coh, vac])
+        @test length(lc1) == 3
+        Gabs.simplify!(lc1)
+        @test length(lc1) == 1
+        @test device(lc1) == :gpu
+        coh2 = coherentstate(basis, 1.0f0) |> gpu
+        coeffs2 = CuArray([0.5f0, 0.3f0, 0.2f0])
+        lc2 = GaussianLinearCombination(basis, coeffs2, [vac, coh, coh2])
+        @test length(lc2) == 3
+        Gabs.simplify!(lc2)
+        @test length(lc2) == 2
+        @test device(lc2) == :gpu
+        cpu_coeffs = Array(lc2.coeffs)
+        cpu_vac_mean = Array(vac.mean)
+        cpu_coh_mean = Array(coh.mean)
+        vac_idx = findfirst(i -> Array(lc2.states[i].mean) ≈ cpu_vac_mean, 1:length(lc2))
+        coh_idx = findfirst(i -> Array(lc2.states[i].mean) ≈ cpu_coh_mean, 1:length(lc2))
+        @test vac_idx !== nothing
+        @test coh_idx !== nothing
+        @test cpu_coeffs[vac_idx] ≈ 0.5f0
+        @test cpu_coeffs[coh_idx] ≈ 0.5f0
+        coeffs3 = CuArray([Float32(1e-16), Float32(1e-17)])
+        lc3 = GaussianLinearCombination(basis, coeffs3, [vac, coh])
+        Gabs.simplify!(lc3)
+        @test length(lc3) == 1
+        @test device(lc3) == :gpu
+    end
+
+    @testset "GPU Operations - Gaussian Unitaries" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = coherentstate(basis, -1.0f0) |> gpu
+        lc_gpu = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1, state2])
+        @test device(lc_gpu) == :gpu
+        disp_gpu = displace(basis, 0.5f0) |> gpu
+        result_gpu = disp_gpu * lc_gpu
+        @test device(result_gpu) == :gpu
+        @test length(result_gpu) == 2
+        @test result_gpu.coeffs ≈ lc_gpu.coeffs
+        expected1 = disp_gpu * state1
+        expected2 = disp_gpu * state2
+        @test Array(result_gpu.states[1].mean) ≈ Array(expected1.mean) rtol=1e-6
+        @test Array(result_gpu.states[2].mean) ≈ Array(expected2.mean) rtol=1e-6
+        disp_cpu = displace(basis, 0.2f0)
+        result_mixed = disp_cpu * lc_gpu
+        @test device(result_mixed) == :gpu
+        squeeze_gpu = squeeze(basis, 0.3f0, Float32(π/4)) |> gpu
+        squeezed_lc = squeeze_gpu * lc_gpu
+        @test device(squeezed_lc) == :gpu
+        @test length(squeezed_lc) == 2
+        phase_gpu = phaseshift(basis, π/3) |> gpu
+        phase_shifted = phase_gpu * lc_gpu
+        @test device(phase_shifted) == :gpu
+    end
+
+    @testset "GPU Operations - Gaussian Channels" begin
+        basis = QuadPairBasis(1)
+        coh1 = coherentstate(basis, 1.0f0) |> gpu
+        coh2 = coherentstate(basis, 0.5f0) |> gpu
+        lc_gpu = GaussianLinearCombination(basis, [0.8f0, 0.2f0], [coh1, coh2])
+        att_gpu = attenuator(basis, π/6, 2.0f0) |> gpu
+        attenuated = att_gpu * lc_gpu
+        @test device(attenuated) == :gpu
+        @test length(attenuated) == 2
+        @test attenuated.coeffs ≈ lc_gpu.coeffs
+        expected1 = att_gpu * coh1
+        expected2 = att_gpu * coh2
+        @test Array(attenuated.states[1].mean) ≈ Array(expected1.mean) rtol=1e-6
+        @test Array(attenuated.states[2].covar) ≈ Array(expected2.covar) rtol=1e-5
+        amp_gpu = amplifier(basis, 0.2f0, 1.5f0) |> gpu
+        amplified = amp_gpu * lc_gpu
+        @test device(amplified) == :gpu
+        att_cpu = attenuator(basis, π/8, 1.0f0)
+        result_mixed = att_cpu * lc_gpu
+        @test device(result_mixed) == :gpu
+    end
+
+    @testset "State Metrics" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu
+        lc_gpu = GaussianLinearCombination(basis, [0.6f0, 0.8f0], [state1, state2])
+        @test device(lc_gpu) == :gpu
+        p = purity(lc_gpu)
+        @test p == 1.0
+        s = entropy_vn(lc_gpu)
+        @test s == 0.0
+        basis_multi = QuadPairBasis(2)
+        epr = eprstate(basis_multi, 0.5f0, Float32(π/3)) |> gpu
+        lc_multi = GaussianLinearCombination(basis_multi, [1.0f0], [epr])
+        @test purity(lc_multi) == 1.0
+        @test entropy_vn(lc_multi) == 0.0
+    end
+
+    @testset "Mixed Device Operations" begin
+        basis = QuadPairBasis(1)
+        state1_cpu = coherentstate(basis, 1.0f0)
+        state2_cpu = coherentstate(basis, -1.0f0)
+        lc_cpu = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1_cpu, state2_cpu])
+        state3_gpu = squeezedstate(basis, 0.2f0, 0.0f0) |> gpu
+        lc_gpu = GaussianLinearCombination(basis, [0.5f0], [state3_gpu])
+        @test device(lc_cpu) == :cpu
+        @test device(lc_gpu) == :gpu
+        lc_sum = lc_cpu + lc_gpu
+        disp_cpu = displace(basis, 0.3f0)
+        disp_gpu = displace(basis, 0.4f0) |> gpu
+        result1 = disp_cpu * lc_gpu
+        @test device(result1) == :gpu
+        result2 = disp_gpu * lc_cpu
+        @test device(result2) == :gpu
+    end
+
+    @testset "Error Handling & Edge Cases" begin
+        basis1 = QuadPairBasis(1)
+        basis2 = QuadBlockBasis(1)
+        state1 = coherentstate(basis1, 1.0f0) |> gpu
+        state2 = coherentstate(basis2, 1.0f0) |> gpu
+        lc1 = GaussianLinearCombination(basis1, [0.5f0], [state1])
+        lc2 = GaussianLinearCombination(basis2, [0.5f0], [state2])
+        @test_throws ArgumentError lc1 + lc2
+        state3 = GaussianState(basis1, CuArray([0.0f0, 0.0f0]), CuMatrix{Float32}(I(2)); ħ = 4)
+        lc3 = GaussianLinearCombination(basis1, [1.0f0], [state3])
+        @test_throws ArgumentError lc1 + lc3
+        disp_wrong = displace(basis2, 1.0f0) |> gpu
+        @test_throws ArgumentError disp_wrong * lc1
+        @test_throws ArgumentError GaussianLinearCombination(basis1, Float32[], typeof(state1)[])
+        tiny_coeffs = CuArray([Float32(1e-20), Float32(1e-21)])
+        tiny_states = [state1, coherentstate(basis1, 2.0f0) |> gpu]
+        lc_tiny = GaussianLinearCombination(basis1, tiny_coeffs, tiny_states)
+        Gabs.simplify!(lc_tiny)
+        @test length(lc_tiny) == 1
+        @test device(lc_tiny) == :gpu
+        if CUDA.functional()
+            @test true
+        else
+            @warn "CUDA not functional, testing fallback behavior"
+        end
+    end
+
+    @testset "Type Stability & Performance" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu
+        lc = GaussianLinearCombination(basis, [0.6f0, 0.8f0], [state1, state2])
+        @inferred device(lc)
+        @inferred purity(lc)
+        @inferred entropy_vn(lc)
+        disp = displace(basis, 0.5f0) |> gpu
+        result = disp * lc
+        @test result.states[1].mean isa CuVector{Float32}
+        @test result.states[1].covar isa CuMatrix{Float32}
+        lc_scaled = 2.0f0 * lc
+        @test lc_scaled.coeffs isa Vector{Float32}
+        @test lc_scaled.states[1].mean isa CuVector{Float32}
+        lc_copy = GaussianLinearCombination(basis, copy(lc.coeffs), copy(lc.states))
+        Gabs.normalize!(lc_copy)
+        @test lc_copy.coeffs isa Vector{Float32}
+        @test device(lc_copy) == :gpu
+    end
+
+    @testset "Multi-Mode GPU Linear Combinations" begin
+        basis = QuadPairBasis(2)
+        α_vec1 = [1.0f0 + 0.5f0im, -0.3f0 + 0.8f0im]
+        α_vec2 = [0.5f0, 0.2f0 - 0.4f0im]
+        coh1 = coherentstate(basis, α_vec1) |> gpu
+        coh2 = coherentstate(basis, α_vec2) |> gpu
+        epr = eprstate(basis, 0.3f0, Float32(π/6)) |> gpu
+        coeffs = [0.5f0, 0.3f0, 0.2f0]
+        lc_multi = GaussianLinearCombination(basis, coeffs, [coh1, coh2, epr])
+        @test device(lc_multi) == :gpu
+        @test length(lc_multi) == 3
+        @test lc_multi.basis.nmodes == 2
+        squeeze_op = squeeze(basis, [0.2f0, 0.3f0], [0.0f0, Float32(π/4)]) |> gpu
+        squeezed_multi = squeeze_op * lc_multi
+        @test device(squeezed_multi) == :gpu
+        @test length(squeezed_multi) == 3
+        @test squeezed_multi.coeffs ≈ coeffs
+        bs = beamsplitter(basis, 0.7f0) |> gpu
+        bs_result = bs * lc_multi
+        @test device(bs_result) == :gpu
+        twosq = twosqueeze(basis, 0.2f0, Float32(π/3)) |> gpu
+        twosq_result = twosq * lc_multi
+        @test device(twosq_result) == :gpu
+    end
+
+    @testset "Professional API Validation" begin
+        @info "=== Testing Professional GPU API for Linear Combinations ==="
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu
+        lc = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1, state2])
+        op = displace(basis, 0.5f0) |> gpu
+        result = op * lc
+        @test device(lc) == :gpu
+        @test device(result) == :gpu
+        @test result.coeffs ≈ lc.coeffs
+        α_gpu = CuArray([1.0f0 + 0.5f0im])
+        auto_state = coherentstate(basis, α_gpu)
+        lc_auto = GaussianLinearCombination(basis, [1.0f0], [auto_state])
+        @test device(lc_auto) == :gpu
+        cpu_op = squeeze(basis, 0.2f0, 0.0f0)
+        mixed_result = cpu_op * lc
+        @test device(mixed_result) == :gpu
+    end
+end
+
+@testitem "GPU Linear Combinations Further Coverage" tags=[:cuda] begin
+    using CUDA
+    using Gabs
+    using Gabs: device, GaussianLinearCombination
+    using LinearAlgebra
+    using Test
+
+    @testset "Basic GPU LC Creation and Device Detection" begin
+        basis = QuadPairBasis(1)
+        cpu_state = coherentstate(basis, 1.0f0)
+        cpu_lc = GaussianLinearCombination(basis, [0.7f0], [cpu_state])
+        gpu_lc = cpu_lc |> gpu
+        @test device(cpu_lc) == :cpu
+        @test device(gpu_lc) == :gpu
+        @test gpu_lc.coeffs isa Vector{Float32}
+        @test Gabs._is_gpu_array(gpu_lc.coeffs) == false
+        @test gpu_lc.states[1].mean isa CuVector{Float32}
+        gpu_state = coherentstate(basis, 1.0f0) |> gpu
+        direct_gpu_lc = GaussianLinearCombination(basis, [0.8f0], [gpu_state])
+        @test device(direct_gpu_lc) == :gpu
+        @test direct_gpu_lc.coeffs isa Vector{Float32}
+        cpu_state2 = squeezedstate(basis, 0.3f0, Float32(π/4))
+        gpu_state2 = cpu_state2 |> gpu
+        mixed_lc = GaussianLinearCombination(basis, [0.6f0, 0.4f0], [cpu_state, gpu_state2])
+        @test device(mixed_lc) == :cpu
+        full_gpu_lc = mixed_lc |> gpu
+        @test device(full_gpu_lc) == :gpu
+        @test all(device(s) == :gpu for s in full_gpu_lc.states)
+    end
+
+    @testset "Arithmetic Operations - Comprehensive Device Promotion" begin
+        basis = QuadPairBasis(1)
+        cpu_state1 = coherentstate(basis, 1.0f0)
+        cpu_state2 = coherentstate(basis, -1.0f0)
+        gpu_state1 = coherentstate(basis, 0.5f0) |> gpu
+        gpu_state2 = squeezedstate(basis, 0.2f0, 0.0f0) |> gpu
+        cpu_lc = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [cpu_state1, cpu_state2])
+        gpu_lc = GaussianLinearCombination(basis, [0.6f0, 0.4f0], [gpu_state1, gpu_state2])
+        result1 = cpu_lc + gpu_lc
+        @test device(result1) == :gpu
+        @test length(result1) == 4
+        @test result1.coeffs isa Vector{Float32}
+        @test all(device(s) == :gpu for s in result1.states)
+        result2 = gpu_lc + cpu_lc  
+        @test device(result2) == :gpu
+        @test length(result2) == 4
+        cpu_lc2 = GaussianLinearCombination(basis, [0.5f0], [cpu_state1])
+        result3 = cpu_lc + cpu_lc2
+        @test device(result3) == :cpu
+        @test length(result3) == 3
+        gpu_lc2 = GaussianLinearCombination(basis, [0.8f0], [gpu_state1])
+        result4 = gpu_lc + gpu_lc2
+        @test device(result4) == :gpu
+        @test length(result4) == 3
+        scaled_cpu = 2.0f0 * cpu_lc
+        scaled_gpu = 2.0f0 * gpu_lc
+        @test device(scaled_cpu) == :cpu
+        @test device(scaled_gpu) == :gpu
+        @test scaled_gpu.coeffs ≈ [1.2f0, 0.8f0]
+        result5 = cpu_state1 + gpu_lc
+        @test device(result5) == :gpu
+        @test length(result5) == 3
+        result6 = gpu_state1 + cpu_lc
+        @test device(result6) == :gpu
+        result7 = cpu_lc - gpu_lc
+        @test device(result7) == :gpu
+        result8 = gpu_state1 - cpu_lc
+        @test device(result8) == :gpu
+    end
+
+    @testset "GPU Operator Applications - All Combinations" begin
+        basis = QuadPairBasis(1)
+        cpu_state1 = coherentstate(basis, 1.0f0)
+        cpu_state2 = squeezedstate(basis, 0.2f0, Float32(π/4))
+        gpu_lc = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [cpu_state1, cpu_state2]) |> gpu
+        cpu_lc = GaussianLinearCombination(basis, [0.6f0, 0.4f0], [cpu_state1, cpu_state2])
+        gpu_disp = displace(basis, 0.5f0) |> gpu
+        result1 = gpu_disp * gpu_lc
+        @test device(result1) == :gpu
+        @test result1.coeffs ≈ gpu_lc.coeffs
+        @test length(result1) == 2
+        cpu_disp = displace(basis, 0.3f0)
+        result2 = cpu_disp * gpu_lc
+        @test device(result2) == :gpu
+        @test result2.coeffs ≈ gpu_lc.coeffs
+        result3 = gpu_disp * cpu_lc
+        @test device(result3) == :gpu
+        @test result3.coeffs ≈ cpu_lc.coeffs
+        result4 = cpu_disp * cpu_lc
+        @test device(result4) == :cpu
+        gpu_squeeze = squeeze(basis, 0.2f0, Float32(π/3)) |> gpu
+        gpu_phase = phaseshift(basis, Float32(π/6)) |> gpu
+        chained = gpu_phase * (gpu_squeeze * gpu_lc)
+        @test device(chained) == :gpu
+        @test chained.coeffs ≈ gpu_lc.coeffs
+        gpu_atten = attenuator(basis, π/4, 2.0f0) |> gpu
+        cpu_amp = amplifier(basis, 0.1f0, 1.5f0)
+        channel_result1 = gpu_atten * gpu_lc
+        @test device(channel_result1) == :gpu
+        channel_result2 = cpu_amp * gpu_lc
+        @test device(channel_result2) == :gpu
+        channel_result3 = gpu_atten * cpu_lc
+        @test device(channel_result3) == :gpu
+        cpu_result = cpu_disp * cpu_lc
+        gpu_equivalent = cpu_disp * (cpu_lc |> gpu)
+        cpu_back = gpu_equivalent |> cpu
+        @test cpu_back.coeffs ≈ cpu_result.coeffs rtol=1e-6
+        @test cpu_back.states[1].mean ≈ cpu_result.states[1].mean rtol=1e-6
+        @test cpu_back.states[1].covar ≈ cpu_result.states[1].covar rtol=1e-5
+    end
+
+    @testset "Memory Management and Efficiency" begin
+        basis = QuadPairBasis(1)
+        states = [coherentstate(basis, Float32(i)) for i in 1:10]
+        coeffs = Float32.(rand(10))
+        cpu_lc = GaussianLinearCombination(basis, coeffs, states)
+        gpu_lc = cpu_lc |> gpu
+        @test sizeof(gpu_lc.coeffs) == sizeof(coeffs)
+        @test gpu_lc.coeffs isa Vector{Float32}
+        large_states = [coherentstate(basis, Float32(i)*0.1f0) for i in 1:100]
+        large_coeffs = Float32.(randn(100))
+        large_lc = GaussianLinearCombination(basis, large_coeffs, large_states)
+        large_gpu_lc = large_lc |> gpu
+        @test device(large_gpu_lc) == :gpu
+        @test length(large_gpu_lc) == 100
+        @test large_gpu_lc.coeffs isa Vector{Float32}
+        transferred_back = large_gpu_lc |> cpu
+        retransferred = transferred_back |> gpu
+        @test device(transferred_back) == :cpu
+        @test device(retransferred) == :gpu
+        @test transferred_back.coeffs ≈ large_lc.coeffs
+    end
+
+    @testset "Normalization and Simplification on GPU" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu
+        unnormalized_lc = GaussianLinearCombination(basis, [3.0f0, 4.0f0], [state1, state2])
+        @test device(unnormalized_lc) == :gpu
+        initial_norm = sqrt(sum(abs2, unnormalized_lc.coeffs))
+        @test initial_norm ≈ 5.0f0
+        Gabs.normalize!(unnormalized_lc)
+        @test device(unnormalized_lc) == :gpu
+        final_norm = sqrt(sum(abs2, unnormalized_lc.coeffs))
+        @test final_norm ≈ 1.0f0 rtol=1e-6
+        @test unnormalized_lc.coeffs ≈ [0.6f0, 0.8f0] rtol=1e-6
+        small_coeffs = [0.9f0, Float32(1e-15), 0.1f0]
+        small_states = [state1, state2, coherentstate(basis, 2.0f0) |> gpu]
+        to_simplify = GaussianLinearCombination(basis, small_coeffs, small_states)
+        @test length(to_simplify) == 3
+        @test device(to_simplify) == :gpu
+        Gabs.simplify!(to_simplify)
+        @test device(to_simplify) == :gpu
+        @test length(to_simplify) == 2
+        identical_state = coherentstate(basis, 1.5f0) |> gpu
+        combine_coeffs = [0.3f0, 0.7f0, 0.2f0]
+        combine_states = [state1, identical_state, identical_state]
+        to_combine = GaussianLinearCombination(basis, combine_coeffs, combine_states)
+        @test length(to_combine) == 3
+        Gabs.simplify!(to_combine)
+        @test length(to_combine) == 2
+        @test device(to_combine) == :gpu
+        cpu_coeffs = Array(to_combine.coeffs)
+        @test any(c -> isapprox(c, 0.9f0, rtol=1e-6), cpu_coeffs)
+    end
+
+    @testset "Complex Coefficients and Precision Handling" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = squeezedstate(basis, 0.2f0, Float32(π/4)) |> gpu
+        complex_coeffs = ComplexF32[0.6f0 + 0.8f0im, 0.3f0 - 0.4f0im]
+        complex_lc = GaussianLinearCombination(basis, complex_coeffs, [state1, state2])
+        @test device(complex_lc) == :gpu
+        @test complex_lc.coeffs isa Vector{ComplexF32}
+        @test eltype(complex_lc.coeffs) == ComplexF32
+        float64_state = coherentstate(basis, 1.0) |> gpu
+        mixed_lc = GaussianLinearCombination(basis, [0.5f0], [float64_state])
+        @test device(mixed_lc) == :gpu
+        precise_op = displace(basis, 1.0) |> gpu
+        result = precise_op * complex_lc
+        @test device(result) == :gpu
+        @test result.coeffs ≈ complex_coeffs
+        large_coeffs = ComplexF32[1000.0f0, 0.001f0]
+        large_lc = GaussianLinearCombination(basis, large_coeffs, [state1, state2])
+        Gabs.normalize!(large_lc)
+        @test device(large_lc) == :gpu
+        @test abs(sqrt(sum(abs2, large_lc.coeffs)) - 1.0f0) < 1e-6
+    end
+
+    @testset "Multi-Mode GPU Linear Combinations" begin
+        basis2 = QuadPairBasis(2)
+        basis3 = QuadPairBasis(3)
+        α_vec = [1.0f0 + 0.5f0im, -0.3f0 + 0.8f0im]
+        coh2mode = coherentstate(basis2, α_vec) |> gpu
+        epr_state = eprstate(basis2, 0.4f0, Float32(π/3)) |> gpu
+        lc_2mode = GaussianLinearCombination(basis2, [0.7f0, 0.3f0], [coh2mode, epr_state])
+        @test device(lc_2mode) == :gpu
+        @test lc_2mode.basis.nmodes == 2
+        bs = beamsplitter(basis2, 0.6f0) |> gpu
+        bs_result = bs * lc_2mode
+        @test device(bs_result) == :gpu
+        @test bs_result.basis.nmodes == 2
+        α_vec3 = [1.0f0, 0.5f0im, -0.3f0 + 0.2f0im]
+        coh3mode = coherentstate(basis3, α_vec3) |> gpu
+        vac3mode = vacuumstate(basis3) |> gpu
+        lc_3mode = GaussianLinearCombination(basis3, [0.8f0, 0.2f0], [coh3mode, vac3mode])
+        @test device(lc_3mode) == :gpu
+        @test lc_3mode.basis.nmodes == 3
+        squeeze_multi = squeeze(basis3, [0.1f0, 0.2f0, 0.15f0], [0.0f0, Float32(π/4), Float32(π/2)]) |> gpu
+        multi_result = squeeze_multi * lc_3mode
+        @test device(multi_result) == :gpu
+        @test length(multi_result) == 2
+    end
+
+    @testset "Error Handling and Edge Cases" begin
+        basis1 = QuadPairBasis(1)
+        basis2 = QuadBlockBasis(1)
+        state1 = coherentstate(basis1, 1.0f0) |> gpu
+        state2 = coherentstate(basis2, 1.0f0) |> gpu
+        lc1 = GaussianLinearCombination(basis1, [0.5f0], [state1])
+        lc2 = GaussianLinearCombination(basis2, [0.5f0], [state2])
+        @test_throws ArgumentError lc1 + lc2
+        @test_throws ArgumentError lc1 - lc2
+        state_different_hbar = GaussianState(basis1, CuArray([0.0f0, 0.0f0]), 
+                                           CuMatrix{Float32}(I(2)); ħ = 4)
+        lc_different = GaussianLinearCombination(basis1, [1.0f0], [state_different_hbar])
+        @test_throws ArgumentError lc1 + lc_different
+        @test_throws ArgumentError GaussianLinearCombination(basis1, Float32[], typeof(state1)[])
+        wrong_op = displace(basis2, 1.0f0) |> gpu
+        @test_throws ArgumentError wrong_op * lc1
+        tiny_lc = GaussianLinearCombination(basis1, [Float32(1e-20)], [state1])
+        Gabs.simplify!(tiny_lc)
+        @test length(tiny_lc) == 1
+        @test device(tiny_lc) == :gpu
+        if CUDA.functional()
+            @test device(lc1) == :gpu
+        else
+            @warn "CUDA not functional - some GPU tests may not be meaningful"
+        end
+    end
+
+    @testset "Performance and Type Stability" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu
+        gpu_lc = GaussianLinearCombination(basis, [0.6f0, 0.8f0], [state1, state2])
+        @inferred device(gpu_lc)
+        @inferred length(gpu_lc) 
+        @inferred gpu_lc[1]
+        @inferred purity(gpu_lc)
+        @inferred entropy_vn(gpu_lc)
+        lc2 = GaussianLinearCombination(basis, [0.4f0], [state1])
+        op = displace(basis, 0.5f0) |> gpu
+        sum_result = gpu_lc + lc2
+        mul_result = 2.0f0 * gpu_lc  
+        op_result = op * gpu_lc
+        @test device(sum_result) == :gpu
+        @test device(mul_result) == :gpu
+        @test device(op_result) == :gpu
+        @test sum_result.coeffs isa Vector{Float32}
+        @test mul_result.coeffs isa Vector{Float32}
+        @test op_result.coeffs isa Vector{Float32}
+        @test op_result.states[1].mean isa CuVector{Float32}
+        @test op_result.states[1].covar isa CuMatrix{Float32}
+        many_states = [coherentstate(basis, Float32(i)*0.1f0) |> gpu for i in 1:50]
+        many_coeffs = Float32.(randn(50))
+        big_lc = GaussianLinearCombination(basis, many_coeffs, many_states)
+        batch_op = phaseshift(basis, π/3) |> gpu
+        batch_result = batch_op * big_lc
+        @test device(batch_result) == :gpu
+        @test length(batch_result) == 50
+        @test batch_result.coeffs isa Vector{Float32}
+    end
+    
+    @testset "Integration with Existing Gabs Functionality" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = squeezedstate(basis, 0.2f0, Float32(π/4)) |> gpu
+        gpu_lc = GaussianLinearCombination(basis, [0.6f0, 0.8f0], [state1, state2])
+        @test purity(gpu_lc) == 1.0
+        @test entropy_vn(gpu_lc) == 0.0
+        block_basis = QuadBlockBasis(1)
+        pair_state = coherentstate(basis, 1.0f0) |> gpu
+        block_state = changebasis(QuadBlockBasis, pair_state)
+        mixed_basis_lc = GaussianLinearCombination(block_basis, [1.0f0], [block_state])
+        @test device(mixed_basis_lc) == :gpu
+        random_basis = QuadPairBasis(1)
+        random_state = randstate(random_basis; pure=true) |> gpu
+        random_lc = GaussianLinearCombination(random_basis, [1.0f0], [random_state])
+        @test device(random_lc) == :gpu
+        att = attenuator(basis, π/6, 2.0f0) |> gpu
+        amp = amplifier(basis, 0.1f0, 1.2f0) |> gpu
+        composed_result = amp * (att * gpu_lc)
+        @test device(composed_result) == :gpu
+        @test composed_result.coeffs ≈ gpu_lc.coeffs
+        x_test = [0.0f0, 0.0f0]
+    end
+
+    @testset "Professional API Usage Examples" begin
+        basis = QuadPairBasis(1)
+        state1 = coherentstate(basis, 1.0f0) |> gpu
+        state2 = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu
+        cat_like = GaussianLinearCombination(basis, [0.7f0, 0.3f0], [state1, state2])
+        @test device(cat_like) == :gpu
+        cpu_disp = displace(basis, 0.5f0)
+        result = cpu_disp * cat_like
+        @test device(result) == :gpu
+        cpu_state = thermalstate(basis, 1.0f0)
+        mixed_result = cpu_state + cat_like
+        @test device(mixed_result) == :gpu
+        @test cat_like.coeffs isa Vector{Float32}
+        @test Gabs._is_gpu_array(cat_like.coeffs) == false
+        @test cat_like.states[1].mean isa CuVector{Float32}
+        ops = [squeeze(basis, Float32(0.1f0*i), Float32(π/6*i)) |> gpu for i in 1:5]
+        results = [op * cat_like for op in ops]
+        @test all(device(r) == :gpu for r in results)
+    end
+end
+
+@testitem "GPU Basis Operations" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device, GaussianLinearCombination
+    using CUDA
+    using LinearAlgebra
+    using Test
+
+    if !CUDA.functional()
+        @warn "CUDA not functional. Skipping GPU basis operations tests."
+        return
+    end
+
+    @testset "Public API Device Management" begin
+        
+        @testset "Device Detection with Public API" begin
+            cpu_array = rand(Float32, 10)
+            gpu_array = CuArray(cpu_array)
+            @test device(cpu_array) == :cpu
+            @test device(gpu_array) == :gpu
+            basis = QuadPairBasis(2)
+            cpu_state = coherentstate(basis, 1.0 + 0.5im)
+            gpu_state = gpu(cpu_state, precision=Float32)
+            @test device(cpu_state) == :cpu
+            @test device(gpu_state) == :gpu
+            cpu_op = displace(basis, 0.5 + 0.3im)
+            gpu_op = gpu(cpu_op, precision=Float32)
+            @test device(cpu_op) == :cpu
+            @test device(gpu_op) == :gpu
+        end
+        
+        @testset "Device Transfer with Public API" begin
+            basis = QuadPairBasis(2)
+            cpu_state = coherentstate(basis, 1.0 + 0.5im)
+            gpu_state_f32 = gpu(cpu_state, precision=Float32)
+            gpu_state_f64 = gpu(cpu_state, precision=Float64)
+            @test device(gpu_state_f32) == :gpu
+            @test device(gpu_state_f64) == :gpu
+            @test eltype(gpu_state_f32.mean) == Float32
+            @test eltype(gpu_state_f64.mean) == Float64
+            cpu_back_f32 = cpu(gpu_state_f32)
+            cpu_back_f64 = cpu(gpu_state_f64)
+            @test device(cpu_back_f32) == :cpu
+            @test device(cpu_back_f64) == :cpu
+            @test isapprox(cpu_back_f32.mean, cpu_state.mean, rtol=1e-6)
+            @test isapprox(cpu_back_f64.mean, cpu_state.mean, rtol=1e-12)
+        end
+        
+        @testset "Adapt Device Functionality" begin
+            basis = QuadPairBasis(1)
+            cpu_state = coherentstate(basis, 1.0)
+            gpu_state = gpu(cpu_state, precision=Float32)
+            new_cpu_state = adapt_device(coherentstate, cpu_state, basis, 2.0)
+            new_gpu_state = adapt_device(coherentstate, gpu_state, basis, 2.0)
+            @test device(new_cpu_state) == :cpu
+            @test device(new_gpu_state) == :gpu
+        end
+    end
+
+    @testset "GPU GaussianState Basis Conversions" begin
+        nmodes_list = [1, 2, 5]
+        precisions = [Float32, Float64]
+        
+        for nmodes in nmodes_list
+            for T in precisions
+                @testset "$(nmodes) modes, $(T) precision" begin
+                    pair_basis = QuadPairBasis(nmodes)
+                    block_basis = QuadBlockBasis(nmodes)
+                    cpu_coherent_pair = coherentstate(pair_basis, 1.0 + 0.5im)
+                    cpu_coherent_block = coherentstate(block_basis, 1.0 + 0.5im)
+                    cpu_squeezed_pair = squeezedstate(pair_basis, 0.3, π/4)
+                    cpu_squeezed_block = squeezedstate(block_basis, 0.3, π/4)
+                    gpu_coherent_pair = gpu(cpu_coherent_pair, precision=T)
+                    gpu_coherent_block = gpu(cpu_coherent_block, precision=T)
+                    gpu_squeezed_pair = gpu(cpu_squeezed_pair, precision=T)
+                    gpu_squeezed_block = gpu(cpu_squeezed_block, precision=T)
+                    
+                    @testset "QuadPairBasis -> QuadBlockBasis" begin
+                        gpu_converted = changebasis(QuadBlockBasis, gpu_coherent_pair)
+                        cpu_converted = changebasis(QuadBlockBasis, cpu_coherent_pair)
+                        @test gpu_converted.basis isa QuadBlockBasis
+                        @test gpu_converted.basis.nmodes == nmodes
+                        @test device(gpu_converted) == :gpu
+                        @test eltype(gpu_converted.mean) == T
+                        @test eltype(gpu_converted.covar) == T
+                        @test gpu_converted.ħ == cpu_coherent_pair.ħ
+                        @test isapprox(Array(gpu_converted.mean), cpu_converted.mean, atol=1e-6)
+                        @test isapprox(Array(gpu_converted.covar), cpu_converted.covar, atol=1e-6)
+                        gpu_squeezed_converted = changebasis(QuadBlockBasis, gpu_squeezed_pair)
+                        cpu_squeezed_converted = changebasis(QuadBlockBasis, cpu_squeezed_pair)
+                        @test device(gpu_squeezed_converted) == :gpu
+                        @test isapprox(Array(gpu_squeezed_converted.mean), cpu_squeezed_converted.mean, atol=1e-6)
+                        @test isapprox(Array(gpu_squeezed_converted.covar), cpu_squeezed_converted.covar, atol=1e-6)
+                    end
+                    
+                    @testset "QuadBlockBasis -> QuadPairBasis" begin
+                        gpu_converted = changebasis(QuadPairBasis, gpu_coherent_block)
+                        cpu_converted = changebasis(QuadPairBasis, cpu_coherent_block)
+                        @test gpu_converted.basis isa QuadPairBasis
+                        @test gpu_converted.basis.nmodes == nmodes
+                        @test device(gpu_converted) == :gpu
+                        @test eltype(gpu_converted.mean) == T
+                        @test eltype(gpu_converted.covar) == T
+                        @test isapprox(Array(gpu_converted.mean), cpu_converted.mean, atol=1e-6)
+                        @test isapprox(Array(gpu_converted.covar), cpu_converted.covar, atol=1e-6)
+                        gpu_squeezed_converted = changebasis(QuadPairBasis, gpu_squeezed_block)
+                        cpu_squeezed_converted = changebasis(QuadPairBasis, cpu_squeezed_block)
+                        @test device(gpu_squeezed_converted) == :gpu
+                        @test isapprox(Array(gpu_squeezed_converted.mean), cpu_squeezed_converted.mean, atol=1e-6)
+                        @test isapprox(Array(gpu_squeezed_converted.covar), cpu_squeezed_converted.covar, atol=1e-6)
+                    end
+                    
+                    @testset "Same basis no-op conversions" begin
+                        same_basis_pair = changebasis(QuadPairBasis, gpu_coherent_pair)
+                        same_basis_block = changebasis(QuadBlockBasis, gpu_coherent_block)
+                        @test device(same_basis_pair) == :gpu
+                        @test device(same_basis_block) == :gpu
+                        @test same_basis_pair.basis isa QuadPairBasis
+                        @test same_basis_block.basis isa QuadBlockBasis
+                        @test isapprox(Array(same_basis_pair.mean), Array(gpu_coherent_pair.mean), atol=1e-12)
+                        @test isapprox(Array(same_basis_block.mean), Array(gpu_coherent_block.mean), atol=1e-12)
+                    end
+                    
+                    @testset "Round-trip conversion consistency" begin
+                        gpu_pair_to_block = changebasis(QuadBlockBasis, gpu_coherent_pair)
+                        gpu_round_trip = changebasis(QuadPairBasis, gpu_pair_to_block)
+                        @test device(gpu_round_trip) == :gpu
+                        @test isapprox(Array(gpu_round_trip.mean), Array(gpu_coherent_pair.mean), atol=1e-6)
+                        @test isapprox(Array(gpu_round_trip.covar), Array(gpu_coherent_pair.covar), atol=1e-6)
+                        gpu_block_to_pair = changebasis(QuadPairBasis, gpu_coherent_block)
+                        gpu_round_trip2 = changebasis(QuadBlockBasis, gpu_block_to_pair)
+                        @test device(gpu_round_trip2) == :gpu
+                        @test isapprox(Array(gpu_round_trip2.mean), Array(gpu_coherent_block.mean), atol=1e-6)
+                        @test isapprox(Array(gpu_round_trip2.covar), Array(gpu_coherent_block.covar), atol=1e-6)
+                    end
+                end
+            end
+        end
+    end
+
+    @testset "GPU GaussianUnitary Basis Conversions" begin
+        for nmodes in [1, 2]
+            for T in [Float32, Float64]
+                @testset "$(nmodes) modes, $(T) precision" begin
+                    pair_basis = QuadPairBasis(nmodes)
+                    block_basis = QuadBlockBasis(nmodes)
+                    cpu_displace_pair = displace(pair_basis, 0.5 + 0.3im)
+                    cpu_displace_block = displace(block_basis, 0.5 + 0.3im)
+                    cpu_squeeze_pair = squeeze(pair_basis, 0.2, π/3)
+                    cpu_squeeze_block = squeeze(block_basis, 0.2, π/3)
+                    gpu_displace_pair = gpu(cpu_displace_pair, precision=T)
+                    gpu_displace_block = gpu(cpu_displace_block, precision=T)
+                    gpu_squeeze_pair = gpu(cpu_squeeze_pair, precision=T)
+                    gpu_squeeze_block = gpu(cpu_squeeze_block, precision=T)
+                    
+                    @testset "Unitary basis conversions" begin
+                        gpu_converted = changebasis(QuadBlockBasis, gpu_displace_pair)
+                        cpu_converted = changebasis(QuadBlockBasis, cpu_displace_pair)
+                        @test gpu_converted.basis isa QuadBlockBasis
+                        @test device(gpu_converted) == :gpu
+                        @test isapprox(Array(gpu_converted.disp), cpu_converted.disp, atol=1e-6)
+                        @test isapprox(Array(gpu_converted.symplectic), cpu_converted.symplectic, atol=1e-6)
+                        gpu_converted2 = changebasis(QuadPairBasis, gpu_displace_block)
+                        cpu_converted2 = changebasis(QuadPairBasis, cpu_displace_block)
+                        @test gpu_converted2.basis isa QuadPairBasis
+                        @test device(gpu_converted2) == :gpu
+                        @test isapprox(Array(gpu_converted2.disp), cpu_converted2.disp, atol=1e-6)
+                        @test isapprox(Array(gpu_converted2.symplectic), cpu_converted2.symplectic, atol=1e-6)
+                        gpu_squeeze_converted = changebasis(QuadBlockBasis, gpu_squeeze_pair)
+                        cpu_squeeze_converted = changebasis(QuadBlockBasis, cpu_squeeze_pair)
+                        @test device(gpu_squeeze_converted) == :gpu
+                        @test isapprox(Array(gpu_squeeze_converted.disp), cpu_squeeze_converted.disp, atol=1e-6)
+                        @test isapprox(Array(gpu_squeeze_converted.symplectic), cpu_squeeze_converted.symplectic, atol=1e-6)
+                    end
+                    
+                    @testset "Unitary no-op conversions" begin
+                        same_basis_pair = changebasis(QuadPairBasis, gpu_displace_pair)
+                        same_basis_block = changebasis(QuadBlockBasis, gpu_displace_block)
+                        @test device(same_basis_pair) == :gpu
+                        @test device(same_basis_block) == :gpu
+                        @test same_basis_pair.basis isa QuadPairBasis
+                        @test same_basis_block.basis isa QuadBlockBasis
+                    end
+                end
+            end
+        end
+    end
+
+    @testset "GPU GaussianChannel Basis Conversions" begin
+        for nmodes in [1, 2]
+            for T in [Float32, Float64]
+                @testset "$(nmodes) modes, $(T) precision" begin
+                    pair_basis = QuadPairBasis(nmodes)
+                    block_basis = QuadBlockBasis(nmodes)
+                    cpu_attenuator_pair = attenuator(pair_basis, π/4, 2)
+                    cpu_attenuator_block = attenuator(block_basis, π/4, 2)
+                    cpu_amplifier_pair = amplifier(pair_basis, 0.3, 3)
+                    cpu_amplifier_block = amplifier(block_basis, 0.3, 3)
+                    gpu_attenuator_pair = gpu(cpu_attenuator_pair, precision=T)
+                    gpu_attenuator_block = gpu(cpu_attenuator_block, precision=T)
+                    gpu_amplifier_pair = gpu(cpu_amplifier_pair, precision=T)
+                    gpu_amplifier_block = gpu(cpu_amplifier_block, precision=T)
+                    
+                    @testset "Channel basis conversions" begin
+                        gpu_converted = changebasis(QuadBlockBasis, gpu_attenuator_pair)
+                        cpu_converted = changebasis(QuadBlockBasis, cpu_attenuator_pair)
+                        @test gpu_converted.basis isa QuadBlockBasis
+                        @test device(gpu_converted) == :gpu
+                        @test isapprox(Array(gpu_converted.disp), cpu_converted.disp, atol=1e-6)
+                        @test isapprox(Array(gpu_converted.transform), cpu_converted.transform, atol=1e-6)
+                        @test isapprox(Array(gpu_converted.noise), cpu_converted.noise, atol=1e-6)
+                        gpu_converted2 = changebasis(QuadPairBasis, gpu_attenuator_block)
+                        cpu_converted2 = changebasis(QuadPairBasis, cpu_attenuator_block)
+                        @test gpu_converted2.basis isa QuadPairBasis
+                        @test device(gpu_converted2) == :gpu
+                        @test isapprox(Array(gpu_converted2.disp), cpu_converted2.disp, atol=1e-6)
+                        @test isapprox(Array(gpu_converted2.transform), cpu_converted2.transform, atol=1e-6)
+                        @test isapprox(Array(gpu_converted2.noise), cpu_converted2.noise, atol=1e-6)
+                        gpu_amp_converted = changebasis(QuadBlockBasis, gpu_amplifier_pair)
+                        cpu_amp_converted = changebasis(QuadBlockBasis, cpu_amplifier_pair)
+                        @test device(gpu_amp_converted) == :gpu
+                        @test isapprox(Array(gpu_amp_converted.disp), cpu_amp_converted.disp, atol=1e-6)
+                        @test isapprox(Array(gpu_amp_converted.transform), cpu_amp_converted.transform, atol=1e-6)
+                        @test isapprox(Array(gpu_amp_converted.noise), cpu_amp_converted.noise, atol=1e-6)
+                    end
+                    
+                    @testset "Channel no-op conversions" begin
+                        same_basis_pair = changebasis(QuadPairBasis, gpu_attenuator_pair)
+                        same_basis_block = changebasis(QuadBlockBasis, gpu_attenuator_block)
+                        @test device(same_basis_pair) == :gpu
+                        @test device(same_basis_block) == :gpu
+                        @test same_basis_pair.basis isa QuadPairBasis
+                        @test same_basis_block.basis isa QuadBlockBasis
+                    end
+                end
+            end
+        end
+    end
+
+    @testset "CPU/GPU Mixed Operations with Basis Conversion" begin
+        basis_pair = QuadPairBasis(2)
+        basis_block = QuadBlockBasis(2)
+        cpu_state_pair = coherentstate(basis_pair, 1.0 + 0.5im)
+        gpu_state_pair = gpu(coherentstate(basis_pair, 0.5 - 0.3im), precision=Float32)
+        cpu_state_block = coherentstate(basis_block, 2.0 + 0.1im)
+        gpu_state_block = gpu(coherentstate(basis_block, -0.8 + 0.7im), precision=Float32)
+        
+        @testset "Mixed device basis conversions" begin
+            cpu_pair_to_block = changebasis(QuadBlockBasis, cpu_state_pair)
+            @test device(cpu_pair_to_block) == :cpu
+            @test cpu_pair_to_block.basis isa QuadBlockBasis
+            gpu_pair_to_block = changebasis(QuadBlockBasis, gpu_state_pair)
+            @test device(gpu_pair_to_block) == :gpu
+            @test gpu_pair_to_block.basis isa QuadBlockBasis
+            cpu_to_gpu_converted = gpu(changebasis(QuadBlockBasis, cpu_state_pair), precision=Float32)
+            @test device(cpu_to_gpu_converted) == :gpu
+            @test cpu_to_gpu_converted.basis isa QuadBlockBasis
+            gpu_to_cpu_converted = cpu(changebasis(QuadPairBasis, gpu_state_block))
+            @test device(gpu_to_cpu_converted) == :cpu
+            @test gpu_to_cpu_converted.basis isa QuadPairBasis
+        end
+        
+        @testset "Operations across converted states" begin
+            cpu_op_pair = displace(basis_pair, 0.3)
+            gpu_op_block = gpu(displace(basis_block, 0.4), precision=Float32)
+            converted_state = changebasis(QuadBlockBasis, gpu_state_pair)
+            result = gpu_op_block * converted_state
+            @test device(result) == :gpu
+            @test result.basis isa QuadBlockBasis
+            cpu_state_converted = changebasis(QuadPairBasis, cpu_state_block)
+            result2 = cpu_op_pair * cpu_state_converted
+            @test device(result2) == :cpu
+            @test result2.basis isa QuadPairBasis
+        end
+    end
+
+    @testset "GPU Basis Conversion Error Handling" begin
+        basis_pair = QuadPairBasis(2)
+        cpu_state = coherentstate(basis_pair, 1.0 + 0.5im)
+        converted_cpu = changebasis(QuadBlockBasis, cpu_state)
+        @test converted_cpu.basis isa QuadBlockBasis
+        @test device(converted_cpu) == :cpu
+        @test isgaussian(converted_cpu, atol=1e-4)
+        squeezed_cpu = squeezedstate(basis_pair, 0.5, π/4)
+        converted_squeezed = changebasis(QuadBlockBasis, squeezed_cpu)
+        @test isgaussian(converted_squeezed, atol=1e-4)
+        thermal_cpu = thermalstate(basis_pair, 2.0)
+        converted_thermal = changebasis(QuadBlockBasis, thermal_cpu)
+        @test isgaussian(converted_thermal, atol=1e-4)
+    end
+
+    @testset "Memory Efficiency and Performance Validation" begin
+        basis_pair = QuadPairBasis(10)
+        large_state = gpu(squeezedstate(basis_pair, 0.5, π/6), precision=Float32)
+        @test device(large_state) == :gpu
+        @test eltype(large_state.mean) == Float32
+        converted_state = changebasis(QuadBlockBasis, large_state)
+        @test device(converted_state) == :gpu
+        @test converted_state.basis isa QuadBlockBasis
+        @test eltype(converted_state.mean) == Float32
+        @test eltype(converted_state.covar) == Float32
+        round_trip = changebasis(QuadPairBasis, converted_state)
+        @test device(round_trip) == :gpu
+        @test isapprox(Array(round_trip.mean), Array(large_state.mean), atol=1e-6)
+        @test isapprox(Array(round_trip.covar), Array(large_state.covar), atol=1e-6)
+    end
+
+    @testset "Integration with Linear Combinations" begin
+        basis_pair = QuadPairBasis(2)
+        state1 = gpu(coherentstate(basis_pair, 1.0), precision=Float32)
+        state2 = gpu(coherentstate(basis_pair, -1.0), precision=Float32)
+        lc = GaussianLinearCombination(basis_pair, [0.7f0, 0.3f0], [state1, state2])
+        @test device(lc) == :gpu
+        @test lc.basis isa QuadPairBasis
+        converted_state1 = changebasis(QuadBlockBasis, state1)
+        converted_state2 = changebasis(QuadBlockBasis, state2)
+        @test device(converted_state1) == :gpu
+        @test device(converted_state2) == :gpu
+        @test converted_state1.basis isa QuadBlockBasis
+        @test converted_state2.basis isa QuadBlockBasis
+        lc_converted = GaussianLinearCombination(QuadBlockBasis(2), [0.7f0, 0.3f0], 
+                                                [converted_state1, converted_state2])
+        @test device(lc_converted) == :gpu
+        @test lc_converted.basis isa QuadBlockBasis
+    end
+end
+
+@testitem "GPU Random Generation Suite" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device, GaussianState, GaussianUnitary, GaussianChannel
+    using CUDA
+    using LinearAlgebra: adjoint, inv, eigvals, det
+    using Test
+
+    if !CUDA.functional()
+        @warn "CUDA not functional. Skipping GPU random generation tests."
+        return
+    end
+
+    @testset "GPU Random States - Physical Properties" begin
+        nmodes = rand(1:3)
+        qpairbasis = QuadPairBasis(nmodes)
+        qblockbasis = QuadBlockBasis(nmodes)
+        
+        @testset "Pure Random States - Convenience API" begin
+            rs_pair_gpu = randstate_gpu(qpairbasis; pure=true)
+            rs_block_gpu = randstate_gpu(qblockbasis; pure=true)
+            @test rs_pair_gpu isa GaussianState
+            @test rs_block_gpu isa GaussianState
+            @test device(rs_pair_gpu) == :gpu
+            @test device(rs_block_gpu) == :gpu
+            @test rs_pair_gpu.ħ == 2
+            @test rs_block_gpu.ħ == 2
+            rs_pair_cpu = cpu(rs_pair_gpu)
+            rs_block_cpu = cpu(rs_block_gpu)
+            @test isgaussian(rs_pair_cpu, atol = 1e-3)
+            @test isgaussian(rs_block_cpu, atol = 1e-3)
+            @test isapprox(purity(rs_pair_cpu), 1.0, atol = 1e-3)
+            @test isapprox(purity(rs_block_cpu), 1.0, atol = 1e-3)
+        end
+        
+        @testset "Mixed Random States - Convenience API" begin
+            rs_mixed_gpu = randstate_gpu(qpairbasis; pure=false)
+            @test rs_mixed_gpu isa GaussianState
+            @test device(rs_mixed_gpu) == :gpu
+            rs_mixed_cpu = cpu(rs_mixed_gpu)
+            @test isgaussian(rs_mixed_cpu, atol = 1e-3)
+        end
+        
+        @testset "Typed Random States - Your GPU Extensions" begin
+            rs_typed_gpu = randstate(CuVector{Float32}, CuMatrix{Float32}, qpairbasis; pure=true)
+            @test rs_typed_gpu isa GaussianState
+            @test device(rs_typed_gpu) == :gpu
+            @test eltype(rs_typed_gpu.mean) == Float32
+            @test eltype(rs_typed_gpu.covar) == Float32
+            rs_typed_cpu = cpu(rs_typed_gpu)
+            @test isgaussian(rs_typed_cpu, atol = 1e-3)
+            @test isapprox(purity(rs_typed_cpu), 1.0, atol = 1e-3)
+        end
+        
+        @testset "Batch States - Your Batch Functions" begin
+            batch_size = 3
+            states_gpu = batch_randstate_gpu(qpairbasis, batch_size; pure=true)
+            @test length(states_gpu) == batch_size
+            @test all(s isa GaussianState for s in states_gpu)
+            @test all(device(s) == :gpu for s in states_gpu)
+            state_cpu = cpu(states_gpu[1])
+            @test isgaussian(state_cpu, atol = 1e-3)
+            @test isapprox(purity(state_cpu), 1.0, atol = 1e-3)
+        end
+    end
+
+    @testset "GPU Random Unitaries - Your GPU Functions" begin
+        nmodes = rand(1:3)
+        qpairbasis = QuadPairBasis(nmodes)
+        
+        @testset "Passive Unitaries - Convenience API" begin
+            ru_passive_gpu = randunitary_gpu(qpairbasis; passive=true)
+            @test ru_passive_gpu isa GaussianUnitary
+            @test device(ru_passive_gpu) == :gpu
+            @test ru_passive_gpu.ħ == 2
+            S_cpu = Array(ru_passive_gpu.symplectic)
+            @test isapprox(S_cpu', inv(S_cpu), atol = 1e-3)
+            @test issymplectic(qpairbasis, S_cpu, atol = 1e-3)
+        end
+        
+        @testset "Active Unitaries - Convenience API" begin
+            ru_active_gpu = randunitary_gpu(qpairbasis; passive=false)
+            @test ru_active_gpu isa GaussianUnitary
+            @test device(ru_active_gpu) == :gpu
+            @test ru_active_gpu.ħ == 2
+            S_cpu = Array(ru_active_gpu.symplectic)
+            @test issymplectic(qpairbasis, S_cpu, atol = 1e-3)
+        end
+        
+        @testset "Typed Unitaries - Your GPU Extensions" begin
+            ru_typed_gpu = randunitary(CuVector{Float32}, CuMatrix{Float32}, qpairbasis; passive=false)
+            @test ru_typed_gpu isa GaussianUnitary
+            @test device(ru_typed_gpu) == :gpu
+            @test eltype(ru_typed_gpu.disp) == Float32
+            @test eltype(ru_typed_gpu.symplectic) == Float32
+            S_cpu = Array(ru_typed_gpu.symplectic)
+            @test issymplectic(qpairbasis, S_cpu, atol = 1e-3)
+        end
+        
+        @testset "Batch Unitaries - Your Batch Functions" begin
+            batch_size = 3
+            unitaries_gpu = batch_randunitary_gpu(qpairbasis, batch_size; passive=false)
+            @test length(unitaries_gpu) == batch_size
+            @test all(u isa GaussianUnitary for u in unitaries_gpu)
+            @test all(device(u) == :gpu for u in unitaries_gpu)
+            S_cpu = Array(unitaries_gpu[1].symplectic)
+            @test issymplectic(qpairbasis, S_cpu, atol = 1e-3)
+        end
+    end
+
+    @testset "GPU Random Channels - Your GPU Functions" begin
+        nmodes = rand(1:2)
+        qpairbasis = QuadPairBasis(nmodes)
+        
+        @testset "Basic Channel Generation - Convenience API" begin
+            rc_gpu = randchannel_gpu(qpairbasis)
+            @test rc_gpu isa GaussianChannel
+            @test device(rc_gpu) == :gpu
+            @test rc_gpu.ħ == 2
+            rc_cpu = cpu(rc_gpu)
+            @test isgaussian(rc_cpu, atol = 1e-3)
+            state_gpu = randstate_gpu(qpairbasis)
+            output_gpu = rc_gpu * state_gpu
+            @test output_gpu isa GaussianState
+            @test device(output_gpu) == :gpu
+        end
+        
+        @testset "Typed Channels - Your GPU Extensions" begin
+            rc_typed_gpu = randchannel(CuVector{Float32}, CuMatrix{Float32}, qpairbasis)
+            @test rc_typed_gpu isa GaussianChannel
+            @test device(rc_typed_gpu) == :gpu
+            @test eltype(rc_typed_gpu.disp) == Float32
+            @test eltype(rc_typed_gpu.transform) == Float32
+            @test eltype(rc_typed_gpu.noise) == Float32
+            rc_cpu = cpu(rc_typed_gpu)
+            @test isgaussian(rc_cpu, atol = 1e-3)
+        end
+    end
+
+    @testset "GPU Random Symplectic Matrices - Your GPU Functions" begin
+        nmodes = rand(1:3)
+        qpairbasis = QuadPairBasis(nmodes)
+        qblockbasis = QuadBlockBasis(nmodes)
+        
+        @testset "Passive Symplectic - Convenience API" begin
+            S_passive_gpu = randsymplectic_gpu(qpairbasis; passive=true)
+            @test S_passive_gpu isa CuMatrix
+            @test size(S_passive_gpu) == (2*nmodes, 2*nmodes)
+            S_passive_cpu = Array(S_passive_gpu)
+            @test isapprox(S_passive_cpu', inv(S_passive_cpu), atol = 1e-3)
+            @test issymplectic(qpairbasis, S_passive_cpu, atol = 1e-3)
+        end
+        
+        @testset "Active Symplectic - Convenience API" begin
+            S_active_gpu = randsymplectic_gpu(qblockbasis; passive=false)
+            @test S_active_gpu isa CuMatrix
+            @test size(S_active_gpu) == (2*nmodes, 2*nmodes)
+            S_active_cpu = Array(S_active_gpu)
+            @test issymplectic(qblockbasis, S_active_cpu, atol = 1e-3)
+        end
+        
+        @testset "Typed Symplectic - Your GPU Extensions" begin
+            S_typed_gpu = randsymplectic(CuMatrix{Float32}, qpairbasis; passive=true)
+            @test S_typed_gpu isa CuMatrix{Float32}
+            @test eltype(S_typed_gpu) == Float32
+            @test size(S_typed_gpu) == (2*nmodes, 2*nmodes)
+            S_typed_cpu = Array(S_typed_gpu)
+            @test isapprox(S_typed_cpu', inv(S_typed_cpu), atol = 1e-3)
+            @test issymplectic(qpairbasis, S_typed_cpu, atol = 1e-3)
+        end
+    end
+
+    @testset "GPU Random Integration - Your Functions Working Together" begin
+        nmodes = 2
+        qpairbasis = QuadPairBasis(nmodes)
+        
+        @testset "State-Unitary Operations" begin
+            state_gpu = randstate_gpu(qpairbasis; pure=true)
+            unitary_gpu = randunitary_gpu(qpairbasis; passive=false)
+            transformed_gpu = unitary_gpu * state_gpu
+            @test transformed_gpu isa GaussianState
+            @test device(transformed_gpu) == :gpu
+            transformed_cpu = cpu(transformed_gpu)
+            @test isgaussian(transformed_cpu, atol = 1e-3)
+        end
+        
+        @testset "State-Channel Operations" begin
+            state_gpu = randstate_gpu(qpairbasis; pure=false)
+            channel_gpu = randchannel_gpu(qpairbasis)
+            output_gpu = channel_gpu * state_gpu
+            @test output_gpu isa GaussianState
+            @test device(output_gpu) == :gpu
+            output_cpu = cpu(output_gpu)
+            @test isgaussian(output_cpu, atol = 1e-3)
+        end
+    end
+
+    @testset "GPU Random Error Handling - Your API" begin
+        qpairbasis = QuadPairBasis(1)
+        @test_nowarn randstate(CuVector{Float32}, CuMatrix{Float32}, qpairbasis)
+        @test_nowarn randunitary(CuVector{Float32}, CuMatrix{Float32}, qpairbasis)
+        @test_nowarn randchannel(CuVector{Float32}, CuMatrix{Float32}, qpairbasis)
+        @test_nowarn randsymplectic(CuMatrix{Float32}, qpairbasis)
+        @test_nowarn randstate_gpu(qpairbasis)
+        @test_nowarn randunitary_gpu(qpairbasis)
+        @test_nowarn randchannel_gpu(qpairbasis)
+        @test_nowarn randsymplectic_gpu(qpairbasis)
+        @test_nowarn batch_randstate_gpu(qpairbasis, 2)
+        @test_nowarn batch_randunitary_gpu(qpairbasis, 2)
+    end
+
+    @testset "GPU Random Precision Control - Your Features" begin
+        qpairbasis = QuadPairBasis(1)
+        
+        @testset "Float32 Precision" begin
+            state_f32 = randstate_gpu(qpairbasis; precision=Float32)
+            unitary_f32 = randunitary_gpu(qpairbasis; precision=Float32)
+            symp_f32 = randsymplectic_gpu(qpairbasis; precision=Float32)
+            @test eltype(state_f32.mean) == Float32
+            @test eltype(unitary_f32.disp) == Float32
+            @test eltype(symp_f32) == Float32
+        end
+        
+        @testset "Float64 Precision" begin
+            state_f64 = randstate_gpu(qpairbasis; precision=Float64)
+            unitary_f64 = randunitary_gpu(qpairbasis; precision=Float64)
+            symp_f64 = randsymplectic_gpu(qpairbasis; precision=Float64)
+            @test eltype(state_f64.mean) == Float64
+            @test eltype(unitary_f64.disp) == Float64
+            @test eltype(symp_f64) == Float64
+        end
+    end
+
+    @testset "GPU Random Performance - Your Batch Functions" begin
+        qpairbasis = QuadPairBasis(2)
+        
+        @testset "Batch vs Individual Performance" begin
+            batch_size = 5
+            batch_states = batch_randstate_gpu(qpairbasis, batch_size; pure=true)
+            batch_unitaries = batch_randunitary_gpu(qpairbasis, batch_size; passive=false)
+            @test length(batch_states) == batch_size
+            @test length(batch_unitaries) == batch_size
+            @test all(device(s) == :gpu for s in batch_states)
+            @test all(device(u) == :gpu for u in batch_unitaries)
+            state_cpu = cpu(batch_states[1])
+            unitary_cpu = cpu(batch_unitaries[1])
+            @test isgaussian(state_cpu, atol = 1e-3)
+            @test issymplectic(qpairbasis, Array(batch_unitaries[1].symplectic), atol = 1e-3)
+        end
+    end
+end
+
+@testitem "GPU Cross-Wigner Foundation - single thread, but accuracy tests" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device, cross_wigner, cross_wignerchar
+    using LinearAlgebra
+    using CUDA
+
+    @testset "GPU Cross-Wigner Function Correctness" begin
+        basis = QuadPairBasis(1)
+        state1_cpu = coherentstate(basis, 1.0 + 0.5im)
+        state2_cpu = coherentstate(basis, -0.8 + 0.3im)
+        state1_gpu = GaussianState(basis, CuArray{Float32}(state1_cpu.mean), CuArray{Float32}(state1_cpu.covar); ħ = state1_cpu.ħ)
+        state2_gpu = GaussianState(basis, CuArray{Float32}(state2_cpu.mean), CuArray{Float32}(state2_cpu.covar); ħ = state2_cpu.ħ)
+        test_points = [
+            [0.0, 0.0],
+            [1.0, 0.5],
+            [-0.5, 1.2],
+            [0.3, -0.8],
+            [2.1, -1.4]
+        ]
+        
+        @testset "Cross-Wigner vs CPU Implementation" begin
+            for x in test_points
+                cpu_result = cross_wigner(state1_cpu, state2_cpu, x)
+                gpu_result = cross_wigner(state1_gpu, state2_gpu, x)
+                @test real(cpu_result) ≈ real(gpu_result) atol=1e-6 rtol=1e-5
+                @test imag(cpu_result) ≈ imag(gpu_result) atol=1e-6 rtol=1e-5
+            end
+        end
+        
+        @testset "Cross-Wigner Hermiticity Property" begin
+            for x in test_points
+                w12 = cross_wigner(state1_gpu, state2_gpu, x)
+                w21 = cross_wigner(state2_gpu, state1_gpu, x)
+                @test real(w12) ≈ real(w21) atol=1e-6 rtol=1e-5
+                @test imag(w12) ≈ -imag(w21) atol=1e-6 rtol=1e-5
+                @test w12 ≈ conj(w21) atol=1e-6 rtol=1e-5
+            end
+        end
+        
+        @testset "Cross-Wigner Identity Property" begin  
+            for x in test_points
+                cross_diag = cross_wigner(state1_gpu, state1_gpu, x)
+                regular_wigner = wigner(state1_gpu, x)
+                @test real(cross_diag) ≈ regular_wigner atol=1e-6 rtol=1e-5
+                @test abs(imag(cross_diag)) < 1e-6
+            end
+        end
+    end
+
+    @testset "GPU Cross-Wigner Characteristic Function Correctness" begin
+        basis = QuadPairBasis(1)
+        state1_cpu = squeezedstate(basis, 0.3, π/4)
+        state2_cpu = coherentstate(basis, 0.5 - 0.2im)
+        state1_gpu = GaussianState(basis, CuArray{Float32}(state1_cpu.mean), CuArray{Float32}(state1_cpu.covar); ħ = state1_cpu.ħ)
+        state2_gpu = GaussianState(basis, CuArray{Float32}(state2_cpu.mean), CuArray{Float32}(state2_cpu.covar); ħ = state2_cpu.ħ)
+        test_xi = [
+            [0.0, 0.0], 
+            [0.1, -0.3],
+            [-0.7, 0.4],
+            [1.2, 0.8],
+            [-0.9, -1.1]
+        ]
+        
+        @testset "Cross-WignerChar vs CPU Implementation" begin
+            for xi in test_xi
+                cpu_result = cross_wignerchar(state1_cpu, state2_cpu, xi)
+                gpu_result = cross_wignerchar(state1_gpu, state2_gpu, xi)
+                @test real(cpu_result) ≈ real(gpu_result) atol=1e-6 rtol=1e-5
+                @test imag(cpu_result) ≈ imag(gpu_result) atol=1e-6 rtol=1e-5
+            end
+        end
+        
+        @testset "Cross-WignerChar Hermiticity Property" begin
+            for xi in test_xi
+                chi12 = cross_wignerchar(state1_gpu, state2_gpu, xi)
+                chi21 = cross_wignerchar(state2_gpu, state1_gpu, -xi)
+                @test real(chi12) ≈ real(chi21) atol=1e-6 rtol=1e-5
+                @test imag(chi12) ≈ -imag(chi21) atol=1e-6 rtol=1e-5
+            end
+        end
+        
+        @testset "Cross-WignerChar Identity Property" begin
+            for xi in test_xi
+                cross_diag = cross_wignerchar(state1_gpu, state1_gpu, xi)
+                regular_char = wignerchar(state1_gpu, xi)
+                @test abs(cross_diag - regular_char) < 1e-6
+            end
+        end
+    end
+
+    @testset "GPU Cross-Wigner Multi-mode Systems" begin
+        basis = QuadPairBasis(2)
+        state1_cpu = coherentstate(basis, [1.0 + 0.5im, -0.3 + 0.8im])
+        state2_cpu = squeezedstate(basis, [0.2, 0.4], [π/6, π/3])
+        state1_gpu = GaussianState(basis, CuArray{Float32}(state1_cpu.mean), CuArray{Float32}(state1_cpu.covar); ħ = state1_cpu.ħ)
+        state2_gpu = GaussianState(basis, CuArray{Float32}(state2_cpu.mean), CuArray{Float32}(state2_cpu.covar); ħ = state2_cpu.ħ)
+        test_points_2mode = [
+            [0.0, 0.0, 0.0, 0.0],
+            [1.0, 0.5, -0.3, 0.8],
+            [-0.2, 1.1, 0.7, -0.4]
+        ]
+        
+        @testset "Multi-mode Cross-Wigner Correctness" begin
+            for x in test_points_2mode
+                cpu_result = cross_wigner(state1_cpu, state2_cpu, x)
+                gpu_result = cross_wigner(state1_gpu, state2_gpu, x)
+                @test real(cpu_result) ≈ real(gpu_result) atol=1e-5 rtol=1e-4
+                @test imag(cpu_result) ≈ imag(gpu_result) atol=1e-5 rtol=1e-4
+            end
+        end
+        
+        @testset "Multi-mode Cross-WignerChar Correctness" begin
+            for xi in test_points_2mode
+                cpu_result = cross_wignerchar(state1_cpu, state2_cpu, xi)
+                gpu_result = cross_wignerchar(state1_gpu, state2_gpu, xi)
+                @test real(cpu_result) ≈ real(gpu_result) atol=1e-5 rtol=1e-4
+                @test imag(cpu_result) ≈ imag(gpu_result) atol=1e-5 rtol=1e-4
+            end
+        end
+    end
+
+    @testset "GPU Cross-Wigner Mixed Device Support" begin
+        basis = QuadPairBasis(1)
+        state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.8 + 0.6im)
+        state2_gpu = squeezedstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.4, π/8)
+        x_cpu = [0.5, -0.3]
+        xi_cpu = [0.2, 0.8]
+        
+        @testset "GPU States with CPU Points" begin
+            result_w = cross_wigner(state1_gpu, state2_gpu, x_cpu)
+            result_chi = cross_wignerchar(state1_gpu, state2_gpu, xi_cpu)
+            @test result_w isa ComplexF32
+            @test result_chi isa ComplexF32
+            @test isfinite(real(result_w)) && isfinite(imag(result_w))
+            @test isfinite(real(result_chi)) && isfinite(imag(result_chi))
+        end
+    end
+
+    @testset "GPU Cross-Wigner Error Handling" begin
+        basis1 = QuadPairBasis(1)
+        basis2 = QuadBlockBasis(1)
+        state1 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis1, 1.0)
+        state2 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis2, 1.0)
+        state3 = GaussianState(basis1, CuArray{Float32}([1.0, 0.0]), CuArray{Float32}([1.0 0.0; 0.0 1.0]); ħ = 4)
+        x = [0.0, 0.0]
+        
+        @testset "Basis Mismatch Detection" begin
+            @test_throws ArgumentError cross_wigner(state1, state2, x)
+            @test_throws ArgumentError cross_wignerchar(state1, state2, x)
+        end
+        
+        @testset "ħ Mismatch Detection" begin
+            @test_throws ArgumentError cross_wigner(state1, state3, x)
+            @test_throws ArgumentError cross_wignerchar(state1, state3, x)
+        end
+        
+        @testset "Dimension Mismatch Detection" begin
+            wrong_x = [0.0]
+            @test_throws ArgumentError cross_wigner(state1, state1, wrong_x)
+            @test_throws ArgumentError cross_wignerchar(state1, state1, wrong_x)
+        end
+    end
+
+    @testset "GPU Cross-Wigner Performance Validation" begin
+        basis = QuadPairBasis(2)
+        state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, [1.0 + 0.5im, -0.3 + 0.8im])
+        state2_gpu = eprstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.3, π/4)
+        x = [0.1, 0.2, 0.3, 0.4]
+        xi = [0.5, 0.6, 0.7, 0.8]
+        
+        @testset "GPU Functions Execute Successfully" begin
+            @test begin
+                result_w = cross_wigner(state1_gpu, state2_gpu, x)
+                result_chi = cross_wignerchar(state1_gpu, state2_gpu, xi)
+                isfinite(real(result_w)) && isfinite(imag(result_w)) &&
+                isfinite(real(result_chi)) && isfinite(imag(result_chi))
+            end
+        end
+    end
+end
+
+@testitem "GPU Full Interference Wigner Functions" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra
+    using CUDA
+    
+    if CUDA.functional()
+        @testset "Single-State Linear Combinations" begin
+            basis = QuadPairBasis(1)
+            alpha = 1.0f0 + 0.5f0*im
+            gpu_state = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha)
+            coeffs = [1.0f0]
+            lc_single = GaussianLinearCombination(basis, coeffs, [gpu_state])
+            x_cpu = [0.1f0, 0.3f0]
+            xi_cpu = [0.2f0, -0.4f0]
+            x_gpu = CuArray(x_cpu)
+            xi_gpu = CuArray(xi_cpu)
+            w_single_direct = wigner(gpu_state, x_gpu)
+            w_single_interference = wigner(lc_single, x_gpu)
+            @test w_single_direct ≈ w_single_interference rtol=1e-6
+            char_single_direct = wignerchar(gpu_state, xi_gpu)
+            char_single_interference = wignerchar(lc_single, xi_gpu)
+            @test char_single_direct ≈ char_single_interference rtol=1e-6
+            w_mixed = wigner(lc_single, x_cpu)
+            @test w_single_direct ≈ w_mixed rtol=1e-6
+            char_mixed = wignerchar(lc_single, xi_cpu)
+            @test char_single_direct ≈ char_mixed rtol=1e-6
+        end
+        
+        @testset "Two-State Interference" begin
+            basis = QuadPairBasis(1)
+            alpha = 1.0f0
+            state_plus = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha)
+            state_minus = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, -alpha)
+            norm_factor = 1.0f0 / sqrt(2.0f0 * (1.0f0 + exp(-2.0f0 * abs2(alpha))))
+            coeffs = [norm_factor, norm_factor]
+            cat_even = GaussianLinearCombination(basis, coeffs, [state_plus, state_minus])
+            x_test = [0.0f0, 0.0f0]
+            xi_test = [0.1f0, 0.2f0]
+            w_interference = wigner(cat_even, x_test)
+            char_interference = wignerchar(cat_even, xi_test)
+            w_plus = wigner(state_plus, CuArray(x_test))
+            w_minus = wigner(state_minus, CuArray(x_test))
+            cross_w = cross_wigner(state_plus, state_minus, CuArray(x_test))
+            w_manual = abs2(coeffs[1]) * w_plus + abs2(coeffs[2]) * w_minus + 
+                       2 * real(conj(coeffs[1]) * coeffs[2] * cross_w)
+            @test w_interference ≈ w_manual rtol=1e-6
+            @test isreal(w_interference)
+            char_plus = wignerchar(state_plus, CuArray(xi_test))
+            char_minus = wignerchar(state_minus, CuArray(xi_test))
+            cross_char = cross_wignerchar(state_plus, state_minus, CuArray(xi_test))
+            char_manual = abs2(coeffs[1]) * char_plus + abs2(coeffs[2]) * char_minus + 
+                          2 * real(conj(coeffs[1]) * coeffs[2] * cross_char)
+            @test char_interference ≈ char_manual rtol=1e-6
+        end
+        
+        @testset "Multi-State Interference" begin
+            basis = QuadPairBasis(1)
+            alphas = [1.0f0, 0.0f0, -1.0f0]
+            states = [coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, α) for α in alphas]
+            coeffs = [0.5f0, 0.3f0, 0.2f0]
+            lc_multi = GaussianLinearCombination(basis, coeffs, states)
+            x_test = [0.5f0, -0.3f0]
+            xi_test = [0.2f0, 0.4f0]
+            w_gpu = wigner(lc_multi, x_test)
+            char_gpu = wignerchar(lc_multi, xi_test)
+            w_diagonal = sum(abs2(coeffs[i]) * wigner(states[i], CuArray(x_test)) for i in 1:3)
+            w_cross = 0.0f0
+            for i in 1:2
+                for j in (i+1):3
+                    cross_term = 2 * real(conj(coeffs[i]) * coeffs[j] * 
+                                         cross_wigner(states[i], states[j], CuArray(x_test)))
+                    w_cross += cross_term
+                end
+            end
+            w_manual = w_diagonal + w_cross
+            @test w_gpu ≈ w_manual rtol=1e-5
+            char_diagonal = sum(abs2(coeffs[i]) * wignerchar(states[i], CuArray(xi_test)) for i in 1:3)
+            char_cross = complex(0.0f0)
+            for i in 1:2
+                for j in (i+1):3
+                    cross_term = 2 * real(conj(coeffs[i]) * coeffs[j] * 
+                                         cross_wignerchar(states[i], states[j], CuArray(xi_test)))
+                    char_cross += cross_term
+                end
+            end
+            char_manual = char_diagonal + char_cross
+            @test char_gpu ≈ char_manual rtol=1e-5
+        end
+        
+        @testset "Complex Coefficients" begin
+            basis = QuadPairBasis(1)
+            alpha1, alpha2 = 1.0f0, -0.5f0
+            state1 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha1)
+            state2 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha2)
+            c1 = 0.6f0 + 0.2f0*im
+            c2 = 0.3f0 - 0.4f0*im
+            coeffs = ComplexF32[c1, c2]
+            lc_complex = GaussianLinearCombination(basis, coeffs, [state1, state2])
+            x_test = [0.2f0, 0.1f0]
+            w_gpu = wigner(lc_complex, x_test)
+            w1 = wigner(state1, CuArray(x_test))
+            w2 = wigner(state2, CuArray(x_test))
+            cross_w = cross_wigner(state1, state2, CuArray(x_test))
+            w_manual = abs2(c1) * w1 + abs2(c2) * w2 + 2 * real(conj(c1) * c2 * cross_w)
+            @test w_gpu ≈ w_manual rtol=1e-6
+            @test isreal(w_gpu)
+        end
+        
+        @testset "Multi-Mode Systems" begin
+            basis = QuadPairBasis(2)
+            alpha1 = ComplexF32[1.0f0, 0.5f0]
+            alpha2 = ComplexF32[-0.5f0, 1.0f0]
+            state1 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha1)
+            state2 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, alpha2)
+            coeffs = [0.7f0, 0.3f0]
+            lc_multimode = GaussianLinearCombination(basis, coeffs, [state1, state2])
+            x_test = [0.1f0, 0.2f0, -0.1f0, 0.3f0]
+            xi_test = [0.05f0, -0.1f0, 0.15f0, -0.05f0]
+            w_gpu = wigner(lc_multimode, x_test)
+            char_gpu = wignerchar(lc_multimode, xi_test)
+            @test isfinite(w_gpu)
+            @test isreal(w_gpu)
+            @test isfinite(char_gpu)
+            w1 = wigner(state1, CuArray(x_test))
+            w2 = wigner(state2, CuArray(x_test))
+            cross_w = cross_wigner(state1, state2, CuArray(x_test))
+            w_manual = abs2(coeffs[1]) * w1 + abs2(coeffs[2]) * w2 + 
+                    2 * real(conj(coeffs[1]) * coeffs[2] * cross_w)
+            @test w_gpu ≈ w_manual rtol=1e-6
+        end
+        
+        @testset "Error Handling" begin
+            basis = QuadPairBasis(1)
+            state = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 1.0f0)
+            lc = GaussianLinearCombination(basis, [1.0f0], [state])
+            @test_throws ArgumentError wigner(lc, [0.0f0])
+            @test_throws ArgumentError wignerchar(lc, [0.0f0, 0.0f0, 0.0f0])
+            basis2 = QuadPairBasis(2)
+            state1_2mode = tensor(state, state)
+            lc2 = GaussianLinearCombination(basis2, [1.0f0], [state1_2mode])
+            @test_throws ArgumentError wigner(lc2, [0.0f0, 0.0f0])
+            @test_throws ArgumentError wignerchar(lc2, [0.0f0, 0.0f0, 0.0f0])
+        end
+        
+        @testset "Performance and Memory Efficiency" begin
+            basis = QuadPairBasis(1)
+            n_states = 5
+            states = [coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 
+                                   Float32(i) * (1.0f0 + 0.1f0*im)) for i in 1:n_states]
+            coeffs = normalize([Float32(i) for i in 1:n_states])
+            lc_large = GaussianLinearCombination(basis, coeffs, states)
+            x_test = [0.0f0, 0.0f0]
+            w_result = wigner(lc_large, x_test)
+            elapsed_time = @elapsed begin
+                for _ in 1:100
+                    wigner(lc_large, x_test)
+                end
+            end
+            @test elapsed_time < 1.0
+            @test isfinite(w_result)
+            @test isreal(w_result)
+            CUDA.reclaim()
+            initial_memory = CUDA.used_memory()
+            w_large = wigner(lc_large, x_test)
+            peak_memory = CUDA.used_memory()
+            memory_increase = peak_memory - initial_memory
+            @test memory_increase < 100_000_000
+            @test isfinite(w_large)
+        end
+        
+        @testset "Numerical Stability" begin
+            basis = QuadPairBasis(1)
+            small_alpha = 1.0f-6
+            large_alpha = 1.0f6
+            small_state = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, small_alpha)
+            large_state = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, large_alpha)
+            coeffs = [1.0f0/sqrt(2.0f0), 1.0f0/sqrt(2.0f0)]
+            lc_extreme = GaussianLinearCombination(basis, coeffs, [small_state, large_state])
+            x_test = [0.0f0, 0.0f0]
+            w_extreme = wigner(lc_extreme, x_test)
+            char_extreme = wignerchar(lc_extreme, [0.1f0, 0.1f0])
+            @test isfinite(w_extreme)
+            @test isfinite(char_extreme)
+            @test !isnan(w_extreme)
+            @test !isnan(char_extreme)
+        end
+    else
+        @test_skip "CUDA not available, skipping GPU interference tests"
+    end
+end
+
+@testitem "GPU Batched Interference Test" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra  
+    using CUDA
+    using Test
+
+    @testset "GPU Batched Interference Wigner Functions" begin
+        basis = QuadPairBasis(1)
+        α1, α2 = 1.0f0 + 0.5f0*im, -1.0f0 + 0.3f0*im
+        state1 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, α1)
+        state2 = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, α2)
+        coeffs = Float32[0.6, 0.8]
+        lc = GaussianLinearCombination(basis, coeffs, [state1, state2])
+        @test device(lc) == :gpu
+        @test length(lc) == 2
+        
+        @testset "Batched Wigner Evaluation" begin
+            num_points = 100
+            x_points = CuArray(randn(Float32, 2, num_points))
+            w_batch = wigner(lc, x_points)
+            @test w_batch isa CuArray{Float32}
+            @test size(w_batch) == (num_points,)
+            @test device(w_batch) == :gpu
+            @test all(isfinite, Array(w_batch))
+            w_single = [wigner(lc, @view x_points[:, i]) for i in 1:min(10, num_points)]
+            w_batch_sample = Array(w_batch[1:length(w_single)])
+            @test w_batch_sample ≈ w_single rtol=1e-5
+        end
+        
+        @testset "Batched Wigner Characteristic Function" begin  
+            num_points = 50
+            xi_points = CuArray(randn(Float32, 2, num_points))
+            char_batch = wignerchar(lc, xi_points)
+            @test char_batch isa CuArray{ComplexF32}
+            @test size(char_batch) == (num_points,)
+            @test device(char_batch) == :gpu
+            @test all(isfinite, Array(real.(char_batch)))
+            @test all(isfinite, Array(imag.(char_batch)))
+            char_single = [wignerchar(lc, @view xi_points[:, i]) for i in 1:min(5, num_points)]
+            char_batch_sample = Array(char_batch[1:length(char_single)])
+            @test char_batch_sample ≈ char_single rtol=1e-5
+        end
+        
+        @testset "Automatic CPU->GPU Promotion" begin
+            num_points = 20
+            x_points_cpu = randn(Float32, 2, num_points)
+            xi_points_cpu = randn(Float32, 2, num_points)
+            w_auto = wigner(lc, x_points_cpu)
+            char_auto = wignerchar(lc, xi_points_cpu)
+            @test device(w_auto) == :gpu
+            @test device(char_auto) == :gpu
+            w_direct = wigner(lc, CuArray(x_points_cpu))
+            char_direct = wignerchar(lc, CuArray(xi_points_cpu))
+            @test Array(w_auto) ≈ Array(w_direct) rtol=1e-6
+            @test Array(char_auto) ≈ Array(char_direct) rtol=1e-6
+        end
+        
+        @testset "Performance Scaling" begin
+            large_num_points = 1000
+            x_large = CuArray(randn(Float32, 2, large_num_points))
+            @test_nowarn begin
+                w_large = wigner(lc, x_large)
+                @test length(w_large) == large_num_points
+                @test device(w_large) == :gpu
+            end
+        end
+        
+        @testset "Multi-State Interference" begin
+            state3 = squeezedstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.2f0, Float32(π/6))
+            coeffs_multi = Float32[0.5, 0.3, 0.2]
+            lc_multi = GaussianLinearCombination(basis, coeffs_multi, [state1, state2, state3])
+            num_points = 50
+            x_points = CuArray(randn(Float32, 2, num_points))
+            w_multi = wigner(lc_multi, x_points)
+            char_multi = wignerchar(lc_multi, x_points)
+            @test w_multi isa CuArray{Float32}
+            @test char_multi isa CuArray{ComplexF32}
+            @test size(w_multi) == (num_points,)
+            @test size(char_multi) == (num_points,)
+            @test all(isfinite, Array(w_multi))
+            @test all(isfinite, Array(real.(char_multi)))
+            @test all(isfinite, Array(imag.(char_multi)))
+        end
+        
+        @testset "Error Handling and Edge Cases" begin
+            x_wrong_dim = CuArray(randn(Float32, 3, 10))
+            @test_throws ArgumentError wigner(lc, x_wrong_dim)
+            @test_throws ArgumentError wignerchar(lc, x_wrong_dim)
+            @test_throws DimensionMismatch GaussianLinearCombination(basis, Float32[0.5, 0.3], [state1])
+            basis_wrong = QuadBlockBasis(1)
+            state_wrong = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis_wrong, 1.0f0)
+            @test_throws ArgumentError GaussianLinearCombination(basis, Float32[0.5], [state_wrong])
+            x_single = CuArray(randn(Float32, 2, 1))
+            w_single_batch = wigner(lc, x_single)
+            @test length(w_single_batch) == 1
+            @test isfinite(Array(w_single_batch)[1])
+        end
+    end
+
+    @testset "GPU Batched Interference - Float64 Precision" begin
+        basis = QuadPairBasis(1)
+        state1_f64 = coherentstate(CuVector{Float64}, CuMatrix{Float64}, basis, 1.0 + 0.5*im)
+        state2_f64 = coherentstate(CuVector{Float64}, CuMatrix{Float64}, basis, -1.0 + 0.3*im)
+        coeffs_f64 = [0.6, 0.8]
+        lc_f64 = GaussianLinearCombination(basis, coeffs_f64, [state1_f64, state2_f64])
+        x_points_f64 = CuArray(randn(Float64, 2, 20))
+        w_f64 = wigner(lc_f64, x_points_f64)
+        char_f64 = wignerchar(lc_f64, x_points_f64)
+        @test w_f64 isa CuArray{Float64}
+        @test char_f64 isa CuArray{ComplexF64}
+        @test all(isfinite, Array(w_f64))
+        @test all(isfinite, Array(real.(char_f64)))
+        @test all(isfinite, Array(imag.(char_f64)))
+    end
+
+    @testset "GPU Batched Interference - Multi-Mode" begin
+        basis_2mode = QuadPairBasis(2)
+        state1_2m = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis_2mode, 1.0f0)
+        state2_2m = squeezedstate(CuVector{Float32}, CuMatrix{Float32}, basis_2mode, 0.3f0, 0.0f0)
+        coeffs_2m = Float32[0.7, 0.3]
+        lc_2m = GaussianLinearCombination(basis_2mode, coeffs_2m, [state1_2m, state2_2m])
+        num_points = 30
+        x_points_2m = CuArray(randn(Float32, 4, num_points))
+        w_2m = wigner(lc_2m, x_points_2m)
+        char_2m = wignerchar(lc_2m, x_points_2m)
+        @test w_2m isa CuArray{Float32}
+        @test char_2m isa CuArray{ComplexF32}
+        @test size(w_2m) == (num_points,)
+        @test size(char_2m) == (num_points,)
+        @test all(isfinite, Array(w_2m))
+        @test all(isfinite, Array(real.(char_2m)))
+        @test all(isfinite, Array(imag.(char_2m)))
+    end
+
+end
+
+#Benchmark code
+#=
+@testitem "GPU Performance Benchmarks for Wigner and Tensor" tags=[:cuda] begin
+    using CUDA
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra
+    using Statistics
+    
+    function gpu_warmup()
+        basis = QuadPairBasis(1)
+        try
+            for _ in 1:5
+                vac = vacuumstate(basis) |> gpu
+                coh = coherentstate(basis, 1.0f0) |> gpu
+            end
+            vac = vacuumstate(basis) |> gpu
+            disp = displace(basis, 1.0f0) |> gpu
+            for _ in 1:5
+                displaced = disp * vac
+            end
+            coh = coherentstate(basis, 0.5f0) |> gpu
+            test_points = CuArray(randn(Float32, 2, 100))
+            for _ in 1:3
+                w = wigner(coh, test_points)
+            end
+            CUDA.synchronize()
+            GC.gc()
+        catch e
+            @error "GPU warmup failed" exception=e
+            rethrow(e)
+        end
+    end
+    
+    function benchmark_operation(gpu_func, cpu_func, name::String; n_trials=5, min_time=1e-6)
+        gpu_times = Float64[]
+        for trial in 1:n_trials
+            CUDA.synchronize()
+            t_start = time_ns()
+            result_gpu = gpu_func()
+            CUDA.synchronize()
+            t_end = time_ns()
+            elapsed = max((t_end - t_start) / 1e9, min_time)
+            push!(gpu_times, elapsed)
+        end
+        cpu_times = Float64[]
+        for trial in 1:n_trials
+            GC.gc()
+            t_start = time_ns()
+            result_cpu = cpu_func()
+            t_end = time_ns()
+            elapsed = max((t_end - t_start) / 1e9, min_time)
+            push!(cpu_times, elapsed)
+        end
+        gpu_median = median(gpu_times)
+        cpu_median = median(cpu_times)
+        speedup = cpu_median / gpu_median
+        @info "Benchmark: $name" cpu_time=round(cpu_median, digits=4) gpu_time=round(gpu_median, digits=4) speedup=round(speedup, digits=2)
+        return speedup, gpu_median, cpu_median
+    end
+    
+    @testset "GPU Warmup" begin
+        gpu_warmup()
+        @test true
+    end
+    
+    @testset "API Usage" begin
+        basis = QuadPairBasis(1)
+        state = coherentstate(basis, 1.0) |> gpu
+        op = displace(basis, 0.5) |> gpu
+        result = op * state
+        @test device(state) == :gpu
+        @test device(op) == :gpu
+        @test device(result) == :gpu
+        α_gpu = CuArray([1.0f0 + 0.5f0im])
+        auto_state = coherentstate(basis, α_gpu)
+        @test device(auto_state) == :gpu
+    end
+    
+    @testset "State Creation Benchmarks" begin
+        
+        @testset "Single-Mode State Creation" begin
+            basis = QuadPairBasis(1)
+            α = 1.0f0 + 0.5f0im
+            gpu_func = () -> coherentstate(basis, α) |> gpu
+            cpu_func = () -> coherentstate(basis, α)
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Single-mode coherent state creation")
+            coh_gpu = gpu_func()
+            coh_cpu = cpu_func()
+            @test Array(coh_gpu.mean) ≈ Float32.(coh_cpu.mean) rtol=1e-6
+            @test speedup > 0.01
+        end
+        
+        @testset "Multi-Mode State Creation" begin
+            basis = QuadPairBasis(10)
+            α_vec = randn(ComplexF32, 10)
+            gpu_func = () -> coherentstate(basis, α_vec) |> gpu
+            cpu_func = () -> coherentstate(basis, α_vec)
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "10-mode coherent state creation")
+            @test speedup > 0.03
+        end
+        
+        @testset "Very Large State Creation" begin
+            basis = QuadPairBasis(20)
+            α_vec = randn(ComplexF32, 20)
+            gpu_func = () -> coherentstate(basis, α_vec) |> gpu
+            cpu_func = () -> coherentstate(basis, α_vec)
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "20-mode coherent state creation")
+            @test speedup > 0.04
+            if speedup > 1.0
+                @info "GPU advantage emerging for very large state creation: $(round(speedup, digits=2))x"
+            end
+        end
+    end
+    
+    @testset "Operation Application Benchmarks" begin
+        
+        @testset "Single-Mode Operations" begin
+            basis = QuadPairBasis(1)
+            vac_gpu = vacuumstate(basis) |> gpu
+            vac_cpu = vacuumstate(basis)
+            disp_gpu = displace(basis, 1.0f0) |> gpu
+            disp_cpu = displace(basis, 1.0f0)
+            gpu_func = () -> disp_gpu * vac_gpu
+            cpu_func = () -> disp_cpu * vac_cpu
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Single-mode operation application")
+            result_gpu = gpu_func()
+            result_cpu = cpu_func()
+            @test Array(result_gpu.mean) ≈ Float32.(result_cpu.mean) rtol=1e-6
+            @test speedup > 0.01
+        end
+        
+        @testset "Multi-Mode Operations" begin
+            basis = QuadPairBasis(5)
+            α_vec = randn(ComplexF32, 5)
+            coh_gpu = coherentstate(basis, α_vec) |> gpu
+            coh_cpu = coherentstate(basis, α_vec)
+            r_vec = randn(Float32, 5) * 0.3f0
+            θ_vec = randn(Float32, 5)
+            squeeze_gpu = squeeze(basis, r_vec, θ_vec) |> gpu
+            squeeze_cpu = squeeze(basis, r_vec, θ_vec)
+            gpu_func = () -> squeeze_gpu * coh_gpu
+            cpu_func = () -> squeeze_cpu * coh_cpu
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "5-mode squeeze operation")
+            @test speedup > 0.02
+        end
+    end
+    
+    @testset "Tensor Product Benchmarks" begin
+        
+        @testset "Small Tensor Products" begin
+            basis1 = QuadPairBasis(2)
+            basis2 = QuadPairBasis(2)
+            α1 = randn(ComplexF32, 2)
+            α2 = randn(ComplexF32, 2)
+            coh1_gpu = coherentstate(basis1, α1) |> gpu
+            coh2_gpu = coherentstate(basis2, α2) |> gpu
+            coh1_cpu = coherentstate(basis1, α1)
+            coh2_cpu = coherentstate(basis2, α2)
+            gpu_func = () -> tensor(coh1_gpu, coh2_gpu)
+            cpu_func = () -> tensor(coh1_cpu, coh2_cpu)
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Small tensor product (2⊗2 modes)")
+            result_gpu = gpu_func()
+            result_cpu = cpu_func()
+            @test result_gpu.basis.nmodes == 4
+            @test Array(result_gpu.mean) ≈ Float32.(result_cpu.mean) rtol=1e-6
+            @test Array(result_gpu.covar) ≈ Float32.(result_cpu.covar) rtol=1e-6
+            @test speedup > 0.05
+        end
+        
+        @testset "Larger Tensor Products" begin
+            basis1 = QuadPairBasis(3)
+            basis2 = QuadPairBasis(4)
+            α1 = randn(ComplexF32, 3)
+            α2 = randn(ComplexF32, 4)
+            coh1_gpu = coherentstate(basis1, α1) |> gpu
+            coh2_gpu = coherentstate(basis2, α2) |> gpu
+            coh1_cpu = coherentstate(basis1, α1)
+            coh2_cpu = coherentstate(basis2, α2)
+            gpu_func = () -> tensor(coh1_gpu, coh2_gpu)
+            cpu_func = () -> tensor(coh1_cpu, coh2_cpu)
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Large tensor product (3⊗4 modes)")
+            result_gpu = gpu_func()
+            result_cpu = cpu_func()
+            @test result_gpu.basis.nmodes == 7
+            @test Array(result_gpu.mean) ≈ Float32.(result_cpu.mean) rtol=1e-6
+            @test speedup > 0.05
+            if speedup > 1.0
+                @info "GPU tensor product advantage: $(round(speedup, digits=2))x"
+            end
+        end
+    end
+    
+    @testset "Wigner Function Benchmarks - The GPU Sweet Spot" begin
+        
+        @testset "Single-Point Wigner" begin
+            basis = QuadPairBasis(1)
+            coh_gpu = coherentstate(basis, 1.0f0) |> gpu
+            coh_cpu = coherentstate(basis, 1.0f0)
+            x_point = [0.5f0, 0.3f0]
+            gpu_func = () -> wigner(coh_gpu, x_point)
+            cpu_func = () -> wigner(coh_cpu, x_point)
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Single-point Wigner evaluation")
+            @test gpu_func() ≈ cpu_func() rtol=1e-5
+            @test speedup > 0.05
+        end
+        
+        @testset "Batch Wigner Evaluation - Small" begin
+            basis = QuadPairBasis(1)
+            coh_gpu = coherentstate(basis, 0.8f0) |> gpu
+            coh_cpu = coherentstate(basis, 0.8f0)
+            n_points = 1000
+            x_points_gpu = CuArray(randn(Float32, 2, n_points))
+            x_points_cpu = Array(x_points_gpu)
+            gpu_func = () -> wigner(coh_gpu, x_points_gpu)
+            cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points]
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Batch Wigner (1000 points)", n_trials=3)
+            @test speedup > 2.0
+            w_gpu = Array(gpu_func())
+            w_cpu = cpu_func()
+            @test w_gpu ≈ w_cpu rtol=1e-4
+            @info "GPU batch Wigner showing advantage: $(round(speedup, digits=1))x speedup"
+        end
+        
+        @testset "Batch Wigner Evaluation - Large" begin
+            basis = QuadPairBasis(1)
+            coh_gpu = coherentstate(basis, 0.5f0) |> gpu
+            coh_cpu = coherentstate(basis, 0.5f0)
+            n_points = 25000
+            x_points_gpu = CuArray(randn(Float32, 2, n_points))
+            x_points_cpu = Array(x_points_gpu)
+            gpu_func = () -> wigner(coh_gpu, x_points_gpu)
+            cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points]
+            speedup, gpu_time, cpu_time = benchmark_operation(gpu_func, cpu_func, "Large batch Wigner (25,000 points)", n_trials=3)
+            @test speedup > 10.0
+            @test gpu_time < 0.1
+            w_gpu = Array(gpu_func())
+            for i in 1:min(10, n_points)
+                w_single = wigner(coh_cpu, x_points_cpu[:, i])
+                @test w_gpu[i] ≈ w_single rtol=1e-4
+            end
+            @info "Large batch GPU advantage: $(round(speedup, digits=1))x speedup"
+        end
+        
+        @testset "Multi-Mode Wigner" begin
+            basis = QuadPairBasis(2)
+            α_vec = [0.5f0 + 0.3f0im, -0.2f0 + 0.8f0im]
+            coh_gpu = coherentstate(basis, α_vec) |> gpu
+            coh_cpu = coherentstate(basis, α_vec)
+            n_points = 5000
+            x_points_gpu = CuArray(randn(Float32, 4, n_points))
+            x_points_cpu = Array(x_points_gpu)
+            gpu_func = () -> wigner(coh_gpu, x_points_gpu)
+            cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points]
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Multi-mode Wigner (2 modes, 5000 points)", n_trials=3)
+            @test speedup > 5.0
+            @info "Multi-mode GPU advantage: $(round(speedup, digits=1))x speedup"
+        end
+    end
+    
+    @testset "Wigner Characteristic Function Benchmarks" begin
+        
+        @testset "Batch Wigner Characteristic - Professional API" begin
+            basis = QuadPairBasis(1)
+            sq_gpu = squeezedstate(basis, 0.3f0, Float32(π/4)) |> gpu
+            sq_cpu = squeezedstate(basis, 0.3f0, Float32(π/4))
+            n_points = 5000
+            xi_points_gpu = CuArray(randn(Float32, 2, n_points) * 0.5f0)
+            xi_points_cpu = Array(xi_points_gpu)
+            gpu_func = () -> wignerchar(sq_gpu, xi_points_gpu)
+            cpu_func = () -> [wignerchar(sq_cpu, xi_points_cpu[:, i]) for i in 1:n_points]
+            speedup, _, _ = benchmark_operation(gpu_func, cpu_func, "Batch Wigner characteristic (5,000 points)", n_trials=3)
+            @test speedup > 2.0
+            chi_gpu = Array(gpu_func())
+            for i in 1:min(5, n_points)
+                chi_single = wignerchar(sq_cpu, xi_points_cpu[:, i])
+                @test real(chi_gpu[i]) ≈ real(chi_single) rtol=1e-4
+                @test imag(chi_gpu[i]) ≈ imag(chi_single) rtol=1e-4
+            end
+            @info "Wigner characteristic GPU advantage: $(round(speedup, digits=1))x speedup"
+        end
+    end
+    
+    @testset " Performance Tests" begin
+        basis = QuadPairBasis(1)
+        coh_gpu = coherentstate(basis, 1.0f0) |> gpu
+        coh_cpu = coherentstate(basis, 1.0f0)
+        n_points = 15000
+        x_points_gpu = CuArray(randn(Float32, 2, n_points))
+        x_points_cpu = Array(x_points_gpu)
+        gpu_func = () -> wigner(coh_gpu, x_points_gpu)
+        cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points]
+        speedup, gpu_time, cpu_time = benchmark_operation(gpu_func, cpu_func, " GPU validation")
+        @test speedup > 15.0
+        @test gpu_time < cpu_time / 10  
+        w_gpu = Array(gpu_func())
+        w_sample = [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:min(20, n_points)]
+        @test w_gpu[1:length(w_sample)] ≈ w_sample rtol=1e-4
+    end
+    
+    @testset "Scaling Analysis" begin
+        basis = QuadPairBasis(1)
+        coh_gpu = coherentstate(basis, 1.0f0) |> gpu
+        coh_cpu = coherentstate(basis, 1.0f0)
+        point_counts = [100, 500, 1000, 5000, 10000, 25000]
+        speedups = Float64[]
+        for n_points in point_counts
+            x_points_gpu = CuArray(randn(Float32, 2, n_points))
+            x_points_cpu = Array(x_points_gpu)
+            gpu_func = () -> wigner(coh_gpu, x_points_gpu)
+            cpu_func = () -> [wigner(coh_cpu, x_points_cpu[:, i]) for i in 1:n_points]
+            speedup, gpu_time, cpu_time = benchmark_operation(gpu_func, cpu_func, "Scaling test ($n_points points)", n_trials=3)
+            push!(speedups, speedup)
+            @info "Scaling point" n_points=n_points speedup=round(speedup, digits=2) gpu_time=round(gpu_time, digits=4) cpu_time=round(cpu_time, digits=4)
+        end
+        @test speedups[end] > speedups[1]  
+        @test maximum(speedups) > 10.0    
+        @info "Peak GPU speedup: $(round(maximum(speedups), digits=2))x"
+    end
+end
+=#
+
+#=
+@testitem "GPU Batched Performance Benchmarks" tags=[:cuda] begin
+    using Gabs
+    using Gabs: device
+    using LinearAlgebra  
+    using CUDA
+    using Test
+    using BenchmarkTools
+
+    function cpu_wigner_batch(lc, x_points)
+        results = Vector{Float32}(undef, size(x_points, 2))
+        for i in 1:size(x_points, 2)
+            results[i] = wigner(lc, x_points[:, i])
+        end
+        return results
+    end
+
+    function cpu_wignerchar_batch(lc, xi_points)
+        results = Vector{ComplexF32}(undef, size(xi_points, 2))
+        for i in 1:size(xi_points, 2)
+            results[i] = wignerchar(lc, xi_points[:, i])
+        end
+        return results
+    end
+
+    function cpu_lc_wigner_batch(lc, x_points)
+        results = Vector{Float32}(undef, size(x_points, 2))
+        for i in 1:size(x_points, 2)
+            results[i] = wigner(lc, x_points[:, i])
+        end
+        return results
+    end
+
+    @testset "GPU vs CPU Speedup Benchmarks - Batched Interference" begin
+        println("GPU BATCHED INTERFERENCE PERFORMANCE BENCHMARKS")
+        test_configs = [
+            (1, 100,    "Small: 1-mode, 2-state, 100 points"),
+            (1, 1_000,  "Medium: 1-mode, 2-state, 1K points"), 
+            (1, 10_000, "Large: 1-mode, 2-state, 10K points"),
+            (2, 1_000,  "Multi-mode: 2-mode, 2-state, 1K points"),
+            (1, 50_000, "Very Large: 1-mode, 2-state, 50K points"),
+            (3, 500,    "High-D: 3-mode, 2-state, 500 points"),
+        ]
+        @testset "Wigner Function Benchmarks" begin
+            println("\nWIGNER FUNCTION BENCHMARKS:")        
+            for (nmodes, npoints, description) in test_configs
+                println("\nTesting: $description")
+                basis = QuadPairBasis(nmodes)
+                phase_space_dim = 2 * nmodes
+                state1_cpu = coherentstate(basis, 1.0f0 + 0.5f0*im)
+                state2_cpu = coherentstate(basis, -1.0f0 + 0.3f0*im)
+                state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 1.0f0 + 0.5f0*im)
+                state2_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, -1.0f0 + 0.3f0*im)
+                coeffs = Float32[0.6, 0.8]
+                coeffs ./= norm(coeffs)
+                cpu_states_array = [state1_cpu, state2_cpu]
+                gpu_states_array = [state1_gpu, state2_gpu]
+                cpu_lc = GaussianLinearCombination(basis, coeffs, cpu_states_array)
+                gpu_lc = GaussianLinearCombination(basis, coeffs, gpu_states_array)
+                @test device(gpu_lc) == :gpu
+                x_points_cpu = randn(Float32, phase_space_dim, npoints)
+                x_points_gpu = CuArray(x_points_cpu)
+                if npoints >= 100
+                    wigner(cpu_lc, x_points_cpu[:, 1])
+                    wigner(gpu_lc, x_points_gpu[:, 1:min(10, npoints)])
+                    CUDA.synchronize()
+                end
+                print("   CPU: ")
+                cpu_time = @belapsed cpu_wigner_batch($cpu_lc, $x_points_cpu) samples=3 evals=1
+                cpu_result = cpu_wigner_batch(cpu_lc, x_points_cpu)
+                print("   GPU: ")
+                gpu_time = @belapsed begin
+                    result = wigner($gpu_lc, $x_points_gpu)
+                    CUDA.synchronize()
+                    result
+                end samples=3 evals=1
+                gpu_result = wigner(gpu_lc, x_points_gpu)
+                CUDA.synchronize()
+                sample_size = min(20, npoints)
+                cpu_sample = cpu_result[1:sample_size]
+                gpu_sample = Array(gpu_result[1:sample_size])
+                @test cpu_sample ≈ gpu_sample rtol=1e-4
+                speedup = cpu_time / gpu_time
+                println("   Results:")
+                println("      CPU Time: $(round(cpu_time*1000, digits=2)) ms")
+                println("      GPU Time: $(round(gpu_time*1000, digits=2)) ms")
+                println("      Speedup:  $(round(speedup, digits=1))x")
+            end
+        end
+        
+        @testset "Wigner Characteristic Function Benchmarks" begin
+            println("\n\nWIGNER CHARACTERISTIC FUNCTION BENCHMARKS:")
+            char_configs = [
+                (1, 500,   "Small: 1-mode, 2-state, 500 points"),
+                (1, 5_000, "Medium: 1-mode, 2-state, 5K points"),
+                (2, 1_000, "Multi-mode: 2-mode, 2-state, 1K points"),
+                (1, 20_000,"Large: 1-mode, 2-state, 20K points"),
+            ]
+            
+            for (nmodes, npoints, description) in char_configs
+                println("\nTesting: $description")
+                basis = QuadPairBasis(nmodes)
+                phase_space_dim = 2 * nmodes
+                state1_cpu = coherentstate(basis, 1.0f0 + 0.5f0*im)
+                state2_cpu = coherentstate(basis, -1.0f0 + 0.3f0*im)
+                state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 1.0f0 + 0.5f0*im)
+                state2_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, -1.0f0 + 0.3f0*im)
+                coeffs = Float32[0.6, 0.8]
+                coeffs ./= norm(coeffs)
+                cpu_states_array = [state1_cpu, state2_cpu]
+                gpu_states_array = [state1_gpu, state2_gpu]
+                cpu_lc = GaussianLinearCombination(basis, coeffs, cpu_states_array)
+                gpu_lc = GaussianLinearCombination(basis, coeffs, gpu_states_array)
+                xi_points_cpu = randn(Float32, phase_space_dim, npoints)
+                xi_points_gpu = CuArray(xi_points_cpu)
+                if npoints >= 100
+                    wignerchar(cpu_lc, xi_points_cpu[:, 1])
+                    wignerchar(gpu_lc, xi_points_gpu[:, 1:min(5, npoints)])
+                    CUDA.synchronize()
+                end
+                print("   CPU: ")
+                cpu_time = @belapsed cpu_wignerchar_batch($cpu_lc, $xi_points_cpu) samples=3 evals=1
+                cpu_result = cpu_wignerchar_batch(cpu_lc, xi_points_cpu)
+                print("   GPU: ")
+                gpu_time = @belapsed begin
+                    result = wignerchar($gpu_lc, $xi_points_gpu)
+                    CUDA.synchronize()
+                    result
+                end samples=3 evals=1
+                gpu_result = wignerchar(gpu_lc, xi_points_gpu)
+                CUDA.synchronize()
+                sample_size = min(10, npoints)
+                cpu_sample = cpu_result[1:sample_size]
+                gpu_sample = Array(gpu_result[1:sample_size])
+                @test cpu_sample ≈ gpu_sample rtol=1e-4
+                speedup = cpu_time / gpu_time
+                println("   Results:")
+                println("      CPU Time: $(round(cpu_time*1000, digits=2)) ms")
+                println("      GPU Time: $(round(gpu_time*1000, digits=2)) ms")
+                println("      Speedup:  $(round(speedup, digits=1))x")
+            end
+        end
+        
+        @testset "Cross-Wigner Batch Performance" begin
+            println("\n\nCROSS-WIGNER BATCH PERFORMANCE:")
+            basis = QuadPairBasis(1)
+            state1_cpu = coherentstate(basis, 1.0f0 + 0.5f0*im)
+            state2_cpu = squeezedstate(basis, 0.3f0, Float32(π/4))
+            state1_gpu = coherentstate(CuVector{Float32}, CuMatrix{Float32}, basis, 1.0f0 + 0.5f0*im)
+            state2_gpu = squeezedstate(CuVector{Float32}, CuMatrix{Float32}, basis, 0.3f0, Float32(π/4))
+            batch_sizes = [100, 1_000, 10_000, 50_000]
+            
+            for npoints in batch_sizes
+                println("\nCross-Wigner batch size: $npoints points")
+                x_points_cpu = randn(Float32, 2, npoints)
+                x_points_gpu = CuArray(x_points_cpu)
+                if npoints >= 100
+                    Gabs.cross_wigner(state1_cpu, state2_cpu, x_points_cpu[:, 1])
+                end
+                coeffs = Float32[0.7, 0.3]
+                cpu_states_array = [state1_cpu, state2_cpu]
+                gpu_states_array = [state1_gpu, state2_gpu]
+                cpu_lc = GaussianLinearCombination(basis, coeffs, cpu_states_array)
+                gpu_lc = GaussianLinearCombination(basis, coeffs, gpu_states_array)
+                print("   CPU: ")
+                cpu_time = @belapsed cpu_lc_wigner_batch($cpu_lc, $x_points_cpu) samples=2 evals=1
+                print("   GPU: ")
+                gpu_time = @belapsed begin
+                    result = wigner($gpu_lc, $x_points_gpu)
+                    CUDA.synchronize()
+                    result
+                end samples=2 evals=1
+                speedup = cpu_time / gpu_time
+                println("   Results:")
+                println("      CPU Time: $(round(cpu_time*1000, digits=2)) ms")
+                println("      GPU Time: $(round(gpu_time*1000, digits=2)) ms") 
+                println("      Speedup:  $(round(speedup, digits=1))x")
+            end
+        end
+    end
+end
+=#
\ No newline at end of file