-
Notifications
You must be signed in to change notification settings - Fork 260
Open
Labels
Description
MWE:
julia> CUDA.@time map(y -> y * 2 / 10, CUDA.ones(5_000_000));
0.163260 seconds (285.00 k CPU allocations: 14.995 MiB) (2 GPU allocations: 38.147 MiB, 0.02% memmgmt time)
julia> CUDA.@time map(y -> 2 // 10 * y, CUDA.ones(5_000_000));
0.282400 seconds (322.23 k CPU allocations: 17.699 MiB) (2 GPU allocations: 38.147 MiB, 0.01% memmgmt time)Note that this has nothing to do with FP64. Here is the FP64 version:
julia> CUDA.@time map(y -> 2 / 10 * y, CUDA.ones(5_000_000));
0.159753 seconds (279.26 k CPU allocations: 14.702 MiB) (2 GPU allocations: 57.220 MiB, 0.13% memmgmt time)Note that I get similar problems in larger kernels. Just using a simple Rational like this makes the whole kernel 2x slower.
LLVM for y * 2 / 10
julia> CUDA.@device_code_llvm map(y -> 2 * y / 10, CUDA.ones(5_000_000));
; PTX CompilerJob of MethodInstance for (::GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1")(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::CuDeviceVector{Float32, 1}, ::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1, CUDA.DeviceMemory}, Tuple{Base.OneTo{Int64}}, var"#424#425", Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}) for sm_86
; @ none within `gpu_broadcast_kernel_linear`
define ptx_kernel void @_Z27gpu_broadcast_kernel_linear16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S8_S8_EE13CuDeviceArrayI7Float32Li1ELi1EE11BroadcastedI12CuArrayStyleILi1E12DeviceMemoryES7_4_424S3_I8ExtrudedISE_S3_I4BoolES3_IS5_EEEE({ ptr, i32 } %state, { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %"__ctx__::CompilerMetadata", { ptr addrspace(1), i64, [1 x i64], i64 } %"dest::CuDeviceArray", { [1 x { { ptr addrspace(1), i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %"bc::Broadcasted") local_unnamed_addr {
conversion:
%"__ctx__::CompilerMetadata.fca.0.0.0.0.extract" = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %"__ctx__::CompilerMetadata", 0, 0, 0, 0
%"__ctx__::CompilerMetadata.fca.1.0.0.0.0.extract" = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %"__ctx__::CompilerMetadata", 1, 0, 0, 0, 0
%"__ctx__::CompilerMetadata.fca.1.1.0.0.0.extract" = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %"__ctx__::CompilerMetadata", 1, 1, 0, 0, 0
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%2 = add nuw nsw i32 %1, 1
%3 = icmp sgt i64 %"__ctx__::CompilerMetadata.fca.1.0.0.0.0.extract", 0
call void @llvm.assume(i1 %3)
%4 = icmp sgt i64 %"__ctx__::CompilerMetadata.fca.1.1.0.0.0.extract", 0
call void @llvm.assume(i1 %4)
%5 = zext nneg i32 %2 to i64
%6 = zext nneg i32 %0 to i64
%7 = mul i64 %"__ctx__::CompilerMetadata.fca.1.1.0.0.0.extract", %6
%8 = add i64 %7, %5
%9 = icmp slt i64 %8, 1
%10 = icmp sgt i64 %8, %"__ctx__::CompilerMetadata.fca.0.0.0.0.extract"
%.not13 = or i1 %9, %10
br i1 %.not13, label %L328, label %L129
L129: ; preds = %conversion
%"dest::CuDeviceArray.fca.0.extract" = extractvalue { ptr addrspace(1), i64, [1 x i64], i64 } %"dest::CuDeviceArray", 0
; ┌ @ /home/efaulha2/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 within `macro expansion`
; │┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:177 within `setindex!`
; ││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:135 within `arrayset`
; │││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:142 within `arrayset_bits`
; ││││┌ @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/pointer.jl:88 within `unsafe_store!`
; │││││┌ @ none within `pointerset`
; ││││││┌ @ none within `macro expansion` @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/base.jl:39
%11 = getelementptr float, ptr addrspace(1) %"dest::CuDeviceArray.fca.0.extract", i64 %8
%12 = getelementptr float, ptr addrspace(1) %11, i64 -1
%"bc::Broadcasted.fca.0.0.0.0.extract" = extractvalue { [1 x { { ptr addrspace(1), i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %"bc::Broadcasted", 0, 0, 0, 0
%"bc::Broadcasted.fca.0.0.1.0.extract" = extractvalue { [1 x { { ptr addrspace(1), i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %"bc::Broadcasted", 0, 0, 1, 0
; │└└└└└└
; │┌ @ broadcast.jl:616 within `getindex`
; ││┌ @ broadcast.jl:620 within `_getindex`
; │││┌ @ broadcast.jl:671 within `_broadcast_getindex`
; ││││┌ @ broadcast.jl:696 within `_getindex`
; │││││┌ @ broadcast.jl:665 within `_broadcast_getindex`
; ││││││┌ @ broadcast.jl:595 within `newindex`
; │││││││┌ @ essentials.jl:799 within `ifelse`
%13 = and i8 %"bc::Broadcasted.fca.0.0.1.0.extract", 1
%.not = icmp eq i8 %13, 0
%"bc::Broadcasted.fca.0.0.2.0.extract" = extractvalue { [1 x { { ptr addrspace(1), i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %"bc::Broadcasted", 0, 0, 2, 0
%14 = select i1 %.not, i64 %"bc::Broadcasted.fca.0.0.2.0.extract", i64 %8
; ││││││└└
; ││││││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:175 within `getindex`
; │││││││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:90 within `arrayref`
; ││││││││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:96 within `arrayref_bits`
; │││││││││┌ @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/pointer.jl:85 within `unsafe_load`
; ││││││││││┌ @ none within `pointerref`
; │││││││││││┌ @ none within `macro expansion` @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/base.jl:39
%15 = getelementptr float, ptr addrspace(1) %"bc::Broadcasted.fca.0.0.0.0.extract", i64 %14
%16 = getelementptr float, ptr addrspace(1) %15, i64 -1
%17 = load float, ptr addrspace(1) %16, align 4
; ││││└└└└└└└└
; ││││ @ broadcast.jl:672 within `_broadcast_getindex`
; ││││┌ @ broadcast.jl:699 within `_broadcast_getindex_evalf`
; │││││┌ @ REPL[89]:1 within `#424`
; ││││││┌ @ promotion.jl:434 within `*` @ float.jl:497
%18 = fmul float %17, 2.000000e+00
; ││││││└
; ││││││┌ @ promotion.jl:436 within `/` @ float.jl:498
%19 = fdiv float %18, 1.000000e+01
; │└└└└└└
; │┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:177 within `setindex!`
; ││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:135 within `arrayset`
; │││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:142 within `arrayset_bits`
; ││││┌ @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/pointer.jl:88 within `unsafe_store!`
; │││││┌ @ none within `pointerset`
; ││││││┌ @ none within `macro expansion` @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/base.jl:39
store float %19, ptr addrspace(1) %12, align 4
; │││└└└└
; │││ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:137 within `arrayset`
br label %L328
L328: ; preds = %L129, %conversion
; │└└
ret void
; └
}LLVM for 2 // 10 * y
julia> CUDA.@device_code_llvm map(y -> 2 // 10 * y, CUDA.ones(5_000_000));
; PTX CompilerJob of MethodInstance for (::GPUArrays.var"#gpu_broadcast_kernel_linear#_copyto!##1")(::KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, ::CuDeviceVector{Float32, 1}, ::Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{1, CUDA.DeviceMemory}, Tuple{Base.OneTo{Int64}}, var"#427#428", Tuple{Base.Broadcast.Extruded{CuDeviceVector{Float32, 1}, Tuple{Bool}, Tuple{Int64}}}}) for sm_86
; @ none within `gpu_broadcast_kernel_linear`
define ptx_kernel void @_Z27gpu_broadcast_kernel_linear16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S8_S8_EE13CuDeviceArrayI7Float32Li1ELi1EE11BroadcastedI12CuArrayStyleILi1E12DeviceMemoryES7_4_427S3_I8ExtrudedISE_S3_I4BoolES3_IS5_EEEE({ ptr, i32 } %state, { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %"__ctx__::CompilerMetadata", { ptr addrspace(1), i64, [1 x i64], i64 } %"dest::CuDeviceArray", { [1 x { { ptr addrspace(1), i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %"bc::Broadcasted") local_unnamed_addr {
conversion:
%"__ctx__::CompilerMetadata.fca.0.0.0.0.extract" = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %"__ctx__::CompilerMetadata", 0, 0, 0, 0
%"__ctx__::CompilerMetadata.fca.1.0.0.0.0.extract" = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %"__ctx__::CompilerMetadata", 1, 0, 0, 0, 0
%"__ctx__::CompilerMetadata.fca.1.1.0.0.0.extract" = extractvalue { [1 x [1 x [1 x i64]]], [2 x [1 x [1 x [1 x i64]]]] } %"__ctx__::CompilerMetadata", 1, 1, 0, 0, 0
%"dest::CuDeviceArray.fca.0.extract" = extractvalue { ptr addrspace(1), i64, [1 x i64], i64 } %"dest::CuDeviceArray", 0
%sret_box = alloca [2 x i64], align 8
%0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%2 = add nuw nsw i32 %1, 1
%3 = icmp sgt i64 %"__ctx__::CompilerMetadata.fca.1.0.0.0.0.extract", 0
call void @llvm.assume(i1 %3)
%4 = icmp sgt i64 %"__ctx__::CompilerMetadata.fca.1.1.0.0.0.extract", 0
call void @llvm.assume(i1 %4)
%5 = zext nneg i32 %2 to i64
%6 = zext nneg i32 %0 to i64
%7 = mul i64 %"__ctx__::CompilerMetadata.fca.1.1.0.0.0.extract", %6
%8 = add i64 %7, %5
%9 = icmp slt i64 %8, 1
%10 = icmp sgt i64 %8, %"__ctx__::CompilerMetadata.fca.0.0.0.0.extract"
%.not18 = or i1 %9, %10
br i1 %.not18, label %L360, label %L129
L129: ; preds = %conversion
%"bc::Broadcasted.fca.0.0.2.0.extract" = extractvalue { [1 x { { ptr addrspace(1), i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %"bc::Broadcasted", 0, 0, 2, 0
%"bc::Broadcasted.fca.0.0.1.0.extract" = extractvalue { [1 x { { ptr addrspace(1), i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %"bc::Broadcasted", 0, 0, 1, 0
%"bc::Broadcasted.fca.0.0.0.0.extract" = extractvalue { [1 x { { ptr addrspace(1), i64, [1 x i64], i64 }, [1 x i8], [1 x i64] }], [1 x [1 x i64]] } %"bc::Broadcasted", 0, 0, 0, 0
; ┌ @ /home/efaulha2/.julia/packages/KernelAbstractions/lGrz7/src/macros.jl:324 within `macro expansion`
; │┌ @ broadcast.jl:616 within `getindex`
; ││┌ @ broadcast.jl:620 within `_getindex`
; │││┌ @ broadcast.jl:671 within `_broadcast_getindex`
; ││││┌ @ broadcast.jl:696 within `_getindex`
; │││││┌ @ broadcast.jl:665 within `_broadcast_getindex`
; ││││││┌ @ broadcast.jl:595 within `newindex`
; │││││││┌ @ essentials.jl:799 within `ifelse`
%11 = and i8 %"bc::Broadcasted.fca.0.0.1.0.extract", 1
%.not = icmp eq i8 %11, 0
%12 = select i1 %.not, i64 %"bc::Broadcasted.fca.0.0.2.0.extract", i64 %8
; ││││││└└
; ││││││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:175 within `getindex`
; │││││││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:90 within `arrayref`
; ││││││││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:96 within `arrayref_bits`
; │││││││││┌ @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/pointer.jl:85 within `unsafe_load`
; ││││││││││┌ @ none within `pointerref`
; │││││││││││┌ @ none within `macro expansion` @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/base.jl:39
%13 = getelementptr float, ptr addrspace(1) %"bc::Broadcasted.fca.0.0.0.0.extract", i64 %12
%14 = getelementptr float, ptr addrspace(1) %13, i64 -1
%15 = load float, ptr addrspace(1) %14, align 4
; ││││└└└└└└└└
; ││││ @ broadcast.jl:672 within `_broadcast_getindex`
; ││││┌ @ broadcast.jl:699 within `_broadcast_getindex_evalf`
; │││││┌ @ REPL[90]:1 within `#427`
; ││││││┌ @ rational.jl:91 within `//`
; │││││││┌ @ rational.jl:48 within `Rational` @ rational.jl:43
call fastcc void @julia_divgcd_100531({ ptr, i32 } %state, ptr %sret_box, i64 2, i64 10)
; ││││││││┌ @ tuple.jl:162 within `indexed_iterate`
%"sret_box[2]_ptr" = getelementptr inbounds i8, ptr %sret_box, i64 8
; ││││││││└
; ││││││││ @ rational.jl:48 within `Rational` @ rational.jl:44
; ││││││││┌ @ rational.jl:21 within `checked_den`
; │││││││││┌ @ int.jl:139 within `signbit`
; ││││││││││┌ @ int.jl:83 within `<`
%"sret_box[2]_ptr.unbox" = load i64, ptr %"sret_box[2]_ptr", align 8
%16 = icmp sgt i64 %"sret_box[2]_ptr.unbox", -1
; │││││││││└└
br i1 %16, label %L308.L330_crit_edge, label %L313
L308.L330_crit_edge: ; preds = %L129
%value_phi.in.sroa.speculate.load.L308.L330_crit_edge = load i64, ptr %sret_box, align 8
br label %L336
L313: ; preds = %L129
; │││││││││ @ rational.jl:22 within `checked_den`
; │││││││││┌ @ checked.jl:95 within `checked_neg`
; ││││││││││┌ @ checked.jl:231 within `checked_sub`
; │││││││││││┌ @ checked.jl:203 within `sub_with_overflow`
%17 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %"sret_box[2]_ptr.unbox")
%18 = extractvalue { i64, i1 } %17, 0
%19 = extractvalue { i64, i1 } %17, 1
; │││││││││││└
; │││││││││││ @ checked.jl:232 within `checked_sub`
br i1 %19, label %L317, label %L321
L317: ; preds = %L313
call fastcc void @julia_throw_overflowerr_binaryop_100524({ ptr, i32 } %state)
unreachable
L321: ; preds = %L313
; │││││││││└└
; │││││││││ @ rational.jl:23 within `checked_den`
; │││││││││┌ @ checked.jl:95 within `checked_neg`
; ││││││││││┌ @ checked.jl:231 within `checked_sub`
; │││││││││││┌ @ checked.jl:203 within `sub_with_overflow`
%sret_box.unbox = load i64, ptr %sret_box, align 8
%20 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 0, i64 %sret_box.unbox)
%21 = extractvalue { i64, i1 } %20, 1
; │││││││││││└
; │││││││││││ @ checked.jl:232 within `checked_sub`
br i1 %21, label %L325, label %L329
L325: ; preds = %L321
call fastcc void @julia_throw_overflowerr_binaryop_100524({ ptr, i32 } %state)
unreachable
L329: ; preds = %L321
; │││││││││││ @ checked.jl:231 within `checked_sub`
; │││││││││││┌ @ checked.jl:203 within `sub_with_overflow`
%22 = extractvalue { i64, i1 } %20, 0
; │││││││││││└
; │││││││││││ @ checked.jl:232 within `checked_sub`
br label %L336
L336: ; preds = %L329, %L308.L330_crit_edge
%value_phi.in.sroa.speculated = phi i64 [ %22, %L329 ], [ %value_phi.in.sroa.speculate.load.L308.L330_crit_edge, %L308.L330_crit_edge ]
%value_phi46 = phi i64 [ %18, %L329 ], [ %"sret_box[2]_ptr.unbox", %L308.L330_crit_edge ]
; ││││││└└└└└
; ││││││┌ @ promotion.jl:434 within `*`
; │││││││┌ @ promotion.jl:404 within `promote`
; ││││││││┌ @ promotion.jl:379 within `_promote`
; │││││││││┌ @ number.jl:7 within `convert`
; ││││││││││┌ @ rational.jl:164 within `AbstractFloat`
; │││││││││││┌ @ number.jl:7 within `convert`
; ││││││││││││┌ @ float.jl:245 within `Float32`
%23 = sitofp i64 %value_phi.in.sroa.speculated to float
%24 = sitofp i64 %value_phi46 to float
; │││││││││││└└
; │││││││││││┌ @ float.jl:498 within `/`
%25 = fdiv float %23, %24
; │││││││└└└└└
; │││││││ @ promotion.jl:434 within `*` @ float.jl:497
%26 = fmul float %15, %25
; │└└└└└└
; │┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:177 within `setindex!`
; ││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:135 within `arrayset`
; │││┌ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:142 within `arrayset_bits`
; ││││┌ @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/pointer.jl:88 within `unsafe_store!`
; │││││┌ @ none within `pointerset`
; ││││││┌ @ none within `macro expansion` @ /home/efaulha2/.julia/packages/LLVM/iza6e/src/interop/base.jl:39
%27 = getelementptr float, ptr addrspace(1) %"dest::CuDeviceArray.fca.0.extract", i64 %8
%28 = getelementptr float, ptr addrspace(1) %27, i64 -1
store float %26, ptr addrspace(1) %28, align 4
; │││└└└└
; │││ @ /home/efaulha2/.julia/packages/CUDA/UurkZ/src/device/array.jl:137 within `arrayset`
br label %L360
L360: ; preds = %L336, %conversion
; │└└
ret void
; └
}Here is my versioninfo:
julia> CUDA.versioninfo()
CUDA toolchain:
- runtime 12.5, artifact installation
- driver 555.58.2 for 12.5
- compiler 12.9
CUDA libraries:
- CUBLAS: 12.5.3
- CURAND: 10.3.6
- CUFFT: 11.2.3
- CUSOLVER: 11.6.3
- CUSPARSE: 12.5.1
- CUPTI: 2024.2.1 (API 12.5.0)
- NVML: 12.0.0+555.58.2
Julia packages:
- CUDA: 5.9.2
- CUDA_Driver_jll: 13.0.2+0
- CUDA_Compiler_jll: 0.3.0+0
- CUDA_Runtime_jll: 0.19.2+0
Toolchain:
- Julia: 1.12.1
- LLVM: 18.1.7
Preferences:
- CUDA_Runtime_jll.version: 12.5
2 devices:
0: NVIDIA RTX A4500 (sm_86, 19.375 MiB / 19.990 GiB available)
1: NVIDIA RTX A4500 (sm_86, 19.593 GiB / 19.990 GiB available)