Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions compiler/rustc/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ check_only = ['rustc_driver_impl/check_only']
jemalloc = ['dep:tikv-jemalloc-sys']
llvm = ['rustc_driver_impl/llvm']
llvm_enzyme = ['rustc_driver_impl/llvm_enzyme']
llvm_offload = ['rustc_driver_impl/llvm_offload']
max_level_info = ['rustc_driver_impl/max_level_info']
rustc_randomized_layouts = ['rustc_driver_impl/rustc_randomized_layouts']
# tidy-alphabetical-end
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_codegen_llvm/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,5 +47,6 @@ tracing = "0.1"
# tidy-alphabetical-start
check_only = ["rustc_llvm/check_only"]
llvm_enzyme = []
llvm_offload = []
# tidy-alphabetical-end

7 changes: 7 additions & 0 deletions compiler/rustc_codegen_llvm/src/back/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,13 @@ pub(crate) unsafe fn llvm_optimize(
llvm_plugins.len(),
)
};

if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
unsafe {
llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw());
}
}

result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
}

Expand Down
32 changes: 31 additions & 1 deletion compiler/rustc_codegen_llvm/src/llvm/ffi.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1718,6 +1718,37 @@ unsafe extern "C" {
) -> &'a Value;
}

#[cfg(feature = "llvm_offload")]
pub(crate) use self::Offload::*;

#[cfg(feature = "llvm_offload")]
mod Offload {
use super::*;
unsafe extern "C" {
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool;
pub(crate) fn LLVMRustOffloadMapper<'a>(OldFn: &'a Value, NewFn: &'a Value);
}
}

#[cfg(not(feature = "llvm_offload"))]
pub(crate) use self::Offload_fallback::*;

#[cfg(not(feature = "llvm_offload"))]
mod Offload_fallback {
use super::*;
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
/// Marked as unsafe to match the real offload wrapper which is unsafe due to FFI.
#[allow(unused_unsafe)]
pub(crate) unsafe fn LLVMRustBundleImages<'a>(_M: &'a Module, _TM: &'a TargetMachine) -> bool {
unimplemented!("This rustc version was not built with LLVM Offload support!");
}
#[allow(unused_unsafe)]
pub(crate) unsafe fn LLVMRustOffloadMapper<'a>(_OldFn: &'a Value, _NewFn: &'a Value) {
unimplemented!("This rustc version was not built with LLVM Offload support!");
}
}

// FFI bindings for `DIBuilder` functions in the LLVM-C API.
// Try to keep these in the same order as in `llvm/include/llvm-c/DebugInfo.h`.
//
Expand Down Expand Up @@ -2025,7 +2056,6 @@ unsafe extern "C" {
) -> &Attribute;

// Operations on functions
pub(crate) fn LLVMRustOffloadMapper<'a>(Fn: &'a Value, Fn: &'a Value);
pub(crate) fn LLVMRustGetOrInsertFunction<'a>(
M: &'a Module,
Name: *const c_char,
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_driver_impl/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ ctrlc = "3.4.4"
check_only = ['rustc_interface/check_only']
llvm = ['rustc_interface/llvm']
llvm_enzyme = ['rustc_interface/llvm_enzyme']
llvm_offload = ['rustc_interface/llvm_offload']
max_level_info = ['rustc_log/max_level_info']
rustc_randomized_layouts = [
'rustc_index/rustc_randomized_layouts',
Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_interface/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,5 @@ rustc_abi = { path = "../rustc_abi" }
check_only = ['rustc_codegen_llvm?/check_only']
llvm = ['dep:rustc_codegen_llvm']
llvm_enzyme = ['rustc_builtin_macros/llvm_enzyme', 'rustc_codegen_llvm/llvm_enzyme']
llvm_offload = ['rustc_codegen_llvm/llvm_offload']
# tidy-alphabetical-end
4 changes: 4 additions & 0 deletions compiler/rustc_llvm/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,10 @@ fn main() {
cfg.define("ENZYME", None);
}

if tracked_env_var_os("LLVM_OFFLOAD").is_some() {
cfg.define("OFFLOAD", None);
}

if tracked_env_var_os("LLVM_RUSTLLVM").is_some() {
cfg.define("LLVM_RUSTLLVM", None);
}
Expand Down
58 changes: 58 additions & 0 deletions compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,14 @@
#include "llvm/Transforms/Utils/ValueMapper.h"
#include <iostream>

// Some of the functions below rely on LLVM modules that may not always be
// available. As such, we only try to build it in the first place, if
// llvm.offload is enabled.
#ifdef OFFLOAD
#include "llvm/Object/OffloadBinary.h"
#include "llvm/Target/TargetMachine.h"
#endif

// for raw `write` in the bad-alloc handler
#ifdef _MSC_VER
#include <io.h>
Expand Down Expand Up @@ -144,6 +152,55 @@ extern "C" void LLVMRustPrintStatistics(RustStringRef OutBuf) {
llvm::PrintStatistics(OS);
}

// Some of the functions here rely on LLVM modules that may not always be
// available. As such, we only try to build it in the first place, if
// llvm.offload is enabled.
#ifdef OFFLOAD
static Error writeFile(StringRef Filename, StringRef Data) {
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
FileOutputBuffer::create(Filename, Data.size());
if (!OutputOrErr)
return OutputOrErr.takeError();
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
llvm::copy(Data, Output->getBufferStart());
if (Error E = Output->commit())
return E;
return Error::success();
}

// This is the first of many steps in creating a binary using llvm offload,
// to run code on the gpu. Concrete, it replaces the following binary use:
// clang-offload-packager -o host.out
// --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp
// The input module is the rust code compiled for a gpu target like amdgpu.
// Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
std::string Storage;
llvm::raw_string_ostream OS1(Storage);
llvm::WriteBitcodeToFile(*unwrap(M), OS1);
OS1.flush();
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc");

SmallVector<char, 1024> BinaryData;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This buffer could be an argument provided by rustc and then rustc can do the file writing

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is, that the C++ version is resizable. If we provide a buffer from rust, it wouldn't be.
I asked and there is no reasonable default size, so we'd pass a (likely) too-small buffer in, set the needed length and return false, see that in rust, allocate a larger buffer with the requested size, call the method again, and hope that it now passes.
It's just 3 lines extra on the Rust side, and I don't expect it to become a compile-time bottleneck, since no one (famous last words) will compile >10k kernels, but it still feels ugly.

Copy link
Member Author

@ZuseZ4 ZuseZ4 Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've looked deeper into the next steps, and I think it's probably not worth cleaning up this write, since it will be fused with the next step, where we consume the in-memory host.out file.

I'll implement a save-temps equivalent later for debugging where we'll still write it out, but then we can handle all intermediate writes at once.
Similar to my offload frontend, which Marcello also just rewrote after we figured out, what we actually need.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wip #149202

raw_svector_ostream OS2(BinaryData);

OffloadBinary::OffloadingImage ImageBinary{};
ImageBinary.TheImageKind = object::IMG_Bitcode;
ImageBinary.Image = std::move(MB);
ImageBinary.TheOffloadKind = object::OFK_OpenMP;
ImageBinary.StringData["triple"] = TM.getTargetTriple().str();
ImageBinary.StringData["arch"] = TM.getTargetCPU();
llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
if (Buffer.size() % OffloadBinary::getAlignment() != 0)
// Offload binary has invalid size alignment
return false;
OS2 << Buffer;
if (Error E = writeFile("host.out",
StringRef(BinaryData.begin(), BinaryData.size())))
return false;
return true;
}

extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
llvm::Function *oldFn = llvm::unwrap<llvm::Function>(OldFn);
llvm::Function *newFn = llvm::unwrap<llvm::Function>(NewFn);
Expand All @@ -163,6 +220,7 @@ extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
llvm::CloneFunctionChangeType::LocalChangesOnly,
returns);
}
#endif

extern "C" LLVMValueRef LLVMRustGetNamedValue(LLVMModuleRef M, const char *Name,
size_t NameLen) {
Expand Down
3 changes: 3 additions & 0 deletions src/bootstrap/src/core/build_steps/compile.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1436,6 +1436,9 @@ fn rustc_llvm_env(builder: &Builder<'_>, cargo: &mut Cargo, target: TargetSelect
if builder.config.llvm_enzyme {
cargo.env("LLVM_ENZYME", "1");
}
if builder.config.llvm_offload {
cargo.env("LLVM_OFFLOAD", "1");
}
let llvm::LlvmResult { host_llvm_config, .. } = builder.ensure(llvm::Llvm { target });
cargo.env("LLVM_CONFIG", &host_llvm_config);

Expand Down
3 changes: 3 additions & 0 deletions src/bootstrap/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -873,6 +873,9 @@ impl Build {
if self.config.llvm_enzyme {
features.push("llvm_enzyme");
}
if self.config.llvm_offload {
features.push("llvm_offload");
}
// keep in sync with `bootstrap/compile.rs:rustc_cargo_env`
if self.config.rust_randomize_layout && check("rustc_randomized_layouts") {
features.push("rustc_randomized_layouts");
Expand Down
14 changes: 2 additions & 12 deletions src/doc/rustc-dev-guide/src/offload/usage.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,27 +79,17 @@ Now we generate the device code. Replace the target-cpu with the right code for
RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core
```

Now find the `<libname>.ll` under target/amdgcn-amd-amdhsa folder and copy it to a device.ll file (or adjust the file names below).
If you work on an NVIDIA or Intel gpu, please adjust the names acordingly and open an issue to share your results (either if you succeed or fail).
First we compile our .ll files (good for manual inspections) to .bc files and clean up leftover artifacts. The cleanup is important, otherwise caching might interfere on following runs.
```
opt lib.ll -o lib.bc
opt device.ll -o device.bc
rm *.o
rm bare.amdgcn.gfx90a.img*
```

```
"clang-offload-packager" "-o" "host.out" "--image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"

"clang-21" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-S" "-save-temps=cwd" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "lib.rs" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-resource-dir" "/<ABSOLUTE_PATH_TO>/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21" "-ferror-limit" "19" "-fopenmp" "-fopenmp-offload-mandatory" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-fembed-offload-object=host.out" "-fopenmp-targets=amdgcn-amd-amdhsa" "-faddrsig" "-D__GCC_HAVE_DWARF2_CFI_ASM=1" "-o" "host.s" "-x" "ir" "lib.bc"

"clang-21" "-cc1as" "-triple" "x86_64-unknown-linux-gnu" "-filetype" "obj" "-main-file-name" "lib.rs" "-target-cpu" "x86-64" "-mrelocation-model" "pic" "-o" "host.o" "host.s"

"clang-linker-wrapper" "--should-extract=gfx90a" "--device-compiler=amdgcn-amd-amdhsa=-g" "--device-compiler=amdgcn-amd-amdhsa=-save-temps=cwd" "--device-linker=amdgcn-amd-amdhsa=-lompdevice" "--host-triple=x86_64-unknown-linux-gnu" "--save-temps" "--linker-path=/ABSOlUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/lld/bin/ld.lld" "--hash-style=gnu" "--eh-frame-hdr" "-m" "elf_x86_64" "-pie" "-dynamic-linker" "/lib64/ld-linux-x86-64.so.2" "-o" "bare" "/lib/../lib64/Scrt1.o" "/lib/../lib64/crti.o" "/ABSOLUTE_PATH_TO/crtbeginS.o" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/bin/../lib/x86_64-unknown-linux-gnu" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21/lib/x86_64-unknown-linux-gnu" "-L/lib/../lib64" "-L/usr/lib64" "-L/lib" "-L/usr/lib" "host.o" "-lstdc++" "-lm" "-lomp" "-lomptarget" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib" "-lgcc_s" "-lgcc" "-lpthread" "-lc" "-lgcc_s" "-lgcc" "/ABSOLUTE_PATH_TO/crtendS.o" "/lib/../lib64/crtn.o"
```

Especially for the last command I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
Especially for the last three commands I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
You can ignore other steps, e.g. the invocation of a "clang-offload-packager".
```
myclang++ -fuse-ld=lld -O3 -fopenmp -fopenmp-offload-mandatory --offload-arch=gfx90a omp_bare.cpp -o main -###
```
Expand Down
Loading