Skip to content

Commit 565193a

Browse files
authored
Rollup merge of rust-lang#149170 - ZuseZ4:automate-offload-packager, r=oli-obk
automate gpu offloading - part 1 Automates step 1 from the rustc-dev-guide offload section: https://rustc-dev-guide.rust-lang.org/offload/usage.html#compile-instructions `"clang-offload-packager" "-o" "host.out" "--image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"` Verified on an MI 250X cc ``@jhuber6,`` ``@kevinsala,`` ``@jdoerfert,`` ``@Sa4dUs`` r? oli-obk
2 parents f15f149 + 88ca3bc commit 565193a

File tree

4 files changed

+59
-12
lines changed

4 files changed

+59
-12
lines changed

compiler/rustc_codegen_llvm/src/back/write.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,13 @@ pub(crate) unsafe fn llvm_optimize(
765765
llvm_plugins.len(),
766766
)
767767
};
768+
769+
if cgcx.target_is_like_gpu && config.offload.contains(&config::Offload::Enable) {
770+
unsafe {
771+
llvm::LLVMRustBundleImages(module.module_llvm.llmod(), module.module_llvm.tm.raw());
772+
}
773+
}
774+
768775
result.into_result().unwrap_or_else(|()| llvm_err(dcx, LlvmError::RunLlvmPasses))
769776
}
770777

compiler/rustc_codegen_llvm/src/llvm/ffi.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1641,6 +1641,9 @@ unsafe extern "C" {
16411641
Name: *const c_char,
16421642
) -> &'a Value;
16431643

1644+
/// Processes the module and writes it in an offload compatible way into a "host.out" file.
1645+
pub(crate) fn LLVMRustBundleImages<'a>(M: &'a Module, TM: &'a TargetMachine) -> bool;
1646+
16441647
/// Writes a module to the specified path. Returns 0 on success.
16451648
pub(crate) fn LLVMWriteBitcodeToFile(M: &Module, Path: *const c_char) -> c_int;
16461649

compiler/rustc_llvm/llvm-wrapper/RustWrapper.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "llvm/IR/Module.h"
2626
#include "llvm/IR/Value.h"
2727
#include "llvm/Object/COFFImportFile.h"
28+
#include "llvm/Object/OffloadBinary.h"
2829
#include "llvm/Remarks/RemarkFormat.h"
2930
#include "llvm/Remarks/RemarkSerializer.h"
3031
#include "llvm/Remarks/RemarkStreamer.h"
@@ -35,6 +36,7 @@
3536
#include "llvm/Support/Signals.h"
3637
#include "llvm/Support/Timer.h"
3738
#include "llvm/Support/ToolOutputFile.h"
39+
#include "llvm/Target/TargetMachine.h"
3840
#include "llvm/Transforms/Utils/Cloning.h"
3941
#include "llvm/Transforms/Utils/ValueMapper.h"
4042
#include <iostream>
@@ -144,6 +146,51 @@ extern "C" void LLVMRustPrintStatistics(RustStringRef OutBuf) {
144146
llvm::PrintStatistics(OS);
145147
}
146148

149+
static Error writeFile(StringRef Filename, StringRef Data) {
150+
Expected<std::unique_ptr<FileOutputBuffer>> OutputOrErr =
151+
FileOutputBuffer::create(Filename, Data.size());
152+
if (!OutputOrErr)
153+
return OutputOrErr.takeError();
154+
std::unique_ptr<FileOutputBuffer> Output = std::move(*OutputOrErr);
155+
llvm::copy(Data, Output->getBufferStart());
156+
if (Error E = Output->commit())
157+
return E;
158+
return Error::success();
159+
}
160+
161+
// This is the first of many steps in creating a binary using llvm offload,
162+
// to run code on the gpu. Concrete, it replaces the following binary use:
163+
// clang-offload-packager -o host.out
164+
// --image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp
165+
// The input module is the rust code compiled for a gpu target like amdgpu.
166+
// Based on clang/tools/clang-offload-packager/ClangOffloadPackager.cpp
167+
extern "C" bool LLVMRustBundleImages(LLVMModuleRef M, TargetMachine &TM) {
168+
std::string Storage;
169+
llvm::raw_string_ostream OS1(Storage);
170+
llvm::WriteBitcodeToFile(*unwrap(M), OS1);
171+
OS1.flush();
172+
auto MB = llvm::MemoryBuffer::getMemBufferCopy(Storage, "module.bc");
173+
174+
SmallVector<char, 1024> BinaryData;
175+
raw_svector_ostream OS2(BinaryData);
176+
177+
OffloadBinary::OffloadingImage ImageBinary{};
178+
ImageBinary.TheImageKind = object::IMG_Bitcode;
179+
ImageBinary.Image = std::move(MB);
180+
ImageBinary.TheOffloadKind = object::OFK_OpenMP;
181+
ImageBinary.StringData["triple"] = TM.getTargetTriple().str();
182+
ImageBinary.StringData["arch"] = TM.getTargetCPU();
183+
llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
184+
if (Buffer.size() % OffloadBinary::getAlignment() != 0)
185+
// Offload binary has invalid size alignment
186+
return false;
187+
OS2 << Buffer;
188+
if (Error E = writeFile("host.out",
189+
StringRef(BinaryData.begin(), BinaryData.size())))
190+
return false;
191+
return true;
192+
}
193+
147194
extern "C" void LLVMRustOffloadMapper(LLVMValueRef OldFn, LLVMValueRef NewFn) {
148195
llvm::Function *oldFn = llvm::unwrap<llvm::Function>(OldFn);
149196
llvm::Function *newFn = llvm::unwrap<llvm::Function>(NewFn);

src/doc/rustc-dev-guide/src/offload/usage.md

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -79,27 +79,17 @@ Now we generate the device code. Replace the target-cpu with the right code for
7979
RUSTFLAGS="-Ctarget-cpu=gfx90a --emit=llvm-bc,llvm-ir -Zoffload=Enable -Zunstable-options" cargo +offload build -Zunstable-options -r -v --target amdgcn-amd-amdhsa -Zbuild-std=core
8080
```
8181

82-
Now find the `<libname>.ll` under target/amdgcn-amd-amdhsa folder and copy it to a device.ll file (or adjust the file names below).
83-
If you work on an NVIDIA or Intel gpu, please adjust the names acordingly and open an issue to share your results (either if you succeed or fail).
84-
First we compile our .ll files (good for manual inspections) to .bc files and clean up leftover artifacts. The cleanup is important, otherwise caching might interfere on following runs.
85-
```
86-
opt lib.ll -o lib.bc
87-
opt device.ll -o device.bc
88-
rm *.o
89-
rm bare.amdgcn.gfx90a.img*
90-
```
9182

9283
```
93-
"clang-offload-packager" "-o" "host.out" "--image=file=device.bc,triple=amdgcn-amd-amdhsa,arch=gfx90a,kind=openmp"
94-
9584
"clang-21" "-cc1" "-triple" "x86_64-unknown-linux-gnu" "-S" "-save-temps=cwd" "-disable-free" "-clear-ast-before-backend" "-main-file-name" "lib.rs" "-mrelocation-model" "pic" "-pic-level" "2" "-pic-is-pie" "-mframe-pointer=all" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-tune-cpu" "generic" "-resource-dir" "/<ABSOLUTE_PATH_TO>/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21" "-ferror-limit" "19" "-fopenmp" "-fopenmp-offload-mandatory" "-fgnuc-version=4.2.1" "-fskip-odr-check-in-gmf" "-fembed-offload-object=host.out" "-fopenmp-targets=amdgcn-amd-amdhsa" "-faddrsig" "-D__GCC_HAVE_DWARF2_CFI_ASM=1" "-o" "host.s" "-x" "ir" "lib.bc"
9685
9786
"clang-21" "-cc1as" "-triple" "x86_64-unknown-linux-gnu" "-filetype" "obj" "-main-file-name" "lib.rs" "-target-cpu" "x86-64" "-mrelocation-model" "pic" "-o" "host.o" "host.s"
9887
9988
"clang-linker-wrapper" "--should-extract=gfx90a" "--device-compiler=amdgcn-amd-amdhsa=-g" "--device-compiler=amdgcn-amd-amdhsa=-save-temps=cwd" "--device-linker=amdgcn-amd-amdhsa=-lompdevice" "--host-triple=x86_64-unknown-linux-gnu" "--save-temps" "--linker-path=/ABSOlUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/lld/bin/ld.lld" "--hash-style=gnu" "--eh-frame-hdr" "-m" "elf_x86_64" "-pie" "-dynamic-linker" "/lib64/ld-linux-x86-64.so.2" "-o" "bare" "/lib/../lib64/Scrt1.o" "/lib/../lib64/crti.o" "/ABSOLUTE_PATH_TO/crtbeginS.o" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/bin/../lib/x86_64-unknown-linux-gnu" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib/clang/21/lib/x86_64-unknown-linux-gnu" "-L/lib/../lib64" "-L/usr/lib64" "-L/lib" "-L/usr/lib" "host.o" "-lstdc++" "-lm" "-lomp" "-lomptarget" "-L/ABSOLUTE_PATH_TO/rust/build/x86_64-unknown-linux-gnu/llvm/lib" "-lgcc_s" "-lgcc" "-lpthread" "-lc" "-lgcc_s" "-lgcc" "/ABSOLUTE_PATH_TO/crtendS.o" "/lib/../lib64/crtn.o"
10089
```
10190

102-
Especially for the last command I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
91+
Especially for the last three commands I recommend to not fix the paths, but rather just re-generate them by copying a bare-mode openmp example and compiling it with your clang. By adding `-###` to your clang invocation, you can see the invidual steps.
92+
You can ignore other steps, e.g. the invocation of a "clang-offload-packager".
10393
```
10494
myclang++ -fuse-ld=lld -O3 -fopenmp -fopenmp-offload-mandatory --offload-arch=gfx90a omp_bare.cpp -o main -###
10595
```

0 commit comments

Comments
 (0)