[FP16] Improved performance by fusing dequantize with compute in kernels: 20-30% Inference Speedup #135
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: GPULlama3 Build & Run | |
| on: | |
| push: | |
| branches: [ main ] | |
| pull_request: | |
| branches: [ main ] | |
| types: [opened, synchronize, reopened] | |
| env: | |
| JAVA_HOME: /opt/jenkins/jdks/graal-23.1.0/jdk-21.0.3 | |
| TORNADO_ROOT: ${{ github.workspace }}/GPULlama3.java/external/tornadovm | |
| LLAMA_ROOT: ${{ github.workspace }} | |
| GRAAL_JARS: /opt/graalJars | |
| MODELS_DIR: /opt/models | |
| jobs: | |
| code-quality: | |
| runs-on: self-hosted | |
| timeout-minutes: 30 | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Check code formatting (Spotless) | |
| run: | | |
| cd ${{ github.workspace }} | |
| # ./mvnw -T12C -Pspotless spotless:check | |
| build-and-run: | |
| runs-on: [self-hosted] | |
| needs: code-quality | |
| timeout-minutes: 30 | |
| strategy: | |
| fail-fast: true | |
| matrix: | |
| backend: | |
| - name: opencl | |
| - name: ptx | |
| steps: | |
| - name: Checkout GPULlama3 | |
| uses: actions/checkout@v4 | |
| - name: Clone TornadoVM master | |
| run: | | |
| git clone --depth 1 --branch master \ | |
| https://github.com/beehive-lab/TornadoVM.git \ | |
| $TORNADO_ROOT | |
| - name: Set up Python venv for TornadoVM | |
| run: | | |
| python3 -m venv $TORNADO_ROOT/venv | |
| source $TORNADO_ROOT/venv/bin/activate | |
| python --version | |
| - name: Build TornadoVM | |
| run: | | |
| cd $TORNADO_ROOT | |
| mkdir -p graalJars && cp $GRAAL_JARS/* graalJars/ | |
| source venv/bin/activate | |
| echo "=== Building TornadoVM ===" | |
| make BACKEND=${{ matrix.backend.name }} | |
| echo "=== Searching for TornadoVM SDK directory ===" | |
| SDK_DIR=$(find dist -type d -maxdepth 3 -path "*/tornadovm-*-${{ matrix.backend.name }}" | head -n 1) | |
| if [ -z "$SDK_DIR" ]; then | |
| echo "::error::Could not locate TornadoVM SDK directory!" | |
| find dist -maxdepth 5 -type d | |
| exit 1 | |
| fi | |
| FULL_SDK="${PWD}/${SDK_DIR}" | |
| echo "Detected TornadoVM SDK: $FULL_SDK" | |
| # Export for current shell session | |
| export TORNADO_SDK="$FULL_SDK" | |
| export PATH="$FULL_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| # Save for subsequent steps | |
| echo "TORNADO_SDK=$FULL_SDK" >> $GITHUB_ENV | |
| echo "PATH=$PATH" >> $GITHUB_ENV | |
| echo "=== Checking tornado CLI ===" | |
| which tornado || { echo "::error::tornado not in PATH"; exit 1; } | |
| tornado --devices | |
| - name: Build GPULlama3.java | |
| run: | | |
| cd ${{ github.workspace }} | |
| echo "Using TORNADO_SDK=$TORNADO_SDK" | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| tornado --version | |
| ./mvnw clean package -DskipTests | |
| - name: FP16 - Run Llama-3.2-1B-Instruct-F16.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Llama-3.2-1B-Instruct-F16.gguf \ | |
| --prompt "Say hello" | |
| - name: FP16 - Run Qwen3-4B-f16.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Qwen3-4B-f16.gguf \ | |
| --prompt "Say hello" | |
| - name: FP16 - Run Mistral-7B-Instruct-v0.3.fp16.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.fp16.gguf \ | |
| --prompt "Say hello" | |
| - name: FP16 - Run Qwen2.5-1.5b-instruct-fp16.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/qwen2.5-1.5b-instruct-fp16.gguf \ | |
| --prompt "Say hello" | |
| - name: FP16 - Run Phi-3-mini-4k-instruct-fp16.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model /$MODELS_DIR/Phi-3-mini-4k-instruct-fp16.gguf \ | |
| --prompt "Say hello" | |
| - name: Q8 - Run Llama-3.2-1B-Instruct-Q8_0.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Llama-3.2-1B-Instruct-Q8_0.gguf \ | |
| --prompt "Say hello" | |
| - name: Q8 - Run Qwen3-0.6B-Q8_0.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Qwen3-0.6B-Q8_0.gguf \ | |
| --prompt "Say hello" | |
| - name: Q8 - Run Phi-3-mini-4k-instruct-Q8_0.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Phi-3-mini-4k-instruct-Q8_0.gguf \ | |
| --prompt "Say hello" | |
| - name: Q8 - Run Qwen2.5-1.5b-instruct-q8_0.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/qwen2.5-1.5b-instruct-q8_0.gguf \ | |
| --prompt "Say hello" | |
| - name: Q8 - Mistral-7B-Instruct-v0.3.Q8_0.gguf | |
| run: | | |
| cd ${{ github.workspace }} | |
| export PATH="$TORNADO_SDK/bin:$JAVA_HOME/bin:$PATH" | |
| ./llama-tornado --gpu --${{ matrix.backend.name }} \ | |
| --model $MODELS_DIR/Mistral-7B-Instruct-v0.3.Q8_0.gguf \ | |
| --prompt "Say hello" |