Skip to content

Commit 64ee3ba

Browse files
committed
shell scripts + dockerfile for CD
1 parent 9c9c0ae commit 64ee3ba

File tree

5 files changed

+376
-0
lines changed

5 files changed

+376
-0
lines changed

Dockerfile

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# This Dockerfile is for RLFT in a cloud instance (AWS/Azure/GCP)
2+
#
3+
# Author: Abhishek Sriram <noobsiecoder@gmail.com>
4+
# Date: Aug 21st, 2025
5+
# Place: Boston, MA
6+
7+
# Build stage with secrets
8+
FROM nvidia/cuda:12.2.0-base-ubuntu22.04 AS builder
9+
10+
# Get arguments
11+
# Repository URI
12+
ARG REPO_URI
13+
# Branch name of the repository
14+
ARG BRANCH_NAME
15+
# Secrets
16+
ARG GCP_STORAGE_JSON_FILE
17+
ARG MODELS_API_ENV_FILE
18+
19+
RUN apt-get update && apt-get install -y git curl && apt-get clean
20+
21+
WORKDIR /src
22+
RUN git clone ${REPO_URI} . && git checkout ${BRANCH_NAME}
23+
24+
# Write secrets
25+
RUN mkdir -p secrets && \
26+
echo "${GCP_STORAGE_JSON_FILE}" | base64 -d > secrets/gcp-storage.json && \
27+
echo "${MODELS_API_ENV_FILE}" | base64 -d > secrets/models-api.env
28+
29+
# Image (OS) type
30+
FROM nvidia/cuda:12.2.0-base-ubuntu22.04
31+
32+
# Prevent interactive prompts + added Time Zone
33+
ENV DEBIAN_FRONTEND=noninteractive
34+
ENV TZ=UTC
35+
36+
# Install build dependency tools
37+
RUN apt-get update && apt-get install -y \
38+
python3 \
39+
python3-pip \
40+
iverilog \
41+
yosys \
42+
&& apt-get clean
43+
44+
# Install uv
45+
RUN curl -LsSf https://astral.sh/uv/install.sh | sh -s -- --yes
46+
ENV PATH="/root/.cargo/bin:/root/.local/bin:${PATH}"
47+
48+
# Work directory of the application
49+
WORKDIR /src
50+
51+
# Clone the repository and switch to the specific branch
52+
RUN git clone ${REPO_URI} . && \
53+
git checkout ${BRANCH_NAME}
54+
55+
# Install python dependencies
56+
# NOTE: Runs as a cautionary step
57+
RUN uv sync
58+
59+
# Make script executable
60+
RUN chmod +x scripts/cd.sh
61+
62+
# Runner/Executable point
63+
CMD ["./scripts/cd.sh"]

scripts/cd.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
#!/bin/bash
2+
# This script aids in:
3+
# 2) Running more tests from tests/ for LLM (loaded in GPU)
4+
# 3) Runner for RLFT
5+
#
6+
# Author: Abhishek Sriram <noobsiecoder@gmail.com>
7+
# Date: Aug 20th, 2025
8+
# Place: Boston, MA
9+
set -e
10+
11+
# Step 1: Run few more tests to check if LLM loads in cloud instance
12+
echo "=== Running LLM Sepecific Tests ==="
13+
uv run pytest -v \
14+
tests/test_llm_prompts.py::test_prompt_extraction \
15+
tests/test_models.py
16+
17+
# Step 2: Run RLFT
18+
# TODO: insert command(s) here ...
19+
echo "=== Running RLFT ==="

scripts/check_docker_instance.sh

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
#!/bin/bash
2+
# Script checks if there is a docker instance running in a cloud VM.
3+
#
4+
# Author: Abhishek Sriram <noobsiecoder@gmail.com>
5+
# Date: Aug 22nd, 2025
6+
# Place: Boston, MA
7+
set -e
8+
9+
# Get list of running containers
10+
running_containers=$(docker ps --format "table {{.Names}}\t{{.Image}}\t{{.Status}}" 2>/dev/null | tail -n +2)
11+
12+
# Check output
13+
if [ -z "$running_containers" ]; then
14+
echo "NO_CONTAINERS"
15+
exit 0
16+
fi
17+
18+
# Check if any non-system containers are running
19+
# Exclude common system containers and daemons
20+
app_containers=$(echo "$running_containers" | grep -v -E "^(portainer|watchtower|traefik|nginx-proxy|docker-proxy|registry)" | grep -v -E "(daemon|system)")
21+
22+
# Check output
23+
if [ -z "$app_containers" ]; then
24+
echo "NO_APP_CONTAINERS"
25+
exit 0
26+
fi
27+
28+
# Check specifically for VeriGenLLM-v2 related containers
29+
verigen_containers=$(echo "$app_containers" | grep -i "verigen\|llm\|rlft")
30+
31+
# Check output
32+
if [ ! -z "$verigen_containers" ]; then
33+
echo "VERIGEN_RUNNING"
34+
echo "$verigen_containers"
35+
exit 0
36+
fi
37+
38+
# Other app containers are running
39+
echo "OTHER_APPS_RUNNING"
40+
echo "$app_containers"
41+
exit 0

scripts/cloud.sh

Lines changed: 179 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,179 @@
1+
#!/bin/bash
2+
# This file is the orchestrator of workflow to ship it to cloud
3+
# Functionality:
4+
# 1) Checks availability for resource in each cloud services.
5+
# 2) Deploy in a VM that is available; else, schedule-for-later/exit.
6+
#
7+
# Author: Abhishek Sriram <noobsiecoder@gmail.com>
8+
# Date: Aug 21st, 2025
9+
# Place: Boston, MA
10+
set -e
11+
12+
# ==================== AWS credentials =====================
13+
# TODO: AWS Credentials not used as quota wasn't increased yet (Recorded on: Aug 22nd, 2025)
14+
15+
# ==================== Azure credentials ===================
16+
AZURE_USERNAME="$secrets.AZURE_USERNAME"
17+
AZURE_PASSWORD="$secrets.AZURE_APP_ID"
18+
AZURE_TENANT="$secrets.AZURE_TENANT"
19+
AZURE_RESOURCE_GROUP="$secrets.AZURE_RESOURCE_GROUP"
20+
AZURE_VM_INSTANCE="$secrets.AZURE_VM_INSTANCE"
21+
22+
# =================== GCP credentials =====================
23+
GCP_TYPE="$secrets.GCP_TYPE"
24+
GCP_PRIVATE_KEY_ID="$secrets.GCP_PRIVATE_KEY_ID"
25+
GCP_PROJECT_ID="$secrets.GCP_PROJECT_ID"
26+
GCP_PRIVATE_KEY="$secrets.GCP_PRIVATE_KEY"
27+
GCP_CLIENT_EMAIL="$secrets.GCP_CLIENT_EMAIL"
28+
GCP_CLIENT_ID="$secrets.GCP_CLIENT_ID"
29+
GCP_AUTH_URI="$secrets.GCP_AUTH_URI"
30+
GCP_TOKEN_URI="$secrets.GCP_TOKEN_URI"
31+
GCP_CERT="$secrets.GCP_CERT"
32+
GCP_CERT_URI="$secrets.GCP_CERT_URI"
33+
GCP_DOMAIN="$secrets.GCP_DOMAIN"
34+
35+
FILE_ENTRYPOINT="~/VeriGenLLM-v2/main.py"
36+
37+
# TODO: Yet to work on AWS VMs -> Waiting on quota increase
38+
# Function to check AWS VM
39+
40+
41+
# Function to check AZURE VM
42+
# TODO: Instead of the pytho script, check dockerfile
43+
check_azure() {
44+
# Checking Azure container
45+
echo "Checking Azure VM for running Docker containers..."
46+
47+
# Login to Azure
48+
az login --service-principal -u $AZURE_USERNAME -p $AZURE_PASSWORD --tenant $AZURE_TENANT > /dev/null 2>&1
49+
50+
# Check if any Docker containers are running (excluding system containers)
51+
# This command will:
52+
# 1. List all running containers with their names and images
53+
# 2. Exclude the Docker daemon and system containers
54+
# 3. Look for actual application containers
55+
local check_script='
56+
# Get list of running containers
57+
running_containers=$(docker ps --format "table {{.Names}}\t{{.Image}}\t{{.Status}}" 2>/dev/null | tail -n +2)
58+
59+
if [ -z "$running_containers" ]; then
60+
echo "NO_CONTAINERS"
61+
exit 0
62+
fi
63+
64+
# Check if any non-system containers are running
65+
# Exclude common system containers and daemons
66+
app_containers=$(echo "$running_containers" | grep -v -E "^(portainer|watchtower|traefik|nginx-proxy|docker-proxy|registry)" | grep -v -E "(daemon|system)")
67+
68+
if [ -z "$app_containers" ]; then
69+
echo "NO_APP_CONTAINERS"
70+
exit 0
71+
fi
72+
73+
# Check specifically for VeriGenLLM-v2 related containers
74+
verigen_containers=$(echo "$app_containers" | grep -i "verigen\|llm\|rlft")
75+
76+
if [ ! -z "$verigen_containers" ]; then
77+
echo "VERIGEN_RUNNING"
78+
echo "$verigen_containers"
79+
exit 0
80+
fi
81+
82+
# Other app containers are running
83+
echo "OTHER_APPS_RUNNING"
84+
echo "$app_containers"
85+
exit 0
86+
'
87+
88+
# Execute the check script on Azure VM
89+
local OUTPUT=$(az vm run-command invoke \
90+
-g $AZURE_RESOURCE_GROUP \
91+
-n $AZURE_VM_INSTANCE \
92+
--command-id RunShellScript \
93+
--scripts "$check_script" \
94+
--output json 2>&1)
95+
96+
# Check if the command executed successfully
97+
if [ $? -ne 0 ]; then
98+
echo "✗ Failed to execute command on Azure VM"
99+
echo "Error: $OUTPUT"
100+
return 3 # VM unreachable or command failed
101+
fi
102+
103+
# Parse the output
104+
local stdout_content=$(echo "$OUTPUT" | jq -r '.value[0].message' 2>/dev/null | grep -oP '\[stdout\]\K.*' | sed 's/\\n/\n/g')
105+
106+
# Determine the status based on output
107+
if echo "$stdout_content" | grep -q "NO_CONTAINERS"; then
108+
echo "✓ No Docker containers running on Azure VM - VM is available"
109+
return 1 # VM available for deployment
110+
elif echo "$stdout_content" | grep -q "NO_APP_CONTAINERS"; then
111+
echo "✓ No application containers running on Azure VM - VM is available"
112+
return 1 # VM available for deployment
113+
elif echo "$stdout_content" | grep -q "VERIGEN_RUNNING"; then
114+
echo "✗ VeriGenLLM-v2 is already running on Azure VM"
115+
# Extract and display the container details
116+
local container_info=$(echo "$stdout_content" | grep -A 10 "VERIGEN_RUNNING" | tail -n +2 | head -n -1)
117+
echo "Running containers:"
118+
echo "$container_info"
119+
return 0 # VM busy with our application
120+
elif echo "$stdout_content" | grep -q "OTHER_APPS_RUNNING"; then
121+
echo "⚠ Other applications are running on Azure VM"
122+
# Extract and display the container details
123+
local container_info=$(echo "$stdout_content" | grep -A 10 "OTHER_APPS_RUNNING" | tail -n +2 | head -n -1)
124+
echo "Running containers:"
125+
echo "$container_info"
126+
return 0 # VM busy with other applications
127+
else
128+
echo "✗ Unable to determine Azure VM status"
129+
return 3 # Unknown status
130+
fi
131+
}
132+
133+
# Function to check GCP VM
134+
# TODO: Instead of the pytho script, check dockerfile
135+
check_gcp() {
136+
# Replace literal '\n' with actual newlines in private key
137+
local FIXED_PRIVATE_KEY
138+
FIXED_PRIVATE_KEY=$(echo "$GCP_PRIVATE_KEY" | sed 's/\\n/\n/g')
139+
140+
# Write security object to /tmp/gcp-secret.json
141+
cat > /tmp/gcp-secret.json <<EOF
142+
{
143+
"type": "$GCP_TYPE",
144+
"project_id": "$GCP_PROJECT_ID",
145+
"private_key_id": "$GCP_PRIVATE_KEY_ID",
146+
"private_key": "$FIXED_PRIVATE_KEY",
147+
"client_email": "$GCP_CLIENT_EMAIL",
148+
"client_id": "$GCP_CLIENT_ID",
149+
"auth_uri": "$GCP_AUTH_URI",
150+
"token_uri": "$GCP_TOKEN_URI",
151+
"auth_provider_x509_cert_url": "$GCP_CERT",
152+
"client_x509_cert_url": "$GCP_CERT_URI",
153+
"universe_domain": "$GCP_DOMAIN"
154+
}
155+
EOF
156+
157+
export GOOGLE_APPLICATION_CREDENTIALS="/tmp/gcp-secret.json"
158+
159+
# Run command on VM
160+
local output=$(gcloud compute ssh $GCP_INSTANCE_NAME \
161+
--zone=$GCP_INSTANCE_ZONE \
162+
--command="pgrep -af '$FILE_ENTRYPOINT'" \
163+
--ssh-flag="-o ConnectTimeout=10" \
164+
2>&1)
165+
166+
if [ $? -eq 0 ] && [ ! -z "$output" ]; then
167+
echo "✓ Script is running"
168+
echo "Process info: $output"
169+
return 0
170+
else
171+
echo "✗ Script is not running"
172+
return 1
173+
fi
174+
}
175+
176+
# Main Runner
177+
main() {}
178+
179+
main # Run main function

scripts/starter.sh

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#!/bin/bash
2+
# Base script for CD in Azure/GCP VM:
3+
# 1) Checks Docker (installs if missing)
4+
# 2) Ensures NVIDIA drivers for GPU are installed
5+
# 3) Builds & runs Docker image for RLFT
6+
#
7+
# Author: Abhishek Sriram <noobsiecoder@gmail.com>
8+
# Date: Aug 21st, 2025
9+
# Place: Boston, MA
10+
11+
set -e
12+
GITHUB_REPO_URI=$1
13+
GITHUB_REPO_BRANCH=$2
14+
GCP_SECRETS_FILE=$3
15+
APIKEYS_FILE=$4
16+
DOCKER_IMAGE_NAME="verilog-rlft"
17+
LOGFILE="/var/log/rlft_setup.log"
18+
BUILD_LOGFILE="/var/log/docker_build.log"
19+
RUN_LOGFILE="/var/log/docker_run.log"
20+
21+
# Step 0: Redirect all logs
22+
exec > >(sudo tee -a "$LOGFILE") 2>&1
23+
echo "===== Starting RLFT setup at $(date) ====="
24+
25+
# Step 1: Install Docker if missing
26+
if ! command -v docker &> /dev/null; then
27+
echo "Docker not found. Installing..."
28+
sudo apt-get update -y
29+
sudo apt-get install -y docker.io
30+
sudo systemctl enable docker
31+
sudo systemctl start docker
32+
echo "Docker installed successfully."
33+
else
34+
echo "Docker is already installed."
35+
fi
36+
37+
# Step 2: Ensure NVIDIA drivers for GPU support
38+
if command -v nvidia-smi &> /dev/null; then
39+
echo "NVIDIA drivers already installed."
40+
else
41+
echo "NVIDIA drivers not found. Installing..."
42+
# GCP official GPU driver installer
43+
sudo apt-get install -y linux-headers-$(uname -r)
44+
sudo apt-get install -y nvidia-driver-535
45+
echo "NVIDIA drivers installed. A VM reboot may be required."
46+
fi
47+
48+
# Step 3: Make project directory
49+
mkdir -p src/
50+
cd src/
51+
52+
# Step 4: Build Docker image
53+
echo "Building Docker image: $DOCKER_IMAGE_NAME"
54+
docker build -f Dockerfile \
55+
--build-arg REPO_URL=$GITHUB_REPO_URI \
56+
--build-arg BRANCH_NAME=$GCP_STORAGE_JSON_FILE \
57+
--build-arg GCP_STORAGE_JSON_FILE=$GCP_SECRETS_FILE \
58+
--build-arg MODELS_API_ENV_FILE=$APIKEYS_FILE \
59+
--no-cache \
60+
-t $DOCKER_IMAGE_NAME . \
61+
2>&1 | sudo tee -a "$BUILD_LOGFILE"
62+
63+
# Step 5: Run container with GPU (if available)
64+
if command -v nvidia-smi &> /dev/null; then
65+
echo "Running container with GPU support..."
66+
docker run --gpus all $DOCKER_IMAGE_NAME \
67+
2>&1 | sudo tee -a "$RUN_LOGFILE"
68+
else
69+
echo "No GPU detected. Running container without GPU..."
70+
docker run $DOCKER_IMAGE_NAME \
71+
2>&1 | sudo tee -a "$RUN_LOGFILE"
72+
fi
73+
74+
echo "===== Finished RLFT setup at $(date) ====="

0 commit comments

Comments
 (0)