Skip to content

Commit 23114ef

Browse files
committed
add checkpoint storage
1 parent f5419fa commit 23114ef

File tree

1 file changed

+27
-13
lines changed

1 file changed

+27
-13
lines changed

.github/workflows/ec2-pipeline.yml

Lines changed: 27 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -53,22 +53,14 @@ jobs:
5353
# - uses: actions/checkout@v2
5454

5555
- name: Display CUDA Version
56-
run: |
57-
if command -v nvcc &> /dev/null; then
58-
echo "CUDA Version:"
59-
nvcc --version || true
60-
else
61-
echo "nvcc not found. Ensure CUDA is installed."
62-
fi
56+
run: |
57+
echo "CUDA Version:"
58+
nvcc --version || true
6359
6460
- name: Display cuDNN Version
6561
run: |
66-
if [ -f /usr/local/cuda/include/cudnn_version.h ]; then
67-
echo "cuDNN Version:"
68-
cat /usr/local/cuda/include/cudnn_version.h | grep CUDNN_MAJOR -A 2 || true
69-
else
70-
echo "cuDNN not found. Ensure cuDNN is installed."
71-
fi
62+
echo "cuDNN Version:"
63+
cat /usr/local/cuda/include/cudnn_version.h | grep CUDNN_MAJOR -A 2 || true
7264
7365
- name: Verify EC2 Instance
7466
run: |
@@ -153,7 +145,11 @@ jobs:
153145
154146
- name: Run DVC commands in container
155147
run: |
148+
mkdir model_storage
149+
touch best_model_checkpoint.txt
156150
docker run --rm --gpus=all \
151+
-v model_storage:/workspace/model_storage \
152+
-v best_model_checkpoint.txt:/workspace/best_model_checkpoint.txt
157153
-e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
158154
-e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
159155
-e AWS_DEFAULT_REGION=${{ secrets.AWS_REGION }} \
@@ -173,6 +169,24 @@ jobs:
173169
# # Stop the container after retrieving logs
174170
# docker stop $CONTAINER_ID
175171
172+
- name: Read best checkpoint file name
173+
id: read_checkpoint
174+
run: |
175+
checkpoint_file=$(head -n 1 best_model_checkpoint.txt)
176+
echo "CHECKPOINT_FILE=$checkpoint_file" >> $GITHUB_ENV
177+
178+
- name: Get latest commit ID
179+
id: get_commit_id
180+
run: echo "COMMIT_ID=$(git rev-parse HEAD)" >> $GITHUB_ENV
181+
182+
- name: Upload checkpoint to S3
183+
run: |
184+
checkpoint_path="${{ env.CHECKPOINT_FILE }}" # Use the checkpoint path from the file
185+
bucket_name="mybucket-emlo-mumbai/session-08-checkpoint/" # Change to your S3 bucket name
186+
s3_key="session-08-checkpoint/${{ env.COMMIT_ID }}/$(basename "$checkpoint_path")" # Define S3 key
187+
echo "Uploading $checkpoint_path to s3://$bucket_name/$s3_key"
188+
aws s3 cp "$checkpoint_path" "s3://$bucket_name/$s3_key"
189+
176190
- name: Clean previous images and containers
177191
run: |
178192
docker system prune -f

0 commit comments

Comments
 (0)