Skip to content

Commit 77e165f

Browse files
committed
add files
1 parent a40e693 commit 77e165f

File tree

10 files changed

+51
-18
lines changed

10 files changed

+51
-18
lines changed

.github/workflows/ec2-pipeline.yml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,16 +147,15 @@ jobs:
147147
run: |
148148
mkdir -p model_storage
149149
docker run --rm --gpus=all \
150-
-v model_storage:/workspace/model_storage \
150+
-v $(pwd):/workspace \
151151
-e AWS_ACCESS_KEY_ID=${{ secrets.AWS_ACCESS_KEY_ID }} \
152152
-e AWS_SECRET_ACCESS_KEY=${{ secrets.AWS_SECRET_ACCESS_KEY }} \
153153
-e AWS_DEFAULT_REGION=${{ secrets.AWS_REGION }} \
154154
${{ secrets.AWS_ECR_LOGIN_URI }}/${{ secrets.ECR_REPOSITORY_NAME }}:latest \
155155
/bin/bash -c "
156156
dvc pull -r myremote && \
157157
mkdir -p model_storage && \
158-
dvc repro -f && \
159-
cp best_model_checkpoint.txt model_storage/
158+
dvc repro -f
160159
"
161160
162161
# # Wait a moment to ensure the container has started

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime
1+
FROM pytorch/pytorch:2.3.1-cuda11.8-cudnn8-runtime
22

33
WORKDIR /workspace
44
COPY . .

configs/callbacks/model_checkpoint.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
model_checkpoint:
44
_target_: src.train.CustomModelCheckpiont
55
dirpath: null # directory to save the model file
6-
filename: "/home/runner/work/emlo4-session-07-ajithvcoder/emlo4-session-07-ajithvcoder/model_storage/epoch-checkpoint.ckpt.ckpt" # checkpoint filename
6+
filename: "/workspace/model_storage/epoch-checkpoint.ckpt.ckpt" # checkpoint filename
77
monitor: null # name of the logged metric which determines when model is improving
88
verbose: False # verbosity mode
99
save_last: null # additionally always save an exact copy of the last checkpoint to a file last.ckpt

configs/experiment/catdog_ex.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ callbacks:
4646
mode: "max"
4747
save_top_k: 1
4848
save_last: True
49-
filename: "/home/runner/work/emlo4-session-07-ajithvcoder/emlo4-session-07-ajithvcoder/model_storage/epoch-checkpoint"
49+
filename: "/workspace/model_storage/epoch-checkpoint"
5050

5151
early_stopping:
5252
monitor: "val_acc"

configs/experiment/dogbreed_ex.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ callbacks:
3535
mode: "max"
3636
save_top_k: 1
3737
save_last: True
38-
filename: "/home/runner/work/emlo4-session-07-ajithvcoder/emlo4-session-07-ajithvcoder/model_storage/epoch-checkpoint.ckpt"
38+
filename: "/workspace/model_storage/epoch-checkpoint.ckpt"
3939

4040
early_stopping:
4141
monitor: "val_acc"

configs/experiment/dogbreed_ex_train.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ callbacks:
3535
mode: "max"
3636
save_top_k: 1
3737
save_last: True
38-
filename: "/home/runner/work/emlo4-session-07-ajithvcoder/emlo4-session-07-ajithvcoder/emlo4-session-06-ajithvcoder/model_storage/epoch-checkpoint"
38+
filename: "/workspace/emlo4-session-06-ajithvcoder/model_storage/epoch-checkpoint"
3939

4040
early_stopping:
4141
monitor: "val_acc"

configs/infer.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ callbacks:
3131
mode: "max"
3232
save_top_k: 1
3333
save_last: True
34-
filename: "/home/runner/work/emlo4-session-07-ajithvcoder/emlo4-session-07-ajithvcoder/model_storage/epoch-checkpoint.ckpt.ckpt"
34+
filename: "/workspace/model_storage/epoch-checkpoint.ckpt.ckpt"

dvc.lock

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@ stages:
55
deps:
66
- path: configs/experiment/catdog_ex.yaml
77
hash: md5
8-
md5: d9430d3b364df5d20fc94f90d8cc6ad1
9-
size: 1280
8+
md5: 959c64c29a91c7f7b4448e0c5cbc54f1
9+
size: 1215
1010
- path: data/cats_and_dogs_filtered
1111
hash: md5
1212
md5: c57d92307d2679437e80f6682cfaf521.dir
@@ -23,8 +23,8 @@ stages:
2323
deps:
2424
- path: configs/experiment/catdog_ex.yaml
2525
hash: md5
26-
md5: 912d97cb77912129ba1ae8427cddc766
27-
size: 1240
26+
md5: 959c64c29a91c7f7b4448e0c5cbc54f1
27+
size: 1215
2828
- path: src/eval.py
2929
hash: md5
3030
md5: 47a9e50a0213ee71ff67a97b71d1ca6e
@@ -34,8 +34,8 @@ stages:
3434
deps:
3535
- path: configs/experiment/catdog_ex.yaml
3636
hash: md5
37-
md5: 912d97cb77912129ba1ae8427cddc766
38-
size: 1240
37+
md5: 959c64c29a91c7f7b4448e0c5cbc54f1
38+
size: 1215
3939
- path: src/infer.py
4040
hash: md5
4141
md5: 1b360e17f7740c9bd6eba9c52e0dc5d2

scripts/multirun_metrics_fetch.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ def get_hyperparams(log_dir, timestamp):
6262
"patch_size": values.get('patch_size'), "embed_dim": values.get('embed_dim')}
6363
return extract_hyperparams
6464

65-
def main():
65+
def main_run():
6666
log_dir = 'logs'
6767
timestamp = get_latest_timestamp(os.path.join(log_dir, 'train/multiruns'))
6868

@@ -177,5 +177,39 @@ def main():
177177
with open('best_model_checkpoint.txt', 'w') as f:
178178
f.write(f"./model_storage/epoch-checkpoint_patch_size-{hparams_data['best_params']['model.patch_size']}_embed_dim-{hparams_data['best_params']['model.embed_dim']}.ckpt")
179179

180+
import shutil
181+
182+
# Define the source file and destination folder
183+
source_file = 'best_model_checkpoint.txt'
184+
destination_folder = 'model_storage/'
185+
186+
# Copy the file to the destination folder
187+
shutil.copy(source_file, destination_folder)
188+
189+
print(f"{source_file} has been copied to {destination_folder}")
190+
191+
192+
# Define the path to the checkpoint file and folder containing .ckpt files
193+
checkpoint_file = 'best_model_checkpoint.txt'
194+
checkpoint_folder = 'model_storage'
195+
196+
# Read the first line of the checkpoint file to get the file to keep
197+
with open(checkpoint_file, 'r') as f:
198+
keep_file = f.readline().strip()
199+
200+
# Get the full path of the file to keep
201+
keep_file_path = os.path.join(checkpoint_folder, os.path.basename(keep_file))
202+
203+
# Iterate over files in the checkpoint folder and delete unwanted .ckpt files
204+
for file in os.listdir(checkpoint_folder):
205+
file_path = os.path.join(checkpoint_folder, file)
206+
if file_path.endswith('.ckpt') and file_path != keep_file_path:
207+
os.remove(file_path)
208+
print(f"Removed: {file_path}")
209+
210+
print(f"Kept: {keep_file_path}")
211+
212+
213+
180214
if __name__ == "__main__":
181-
main()
215+
main_run()

tests/test_infer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def config():
1616
with hydra.initialize(version_base=None, config_path="../configs"):
1717
cfg = hydra.compose(
1818
config_name="infer",
19-
overrides=["callbacks.model_checkpoint.filename=/home/runner/work/emlo4-session-07-ajithvcoder/emlo4-session-07-ajithvcoder/model_storage/epoch-checkpoint.ckpt.ckpt"],
19+
overrides=["callbacks.model_checkpoint.filename=/workspace/model_storage/epoch-checkpoint.ckpt.ckpt"],
2020
)
2121
return cfg
2222

0 commit comments

Comments
 (0)