Skip to content

Commit 74dabca

Browse files
authored
Updated CI and docker to pth 1.10 (#2285)
* Updated CI and docker to pth 1.10 Description: - Updated CI and docker to pth 1.10 * More updates and fixes * Disabled windows tests and fixed nvidia driver update for linux * Try to fix driver installation
1 parent a397ffa commit 74dabca

File tree

8 files changed

+28
-28
lines changed

8 files changed

+28
-28
lines changed

.circleci/config.yml

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ parameters:
44
pytorch_stable_image:
55
type: string
66
# https://hub.docker.com/r/pytorch/pytorch/tags
7-
default: "pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime"
7+
default: "pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime"
88
pytorch_stable_image_devel:
99
type: string
1010
# https://hub.docker.com/r/pytorch/pytorch/tags
11-
default: "pytorch/pytorch:1.9.0-cuda11.1-cudnn8-devel"
11+
default: "pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel"
1212
workingdir:
1313
type: string
1414
default: "/tmp/ignite"
@@ -26,7 +26,7 @@ parameters:
2626
one_gpu: &one_gpu
2727
machine:
2828
# https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
29-
image: ubuntu-1604-cuda-11.1:202012-01 # CUDA v11.1, Docker v19.03.13, nvidia-container-toolkit v1.4.0-1
29+
image: ubuntu-2004-cuda-11.2:202103-01 # CUDA v11.2.1, Docker v20.10.5, nvidia-container-toolkit v1.4.2-1
3030
docker_layer_caching: true
3131
# https://circleci.com/product/features/resource-classes/#linux-vm
3232
resource_class: gpu.small
@@ -40,7 +40,7 @@ one_gpu_windows: &one_gpu_windows
4040
two_gpus: &two_gpus
4141
machine:
4242
# https://circleci.com/docs/2.0/configuration-reference/#available-linux-gpu-images
43-
image: ubuntu-1604-cuda-11.1:202012-01 # CUDA v11.1, Docker v19.03.13, nvidia-container-toolkit v1.4.0-1
43+
image: ubuntu-2004-cuda-11.2:202103-01 # CUDA v11.2.1, Docker v20.10.5, nvidia-container-toolkit v1.4.2-1
4444
docker_layer_caching: true
4545
# https://circleci.com/product/features/resource-classes/#linux-vm
4646
resource_class: gpu.medium
@@ -54,12 +54,12 @@ install_latest_nvidia: &install_latest_nvidia
5454
name: Install latest NVidia-driver and CUDA
5555
command: |
5656
sudo apt-get purge nvidia* && sudo apt-get autoremove
57-
sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-455 cuda-drivers-455
57+
sudo apt-get update && sudo apt-get install -y --no-install-recommends nvidia-driver-470
5858
# Install nvidia-container-runtime
5959
sudo apt-get install -y nvidia-container-runtime
6060
# Reload driver : https://stackoverflow.com/a/45319156/6309199
6161
# lsof | grep nvidia -> kill Xvfb
62-
sudo lsof | grep "/usr/bin/Xvfb" | head -1 | awk '{print $2}' | xargs -I {} sudo kill -9 {}
62+
sudo lsof | grep "/usr/bin/Xvfb" | head -1 | awk '{print $2}' | xargs -I {} sudo kill -9 {} || echo "Command 'sudo lsof ...' is failed"
6363
# lsmod | grep nvidia
6464
sudo rmmod nvidia_uvm && sudo rmmod nvidia_drm && sudo rmmod nvidia_modeset && sudo rmmod nvidia
6565
# reload driver
@@ -86,9 +86,6 @@ run_pytorch_container: &run_pytorch_container
8686
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
8787
docker exec -it pthd nvidia-smi
8888
docker exec -it pthd ls
89-
# temporarily manually install v1.9.1
90-
# https://github.com/pytorch/ignite/pull/2211#issuecomment-927080841
91-
export update_pth_cmd='conda install -y pytorch==1.9.1 -c pytorch -c nvidia'
9289
docker exec -it pthd /bin/bash -c "$update_pth_cmd"
9390
9491
run_pytorch_devel_container: &run_pytorch_devel_container
@@ -100,9 +97,6 @@ run_pytorch_devel_container: &run_pytorch_devel_container
10097
docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image_devel >>
10198
docker exec -it pthd nvidia-smi
10299
docker exec -it pthd ls
103-
# temporarily manually install v1.9.1
104-
# https://github.com/pytorch/ignite/pull/2211#issuecomment-927080841
105-
export update_pth_cmd='conda install -y pytorch==1.9.1 -c pytorch -c nvidia'
106100
docker exec -it pthd /bin/bash -c "$update_pth_cmd"
107101
108102
install_dependencies: &install_dependencies
@@ -208,23 +202,25 @@ jobs:
208202
command: |
209203
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
210204
211-
- run:
212-
name: Update CUDA Driver for Windows
213-
command: |
214-
curl -O https://raw.githubusercontent.com/pytorch/pytorch/master/.circleci/scripts/windows_cuda_install.sh
215-
mkdir -p "C:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations/"
216-
JOB_EXECUTOR="windows-with-nvidia-gpu" CUDA_VERSION="11.3" VC_PRODUCT="BuildTools" VC_YEAR="2019" bash ./windows_cuda_install.sh
217-
bash -c "'/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe'"
205+
# - run:
206+
# name: Update CUDA Driver for Windows
207+
# command: |
208+
# curl -O https://raw.githubusercontent.com/pytorch/pytorch/master/.circleci/scripts/windows_cuda_install.sh
209+
# mkdir -p "C:/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations/"
210+
# JOB_EXECUTOR="windows-with-nvidia-gpu" CUDA_VERSION="11.3" VC_PRODUCT="BuildTools" VC_YEAR="2019" bash ./windows_cuda_install.sh
211+
# bash -c "'/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe'"
218212

219213
- run:
220214
name: Install dependencies
221215
command: |
222216
conda --version
223217
# We have to use cuda 10.2 on Windows:
224218
# https://github.com/pytorch/ignite/issues/1843
225-
conda install -y pytorch torchvision cudatoolkit=10.2 -c pytorch
219+
conda install -y pytorch==1.9.1 torchvision cudatoolkit=10.2 -c pytorch
226220
pip install -r requirements-dev.txt
227221
pip install .
222+
python -c "import torch; print(torch.__version__, torch.version.cuda, torch.cuda.is_available())"
223+
python -c "import torch; torch.cuda.is_available()"
228224
229225
- run:
230226
# https://github.com/pytorch/ignite/issues/1737
@@ -330,6 +326,7 @@ jobs:
330326
name: Trigger job if modified
331327
command: |
332328
bash .circleci/trigger_if_modified.sh "^(ignite|tests|examples|\.circleci).*"
329+
- <<: *install_latest_nvidia
333330
- <<: *pull_pytorch_stable_devel_image
334331
- <<: *run_pytorch_devel_container
335332
- <<: *install_dependencies
@@ -461,7 +458,10 @@ workflows:
461458
unless: << pipeline.parameters.should_build_docker_images >>
462459
jobs:
463460
- one_gpu_tests
464-
- one_gpu_windows_tests
461+
# Disabled windows tests as NVidia driver is too old
462+
# > c:\tools\miniconda3\lib\site-packages\torch\cuda\__init__.py:52: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at ..\c10\cuda\CUDAFunctions.cpp:115.)
463+
# > return torch._C._cuda_getDeviceCount() > 0
464+
# - one_gpu_windows_tests
465465
- two_gpus_tests
466466
- two_gpus_check_dist_cifar10_example
467467
- two_gpus_hvd_tests

.github/workflows/pytorch-version-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
fail-fast: false
1515
matrix:
1616
python-version: [3.6, 3.7, 3.8]
17-
pytorch-version: [1.8.1, 1.7.1, 1.6.0, 1.5.1, 1.4.0, 1.3.1]
17+
pytorch-version: [1.9.1, 1.8.1, 1.7.1, 1.6.0, 1.5.1, 1.4.0, 1.3.1]
1818
exclude:
1919
- pytorch-version: 1.3.1
2020
python-version: 3.8

docker/docker.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
[DEFAULT]
2-
build_docker_image_pytorch_version = 1.9.0-cuda11.1-cudnn8
2+
build_docker_image_pytorch_version = 1.10.0-cuda11.3-cudnn8
33
build_docker_image_hvd_version = v0.23.0
44
build_docker_image_msdp_version = v0.5.4

docker/hvd/Dockerfile.hvd-apex-nlp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
FROM pytorchignite/hvd-apex:latest
33

44
# Ignite NLP dependencies
5-
RUN pip install --upgrade --no-cache-dir "torchtext<0.10.1" \
5+
RUN pip install --upgrade --no-cache-dir torchtext \
66
transformers \
77
spacy \
88
nltk

docker/hvd/Dockerfile.hvd-nlp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
FROM pytorchignite/hvd-base:latest
33

44
# Ignite NLP dependencies
5-
RUN pip install --upgrade --no-cache-dir "torchtext<0.10.1" \
5+
RUN pip install --upgrade --no-cache-dir torchtext \
66
transformers \
77
spacy \
88
nltk

docker/main/Dockerfile.apex-nlp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
FROM pytorchignite/apex:latest
33

44
# Ignite NLP dependencies
5-
RUN pip install --upgrade --no-cache-dir "torchtext<0.10.1" \
5+
RUN pip install --upgrade --no-cache-dir torchtext \
66
transformers \
77
spacy \
88
nltk

docker/main/Dockerfile.nlp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
FROM pytorchignite/base:latest
33

44
# Ignite NLP dependencies
5-
RUN pip install --upgrade --no-cache-dir "torchtext<0.10.1" \
5+
RUN pip install --upgrade --no-cache-dir torchtext \
66
transformers \
77
spacy \
88
nltk

docker/msdp/Dockerfile.msdp-apex-nlp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
FROM pytorchignite/msdp-apex:latest
33

44
# Ignite NLP dependencies
5-
RUN pip install --upgrade --no-cache-dir "torchtext<0.10.1" \
5+
RUN pip install --upgrade --no-cache-dir torchtext \
66
transformers \
77
spacy \
88
nltk

0 commit comments

Comments
 (0)