File tree Expand file tree Collapse file tree 2 files changed +5
-8
lines changed
ignite/distributed/comp_models Expand file tree Collapse file tree 2 files changed +5
-8
lines changed Original file line number Diff line number Diff line change @@ -86,7 +86,6 @@ run_pytorch_container: &run_pytorch_container
8686 docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
8787 docker exec -it pthd nvidia-smi
8888 docker exec -it pthd ls
89- docker exec -it pthd /bin/bash -c "$update_pth_cmd"
9089
9190run_pytorch_devel_container : &run_pytorch_devel_container
9291 - run :
@@ -97,7 +96,6 @@ run_pytorch_devel_container: &run_pytorch_devel_container
9796 docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image_devel >>
9897 docker exec -it pthd nvidia-smi
9998 docker exec -it pthd ls
100- docker exec -it pthd /bin/bash -c "$update_pth_cmd"
10199
102100install_dependencies : &install_dependencies
103101 - run :
Original file line number Diff line number Diff line change @@ -150,12 +150,11 @@ def _init_from_context(self) -> None:
150150
151151 def _compute_nproc_per_node (self ) -> int :
152152 local_rank = self .get_local_rank ()
153- device = torch .device ("cpu" )
154- if torch .cuda .is_available ():
155- # we manually set cuda device to local rank in order to avoid a hang on all_reduce
156- device = torch .device (f"cuda:{ local_rank } " )
157- tensor = torch .tensor ([self .get_local_rank () + 1 ]).to (device )
158- dist .all_reduce (tensor , op = dist .ReduceOp .MAX )
153+ # Create new cpu group to get nproc_per_node such we avoid using
154+ # badly configured NCCL
155+ gloo_group = dist .new_group (backend = "gloo" )
156+ tensor = torch .tensor ([local_rank + 1 ]).to ("cpu" )
157+ dist .all_reduce (tensor , op = dist .ReduceOp .MAX , group = gloo_group )
159158 return int (tensor .item ())
160159
161160 def _get_all_hostnames (self ) -> List [Tuple [str , ...]]:
You can’t perform that action at this time.
0 commit comments