Fixes _compute_nproc_per_node in case of bad dist configuration (#2288)

vfdev-5 · web-flow · commit 4a37e35dcb95 · 2021-10-23T13:44:33.000+02:00
Description: - Now uses new gloo group to compute nproc per node - Context: using NCCL and if user badly setups cuda per proc, idist will hang on _compute_nproc_per_node - Here is an example: https://app.circleci.com/pipelines/github/pytorch/ignite/2264/workflows/2e3073fd-0859-41c7-91e8-eef0f8eabee2/jobs/7060?invite=true#step-107-872 - However, I couldn't repro the issue on my setup cc @sdesrozier
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -86,7 +86,6 @@ run_pytorch_container: &run_pytorch_container
         docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image >>
         docker exec -it pthd nvidia-smi
         docker exec -it pthd ls
-        docker exec -it pthd /bin/bash -c "$update_pth_cmd"
 
 run_pytorch_devel_container: &run_pytorch_devel_container
   - run:
@@ -97,7 +96,6 @@ run_pytorch_devel_container: &run_pytorch_devel_container
         docker run --gpus=all --rm -itd --shm-size 16G -v ${wd}:/ignite -w /ignite --name pthd << pipeline.parameters.pytorch_stable_image_devel >>
         docker exec -it pthd nvidia-smi
         docker exec -it pthd ls
-        docker exec -it pthd /bin/bash -c "$update_pth_cmd"
 
 install_dependencies: &install_dependencies
   - run:
diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py
@@ -150,12 +150,11 @@ def _init_from_context(self) -> None:
 
         def _compute_nproc_per_node(self) -> int:
             local_rank = self.get_local_rank()
-            device = torch.device("cpu")
-            if torch.cuda.is_available():
-                # we manually set cuda device to local rank in order to avoid a hang on all_reduce
-                device = torch.device(f"cuda:{local_rank}")
-            tensor = torch.tensor([self.get_local_rank() + 1]).to(device)
-            dist.all_reduce(tensor, op=dist.ReduceOp.MAX)
+            # Create new cpu group to get nproc_per_node such we avoid using
+            # badly configured NCCL
+            gloo_group = dist.new_group(backend="gloo")
+            tensor = torch.tensor([local_rank + 1]).to("cpu")
+            dist.all_reduce(tensor, op=dist.ReduceOp.MAX, group=gloo_group)
             return int(tensor.item())
 
         def _get_all_hostnames(self) -> List[Tuple[str, ...]]: