[WIP] create from context for Native Torch Dist

pytorch · vfdev-5 · May 31, 2020 · May 11, 2020 · May 11, 2020 · May 11, 2020
commit 7fcadca046a84051679efb99ac580eae9432e8fc
diff --git a/ignite/distributed/comp_models/native.py b/ignite/distributed/comp_models/native.py
@@ -55,9 +55,11 @@ def __init__(self, backend=None, timeout=None, **kwargs):
 
     def _create_from_backend(self, backend, timeout=None, **kwargs):
         self.setup_env_vars()
+
+        self._local_rank = int(os.environ["LOCAL_RANK"])
+        # for debug purposes
         self._master_port = int(os.environ["MASTER_PORT"])
         self._master_addr = os.environ["MASTER_ADDR"]
-        self._local_rank = int(os.environ["LOCAL_RANK"])
 
         init_pg_kwargs = {}
         if timeout is not None:
@@ -70,28 +72,23 @@ def _create_from_backend(self, backend, timeout=None, **kwargs):
         if backend == "nccl":
             torch.cuda.device(self._local_rank)
 
-        self._ntasks_per_node = self._compute_ntasks_per_node()
-        self._nnodes = self.get_world_size() // self.get_ntasks_per_node()
-        self._node = self.get_rank() // self._ntasks_per_node
+        self._setup_attrs()
 
     def _init_from_context(self):
 
-        raise NotImplementedError("")
+        if "LOCAL_RANK" not in os.environ:
+            raise RuntimeError(
+                "Can not initialize native dist model without local rank information. "
+                "Please, set `os.environ['LOCAL_RANK']` with correct local rank index"
+            )
 
+        self._local_rank = int(os.environ["LOCAL_RANK"])
+        # for debug purposes
         self._master_port = None
         self._master_addr = None
+        self._setup_attrs()
 
-        # THIS COULD HELP TO GET master addr/port if TCPStore is used.
-        # HOWEVER, user can use FileStore or any other store.
-        # try:
-        #     store = dist.distributed_c10d._get_default_store()
-        #     if isinstance(store, torch.distributed.TCPStore):
-        #         self._master_port = None
-        #         self._master_addr = None
-        # except AttributeError:
-        #     pass
-
-        self._local_rank = 0  # self.get_rank() % self._ntasks_per_node
+    def _setup_attrs(self):
         self._ntasks_per_node = self._compute_ntasks_per_node()
         self._nnodes = self.get_world_size() // self._ntasks_per_node
         self._node = self.get_rank() // self._ntasks_per_node

diff --git a/tests/ignite/distributed/comp_models/test_native.py b/tests/ignite/distributed/comp_models/test_native.py
@@ -8,7 +8,7 @@
 
 
 @pytest.mark.distributed
-def test__dist_model():
+def test__native_dist_model():
     available_backends = _NativeDistModel.available_backends
 
     if dist.is_nccl_available():
@@ -29,7 +29,7 @@ def test__dist_model():
 
 @pytest.mark.distributed
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
-def test__dist_model_create_from_backend_bad_config():
+def test__native_dist_model_create_from_backend_bad_config():
     import os
     from datetime import timedelta
 
@@ -55,7 +55,7 @@ def _assert_model(model, true_conf):
     assert model.get_ntasks_per_node() == true_conf["ntasks_per_node"]
 
 
-def _test__dist_model_create_from_backend_no_dist(backend, true_device):
+def _test__native_dist_model_create_from_backend_no_dist(backend, true_device):
     from datetime import timedelta
 
     model = _NativeDistModel.create_from_backend(backend=backend, timeout=timedelta(seconds=20))
@@ -79,7 +79,7 @@ def _test__dist_model_create_from_backend_no_dist(backend, true_device):
     model.finalize()
 
 
-def _test__dist_model_create_from_backend_dist(local_rank, rank, world_size, backend, true_device):
+def _test__native_dist_model_create_from_backend_dist(local_rank, rank, world_size, backend, true_device):
     import os
     from datetime import timedelta
 
@@ -109,13 +109,16 @@ def _test__dist_model_create_from_backend_dist(local_rank, rank, world_size, bac
     del os.environ["RANK"]
 
 
-def _test__dist_model_create_from_context_no_dist(true_backend, true_device):
+def _test__native_dist_model_create_from_context_no_dist(true_backend, true_device):
 
     assert _NativeDistModel.create_from_context() is None
 
     dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=1, rank=0)
     dist.barrier()
 
+    import os
+    os.environ["LOCAL_RANK"] = "0"
+
     model = _NativeDistModel.create_from_context()
 
     assert dist.is_available() and dist.is_initialized()
@@ -137,7 +140,7 @@ def _test__dist_model_create_from_context_no_dist(true_backend, true_device):
     dist.destroy_process_group()
 
 
-def _test__dist_model_create_from_context_dist(local_rank, rank, world_size, true_backend, true_device):
+def _test__native_dist_model_create_from_context_dist(local_rank, rank, world_size, true_backend, true_device):
 
     dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=world_size, rank=rank)
     dist.barrier()
@@ -165,30 +168,30 @@ def _test__dist_model_create_from_context_dist(local_rank, rank, world_size, tru
 
 @pytest.mark.distributed
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
-def test__dist_model_create_no_dist_gloo():
-    _test__dist_model_create_from_backend_no_dist("gloo", "cpu")
-    # _test__dist_model_create_from_context_no_dist("gloo", "cpu")
+def test__native_dist_model_create_no_dist_gloo():
+    _test__native_dist_model_create_from_backend_no_dist("gloo", "cpu")
+    _test__native_dist_model_create_from_context_no_dist("gloo", "cpu")
 
 
 @pytest.mark.distributed
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
-def test__dist_model_create_no_dist_nccl():
-    _test__dist_model_create_from_backend_no_dist("nccl", "cuda:0")
-    # _test__dist_model_create_from_context_no_dist("nccl", "cuda:0")
+def test__native_dist_model_create_no_dist_nccl():
+    _test__native_dist_model_create_from_backend_no_dist("nccl", "cuda:0")
+    _test__native_dist_model_create_from_context_no_dist("nccl", "cuda:0")
 
 
 @pytest.mark.distributed
-def test__dist_model_create_dist_gloo(local_rank, world_size):
-    _test__dist_model_create_from_backend_dist(local_rank, local_rank, world_size, "gloo", "cpu")
-    # _test__dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", "cpu")
+def test__native_dist_model_create_dist_gloo(local_rank, world_size):
+    _test__native_dist_model_create_from_backend_dist(local_rank, local_rank, world_size, "gloo", "cpu")
+    # _test__native_dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", "cpu")
 
 
 @pytest.mark.distributed
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
-def test__dist_model_create_dist_nccl(local_rank, world_size):
-    _test__dist_model_create_from_backend_dist(local_rank, local_rank, world_size, "nccl", "cuda:{}".format(local_rank))
-    # _test__dist_model_create_from_context_dist(
+def test__native_dist_model_create_dist_nccl(local_rank, world_size):
+    _test__native_dist_model_create_from_backend_dist(local_rank, local_rank, world_size, "nccl", "cuda:{}".format(local_rank))
+    # _test__native_dist_model_create_from_context_dist(
     #     local_rank, local_rank, world_size, "nccl", "cuda:{}".format(local_rank)
     # )
 
@@ -209,7 +212,7 @@ def _test_dist_spawn_fn(local_rank, backend, world_size, device):
         assert _model.device() == device
 
 
-def _test__dist_model_spawn(backend, num_workers_per_machine, device):
+def _test__native_dist_model_spawn(backend, num_workers_per_machine, device):
     _NativeDistModel.spawn(
         _test_dist_spawn_fn,
         args=(backend, num_workers_per_machine, device),
@@ -220,12 +223,12 @@ def _test__dist_model_spawn(backend, num_workers_per_machine, device):
 
 @pytest.mark.distributed
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
-def test__dist_model_spawn_gloo():
-    _test__dist_model_spawn("gloo", num_workers_per_machine=4, device="cpu")
+def test__native_dist_model_spawn_gloo():
+    _test__native_dist_model_spawn("gloo", num_workers_per_machine=4, device="cpu")
 
 
 @pytest.mark.distributed
 @pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
 @pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
-def test__dist_model_spawn_nccl():
-    _test__dist_model_spawn("nccl", num_workers_per_machine=torch.cuda.device_count(), device="cuda")
+def test__native_dist_model_spawn_nccl():
+    _test__native_dist_model_spawn("nccl", num_workers_per_machine=torch.cuda.device_count(), device="cuda")
diff --git a/tests/ignite/distributed/test_utils.py b/tests/ignite/distributed/test_utils.py
@@ -168,6 +168,12 @@ def _test_sync_model(cls):
     assert isinstance(_model, cls), "{} vs {}".format(type(_model), cls)
 
 
+def test__sync_model_no_dist():
+    from ignite.distributed.comp_models import _SerialModel
+
+    _test_sync_model(_SerialModel)
+
+
 @pytest.mark.tpu
 @pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
 @pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")