Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Merge idist into master #1045

Merged
merged 54 commits into from
May 31, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
177fb6f
Improved parallel utils (#1023)
vfdev-5 May 11, 2020
91d8875
[WIP] create from context for XLA
vfdev-5 May 11, 2020
3cfccd4
autopep8 fix
May 11, 2020
f71043f
Tests for _sync_model for XLA
vfdev-5 May 11, 2020
093ddb1
autopep8 fix
May 11, 2020
7ad7fcf
More tests and updates
vfdev-5 May 11, 2020
d57b3c9
autopep8 fix
May 11, 2020
7fcadca
[WIP] create from context for Native Torch Dist
vfdev-5 May 12, 2020
5a6e052
autopep8 fix
May 12, 2020
1c362fe
Added tests for idist.* created from context for native dist settings
vfdev-5 May 12, 2020
12512cf
[WIP] Fix tests
vfdev-5 May 13, 2020
228fd89
Fixed metric related tests
vfdev-5 May 13, 2020
b09ea05
autopep8 fix
May 13, 2020
a23da8e
Merge branch 'master' of https://github.com/pytorch/ignite into idist
vfdev-5 May 13, 2020
da72b15
[WIP] idist - Docs & code updates (#1034)
vfdev-5 May 15, 2020
0352bc6
Merge branch 'master' into origin-idist
vfdev-5 May 15, 2020
16256cf
Merge branch 'master' of https://github.com/pytorch/ignite into origi…
vfdev-5 May 16, 2020
914bba9
Tpu metrics (#1042)
vfdev-5 May 16, 2020
feb79b4
Merge branch 'master' into idist
vfdev-5 May 16, 2020
25d38d1
Increased err tol for mse and rmse tests on single TPU
vfdev-5 May 16, 2020
8886948
Fixes #991 (#1047)
vfdev-5 May 16, 2020
add8a4d
Merge branch 'master' into idist
vfdev-5 May 16, 2020
bdae449
add TPU checkpointing to CPU. (#1005)
erip May 16, 2020
d1cc29d
Updated tests on checkpoint and TPU
vfdev-5 May 16, 2020
977ac8c
Merge branch 'master' into idist
vfdev-5 May 17, 2020
15072ae
Added barrier op in idist (#1050)
vfdev-5 May 17, 2020
ac86d46
Merge branch 'master' into idist
vfdev-5 May 18, 2020
037e7f7
Fixed bug with torch.cuda.set_device
vfdev-5 May 19, 2020
2a01cc3
Fixed cuda device index, added warning if cuda device index != local …
vfdev-5 May 19, 2020
1f54ab5
autopep8 fix
May 19, 2020
199224a
Merge branch 'master' into idist
vfdev-5 May 22, 2020
888a654
Issue 1011 (#1053)
vfdev-5 May 22, 2020
ae1bdf5
Improved device() method (#1062)
vfdev-5 May 23, 2020
0fa8c61
Merge branch 'master' into idist
sdesrozis May 23, 2020
537dbd0
Idist kwargs dict (#1064)
vfdev-5 May 23, 2020
727f038
removed badly merged _need_to_sync
vfdev-5 May 23, 2020
530c422
Improved device and setup_common_training_handlers (#1066)
vfdev-5 May 24, 2020
74ddacb
Idist improve2 (#1075)
vfdev-5 May 28, 2020
6735dc0
Merge branch 'master' into idist
vfdev-5 May 28, 2020
b1b5d56
Merge branch 'master' into idist
vfdev-5 May 28, 2020
1e5d7d3
Added support for str input for all gather (#1081)
vfdev-5 May 29, 2020
89e1358
Fix #1055 (#1068)
sdesrozis May 29, 2020
1c34eda
Merge branch 'master' into idist
vfdev-5 May 29, 2020
d277a25
Fix failing tests on multi-gpus
vfdev-5 May 29, 2020
d9a80c6
Fix failing XLA tests
vfdev-5 May 30, 2020
f617787
Merge branch 'master' into idist
vfdev-5 May 30, 2020
a8f03e8
Merge branch 'master' into idist
vfdev-5 May 31, 2020
b41cf6d
Fixes failing tests on multi-GPUs
vfdev-5 May 31, 2020
222cb60
autopep8 fix
May 31, 2020
b3b9aff
Remove useless barriers (#1085)
sdesrozis May 31, 2020
44f4c63
Fixes failing TPU with fork mp
vfdev-5 May 31, 2020
8989e5e
Merge branch 'master' into idist
vfdev-5 May 31, 2020
f4ee4f9
Applied review suggestions
vfdev-5 May 31, 2020
669ef8a
autopep8 fix
May 31, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[WIP] create from context for Native Torch Dist
  • Loading branch information
vfdev-5 committed May 12, 2020
commit 7fcadca046a84051679efb99ac580eae9432e8fc
29 changes: 13 additions & 16 deletions ignite/distributed/comp_models/native.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,11 @@ def __init__(self, backend=None, timeout=None, **kwargs):

def _create_from_backend(self, backend, timeout=None, **kwargs):
self.setup_env_vars()

self._local_rank = int(os.environ["LOCAL_RANK"])
# for debug purposes
self._master_port = int(os.environ["MASTER_PORT"])
self._master_addr = os.environ["MASTER_ADDR"]
self._local_rank = int(os.environ["LOCAL_RANK"])

init_pg_kwargs = {}
if timeout is not None:
Expand All @@ -70,28 +72,23 @@ def _create_from_backend(self, backend, timeout=None, **kwargs):
if backend == "nccl":
torch.cuda.device(self._local_rank)

self._ntasks_per_node = self._compute_ntasks_per_node()
self._nnodes = self.get_world_size() // self.get_ntasks_per_node()
self._node = self.get_rank() // self._ntasks_per_node
self._setup_attrs()

def _init_from_context(self):

raise NotImplementedError("")
if "LOCAL_RANK" not in os.environ:
raise RuntimeError(
"Can not initialize native dist model without local rank information. "
"Please, set `os.environ['LOCAL_RANK']` with correct local rank index"
)

self._local_rank = int(os.environ["LOCAL_RANK"])
# for debug purposes
self._master_port = None
self._master_addr = None
self._setup_attrs()

# THIS COULD HELP TO GET master addr/port if TCPStore is used.
# HOWEVER, user can use FileStore or any other store.
# try:
# store = dist.distributed_c10d._get_default_store()
# if isinstance(store, torch.distributed.TCPStore):
# self._master_port = None
# self._master_addr = None
# except AttributeError:
# pass

self._local_rank = 0 # self.get_rank() % self._ntasks_per_node
def _setup_attrs(self):
self._ntasks_per_node = self._compute_ntasks_per_node()
self._nnodes = self.get_world_size() // self._ntasks_per_node
self._node = self.get_rank() // self._ntasks_per_node
Expand Down
49 changes: 26 additions & 23 deletions tests/ignite/distributed/comp_models/test_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@


@pytest.mark.distributed
def test__dist_model():
def test__native_dist_model():
available_backends = _NativeDistModel.available_backends

if dist.is_nccl_available():
Expand All @@ -29,7 +29,7 @@ def test__dist_model():

@pytest.mark.distributed
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
def test__dist_model_create_from_backend_bad_config():
def test__native_dist_model_create_from_backend_bad_config():
import os
from datetime import timedelta

Expand All @@ -55,7 +55,7 @@ def _assert_model(model, true_conf):
assert model.get_ntasks_per_node() == true_conf["ntasks_per_node"]


def _test__dist_model_create_from_backend_no_dist(backend, true_device):
def _test__native_dist_model_create_from_backend_no_dist(backend, true_device):
from datetime import timedelta

model = _NativeDistModel.create_from_backend(backend=backend, timeout=timedelta(seconds=20))
Expand All @@ -79,7 +79,7 @@ def _test__dist_model_create_from_backend_no_dist(backend, true_device):
model.finalize()


def _test__dist_model_create_from_backend_dist(local_rank, rank, world_size, backend, true_device):
def _test__native_dist_model_create_from_backend_dist(local_rank, rank, world_size, backend, true_device):
import os
from datetime import timedelta

Expand Down Expand Up @@ -109,13 +109,16 @@ def _test__dist_model_create_from_backend_dist(local_rank, rank, world_size, bac
del os.environ["RANK"]


def _test__dist_model_create_from_context_no_dist(true_backend, true_device):
def _test__native_dist_model_create_from_context_no_dist(true_backend, true_device):

assert _NativeDistModel.create_from_context() is None

dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=1, rank=0)
dist.barrier()

import os
os.environ["LOCAL_RANK"] = "0"

model = _NativeDistModel.create_from_context()

assert dist.is_available() and dist.is_initialized()
Expand All @@ -137,7 +140,7 @@ def _test__dist_model_create_from_context_no_dist(true_backend, true_device):
dist.destroy_process_group()


def _test__dist_model_create_from_context_dist(local_rank, rank, world_size, true_backend, true_device):
def _test__native_dist_model_create_from_context_dist(local_rank, rank, world_size, true_backend, true_device):

dist.init_process_group(true_backend, "tcp://0.0.0.0:2222", world_size=world_size, rank=rank)
dist.barrier()
Expand Down Expand Up @@ -165,30 +168,30 @@ def _test__dist_model_create_from_context_dist(local_rank, rank, world_size, tru

@pytest.mark.distributed
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
def test__dist_model_create_no_dist_gloo():
_test__dist_model_create_from_backend_no_dist("gloo", "cpu")
# _test__dist_model_create_from_context_no_dist("gloo", "cpu")
def test__native_dist_model_create_no_dist_gloo():
_test__native_dist_model_create_from_backend_no_dist("gloo", "cpu")
_test__native_dist_model_create_from_context_no_dist("gloo", "cpu")


@pytest.mark.distributed
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
def test__dist_model_create_no_dist_nccl():
_test__dist_model_create_from_backend_no_dist("nccl", "cuda:0")
# _test__dist_model_create_from_context_no_dist("nccl", "cuda:0")
def test__native_dist_model_create_no_dist_nccl():
_test__native_dist_model_create_from_backend_no_dist("nccl", "cuda:0")
_test__native_dist_model_create_from_context_no_dist("nccl", "cuda:0")


@pytest.mark.distributed
def test__dist_model_create_dist_gloo(local_rank, world_size):
_test__dist_model_create_from_backend_dist(local_rank, local_rank, world_size, "gloo", "cpu")
# _test__dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", "cpu")
def test__native_dist_model_create_dist_gloo(local_rank, world_size):
_test__native_dist_model_create_from_backend_dist(local_rank, local_rank, world_size, "gloo", "cpu")
# _test__native_dist_model_create_from_context_dist(local_rank, local_rank, world_size, "gloo", "cpu")


@pytest.mark.distributed
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
def test__dist_model_create_dist_nccl(local_rank, world_size):
_test__dist_model_create_from_backend_dist(local_rank, local_rank, world_size, "nccl", "cuda:{}".format(local_rank))
# _test__dist_model_create_from_context_dist(
def test__native_dist_model_create_dist_nccl(local_rank, world_size):
_test__native_dist_model_create_from_backend_dist(local_rank, local_rank, world_size, "nccl", "cuda:{}".format(local_rank))
# _test__native_dist_model_create_from_context_dist(
# local_rank, local_rank, world_size, "nccl", "cuda:{}".format(local_rank)
# )

Expand All @@ -209,7 +212,7 @@ def _test_dist_spawn_fn(local_rank, backend, world_size, device):
assert _model.device() == device


def _test__dist_model_spawn(backend, num_workers_per_machine, device):
def _test__native_dist_model_spawn(backend, num_workers_per_machine, device):
_NativeDistModel.spawn(
_test_dist_spawn_fn,
args=(backend, num_workers_per_machine, device),
Expand All @@ -220,12 +223,12 @@ def _test__dist_model_spawn(backend, num_workers_per_machine, device):

@pytest.mark.distributed
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
def test__dist_model_spawn_gloo():
_test__dist_model_spawn("gloo", num_workers_per_machine=4, device="cpu")
def test__native_dist_model_spawn_gloo():
_test__native_dist_model_spawn("gloo", num_workers_per_machine=4, device="cpu")


@pytest.mark.distributed
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
def test__dist_model_spawn_nccl():
_test__dist_model_spawn("nccl", num_workers_per_machine=torch.cuda.device_count(), device="cuda")
def test__native_dist_model_spawn_nccl():
_test__native_dist_model_spawn("nccl", num_workers_per_machine=torch.cuda.device_count(), device="cuda")
6 changes: 6 additions & 0 deletions tests/ignite/distributed/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,12 @@ def _test_sync_model(cls):
assert isinstance(_model, cls), "{} vs {}".format(type(_model), cls)


def test__sync_model_no_dist():
from ignite.distributed.comp_models import _SerialModel

_test_sync_model(_SerialModel)


@pytest.mark.tpu
@pytest.mark.skipif("NUM_TPU_WORKERS" in os.environ, reason="Skip if NUM_TPU_WORKERS is in env vars")
@pytest.mark.skipif(not has_xla_support, reason="Skip if no PyTorch XLA package")
Expand Down