Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

metrics_impl fix 2 gpu hvd tests and ensure consistent detaching #1280

Merged
merged 51 commits into from
Sep 11, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
51 commits
Select commit Hold shift + click to select a range
9f7daa1
update accuracy to accumulate _num_correct in a tensor on the right d…
n2cholas Aug 7, 2020
a87f93d
update loss metric to accumulate _sum in a tensor on the right device
n2cholas Aug 7, 2020
30b2e19
update mae metric to accumulate in a tensor on the right device
n2cholas Aug 7, 2020
a3e237c
update mpd metric to accumulate in a tensor on the right device
n2cholas Aug 7, 2020
7100176
update mse metric to accumulate in a tensor on the right device
n2cholas Aug 7, 2020
3228a0a
update top k accuracy metric to accumulate in a tensor on the right …
n2cholas Aug 7, 2020
412551e
update precision and recall metrics to accumulate in tensors on the r…
n2cholas Aug 8, 2020
4c4a76c
.....
n2cholas Aug 8, 2020
b1e6956
black formatting
n2cholas Aug 8, 2020
b081e92
reverted run*.sh
n2cholas Aug 10, 2020
a343c35
change all metrics default device to cpu except running_average
n2cholas Aug 16, 2020
8548601
Update ignite/metrics/precision.py
n2cholas Aug 16, 2020
b84226b
remove Optional type from metric devices since default is cpu
n2cholas Aug 16, 2020
685c23b
add comment explaining lack of detach in accuracy metrics
n2cholas Aug 16, 2020
0b4337d
update docstrings and docs
n2cholas Aug 17, 2020
b2fa213
Update ignite/metrics/accumulation.py
n2cholas Aug 17, 2020
90e0e9a
Update ignite/metrics/accumulation.py
n2cholas Aug 17, 2020
6c1fda4
Update ignite/metrics/accumulation.py
n2cholas Aug 17, 2020
c510e10
Update ignite/metrics/accuracy.py
n2cholas Aug 17, 2020
d5d4854
Update ignite/metrics/fbeta.py
n2cholas Aug 17, 2020
39515b7
Update ignite/metrics/loss.py
n2cholas Aug 17, 2020
3c49871
Update ignite/metrics/metric.py
n2cholas Aug 17, 2020
6de10dd
Update ignite/metrics/precision.py
n2cholas Aug 17, 2020
eca0bc3
Update ignite/metrics/recall.py
n2cholas Aug 17, 2020
ad7082e
add comment explaining lack of detach in metrics docs
n2cholas Aug 17, 2020
c057d52
Merge remote-tracking branch 'pytorch-ignite/metrics_impl' into metri…
n2cholas Aug 17, 2020
90b5b85
support device argument for running_average
n2cholas Aug 17, 2020
3481da1
update support for device argumenet for accumulation
n2cholas Aug 18, 2020
d340bb7
fix and improve device tests for metrics
n2cholas Aug 18, 2020
4824e24
fix and improve device tests for metrics
n2cholas Aug 18, 2020
1361866
fix TPU tests
n2cholas Aug 18, 2020
556262b
Apply suggestions from code review
vfdev-5 Aug 18, 2020
489620b
Apply suggestions from code review
vfdev-5 Aug 18, 2020
566e9bc
Merge branch 'metrics_impl' of https://github.com/pytorch/ignite into…
Aug 31, 2020
6edd30d
detach tensors earlier in update
Aug 31, 2020
375f91e
remove redundant to() call
Aug 31, 2020
960449c
ensure metrics aren't created on XLA devices
n2cholas Sep 7, 2020
96128fd
Merge branch 'metrics_impl' of https://github.com/pytorch/ignite into…
n2cholas Sep 7, 2020
d192a8f
Fixed isort
vfdev-5 Sep 8, 2020
23d72c0
move xla check to Metric.__init__ instead of individual metrics
n2cholas Sep 9, 2020
edc74a4
update xla tests
n2cholas Sep 9, 2020
2c6b7d2
replace deleted callable check
n2cholas Sep 9, 2020
6e64f37
remove redundant precision and recall __init__
n2cholas Sep 9, 2020
2828e74
replace precision/recall __init__ for docs rendering
n2cholas Sep 9, 2020
bcc3cbb
add support for metrics_lambda with components on diff devices
n2cholas Sep 9, 2020
bb257de
Merge branch 'metrics_impl' of https://github.com/pytorch/ignite into…
n2cholas Sep 9, 2020
2f7e542
fix epoch_metric xla test
n2cholas Sep 9, 2020
0d1ba42
Merge branch 'metrics_impl' of https://github.com/pytorch/ignite into…
n2cholas Sep 11, 2020
ebefd4b
detach output consistently for all metrics
n2cholas Sep 11, 2020
4ce863d
fix horovod two gpu tests
n2cholas Sep 11, 2020
8a94b8f
make confusion matrix detaches like other metrics
n2cholas Sep 11, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
update xla tests
  • Loading branch information
n2cholas committed Sep 9, 2020
commit edc74a49b147e50422cb394a0068eb7e8b188a66
14 changes: 6 additions & 8 deletions tests/ignite/metrics/test_accumulation.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,10 @@ def _geom_mean(y_true):

def _test_distrib_accumulator_device(device):

for metric_device in [torch.device("cpu"), idist.device()]:
metric_devices = [torch.device("cpu")]
if device.type != "xla":
metric_devices.append(device)
for metric_device in metric_devices:

m = VariableAccumulation(lambda a, x: x, device=metric_device)
assert m._device == metric_device
Expand All @@ -384,11 +387,6 @@ def _test_distrib_accumulator_device(device):
)


def _test_creating_on_xla_fails(device):
with pytest.raises(ValueError, match=r"Cannot create metric on an XLA device. Use device='cpu' instead."):
VariableAccumulation(lambda a, x: x, device=device)


@pytest.mark.distributed
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
Expand Down Expand Up @@ -462,7 +460,7 @@ def test_distrib_single_device_xla():
_test_distrib_average(device)
_test_distrib_geom_average(device)
_test_distrib_integration(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


def _test_distrib_xla_nprocs(index):
Expand All @@ -471,7 +469,7 @@ def _test_distrib_xla_nprocs(index):
_test_distrib_average(device)
_test_distrib_geom_average(device)
_test_distrib_integration(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


@pytest.mark.tpu
Expand Down
14 changes: 6 additions & 8 deletions tests/ignite/metrics/test_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,10 @@ def update(engine, i):

def _test_distrib_accumulator_device(device):

for metric_device in [torch.device("cpu"), idist.device()]:
metric_devices = [torch.device("cpu")]
if device.type != "xla":
metric_devices.append(device)
for metric_device in metric_devices:

acc = Accuracy(device=metric_device)
assert acc._device == metric_device
Expand All @@ -830,11 +833,6 @@ def _test_distrib_accumulator_device(device):
)


def _test_creating_on_xla_fails(device):
with pytest.raises(ValueError, match=r"Cannot create metric on an XLA device. Use device='cpu' instead."):
Accuracy(device=device)


@pytest.mark.distributed
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
Expand Down Expand Up @@ -901,15 +899,15 @@ def test_distrib_single_device_xla():
_test_distrib_multilabel_input_NHW(device)
_test_distrib_integration_multiclass(device)
_test_distrib_integration_multilabel(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


def _test_distrib_xla_nprocs(index):
device = idist.device()
_test_distrib_multilabel_input_NHW(device)
_test_distrib_integration_multiclass(device)
_test_distrib_integration_multilabel(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


@pytest.mark.tpu
Expand Down
18 changes: 11 additions & 7 deletions tests/ignite/metrics/test_confusion_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,12 +611,16 @@ def _test(metric_device):
assert np.all(true_res == res)

_test("cpu")
_test(idist.device())
if device.type != "xla":
_test(idist.device())


def _test_distrib_accumulator_device(device):

for metric_device in [torch.device("cpu"), idist.device()]:
metric_devices = [torch.device("cpu")]
if device.type != "xla":
metric_devices.append(device)
for metric_device in metric_devices:

cm = ConfusionMatrix(num_classes=3, device=metric_device)
assert cm._device == metric_device
Expand All @@ -638,7 +642,7 @@ def _test_distrib_accumulator_device(device):
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):

device = "cuda:{}".format(local_rank)
device = torch.device("cuda:{}".format(local_rank))
_test_distrib_multiclass_images(device)
_test_distrib_accumulator_device(device)

Expand All @@ -647,7 +651,7 @@ def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
def test_distrib_cpu(distributed_context_single_node_gloo):

device = "cpu"
device = torch.device("cpu")
_test_distrib_multiclass_images(device)
_test_distrib_accumulator_device(device)

Expand All @@ -657,7 +661,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo):
@pytest.mark.skipif("WORLD_SIZE" in os.environ, reason="Skip if launched as multiproc")
def test_distrib_hvd(gloo_hvd_executor):

device = "cpu" if not torch.cuda.is_available() else "cuda"
device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")
nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()

gloo_hvd_executor(_test_distrib_multiclass_images, (device,), np=nproc, do_init=True)
Expand All @@ -668,7 +672,7 @@ def test_distrib_hvd(gloo_hvd_executor):
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif("MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
device = "cpu"
device = torch.device("cpu")
_test_distrib_multiclass_images(device)
_test_distrib_accumulator_device(device)

Expand All @@ -677,7 +681,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif("GPU_MULTINODE_DISTRIB" not in os.environ, reason="Skip if not multi-node distributed")
def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
device = "cuda:{}".format(distributed_context_multi_node_nccl["local_rank"])
device = torch.device("cuda:{}".format(distributed_context_multi_node_nccl["local_rank"]))
_test_distrib_multiclass_images(device)
_test_distrib_accumulator_device(device)

Expand Down
26 changes: 12 additions & 14 deletions tests/ignite/metrics/test_loss.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,12 @@ def _test(metric_device):
_test(device)


def _test_distrib_sum_device(device):
def _test_distrib_accumulator_device(device):

for metric_device in [torch.device("cpu"), idist.device()]:
metric_devices = [torch.device("cpu")]
if device.type != "xla":
metric_devices.append(device)
for metric_device in metric_devices:
loss = Loss(nll_loss, device=metric_device)
assert loss._device == metric_device
assert loss._sum.device == metric_device, "{}:{} vs {}:{}".format(
Expand All @@ -139,19 +142,14 @@ def test_sum_detached():
assert not loss._sum.requires_grad


def _test_creating_on_xla_fails(device):
with pytest.raises(ValueError, match=r"Cannot create metric on an XLA device. Use device='cpu' instead."):
Loss(nll_loss, device=device)


@pytest.mark.distributed
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
def test_distrib_gpu(local_rank, distributed_context_single_node_nccl):

device = torch.device("cuda:{}".format(local_rank))
_test_distrib_compute_on_criterion(device)
_test_distrib_sum_device(device)
_test_distrib_accumulator_device(device)


@pytest.mark.distributed
Expand All @@ -160,7 +158,7 @@ def test_distrib_cpu(distributed_context_single_node_gloo):

device = torch.device("cpu")
_test_distrib_compute_on_criterion(device)
_test_distrib_sum_device(device)
_test_distrib_accumulator_device(device)


@pytest.mark.distributed
Expand All @@ -172,7 +170,7 @@ def test_distrib_hvd(gloo_hvd_executor):
nproc = 4 if not torch.cuda.is_available() else torch.cuda.device_count()

gloo_hvd_executor(_test_distrib_compute_on_criterion, (device,), np=nproc, do_init=True)
gloo_hvd_executor(_test_distrib_sum_device, (device,), np=nproc, do_init=True)
gloo_hvd_executor(_test_distrib_accumulator_device, (device,), np=nproc, do_init=True)


@pytest.mark.multinode_distributed
Expand All @@ -181,7 +179,7 @@ def test_distrib_hvd(gloo_hvd_executor):
def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
device = torch.device("cpu")
_test_distrib_compute_on_criterion(device)
_test_distrib_sum_device(device)
_test_distrib_accumulator_device(device)


@pytest.mark.multinode_distributed
Expand All @@ -190,7 +188,7 @@ def test_multinode_distrib_cpu(distributed_context_multi_node_gloo):
def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
device = torch.device("cuda:{}".format(distributed_context_multi_node_nccl["local_rank"]))
_test_distrib_compute_on_criterion(device)
_test_distrib_sum_device(device)
_test_distrib_accumulator_device(device)


@pytest.mark.tpu
Expand All @@ -199,13 +197,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
def test_distrib_single_device_xla():
device = idist.device()
_test_distrib_compute_on_criterion(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


def _test_distrib_xla_nprocs(index):
device = idist.device()
_test_distrib_compute_on_criterion(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


@pytest.mark.tpu
Expand Down
14 changes: 6 additions & 8 deletions tests/ignite/metrics/test_mean_absolute_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ def _test(metric_device):

def _test_distrib_accumulator_device(device):

for metric_device in [torch.device("cpu"), idist.device()]:
metric_devices = [torch.device("cpu")]
if device.type != "xla":
metric_devices.append(device)
for metric_device in metric_devices:
mae = MeanAbsoluteError(device=metric_device)
assert mae._device == metric_device
assert mae._sum_of_absolute_errors.device == metric_device, "{}:{} vs {}:{}".format(
Expand Down Expand Up @@ -103,11 +106,6 @@ def test_accumulator_detached():
assert not mae._sum_of_absolute_errors.requires_grad


def _test_creating_on_xla_fails(device):
with pytest.raises(ValueError, match=r"Cannot create metric on an XLA device. Use device='cpu' instead."):
MeanAbsoluteError(device=device)


@pytest.mark.distributed
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
Expand Down Expand Up @@ -161,13 +159,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
def test_distrib_single_device_xla():
device = idist.device()
_test_distrib_integration(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


def _test_distrib_xla_nprocs(index):
device = idist.device()
_test_distrib_integration(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


@pytest.mark.tpu
Expand Down
14 changes: 6 additions & 8 deletions tests/ignite/metrics/test_mean_pairwise_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,10 @@ def _test(metric_device):

def _test_distrib_accumulator_device(device):

for metric_device in [torch.device("cpu"), idist.device()]:
metric_devices = [torch.device("cpu")]
if device.type != "xla":
metric_devices.append(device)
for metric_device in metric_devices:

mpd = MeanPairwiseDistance(device=metric_device)
assert mpd._device == metric_device
Expand All @@ -112,11 +115,6 @@ def test_accumulator_detached():
assert not mpd._sum_of_distances.requires_grad


def _test_creating_on_xla_fails(device):
with pytest.raises(ValueError, match=r"Cannot create metric on an XLA device. Use device='cpu' instead."):
MeanPairwiseDistance(device=device)


@pytest.mark.distributed
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
Expand Down Expand Up @@ -170,13 +168,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
def test_distrib_single_device_xla():
device = idist.device()
_test_distrib_integration(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


def _test_distrib_xla_nprocs(index):
device = idist.device()
_test_distrib_integration(device)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


@pytest.mark.tpu
Expand Down
14 changes: 6 additions & 8 deletions tests/ignite/metrics/test_mean_squared_error.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,10 @@ def _test(metric_device):

def _test_distrib_accumulator_device(device):

for metric_device in [torch.device("cpu"), idist.device()]:
metric_devices = [torch.device("cpu")]
if device.type != "xla":
metric_devices.append(device)
for metric_device in metric_devices:

device = torch.device(device)
mse = MeanSquaredError(device=metric_device)
Expand Down Expand Up @@ -105,11 +108,6 @@ def test_accumulator_detached():
assert not mse._sum_of_squared_errors.requires_grad


def _test_creating_on_xla_fails(device):
with pytest.raises(ValueError, match=r"Cannot create metric on an XLA device. Use device='cpu' instead."):
MeanSquaredError(device=device)


@pytest.mark.distributed
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
Expand Down Expand Up @@ -164,13 +162,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
def test_distrib_single_device_xla():
device = idist.device()
_test_distrib_integration(device, tol=1e-4)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


def _test_distrib_xla_nprocs(index):
device = idist.device()
_test_distrib_integration(device, tol=1e-4)
_test_creating_on_xla_fails(device)
_test_distrib_accumulator_device(device)


@pytest.mark.tpu
Expand Down
10 changes: 9 additions & 1 deletion tests/ignite/metrics/test_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -568,13 +568,19 @@ def update(self, output):
self.a += 10.0
self.b -= 5.0

m = DummyMetric(device=device)
metric_device = device if torch.device(device).type != "xla" else "cpu"
m = DummyMetric(device=metric_device)
m.update(None)
m.compute()
# check if can call compute multiple times without all reduce invocation
m.compute()


def _test_creating_on_xla_fails(device):
with pytest.raises(ValueError, match=r"Cannot create metric on an XLA device. Use device='cpu' instead."):
DummyMetric2(device=device)


@pytest.mark.distributed
@pytest.mark.skipif(not idist.has_native_dist_support, reason="Skip if no native dist support")
@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="Skip if no GPU")
Expand Down Expand Up @@ -625,11 +631,13 @@ def test_multinode_distrib_gpu(distributed_context_multi_node_nccl):
def test_distrib_single_device_xla():
device = idist.device()
_test_distrib_sync_all_reduce_decorator(device)
_test_creating_on_xla_fails(device)


def _test_distrib_xla_nprocs(index):
device = idist.device()
_test_distrib_sync_all_reduce_decorator(device)
_test_creating_on_xla_fails(device)


@pytest.mark.tpu
Expand Down
Loading