[DCP][BE] Move DCP._state_dict_utils out from DCP (pytorch#115523)

DCP._state_dict_utils is also used by FSDP. This can cause circular import sometimes. Move it out from DCP to avoid circular import. Differential Revision: [D52022440](https://our.internmc.facebook.com/intern/diff/D52022440/) Pull Request resolved: pytorch#115523 Approved by: https://github.com/wz337
azad-meta · Dec 13, 2023 · cc28f61 · cc28f61
1 parent 1500379
commit cc28f61
Show file tree

Hide file tree

Showing 9 changed files with 21 additions and 19 deletions.
diff --git a/test/distributed/_composable/fully_shard/test_fully_shard_model_checkpoint.py b/test/distributed/_composable/fully_shard/test_fully_shard_model_checkpoint.py
@@ -9,7 +9,7 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._composable import fully_shard
-from torch.distributed.checkpoint._state_dict_utils import _gather_state_dict
+from torch.distributed._state_dict_utils import _gather_state_dict
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType
 from torch.distributed.fsdp.api import ShardingStrategy
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy

diff --git a/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py b/test/distributed/checkpoint/test_fsdp_tp_checkpoint_conversion.py
@@ -3,8 +3,8 @@
 import torch.distributed.checkpoint as dist_cp
 from torch.distributed._shard.sharded_tensor import ShardedTensor
 
+from torch.distributed._state_dict_utils import _all_gather_sharded_tensor
 from torch.distributed._tensor import DTensor, init_device_mesh, Replicate
-from torch.distributed.checkpoint._state_dict_utils import _all_gather_sharded_tensor
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 

diff --git a/test/distributed/checkpoint/test_state_dict_utils.py b/test/distributed/checkpoint/test_state_dict_utils.py
@@ -4,12 +4,12 @@
 import torch.distributed as dist
 import torch.distributed._functional_collectives as funcol
 
-from torch.distributed._tensor import DTensor
-from torch.distributed._tensor.placement_types import Shard
-from torch.distributed.checkpoint._state_dict_utils import (
+from torch.distributed._state_dict_utils import (
     _gather_state_dict,
     _offload_state_dict_to_cpu,
 )
+from torch.distributed._tensor import DTensor
+from torch.distributed._tensor.placement_types import Shard
 from torch.testing._internal.common_utils import run_tests
 from torch.testing._internal.distributed._tensor.common_dtensor import (
     DTensorTestBase,

diff --git a/test/distributed/fsdp/test_fsdp_optim_state.py b/test/distributed/fsdp/test_fsdp_optim_state.py
@@ -10,11 +10,11 @@
 import torch.nn as nn
 from torch import distributed as dist
 from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._state_dict_utils import _gather_state_dict
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     _CHECKPOINT_WRAPPED_MODULE,
     apply_activation_checkpointing,
 )
-from torch.distributed.checkpoint._state_dict_utils import _gather_state_dict
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 from torch.distributed.fsdp.api import ShardingStrategy
 from torch.distributed.fsdp.fully_sharded_data_parallel import (

diff --git a/test/distributed/fsdp/test_fsdp_state_dict.py b/test/distributed/fsdp/test_fsdp_state_dict.py
@@ -16,15 +16,15 @@
     Shard,
     ShardedTensor,
 )
+from torch.distributed._state_dict_utils import (
+    _all_gather_sharded_tensor,
+    _gather_state_dict,
+)
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     apply_activation_checkpointing,
     checkpoint_wrapper,
     CheckpointImpl,
 )
-from torch.distributed.checkpoint._state_dict_utils import (
-    _all_gather_sharded_tensor,
-    _gather_state_dict,
-)
 from torch.distributed.fsdp import (
     CPUOffload,
     FullStateDictConfig,

diff --git a/...stributed/checkpoint/_state_dict_utils.py → torch/distributed/_state_dict_utils.py b/...stributed/checkpoint/_state_dict_utils.py → torch/distributed/_state_dict_utils.py
@@ -1,16 +1,18 @@
 import math
-from typing import Any, Callable, Dict, Optional, Tuple
+from typing import Any, Callable, Dict, Optional, Tuple, TYPE_CHECKING
 
 import torch
 import torch.distributed as dist
 import torch.nn.functional as F
-from torch.distributed import distributed_c10d
-from torch.distributed._shard.sharded_tensor import ShardedTensor
-from torch.distributed._tensor import DTensor, Replicate
+
+if dist.is_available() or TYPE_CHECKING:
+    from torch.distributed import distributed_c10d
+    from torch.distributed._shard.sharded_tensor import ShardedTensor
+    from torch.distributed._tensor import DTensor, Replicate
 
 
 def _all_gather_sharded_tensor(
-    sharded_tensor: ShardedTensor,
+    sharded_tensor: "ShardedTensor",
     pg: Optional[dist.ProcessGroup] = None,
     device: Optional[torch.device] = None,
 ) -> torch.Tensor:

diff --git a/torch/distributed/checkpoint/state_dict.py b/torch/distributed/checkpoint/state_dict.py
@@ -21,11 +21,11 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
-from torch.distributed._tensor import DTensor
-from torch.distributed.checkpoint._state_dict_utils import (
+from torch.distributed._state_dict_utils import (
     _gather_state_dict,
     _offload_state_dict_to_cpu,
 )
+from torch.distributed._tensor import DTensor
 from torch.distributed.fsdp import (
     FullOptimStateDictConfig,
     FullStateDictConfig,

diff --git a/torch/distributed/fsdp/_optim_utils.py b/torch/distributed/fsdp/_optim_utils.py
@@ -25,8 +25,8 @@
 import torch.distributed.fsdp._traversal_utils as traversal_utils
 import torch.nn as nn
 from torch.distributed._shard.sharded_tensor import ShardedTensor
+from torch.distributed._state_dict_utils import _gather_state_dict
 from torch.distributed._tensor import DTensor, Replicate
-from torch.distributed.checkpoint._state_dict_utils import _gather_state_dict
 from torch.distributed.distributed_c10d import _get_pg_default_device
 from torch.distributed.fsdp._common_utils import (
     _apply_to_modules,

diff --git a/torch/testing/_internal/distributed/common_state_dict.py b/torch/testing/_internal/distributed/common_state_dict.py
@@ -8,8 +8,8 @@
 import torch.nn as nn
 
 from torch.distributed._sharded_tensor import ShardedTensor
+from torch.distributed._state_dict_utils import _gather_state_dict
 from torch.distributed._tensor import DTensor
-from torch.distributed.checkpoint._state_dict_utils import _gather_state_dict
 from torch.distributed.checkpoint.state_dict import (
     PG,
     set_state_dict,