You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardexpand all lines: src/accelerate/utils/dataclasses.py
+13-5
Original file line number
Diff line number
Diff line change
@@ -30,7 +30,7 @@
30
30
31
31
importtorch
32
32
33
-
from .constantsimportFSDP_AUTO_WRAP_POLICY, FSDP_BACKWARD_PREFETCH, FSDP_STATE_DICT_TYPE
33
+
from .constantsimportFSDP_AUTO_WRAP_POLICY, FSDP_BACKWARD_PREFETCH, FSDP_SHARDING_STRATEGY, FSDP_STATE_DICT_TYPE
34
34
from .environmentimportstr_to_bool
35
35
from .importsimportis_cuda_available, is_npu_available, is_xpu_available
36
36
from .versionsimportcompare_versions
@@ -439,6 +439,7 @@ class CustomDtype(enum.Enum):
439
439
r"""
440
440
An enum that contains multiple custom dtypes that can be used for `infer_auto_device_map`.
441
441
"""
442
+
442
443
FP8="fp8"
443
444
INT4="int4"
444
445
@@ -918,7 +919,7 @@ class FullyShardedDataParallelPlugin:
918
919
},
919
920
)
920
921
limit_all_gathers: bool=field(
921
-
default=False,
922
+
default=True,
922
923
metadata={
923
924
"help": "If False, then FSDP allows the CPU thread to schedule all-gathers "
924
925
"without any extra synchronization. If True, then FSDP explicitly synchronizes the CPU thread to prevent "
@@ -929,9 +930,10 @@ class FullyShardedDataParallelPlugin:
929
930
use_orig_params: bool=field(
930
931
default=True,
931
932
metadata={
932
-
"help": "If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres. "
933
+
"help": "If `True`, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres. "
933
934
"Useful in cases such as parameter-efficient fine-tuning. "
934
-
"Please refer this [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019)"
935
+
"Please refer this [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019). "
936
+
"This also enables to have different optimizer param groups. This should be `True` when creating optimizer object before preparing/wrapping the model with FSDP."
0 commit comments