facebookresearch · rohan-varma · Jul 7, 2023 · Jul 7, 2023
diff --git a/examples/flava/native/train.py b/examples/flava/native/train.py
@@ -117,7 +117,11 @@ def __init__(self, config: DictConfig):
  else torch.float16
  )
 
- self.scaler = ShardedGradScaler() if config.training.enable_amp else None
+ self.scaler = (
+ ShardedGradScaler()
+ if config.training.enable_amp and self.half_dtype == torch.float16
+ else None
+ )
 
  def log(
  self,
@@ -144,25 +148,6 @@ def create_model(self) -> torch.nn.Module:
  f"size: {get_model_size_gb(model):.3} GB"
  )
 
- if self.config.training.activation_checkpointing:
- check_fn = lambda submodule: isinstance(submodule, TransformerEncoderLayer)
- checkpoint_impl = CheckpointImpl.REENTRANT
-
- # DDP gradient hooks have compatibility issues with REENTRANT autograd
- if strategy == "ddp":
- checkpoint_impl = CheckpointImpl.NO_REENTRANT
-
- checkpoint_wrapper_fn = partial(
- checkpoint_wrapper,
- offload_to_cpu=False,
- checkpoint_impl=checkpoint_impl,
- )
- apply_activation_checkpointing(
- model,
- checkpoint_wrapper_fn=checkpoint_wrapper_fn,
- check_fn=check_fn,
- )
-
  if strategy == "ddp":
  # TODO do we have to do this in FSDP too? see https://github.com/pytorch/pytorch/issues/75478
  model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
@@ -206,6 +191,20 @@ def create_model(self) -> torch.nn.Module:
 
  print0(f"after FSDP {torch.cuda.memory_allocated()/1024**3:.3} GB")
 
+ if self.config.training.activation_checkpointing:
+ check_fn = lambda submodule: isinstance(submodule, TransformerEncoderLayer)
+ checkpoint_impl = CheckpointImpl.NO_REENTRANT
+
+ checkpoint_wrapper_fn = partial(
+ checkpoint_wrapper,
+ checkpoint_impl=checkpoint_impl,
+ )
+ apply_activation_checkpointing(
+ model,
+ checkpoint_wrapper_fn=checkpoint_wrapper_fn,
+ check_fn=check_fn,
+ )
+
  else:
  raise ValueError(f"unknown strategy: {strategy}")