o
    oi                  
   @   s  d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d d	lmZ d d
l m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8m9Z9m:Z:m;Z;m<Z< d dl=m>Z>m?Z?m@Z@mAZAmBZB d dl=mCZD d dlEmFZFmGZG d dlHmIZI d dlJmKZKmLZLmMZMmNZN d dlOmPZPmZmQZQ d dlRmSZS d dlTmUZUmVZV erd dlWmXZX d dlYmZZZm[Z[m\Z\ d dl]m^Z^ eeee!  ee!e_e`ge_f e^f Zaee\ed  f Zbd!Zcejdd"eed#d$ G d%d& d&e4e;Zfd'eeee! eee!  f  d(ed) d*efd+d,Zgd-ed) d.ed*efd/d0Zhd1e!d2ed*dfd3d4ZiG d5d6 d6e:Zjd7eee_d8f  d*d8fd9d:Zkd;d<d.ed*d=fd>d?Zld@e#d*e_fdAdBZmd1e!d*edC fdDdEZn	Fd[d1e!dGe`dHe_d*edC fdIdJZodKe
d*e_fdLdMZpdKe
d*e_fdNdOZqd1erd*e%e! fdPdQZsd1ejtj!dRejud*dfdSdTZvdUeewef dKe
d*dfdVdWZxdXeewef dKe
d*dfdYdZZydS )\    N)	ExitStacknullcontext)	timedelta)partial)Path)TYPE_CHECKINGAnyCallableContextManagerDict	GeneratorListLiteralOptionalSetTupleTypeUnion)RequirementCache)rank_zero_only)Tensor)Module)	Optimizer)	TypeGuardoverride)Accelerator)CheckpointIOClusterEnvironment	Precision)default_pg_timeout)FSDPPrecision)_SubprocessScriptLauncher)ParallelStrategy)_StrategyRegistry)
TBroadcast_apply_filter_BackwardSyncControl_Sharded!_validate_keys_for_strict_loading)ReduceOp_distributed_is_initialized-_get_default_process_group_backend_for_device_init_dist_connection_sync_ddp_if_availablegroup)_TORCH_GREATER_EQUAL_2_2_TORCH_GREATER_EQUAL_2_3)&_has_meta_device_parameters_or_buffers)_METADATA_FILENAME
_lazy_load_materialize_tensors_move_state_into)rank_zero_deprecationr   rank_zero_warn)
reset_seed)_PATH	_Stateful)
DeviceMesh)
CPUOffloadMixedPrecisionShardingStrategyModuleWrapPolicy)
FULL_SHARDSHARD_GRAD_OPNO_SHARDHYBRID_SHARD)fsdpfsdp_cpu_offloadignorez.*FSDP.state_dict_type.*)categorymessagec                !       s&  e Zd ZdZdddddeddddddddfdee deeej	  dee
 dee d	ee d
ee deeddf ded ded deeee eee  f  ded ddded deeee df  deddf  fddZeedefddZejededdfd dZeedej	fd!d"Zedefd#d$Zejd%eddfd&d$Zedefd'd(Zeedeeef fd)d*Z edee fd+d,Z!eded fd-d.Z"eede#fd/d0Z$e$jedee# ddfd1d0Z$edmd2d3Z%edm fd4d5Z&ed6ed7ee' deeee' f fd8d9Z(ed6edefd:d;Z)ed<e'de'f fd=d>Z*ed6eddfd?d@Z+edndAee de,fdBdCZ-ede,fdDdEZ.e	FdodGe/dHee dIeee0ef  de/fdJdKZ1edLededdfdMdNZ2edpdPe3dQede3fdRdSZ4e	T	Udqd6ed<e'dVee5ef dWee5ef dXede/fdYdZZ6e		drd[e7d\eeeee'ef f d]ee d^eeee8eegef f  ddf
d_d`Z9e		Udsd[e7d\eeee'eeeee'ef f f  daedeeef fdbdcZ:e;edde<ddfdedfZ=dmdgdhZ>defdidjZ?dmdkdlZ@  ZAS )tFSDPStrategyae  Strategy for Fully Sharded Data Parallel provided by torch.distributed.

    Fully Sharded Training shards the entire model across all available GPUs, allowing you to scale model
    size, whilst using efficient communication to reduce overhead. In practice, this means we can remain
    at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar
    to ZeRO-Stage 3.

    For more information check out
    `this blogpost <https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api>`__.

    Defaults have been set and options have been exposed, but may require configuration
    based on your level of memory/speed efficiency. We suggest having a look at
    `this tutorial <https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html>`__ for more information.

    Arguments:
        cpu_offload: See ``cpu_offload`` parameter in :class:`torch.distributed.fsdp.FullyShardedDataParallel`.
        mixed_precision: See ``mixed_precision`` parameter in :class:`torch.distributed.fsdp.FullyShardedDataParallel`.
        auto_wrap_policy: Same as ``auto_wrap_policy`` parameter in
            :class:`torch.distributed.fsdp.FullyShardedDataParallel`. For convenience, this also accepts a set of the
            layer classes to wrap.
        activation_checkpointing: Deprecated. Use ``activation_checkpointing_policy``.
        activation_checkpointing_policy: Same as ``auto_wrap_policy`` parameter in
            :class:`torch.distributed.fsdp.FullyShardedDataParallel` but used when selecting the modules for which you
            want to enable activation checkpointing. Enabling this can free up a significant amount of memory at the
            cost of speed since activations in these layers need to be recomputed during backpropagation. For
            convenience, this also accepts a set of the layer classes to wrap.
        sharding_strategy: Select whether to shard model parameters, gradients, optimizer states, or a combination of
            them. Available values are:

            - ``"FULL_SHARD"``: Shards model parameters, gradients, and optimizer states (default).
            - ``"SHARD_GRAD_OP"``: Shards gradients and optimizer states only. Model parameters get replicated.
            - ``"NO_SHARD"``: No sharding (identical to regular DDP).
            - ``"HYBRID_SHARD"``: Shards model parameters, gradients, and optimizer states within a single machine, but
              replicates across machines. See also the `device_mesh` parameter below.

            Also accepts a :class:`torch.distributed.fsdp.ShardingStrategy` enum value.

        device_mesh: A tuple `(replication size, sharding size)` that defines over how many devices to shard and
            replicate the model. The product of the two numbers must equal the world size. Only valid in combination
            with the `HYBRID_SHARD` sharding strategy.

        state_dict_type: The format in which the state of the model and optimizers gets saved into the checkpoint.

            - ``"full"``: The full weights and optimizer states get assembled on rank 0 and saved to a single file.
            - ``"sharded"``: Each rank saves its shard of weights and optimizer states to a file. The checkpoint is
              a folder with as many files as the world size.

        \**kwargs: See available parameters in :class:`torch.distributed.fsdp.FullyShardedDataParallel`.

    NrB   shardedacceleratorparallel_devicescluster_environment	precisionprocess_group_backendtimeoutcpu_offloadr=   mixed_precisionr>   auto_wrap_policy_POLICYactivation_checkpointingactivation_checkpointing_policysharding_strategy_SHARDING_STRATEGYstate_dict_type)fullrL   device_meshr<   kwargsreturnc                    s   t  j||||d d| _|| _|| _t | _t|	|| _| j	dd |d ur3t
s.td|| jd< t|
|| _|| _t|| j| _t|| _|| _d S )N)rM   rN   rO   rP      use_orig_paramsTz=The `device_mesh` argument is only supported in torch >= 2.2.r]   )super__init__
_num_nodes_process_group_backend_timeout_FSDPBackwardSyncControl_backward_sync_control_auto_wrap_policy_kwargs_fsdp_kwargs
setdefaultr0   
ValueError _activation_checkpointing_kwargs_state_dict_type_init_sharding_strategyrY   _init_cpu_offloadrS   rT   )selfrM   rN   rO   rP   rQ   rR   rS   rT   rU   rW   rX   rY   r[   r]   r^   	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/lightning/fabric/strategies/fsdp.pyrc      s.   


zFSDPStrategy.__init__c                 C      t dt| j d)NThe `z3` does not use the `CheckpointIO` plugin interface.NotImplementedErrortype__name__rq   rt   rt   ru   checkpoint_io      zFSDPStrategy.checkpoint_ioioc                 C   rv   )Nrw   z3` does not support setting a `CheckpointIO` plugin.rx   )rq   r   rt   rt   ru   r}      r~   c                 C   s   | j d usJ | j | j S N)rN   
local_rankr|   rt   rt   ru   root_device   s   zFSDPStrategy.root_devicec                 C      | j S r   rd   r|   rt   rt   ru   	num_nodes      zFSDPStrategy.num_nodesr   c                 C   s
   || _ d S r   r   )rq   r   rt   rt   ru   r      s   
c                 C   s   | j d ur
t| j S dS )Nr   )rN   lenr|   rt   rt   ru   num_processes   s   zFSDPStrategy.num_processesc                 C   s   | j | j | jdS )N)num_replicasrank)r   r   global_rankr|   rt   rt   ru   distributed_sampler_kwargs   s   z'FSDPStrategy.distributed_sampler_kwargsc                 C   r   r   )re   r|   rt   rt   ru   rQ      r   z"FSDPStrategy.process_group_backendc                 C   s&   | j r| j S | j}t|tr|jS d S r   )rT   rP   
isinstancer    mixed_precision_configrq   pluginrt   rt   ru   r      s   
z#FSDPStrategy.mixed_precision_configc                 C   s(   | j }|d urt|tsJ |S tdS )Nz32-true)
_precisionr   r    r   rt   rt   ru   rP      s
   zFSDPStrategy.precisionc                 C   s*   |d urt |tstd| || _d S )NzGThe FSDP strategy can only work with the `FSDPPrecision` plugin, found )r   r    	TypeErrorr   )rq   rP   rt   rt   ru   rP      s   
c                 C   s2   | j d usJ | j jst| j | j| j| _d S d S r   )rO   creates_processes_externallyr!   r   r   	_launcherr|   rt   rt   ru   _configure_launcher   s   z FSDPStrategy._configure_launcherc                    sN   t    |   t| jdtr%ddlm} |d| jd | jd< d S d S )Nr]   r   )init_device_meshcuda)	rb   setup_environment_setup_distributedr   rj   gettupletorch.distributed.device_meshr   )rq   r   rr   rt   ru   r     s   
zFSDPStrategy.setup_environmentmodule
optimizersc                 C   s<   | j d}|du rtdt| j d| |}||fS )zWraps the model into a :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel`
        module and sets `use_orig_params=True` to keep the reference to the original parameters in the optimizer.ra   Fz	You set `z(use_orig_params=False)` but this is not supported when setting the model and optimizer up jointly. Either set it to `True` or set the objects up in this order: Create the model, call `setup_module`, create the optimizer, call `setup_optimizer`.)rj   r   rl   rz   r{   setup_module)rq   r   r   ra   rt   rt   ru   setup_module_and_optimizers  s   
z(FSDPStrategy.setup_module_and_optimizersc                    s   ddl m  t fdd| D r)t|rtd d| jv r(td | jd= n d
|| j| j| j	| j
jd| j}t|| j
 t|| j |S )z|Wraps the model into a :class:`~torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel`
        module.r   FullyShardedDataParallelc                 3       | ]}t | V  qd S r   r   .0modr   rt   ru   	<genexpr>#      z,FSDPStrategy.setup_module.<locals>.<genexpr>zYThe model is already wrapped in `FSDP` but there are still parameters on the meta device.rU   z_A FSDP `auto_wrap_policy` is set, but the model is already wrapped. The policy will be ignored.)r   rS   rT   rY   	device_idNrt   )torch.distributed.fsdpr   anymodulesr2   r8   rj   rS   r   rY   r   index_move_torchmetrics_to_device_setup_activation_checkpointingrm   rq   r   rt   r   ru   r     s0   
	zFSDPStrategy.setup_module	optimizerc                    s,   | j drt |S t|std|S )aI  Set up an optimizer for a model wrapped with FSDP.

        This setup method doesn't modify the optimizer or wrap the optimizer. The only thing it currently does is verify
        that the optimizer was created after the model was wrapped with :meth:`setup_module` with a reference to the
        flattened parameters.

        ra   zThe optimizer does not seem to reference any FSDP parameters. HINT: Make sure to create the optimizer after setting up the model.)rj   r   rb   setup_optimizer_optimizer_has_flat_paramsrl   )rq   r   rr   rt   ru   r   ?  s   	zFSDPStrategy.setup_optimizerc                 C   s   d S r   rt   r   rt   rt   ru   module_to_deviceR  s   zFSDPStrategy.module_to_device
empty_initc                 C   sD   | j  }|  }t }|r|td || || |S )Nmeta)rP   module_init_contextmodule_sharded_contextr   enter_contexttorchdevice)rq   r   precision_init_ctxmodule_sharded_ctxstackrt   rt   ru   r   V  s   


z FSDPStrategy.module_init_contextc                 C   s>   ddl m} ddlm} |d|| j| j| j| jjd| j	S )Nr   r   )enable_wrap)wrapper_clsrS   rT   rY   r   rt   )
2torch.distributed.fsdp.fully_sharded_data_parallelr   torch.distributed.fsdp.wrapr   rS   r   rY   r   r   rj   )rq   r   r   rt   rt   ru   r   d  s   z#FSDPStrategy.module_sharded_contextmeantensorr/   	reduce_opc                 C   s   t |trt|||dS |S )N)r   )r   r   r-   )rq   r   r/   r   rt   rt   ru   
all_reducer  s   
zFSDPStrategy.all_reduceargsc                 O   s>   t  sd S tj dkrtjj| jjgd d S tj  d S )Nnccl)
device_ids)r*   r   distributedget_backendbarrierr   r   )rq   r   r^   rt   rt   ru   r   z  s
   zFSDPStrategy.barrierr   objsrcc                 C   s,   t  s|S |g}tjj||tjd |d S )Nr.   r   )r*   r   r   broadcast_object_list_groupWORLD)rq   r   r   rt   rt   ru   	broadcast  s
   zFSDPStrategy.broadcast       @Tmax_norm	norm_typeerror_if_nonfinitec                 C   sP   ddl m} t||stdt| j d|jj d| j| |j	||dS )zClip gradients by norm.r   r   zFGradient clipping with FSDP is only possible if the module passed to `zE.clip_gradients_norm` is wrapped in `FullyShardedDataParallel`. Got: .)r   r   )
r   r   r   r   rz   r{   rs   rP   unscale_gradientsclip_grad_norm_)rq   r   r   r   r   r   r   rt   rt   ru   clip_gradients_norm  s   

z FSDPStrategy.clip_gradients_normpathstatestorage_optionsfilterc                 C   s  |durt d|dur| jdkrtdt| |}| r0| jdkr0t|s0td| ddlm	} d	d
 |
 D }t|dkrItdt|dkrStd|d }| jdkr| rd|  |jddd t|}i }	i }
|A | D ]4\}}t|tr| }|	}nt|tr|||}|	}nt|tr| n|}|
}t||pi || qzW d   n1 sw   Y  t|	| | jdkrt|
|t  dS dS | jdkrFt|rt| t || j!d}i }|? | D ]2\}}t|tr| }nt|tr|||}nt|tr| n|}t||p!i || qW d   n	1 s1w   Y  | jdkrDt|| dS dS td| j )a  Save model, optimizer, and other state to a checkpoint on disk.

        If the state-dict-type is ``'full'``, the checkpoint will be written to a single file containing the weights,
        optimizer state and other metadata. If the state-dict-type is ``'sharded'``, the checkpoint gets saved as a
        directory containing one file per process, with model- and optimizer shards stored per file. Additionally, it
        creates a metadata file `meta.pt` with the rest of the user's state (only saved from rank 0).

        Nz`FSDPStrategy.save_checkpoint(..., storage_options=...)` is not supported because `FSDPStrategy` does not use the `CheckpointIO`.rL   zVFSDP doesn't support loading sharded filtered checkpoints, so saving them is disabled.r\   z/The checkpoint path exists and is a directory: r   r   c                 S   s   g | ]}t |r|qS rt   _has_fsdp_modules)r   r   rt   rt   ru   
<listcomp>  s    z0FSDPStrategy.save_checkpoint.<locals>.<listcomp>a  Could not find a FSDP model in the provided checkpoint state. Please provide the model as part of the state like so: `save_checkpoint(..., state={'model': model, ...})`. Make sure you set up the model (and optimizers if any) through the strategy before saving the checkpoint.r`   zFound multiple FSDP models in the given state. Saving checkpoints with FSDP is currently limited to a single model per checkpoint. To save multiple models, call the save method for each model separately with a different path.T)parentsexist_ok)
world_sizezUnknown state_dict_type: )"r   rn   ry   r   r   is_dir_is_sharded_checkpointIsADirectoryErrorr   r   valuesr   rl   is_fileunlinkmkdir_get_sharded_state_dict_contextitemsr   r   
state_dictr   optim_state_dictr;   r%   _distributed_checkpoint_saver   r   saver3   shutilrmtree_get_full_state_dict_contextr   )rq   r   r   r   r   FSDPr   r   state_dict_ctxconverted_statemetadatakeyr   	convertedtarget_dict
full_statert   rt   ru   save_checkpoint  s   







zFSDPStrategy.save_checkpointstrictc              	   C   s   |s
t d|dt| |}t|tr'ddlm} |||| j|d i S t|tr0t	dddl
m} ddlm} d	d
 | D }t|dkrOt ddd
 | D }t|dkrbt dt| d \}	}
t|rt|
}|J |	|
 i}t|| |
j||	 |d |rddlm} ||d}| D ]\}}|||	 ||d}|j|| |
|d}|| qW d   n1 sw   Y  t|t }| |  |  }t|| |d |D ]}||vrq||||< q|S t|rvt |}ddlm!}m"} |||	|
| j|d t|tri S t#|}| D ]3\}}t$|
| jdd ||||
}|j||
|d}|| W d   n	1 sQw   Y  q$| |  |  }t|| |d t%|||d |S t dt&|d)zOLoad the contents from a checkpoint and restore the state of the given objects.z,Got FSDPStrategy.load_checkpoint(..., state=z) but a state with at least  a model instance to reload is required. Pass it in like so: FSDPStrategy.load_checkpoint(..., state={'model': model, ...})r   ) _load_raw_module_state_from_path)r   r   r   z`Loading a single optimizer object from a checkpoint is not supported yet with the FSDP strategy.)!load_sharded_optimizer_state_dictr   c                 S   s   i | ]\}}t |r||qS rt   r   )r   r   r   rt   rt   ru   
<dictcomp>%  s    z0FSDPStrategy.load_checkpoint.<locals>.<dictcomp>a  Could not find a FSDP model in the provided checkpoint state. Please provide the model as part of the state like so: `load_checkpoint(..., state={'model': model, ...})`. Make sure you set up the model (and optimizers if any) through the strategy before loading the checkpoint.c                 S   s    i | ]\}}t |tr||qS rt   )r   r   )r   r   optimrt   rt   ru   r  ,  s     r`   zFound multiple FSDP models in the given state. Loading checkpoints with FSDP is currently limited to a single model per checkpoint. To load multiple models, call the load method for each model separately with a different path.)r   FileSystemReaderr   )model_state_dictoptimizer_keystorage_reader)r   modelr  N)_load_raw_module_state _rekey_optimizer_state_if_neededF)r   
rank0_only)sourcedestinationkeysz	The path z does not point to a valid checkpoint. Make sure the path points to either a directory with FSDP checkpoint shards, or a single file with a full checkpoint.)'rl   r   r   r   r   *lightning.fabric.strategies.model_parallelr   r   r   ry   &torch.distributed.checkpoint.optimizerr  r   r   r   r   listr   r   r   _distributed_checkpoint_loadload_state_dicttorch.distributed.checkpointr  optim_state_dict_to_loadr   loadr3   r  r(   pop_is_full_checkpointr4   r  r  r5   r   r6   str)rq   r   r   r   r   r  r   r   r   
module_keyr   r   module_stater  reader	optim_keyr  optim_stateflattened_osdr   requested_metadata_keysr   
checkpointr  r  temp_state_dictr   rt   rt   ru   load_checkpoint  s   





	zFSDPStrategy.load_checkpointstrategy_registryc                 C   s4   t j sd S |jd| dd |jd| ddd d S )NrF   z+Fully Sharded Data Parallel (FSDP) training)descriptionrG   zQFully Sharded Data Parallel (FSDP) training with Full Sharding and CPU OffloadingT)r'  rS   )r   r   is_availableregister)clsr&  rt   rt   ru   register_strategies  s   

z FSDPStrategy.register_strategiesc                 C   s>   t   |   |  | _| jd usJ t| j| j| jd d S )N)rR   )r9   _set_world_ranks_get_process_group_backendre   rO   r,   rf   r|   rt   rt   ru   r     s
   
zFSDPStrategy._setup_distributedc                 C   s   | j pt| jS r   )re   r+   r   r|   rt   rt   ru   r-    s   z'FSDPStrategy._get_process_group_backendc                 C   sJ   | j d ur| j | j| j | j  | j | j| j  | j t_	t
_	d S r   )rO   set_global_rank	node_rankr   r   set_world_sizer   r   r   r   utils_rank_zero_onlyr|   rt   rt   ru   r,    s   
zFSDPStrategy._set_world_ranks)r_   Nr   )Nr   )r   )r   T)NN)NT)Br{   
__module____qualname____doc__r   r   r   r   r   r   r   r   r  r   r   boolr   r   r   r   intr   rc   propertyr   r   r}   setterr   r   r   r   r   rQ   r   r    rP   r   r   r   r   r   r   r   r
   r   r   r   r)   r   r   r$   r   floatr   r:   r	   r   r%  classmethodr#   r+  r   r-  r,  __classcell__rt   rt   rr   ru   rK   [   sF   5	
.
!

c"
 
rK   rW   rX   rV   r_   c                    s   | d u r
|d u r
i S | d ur|d urt d| d ur<t| tr$t|  n| f td|  dt  d d fddiS t|trFt|i S d|iS )	NzeYou cannot set both `activation_checkpointing` and `activation_checkpointing_policy`. Use the latter.z'`FSDPStrategy(activation_checkpointing=zD)` is deprecated, use `FSDPStrategy(activation_checkpointing_policy=z)` instead.check_fnc                    s
   t |  S r   r   )	submoduleclassesrt   ru   <lambda>  s   
 z2_activation_checkpointing_kwargs.<locals>.<lambda>rU   )rl   r   r  r   r7   setri   )rW   rX   rt   r>  ru   rm     s&   



rm   policyr^   c                 C   s6   | d u r|S t | trddlm} || } | |d< |S )Nr   r@   rU   )r   rA  r   rA   )rB  r^   rA   rt   rt   ru   ri     s   
ri   r   activation_checkpointing_kwargsc                    sx   |sd S ddl m  t fdd|  D rtd d S ddl m}m}m} ts0t	||j
d}|| fd|i| d S )	Nr   CheckpointWrapperc                 3   r   r   r   r   rD  rt   ru   r     r   z2_setup_activation_checkpointing.<locals>.<genexpr>ztFSDP checkpointing is configured, but the model already contains checkpointed layers. Checkpointing will be ignored.)CheckpointImplapply_activation_checkpointingcheckpoint_wrapper)checkpoint_implcheckpoint_wrapper_fn);torch.distributed.algorithms._checkpoint.checkpoint_wrapperrE  r   r   r8   rF  rG  rH  r0   r   NO_REENTRANT)r   rC  rF  rG  rH  rt   rD  ru   r     s   r   c                   @   s&   e Zd ZedededefddZdS )rg   r   enabledr_   c                 C   sH   |st  S ddlm} t||s tdt| j d|jj d| S )zuBlocks gradient synchronization inside the :class:`~torch.distributed.fsdp.FullyShardedDataParallel`
        wrapper.r   r   zABlocking backward sync is only possible if the module passed to `zB.no_backward_sync` is wrapped in `FullyShardedDataParallel`. Got: r   )	r   r   r   r   r   rz   r{   rs   no_sync)rq   r   rM  r   rt   rt   ru   no_backward_sync  s   
z)_FSDPBackwardSyncControl.no_backward_syncN)r{   r2  r3  r   r   r5  r
   rO  rt   rt   rt   ru   rg     s    rg   rS   r=   c                 C   s(   ddl m} t| |r| S |t| dS )Nr   )r=   )offload_params)r   r=   r   r5  )rS   r=   rt   rt   ru   rp     s   rp   rY   rZ   r?   c                 C   s   ddl m} |dd ur|dd urtdt| tr#||   n| }d|jv rC|dd u rC|dd u rC|dd u rCtd|S )	Nr   )r?   process_groupr]   znThe arguments `FSDPStrategy(process_group=..., device_mesh=...)` are mutually exclusive.Pass only one of them.HYBRIDrU   zThe hybrid sharding strategy requires you to pass at least one of the parameters: `auto_wrap_policy`, `process_group` tuple, or `device_mesh`.)	r   r?   r   rl   r   r  uppernameRuntimeError)rY   r^   r?   strategyrt   rt   ru   ro     s   
ro   r   c                 C   s   t dd | jD S )Nc                 s   s*    | ]}|d  D ]	}t |ddV  qqdS )params_fsdp_flattenedFN)getattr)r   r/   paramrt   rt   ru   r     s    z-_optimizer_has_flat_params.<locals>.<genexpr>)r   param_groups)r   rt   rt   ru   r     s   r   )NNNc                 C   sL   ddl m} ddlm}m}m} |dd}|dd}|j| |j||d}|S )Nr   r   )ShardedOptimStateDictConfigShardedStateDictConfigStateDictTypeT)offload_to_cpur   r[   state_dict_configoptim_state_dict_config)r   r   torch.distributed.fsdp.apir\  r]  r^  r[   SHARDED_STATE_DICT)r   r   r\  r]  r^  ra  rb  state_dict_type_contextrt   rt   ru   r      s   

r   Tr   r  c           
      C   sX   ddl m}m} ddl m} ddlm} |d|d}|d|d}|j| |j||d}	|	S )Nr   )FullStateDictConfigr^  r   )FullOptimStateDictConfigT)r_  r  r`  )r   rf  r^  r   rc  rg  r[   FULL_STATE_DICT)
r   r   r  rf  r^  r   rg  ra  rb  re  rt   rt   ru   r   /  s   r   r   c                 C   s   |   o	| t  S )z]A heuristic check to determine whether the path points to a directory with checkpoint shards.)r   r3   r   r  rt   rt   ru   r   B  s   r   c                 C   s   |   S r   )r   r  rt   rt   ru   r  G  s   r  c                    s0   ddl m  t| tot fdd|  D S )Nr   r   c                 3   r   r   r   r   mr   rt   ru   r   N  r   z$_has_fsdp_modules.<locals>.<genexpr>)r   r   r   r   r   r   )r   rt   r   ru   r   K  s   $r   r   c                    sB   t dsd S ddlm   fdd|  D D ]}|| qd S )Ntorchmetricsr   Metricc                 3   s    | ]
}t | r|V  qd S r   r   ri  rl  rt   ru   r   Y  s    z/_move_torchmetrics_to_device.<locals>.<genexpr>)r   rk  rm  r   to)r   r   metricrt   rl  ru   r   Q  s   r   r   c                 C   sd   t rddlm} || |d d S ddlm} trddlm} nddlm} ||dd}|| | d S )Nr   )r   checkpoint_id)FileSystemWriter)save_state_dictT)r   single_file_per_rank)r1   r  r   rr  r0   rs  )r   r   r   rr  writerrt   rt   ru   r   ]  s   r   r  c                 C   sb   t rddlm} || |d d S ddlm} trddlm} nddlm} ||d}|| | d S )Nr   )r  rp  r  )r  r  )r1   r  r  r  r0   r  )r  r   r  r  r  rt   rt   ru   r  p  s   
r  )T)zr   warnings
contextlibr   r   datetimer   	functoolsr   pathlibr   typingr   r   r	   r
   r   r   r   r   r   r   r   r   r   r    lightning_utilities.core.importsr   "lightning_utilities.core.rank_zeror   r1  r   torch.nnr   torch.optimr   typing_extensionsr   r   lightning.fabric.acceleratorsr   lightning.fabric.pluginsr   r   r   5lightning.fabric.plugins.collectives.torch_collectiver   'lightning.fabric.plugins.precision.fsdpr    7lightning.fabric.strategies.launchers.subprocess_scriptr!   $lightning.fabric.strategies.parallelr"   $lightning.fabric.strategies.registryr#   $lightning.fabric.strategies.strategyr$   r%   r&   r'   r(   &lightning.fabric.utilities.distributedr)   r*   r+   r,   r-   r/   r   "lightning.fabric.utilities.importsr0   r1   lightning.fabric.utilities.initr2   lightning.fabric.utilities.loadr3   r4   r5   r6   $lightning.fabric.utilities.rank_zeror7   r8   lightning.fabric.utilities.seedr9    lightning.fabric.utilities.typesr:   r;   r   r<   r   r=   r>   r?   r   rA   r5  r6  rV   rZ   _FSDP_ALIASESfilterwarningsFutureWarningrK   rm   ri   r   rg   rp   ro   r   r   r   r   r  objectr   nnr   r   r  r   r  rt   rt   rt   ru   <module>   s   <$    V

"