o
    "i                     @   s<  U d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
mZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d d	l)m*Z*m+Z+m,Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d d
l3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z: d dl;m<Z< g dZ=dZ>dZ?dZ@dZAeeB ZCee&eejDeEeFeBf ZGeeGeeG eeG eeBdf f ZHeeBeHf ZIeeI ZJeeBeeIeJf f ZKeL ZMee eNd< e jOdd ZPeG dd dZQeG dd deQZRejSdd		dadejTdeBdeUd eUd!eCf
d"d#ZVG d$d% d%ZWd&d' ZXddd(dejTd)eejYjZd*f d+eUd,eeejT  d-eeQ d!eRfd.d/Z[d0eeBeHf d1eKd2eRd!dfd3d4Z\d5eejTejYjZf d6eBd!efd7d8Z]d9eeBef d2eRd!eeBef fd:d;Z^dejTd2eRd!eeBeHf fd<d=Z_dejTd9eeBeHf d2eRd!e7fd>d?Z`d@ejYjZd!dfdAdBZad9eKd!eeBeHf fdCdDZbd@ejYjZd9eeBeHf d2eRd!eKfdEdFZcdejTdGeejYjZd*f d2eRd!eKfdHdIZddejTd@ejYjZd1eKd2eRd!eKf
dJdKZedejTdGeejYjZd*f d9eKd2eRd!df
dLdMZfddd(dejTd,eeejT  d-eeQ d!eeBeHf fdNdOZgddd(dejTdGeejYjZeejYjZ f d,eeejT  d-eeQ d!eKf
dPdQZhddd(dejTdGeejYjZeejYjZ f d,eeejT  d-eeQ d!eeeBeHf eKf f
dRdSZidejTd9eeejTeeBeHf f eeBeHf f d!eeBeHf fdTdUZjddVdejTd0eeBeHf d-eeQ d!e7fdWdXZkddVdejTdGeejYjZeejYjZ f d1eKd-eeQ d!df
dYdZZlddVdejTdGeejYjZeejYjZ f d0eeBeHf d1eKd-eeQ d!e7fd[d\ZmeddVdejTd-eeQ d!dfd]d^ZneddVdejTdGeejYjZd*f d-eeQ d!dfd_d`ZodS )b    N)asdict	dataclassfield)chain)AnyCallablecastDict	GeneratorIterableListno_type_checkOptionalSetTupleUnion)ShardedTensor)_broadcast_state_dict_flatten_state_dict_gather_state_dict_offload_state_dict_to_cpu_unflatten_state_dict)DTensor)_CHECKPOINT_PREFIX)FullOptimStateDictConfigFullStateDictConfigFullyShardedDataParallelOptimStateDictConfigShardedOptimStateDictConfigShardedStateDictConfigStateDictConfigStateDictType)._get_module_fsdp_state_if_fully_sharded_moduleFSDP_WRAPPED_MODULE)_IncompatibleKeys)DistributedDataParallel)tree_map_only)FQNS_TPrimitiveType	ValueTypeDictValueTypeListDictValueTypeOptimizerStateTypeStateDictOptionsget_model_state_dictget_optimizer_state_dictget_state_dictset_model_state_dictset_optimizer_state_dictset_state_dict_flat_paramparam_groupsparamsstater)   _patched_state_dictc                  c   s@    t  } t   zd V  W | rt   d S d S | rt   w w N)gc	isenableddisableenable)
is_enabled r?   e/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/torch/distributed/checkpoint/state_dict.py_gc_context[   s   
rA   c                   @   sf   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< dS )r-   ap  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().

    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
       full state_dict and will broadcast the tensors in the state_dict/
       optim_state_dict one by one to other ranks. Other ranks will receive
       the tensors and shard according to the local shards in the model and
       optimizer. ``full_state_dict`` must be set to True when using this option.
       This option currently only supports DTensor, not the legacy ShardedTensor.
    Ffull_state_dictcpu_offloadignore_frozen_paramsTkeep_submodule_prefixesstrictbroadcast_from_rank0flatten_optimizer_state_dictN)__name__
__module____qualname____doc__rB   bool__annotations__rC   rD   rE   rF   rG   rH   r?   r?   r?   r@   r-   f   s   
 $r-   c                   @   s   e Zd ZU eedZeeee	j
f eee	j
f f ed< eedZeeee	j
f eee	j
f f ed< eedZee ed< dZeed< dZeed< ejZeed< eedZeej ed	< d
S )_StateDictInfo)default_factoryfqn_param_mappingshared_params_mappingsubmodule_prefixesThandle_modelhandle_optimfsdp_contextfsdp_modulesN)rI   rJ   rK   r   dictrQ   r	   r   strtorchTensorr'   rN   rR   setrS   r   rT   rM   rU   
contextlibnullcontextrV   r   listrW   r   nnModuler?   r?   r?   r@   rO      s   
 rO   )maxsizeTmodelnameskip_ddp_prefixskip_compiler_prefixreturnc           
         sx  | td}d|vr|hS |d}g }| }t|D ]\}}t|tr4|dks)J |j}|s3|| qt|trz|t	|d k rf||d  t
krfd| t|t
}	 rZ  d  fdd|	jD   S t|t}|tkry|| t||}qt|tjjjr|dksJ |j}|s|| q|| |tjjjkr|t	|d krtdqt||}qd| tdhS )	a  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `Set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
     .module   c                    s   h | ]}  | qS r?   r?   .0fqnprefixr?   r@   	<setcomp>       z_get_fqns.<locals>.<setcomp>	_orig_modz-Expect `_extra_state` to be the last obj name)replacer   split	enumerate
isinstanceDDPrj   appendFSDPlen_FLAT_PARAMjoingetattr_fqnsr#   rZ   _dynamo
eval_frameOptimizedModulers   r`   modules_EXTRA_STATE_KEY_SUFFIXRuntimeError)
rc   rd   re   rf   	obj_namesfqn_obj_namescurr_objicurr_obj_name
flat_paramr?   ro   r@   	_get_fqns   sL   



 







r   c                   @   s   e Zd ZdS )_EXTRA_STATEN)rI   rJ   rK   r?   r?   r?   r@   r      s    r   c                 #   s:    t  dtjdtdtf fdd  | dE d H  d S )Nrj   curr_fqnrg   c                 3   s     |  |r| dnd}|  D ]\}}|v rq| | } ||E d H  qt| jdd| jddD ]\}}|| jv rBq8| | }||fV  q8t| jdtj	j
tj	j
krl| tjjj }|t fV  d S d S )Nri   rh   F)recurseget_extra_state)addnamed_childrenr   named_buffersnamed_parameters_non_persistent_buffers_setr~   	__class__r`   ra   r   r   rj   r   r   )rj   r   rd   	submodulenew_fqnobjr   visited_modulesr?   r@   r      s*   

z+_iterate_valid_model_state.<locals>.recurserh   )r\   r`   ra   rY   r
   )rc   r?   r   r@   _iterate_valid_model_state   s   r   )
submodulesoptionsoptims.
optim_onlyr   r   c                C   s&  |rt dt |r|std|pt }i }i }t| D ]@\}}t|tr'qt| |}	|	|d}
|
durIt
tt || |	 || ||< n|	 ||< |	D ]}
t|ts\|||
< qQqt| D ]\}}|D ]
}
t
tj|||
< qjqdt }|rt|}|  D ]"\}}||vrqt| |}	t|	dksJ d|dd |	D  q|jr|jstdt| }|r|jrt|j|jd	}t|j|jp|jd	}tj}nt |jd
}t!|jd
}tj"}t#j$dd }t%j&|| |||d}nt#j'}t(di t)|||||t
t*t+j, || t|dkdS )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    zGetting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.z;Optimizers are not passed in but optim_only is set to True.Nrk   z)Submodule FQN should only have 1 instancec                 s   s    | ]}| d V  qdS )ri   Nr?   rl   r?   r?   r@   	<genexpr>A  s    z"_verify_options.<locals>.<genexpr>z?full_state_dict must be True when broadcast_from_rank0 is True.)offload_to_cpu
rank0_only)r   c              	   s   sx    t  - tj| |||d d V  W d    n1 sw   Y  W d    d S W d    d S 1 s5w   Y  d S )Nrj   state_dict_typestate_dict_configoptim_state_dict_config)warningscatch_warningsrz   r   r   r?   r?   r@   $fsdp_state_dict_type_without_warning_  s   
"z=_verify_options.<locals>.fsdp_state_dict_type_without_warningr   r   )rQ   rR   rS   rV   rW   rT   rU   r?   )-r   warnFutureWarningr   r-   r   rw   r   r   getr   r   rY   updatecopyr_   itemsrZ   r[   r\   named_modulesr{   rG   rB   
ValueErrorrz   rW   r   rC   r   r!   FULL_STATE_DICTr   r   SHARDED_STATE_DICTr]   contextmanager	functoolspartialr^   rO   r   r   r`   ra   )rc   r   r   r   r   rQ   rR   rd   paramfqnsrn   param_fqns_rS   rj   rW   r   r   r   r   rV   r?   r?   r@   _verify_options  s   









r   model_state_dictoptim_state_dictinfoc                 C   s   |j D ]}t|}|d usJ dq|jr3| s3|js3|js3|jr#|js3|jr3|js3t	dt
 d|jrH|sH|jr>|jsH|jsHt	d| |  D ]}t|v r\t	| dt dqLd S )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=ri   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)rW   r"   rT   rS   rD   rC   rB   rF   rG   r   distget_rankrU   keysr|   )r   r   r   rj   
fsdp_statekeyr?   r?   r@   _verify_state_dict  sZ   
	r   r   apic                 C   s,   t | |}|tv rtjt | j|| d}|S )N)self)r~   r8   r   r   r   )r   r   callr?   r?   r@   _state_dict_fn  s   
r   
state_dictc                 C   sB   |j r|jrtj st nd}t| |j|dS |jrt| S | S )N)r   )rC   
ranks_only)rB   rC   rZ   distributedis_initializedtupler   r   )r   r   r   r?   r?   r@   _maybe_full_or_cpu_state_dict  s   r   c                 C   s  |j si S |  t| d }W d    n1 sw   Y  t| D ]:}t| |}t|dks8J ||ftt|}||kr_dt	fdd}|||sXt
d| d| ||||< q%|jri }| D ]&}|jD ] }||svqn|jr|| ||< qn|t|d  }	|| ||	< qnqi|}|jr|  D ]\}}
|
jrqt| |}|D ]}|| qqt| D ]\}}t|r|jr|| qt||S )Nr   rk   rg   c                 S   s   t |t | kr
dS |d}| d}d}t|D ]&\}}||| kr9|d7 }|t |kr8|t |d k  S q|dv r>q dS dS )NFri   r   rk   )rj   rs   T)r{   ru   rv   )r   rn   	fqn_split	key_splitfqn_idxkey_idxkey_namer?   r?   r@   verify  s   

z%_get_model_state_dict.<locals>.verifyzAn unexpected key, z, exists. FQN is )rT   rV   r   r_   r   r   r{   nextiterrM   r   poprS   
startswithrE   rD   r   requires_gradr   rZ   	is_tensoris_metar   )rc   r   r   r   r   rn   r   new_state_dictrp   r   r   pr?   r?   r@   _get_model_state_dict  sP   







r   c                 C   s\  |j r|s|jsti i S i }t| D ]3\}}t| |}t| |ddd}t||D ]\}}	|jr6t dkrA||	krA||||	< |||	< q)q|jrd }
|	 D ]\}}t
|rn| dkrn|
d u rg|j}
qP|
|jksnJ qP|
d usuJ t|||
|jd |	 D ]\}}|||< q|  ttt| d||jdW  d    S 1 sw   Y  d S )NF)re   rf   r   )devicerF   load_state_dict)r   rF   )rT   rG   r$   r   r   zipr   r   r   r   rZ   r   dimr   r   rF   rV   r   r   )rc   r   r   local_state_dictr   valuer   fqns_with_prefixrn   fqn_with_prefixr   local_stater?   r?   r@   _load_model_state_dict  sL   





$r   optimc                 C   s   | j rdS | jD ]}|t D ]}|jdurtd|jr"t||_qqg }| jD ]}d|v r:||d  d|d< q)| j	dd | jD ]}d|v rQ|
d|d< qD| jdd dS )	zH
    Initialize optim states by calling the step() with zero grads.
    Na  state_dict can only be used if the optimizer states are initialized (usually after one step() with gradients) or gradients are None. For the later case, state_dict will fake the gradients as zero to initialize the optimizer states. However, the gradients are not None.lrg        )closurer   T)set_to_none)r7   r5   _PARAMSgradr   r   rZ   
zeros_likery   stepr   	zero_grad)r   param_groupr   lrsr?   r?   r@   _init_optim_state<  s0   



r   c           	   
   C   s   dd }i }t t| t  D ] \}}t t| D ]\}}|| ||t d| d| < qqt t| t D ]&}|t}t tt	 |D ]}| D ]\}}||t d| d| < qKqEq7|S )aI  
    This API flattens the optimizer state_dict to support optimizer resharding for
    MPMD, e.g., pipeline parallelism.

    Without the API, the original optimizer state_dict looks like:
    {
        "state": {
            "layer1.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
            "layer2.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
        },
        "param_group": [
            {
                "lr": 0.0,
                "betas": (0.9, 0.95), ...,
                "params": ["layer1.weight", "layer2.weight"]
            }
        ]
    }

    With this API, the optimizer state_dict looks like:
    {
        "state.layer1.weight.step": 10,
        "state.layer2.weight.step": 10,
        "state.layer1.weight.exp_avg": SomeTensor,
        "state.layer2.weight.exp_avg": SomeTensor,
        "state.layer1.weight.exp_avg_sq": SomeTensor,
        "state.layer2.weight.exp_avg_sq": SomeTensor,
        "param_group.layer1.weight.lr" : 0.1,
        "param_group.layer2.weight.lr" : 0.1,
        "param_group.layer1.weight.betas" : (0.9, 0.95),
        "param_group.layer2.weight.betas" : (0.9, 0.95),
    }

    Note that if any of the value is a container, like the betas in the example,
    this API won't flattent it.
    c                 S   s*   t | tjttfstdt|  dd S )NzUFlattening optimizer state_dict only supports tensor, int, float states now. Type is ri   )rw   rZ   r[   intfloatNotImplementedErrortype)vr?   r?   r@   _raise_if_type_not_supported  s   z?_flatten_optim_state_dict.<locals>._raise_if_type_not_supportedri   )
r   r*   _STATEr   r+   _PGr   r   r   rY   )	r   r   retrn   r7   kr   r   r   r?   r?   r@   _flatten_optim_state_dictb  s   *
r   c                 C   s\  i }g }t |t|i}| jD ]}|tg i |t D ]A}|j| D ]9}|d t }	t|	ts0J |	| |js9q!i ||< | j	| 
 D ]}
|t  d| d|
  tt|| |
< qDq!qttt |d t d }|
 D ]=}|tkrtqm|t d| d|  }||d vr||d |< qm|d | |krtd| d| d| d|d |  d	qmq|S )z
    This API unflattens the state_dict generated by _flatten_optim_state_dict().
    See the docstring of _flatten_optim_state_dict() for more detail.
    ri   r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )r   r   r5   ry   r   rQ   rw   r_   r   r7   r   r   r*   r   rY   r   )r   r   r   r7   pg_state
return_osdr   r   rn   r6   
state_namefirst_param_fqnr   r   r?   r?   r@   _unflatten_optim_state_dict  sR   	


r  
optimizersc              	      s  |j si S ti tg i}|D ]}t| t|d }|jrm|  t| ||}W d    n1 s2w   Y  |s:qt	|t 
 D ]}d|v rW|t ||t |dd< qB|t D ]}dd |t D }||t< q\nut	tdd |jD }tt|tt|}	i  |  D ](\}
}t| |
}t|d	ksJ tt|}||	vrq|	| }| |< | |< qt	|t 
 D ]}
 |
 }|t |
|t |< q|t D ]} fd
d|t D |t< q|sqtt|t |t  tt|t |t  q|jr	ttt |}t!||S )Nr   rs   
_orig_mod.rh   c                 S   s   g | ]}| d dqS )r  rh   rt   rm   r   r?   r?   r@   
<listcomp>  rr   z)_get_optim_state_dict.<locals>.<listcomp>c                 s   s    | ]}|t  V  qd S r9   )r   )rm   gr?   r?   r@   r     s    z(_get_optim_state_dict.<locals>.<genexpr>rk   c                    s   g | ]} | qS r?   r?   )rm   pidfqn_pid_mappingr?   r@   r    s    )"rU   r   r   r   r   rW   rV   rz   r   r_   r   r   rt   r   r   from_iterabler5   rX   r   ranger{   r   r   r   r   r   r*   r   r+   extendrH   r,   r   r   )rc   r  r   r   r   osdr   r  r6   param_pid_mappingr   r   r   rn   r  groupr?   r  r@   _get_optim_state_dict  s\   




r  c              	   C   s  i }g }t |t|i}i }tdd tt|t   D r|S |jD ]}|tg i |t D ]q}	|j	|	 D ]i}
|
|j
v rXd}tt|t D ]}|
ttt |t v rVd} nqEnd}|s]q5|d t }t|tsjJ ||
 |	jr}tt|t  |
 ||
< tt|t D ]}|
ttt |t v rt|t d |t|< qq5q.q!tt|t D ]#}|t|d}|dkrq| D ]\}}|tkrq||| |< qq|S )a  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    c                 s   s    | ]}t |tV  qd S r9   )rw   r   r
  r?   r?   r@   r   (  s    

z*_split_optim_state_dict.<locals>.<genexpr>FTr  rk   )r   r   allr   r*   r   r5   ry   r   rQ   rR   r+   r   rY   rw   r_   r   r{   idr   r   )rc   r   r   r   r7   r  r  
pg_mappingr   r   rn   	in_paramsloaded_param_groupr6   idxr   r   r?   r?   r@   _split_optim_state_dict  sb   


r  c              	      s6  |j sd S |D ]}t| |r*t|v rt| |||}nt|ttttf ||}ni }|j	r| 
 D ]d\}}t| |}t| |dd}	||	krHq3t|dksPJ | |	 |t D ]}
ttttf |
}fdd|t D }||t< q\tt|t }t| D ]}|v r||||< qq3|  t| ||}W d    n1 sw   Y  n\|jrd|_t| |f|}d|_d   fdd}ttj||} d usJ t|\}}t|\}}t|| d	 | D ]}||vr
||v sJ || ||< || ||< qt ||}t!|d
|d qd S )NF)rf   rk   c                    s   g | ]}|  qS r?   r	  )rm   r   )rn   fqn_with_compilerr?   r@   r  |  s    z*_load_optim_state_dict.<locals>.<listcomp>Tc                    s4   |   dkr d u r| j | S  | jkrtd| S )Nr   zDevice mismatch)r   r   r   )tr   r?   r@   _device  s   
z'_load_optim_state_dict.<locals>._devicer   r   r   )"rU   r   r   r  r  r   r	   rY   r)   rW   r   r   r{   r   r   r   r   r*   r_   r   rt   rV   rz   optim_state_dict_to_loadrG   rB   r  r&   rZ   r[   r   r   r   r   )rc   r  r   r   r   r   original_fqn_r   fqns_with_compilerr  valr6   	osd_stater   r   r!  flatten_osdosd_mappingflatten_local_osdlocal_osd_mapping	optim_keyr?   )r   rn   r  r@   _load_optim_state_dictV  s   



	
r.  c                C   sX   t   t| t d||d}t| |}t|i | |W  d   S 1 s%w   Y  dS )aH  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.

    :rtype: typing.Dict[str, ValueType]
    Fr   r   r   N)rA   r   r   r   r   )rc   r   r   r   r   r?   r?   r@   r.     s   
$r.   c                C   st   t  - t|tjjr|fnt|}t| |d||d}t| ||}ti || |W  d   S 1 s3w   Y  dS )a  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.

    :rtype: OptimizerStateType
    Tr/  N)	rA   rw   rZ   r   	Optimizerr   r   r  r   )rc   r  r   r   r   r   r?   r?   r@   r/     s    $r/   c                C   s   t  4 t|tjjr|fnt|}t| |d||d}t| |}t| ||}t	||| ||fW  d   S 1 s:w   Y  dS )a  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:
        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
        >>> from torch.distributed.checkpoint.state_dict import get_state_dict

        >>> fsdp_model = FSDP(copy.deepcopy(model))
        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        >>> ddp_model = DDP(copy.deepcopy(model))
        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)

        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        >>> # the asserts will fail.
        >>> assert ddp_state_dict == fsdp_state_dict
        >>> assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.

    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
    Fr/  N)
rA   rw   rZ   r   r0  r   r   r   r  r   )rc   r  r   r   r   r   r   r?   r?   r@   r0     s"   F
$r0   c           	         s   |si S t tt| tjretdt t	t
tjt
ttf f |}i }| D ]8\}}|  D ]/\}}||kr;q2t| |}t|dksJJ dtt| d | fdd| D  q2q*|S t	t
ttf |S )NzPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.rk   z/FQNs for a submodule should only have 1 elementri   c                    s   i | ]	\}} | |qS r?   r?   )rm   subfqnr   ro   r?   r@   
<dictcomp>u  s    z/_unflatten_model_state_dict.<locals>.<dictcomp>)rw   r   r   r   r`   ra   r   r   r   r   r	   rY   r)   r   r   r   r{   r   )	rc   r   cast_state_dictr   r   sub_state_dictrd   mr   r?   ro   r@   _unflatten_model_state_dict[  s,   

r6  )r   c                C   s^   t | |}t  t| t d|d}t|i | t| ||W  d   S 1 s(w   Y  dS )a=  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Dict[str, ValueType]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    :type model_state_dict: typing.Dict[str, ValueType]
    Fr   r   N)r6  rA   r   r   r   r   )rc   r   r   r   r?   r?   r@   r1   |  s   
$r1   c                C   sr   t  , t|tjjr|fnt|}t| |d|d}ti || t| ||| W d   dS 1 s2w   Y  dS )a  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None

    :type optim_state_dict: typing.OptimizerStateType
    Tr7  N)	rA   rw   rZ   r   r0  r   r   r   r.  )rc   r  r   r   r   r?   r?   r@   r2     s   "r2   c                C   s   t | |}t 2 t|tjjr|fnt|}t| || |d}t||| t	| ||| t
| ||W  d   S 1 s=w   Y  dS )a4  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

    :type model_state_dict: typing.Dict[str, ValueType]
    :type optim_state_dict: typing.OptimizerStateType
    r7  N)r6  rA   rw   rZ   r   r0  r   r   r   r.  r   )rc   r  r   r   r   r   r?   r?   r@   r3     s   *

$r3   c                   sj   t jt| |dfdd}|| _t jt| |d dtttf f fdd}|| _t	
| t	
| dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rc   r   c                           S r9   r?   r?   _state_dict_callr?   r@   state_dict_call)     z0_patch_model_state_dict.<locals>.state_dict_callr   c                        | d d S )N)r   r?   r"  _load_state_dict_callr?   r@   load_state_dict_call4     z5_patch_model_state_dict.<locals>.load_state_dict_callN)r   r   r.   r   r1   r	   rY   r   r   r8   r   )rc   r   r;  r@  r?   r?  r:  r@   _patch_model_state_dict  s    
rC  c                   s   t jt| ||dfdd}t jt| ||d dtttf f fdd}t| t| t	|t
jjr9|fnt|}|D ]}||_||_q?dS )a  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rc   r  r   c                      r8  r9   r?   r?   r9  r?   r@   r;  e  r<  z4_patch_optimizer_state_dict.<locals>.state_dict_callr   c                    r=  )N)r   r?   r"  r>  r?   r@   r@  o  rA  z9_patch_optimizer_state_dict.<locals>.load_state_dict_callN)r   r   r/   r2   r	   rY   r   r8   r   rw   rZ   r   r0  r   r   r   )rc   r  r   r;  r@  r   r?   rB  r@   _patch_optimizer_state_dict?  s0   

rD  )TT)pr]   r   r:   r   dataclassesr   r   r   	itertoolsr   typingr   r   r   r	   r
   r   r   r   r   r   r   r   rZ   torch.distributedr   r   torch.nnr`   'torch.distributed._shard.sharded_tensorr   #torch.distributed._state_dict_utilsr   r   r   r   r   torch.distributed._tensorr   ;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   torch.distributed.fsdpr   r   r   rz   r   r   r   r    r!   $torch.distributed.fsdp._common_utilsr"   r#   torch.nn.modules.moduler$   torch.nn.parallelr%   rx   torch.utils._pytreer&   __all__r|   r   r   r   rY   r'   r[   r   r   r(   r)   r*   r+   r,   r\   r8   rN   r   rA   r-   rO   	lru_cachera   rM   r   r   r   r   r0  r   r   r   r   r   r   r   r   r  r  r  r.  r.   r/   r0   r6  r1   r2   r3   rC  rD  r?   r?   r?   r@   <module>   s  
8(


.
>$


$-




C

.&@

-
?
H
]

)
1
Y$

%

,
-

>6