o
    "’×iÉÉ  ã                   @   s<  U d dl Z d dlZd dlZd dlZd dlmZmZmZ d dlm	Z	 d dl
mZmZmZmZmZmZmZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlm Z m!Z!m"Z"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d d	l)m*Z*m+Z+m,Z-m.Z.m/Z/m0Z0m1Z1m2Z2 d d
l3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z: d dl;m<Z< g d¢Z=dZ>dZ?dZ@dZAeeB ZCee&eejDeEeFeBf ZGeeGeeG eeG eeBdf f ZHeeBeHf ZIeeI ZJeeBeeIeJf f ZKeLƒ ZMee eNd< e jOdd„ ƒZPeG dd„ dƒƒZQeG dd„ deQƒƒZRejSdd		dadejTdeBdeUd eUd!eCf
d"d#„ƒZVG d$d%„ d%ƒZWd&d'„ ZXddd(œdejTd)eejYjZd*f d+eUd,eeejT  d-eeQ d!eRfd.d/„Z[d0eeBeHf d1eKd2eRd!dfd3d4„Z\d5eejTejYjZf d6eBd!efd7d8„Z]d9eeBef d2eRd!eeBef fd:d;„Z^dejTd2eRd!eeBeHf fd<d=„Z_dejTd9eeBeHf d2eRd!e7fd>d?„Z`d@ejYjZd!dfdAdB„Zad9eKd!eeBeHf fdCdD„Zbd@ejYjZd9eeBeHf d2eRd!eKfdEdF„ZcdejTdGeejYjZd*f d2eRd!eKfdHdI„ZddejTd@ejYjZd1eKd2eRd!eKf
dJdK„ZedejTdGeejYjZd*f d9eKd2eRd!df
dLdM„Zfddd(œdejTd,eeejT  d-eeQ d!eeBeHf fdNdO„Zgddd(œdejTdGeejYjZeejYjZ f d,eeejT  d-eeQ d!eKf
dPdQ„Zhddd(œdejTdGeejYjZeejYjZ f d,eeejT  d-eeQ d!eeeBeHf eKf f
dRdS„ZidejTd9eeejTeeBeHf f eeBeHf f d!eeBeHf fdTdU„ZjddVœdejTd0eeBeHf d-eeQ d!e7fdWdX„ZkddVœdejTdGeejYjZeejYjZ f d1eKd-eeQ d!df
dYdZ„ZlddVœdejTdGeejYjZeejYjZ f d0eeBeHf d1eKd-eeQ d!e7fd[d\„ZmeddVœdejTd-eeQ d!dfd]d^„ƒZneddVœdejTdGeejYjZd*f d-eeQ d!dfd_d`„ƒZodS )bé    N)ÚasdictÚ	dataclassÚfield)Úchain)ÚAnyÚCallableÚcastÚDictÚ	GeneratorÚIterableÚListÚno_type_checkÚOptionalÚSetÚTupleÚUnion)ÚShardedTensor)Ú_broadcast_state_dictÚ_flatten_state_dictÚ_gather_state_dictÚ_offload_state_dict_to_cpuÚ_unflatten_state_dict)ÚDTensor)Ú_CHECKPOINT_PREFIX)ÚFullOptimStateDictConfigÚFullStateDictConfigÚFullyShardedDataParallelÚOptimStateDictConfigÚShardedOptimStateDictConfigÚShardedStateDictConfigÚStateDictConfigÚStateDictType)Ú._get_module_fsdp_state_if_fully_sharded_moduleÚFSDP_WRAPPED_MODULE)Ú_IncompatibleKeys)ÚDistributedDataParallel)Útree_map_only)ÚFQNS_TÚPrimitiveTypeÚ	ValueTypeÚDictValueTypeÚListDictValueTypeÚOptimizerStateTypeÚStateDictOptionsÚget_model_state_dictÚget_optimizer_state_dictÚget_state_dictÚset_model_state_dictÚset_optimizer_state_dictÚset_state_dictÚ_flat_paramÚparam_groupsÚparamsÚstater)   Ú_patched_state_dictc                  c   s@    t  ¡ } t  ¡  zd V  W | rt  ¡  d S d S | rt  ¡  w w ©N)ÚgcÚ	isenabledÚdisableÚenable)Ú
is_enabled© r?   úe/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/torch/distributed/checkpoint/state_dict.pyÚ_gc_context[   s   €ÿ
ÿrA   c                   @   sf   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< dZ
eed< dZeed	< dZeed
< dS )r-   ap  
    This dataclass specifies how get_state_dict/set_state_dict will work.

    - ``full_state_dict``: if this is set to True, all the tensors in the
      returned state_dict will be gathered. No ShardedTensor and DTensor
      will be in the returned state_dict.

    - ``cpu_offload``: offload all the tensors to cpu. To prevent CPU OOM, if
      ``full_state_dict`` is also true, then only the rank0 will get the
      state_dict and all other ranks will get empty state_dict.

    - ``ignore_frozen_params``: if the value is True, the returned state_dict
      won't contain any frozen parameters -- the ``requires_grad`` is False.
      The default value is False.

    - ``keep_submodule_prefixes`` (deprecated): when ``submodules`` is not None, this option
      indicates whether to keep the submodule prefixes from the state_dict keys.
      or example, if the submodule is ``module.pretrain`` and the full FQN of
      the parameter is ``pretrain.layer1.weight`` of the param. When this option
      is True, the parameter's key in the returned state_dict will be
      ``pretrain.layer1.weight``. If the options is False, the key will be
      ``layer1.weight``.
      Note that if ``keep_submodule_prefixes`` is False, there may be conflicted
      FQNs, hence there should be only one submodule in ``submodules``.

    - ``strict``: the ``strict`` option when ``set_state_dict`` calls
      model.load_state_dict().

    - ``broadcast_from_rank0``: when the option is True, rank0 should receive a
       full state_dict and will broadcast the tensors in the state_dict/
       optim_state_dict one by one to other ranks. Other ranks will receive
       the tensors and shard according to the local shards in the model and
       optimizer. ``full_state_dict`` must be set to True when using this option.
       This option currently only supports DTensor, not the legacy ShardedTensor.
    FÚfull_state_dictÚcpu_offloadÚignore_frozen_paramsTÚkeep_submodule_prefixesÚstrictÚbroadcast_from_rank0Úflatten_optimizer_state_dictN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__rB   ÚboolÚ__annotations__rC   rD   rE   rF   rG   rH   r?   r?   r?   r@   r-   f   s   
 $r-   c                   @   s¾   e Zd ZU eedZeeee	j
f eee	j
f f ed< eedZeeee	j
f eee	j
f f ed< eedZee ed< dZeed< dZeed< ejZeed< eedZeej ed	< d
S )Ú_StateDictInfo)Údefault_factoryÚfqn_param_mappingÚshared_params_mappingÚsubmodule_prefixesTÚhandle_modelÚhandle_optimÚfsdp_contextÚfsdp_modulesN)rI   rJ   rK   r   ÚdictrQ   r	   r   ÚstrÚtorchÚTensorr'   rN   rR   ÚsetrS   r   rT   rM   rU   Ú
contextlibÚnullcontextrV   r   ÚlistrW   r   ÚnnÚModuler?   r?   r?   r@   rO   •   s   
 þÿþÿrO   )ÚmaxsizeTÚmodelÚnameÚskip_ddp_prefixÚskip_compiler_prefixÚreturnc           
         sx  |  td¡}d|vr|hS | d¡}g }| }t|ƒD ]—\}}t|tƒr4|dks)J ‚|j}|s3| |¡ qt|tƒrz|t	|ƒd k rf||d  t
krfd |¡‰ t|t
ƒ}	ˆ rZˆ › d‰ ‡ fdd„|	jD ƒ  S t|tƒ}|tkry| |¡ t||ƒ}qt|tjjjƒr“|dksˆJ ‚|j}|s’| |¡ q| |¡ |tjjjkr¬|t	|ƒd kr«tdƒ‚qt||ƒ}qd |¡  td¡hS )	aá  
    This API is used to convert the name of a parameter to the FQNs. For FSDP
    without `use_orig_params`, the name of FlatParameter can be mapped to
    multiple original parameters. As a result, the return type of this function
    is `Set[str]`.

    Args:
        module (nn.Module): the root model.
        name (str): the name
        skip_ddp_prefix (bool): whether to skip DDP's `module` prefix

    Returns:
        The canonical FQNs based on the model traversal.
    Ú Ú.Úmoduleé   c                    s   h | ]}ˆ › |› ’qS r?   r?   ©Ú.0Úfqn©Úprefixr?   r@   Ú	<setcomp>Î   ó    z_get_fqns.<locals>.<setcomp>Ú	_orig_modz-Expect `_extra_state` to be the last obj name)Úreplacer   ÚsplitÚ	enumerateÚ
isinstanceÚDDPrj   ÚappendÚFSDPÚlenÚ_FLAT_PARAMÚjoinÚgetattrÚ_fqnsr#   rZ   Ú_dynamoÚ
eval_frameÚOptimizedModulers   r`   ÚmodulesÚ_EXTRA_STATE_KEY_SUFFIXÚRuntimeError)
rc   rd   re   rf   Ú	obj_namesÚfqn_obj_namesÚcurr_objÚiÚcurr_obj_nameÚ
flat_paramr?   ro   r@   Ú	_get_fqns¤   sL   


€
 




€
€
ÿrŒ   c                   @   s   e Zd ZdS )Ú_EXTRA_STATEN)rI   rJ   rK   r?   r?   r?   r@   r   ã   s    r   c                 #   s:    t ƒ ‰dtjdtdtf‡ ‡fdd„‰ ˆ | dƒE d H  d S )Nrj   Úcurr_fqnrg   c                 3   sÜ    ˆ  | ¡ |r|› dnd}|  ¡ D ]\}}|ˆv rq|› |› }ˆ ||ƒE d H  qt| jdd| jddƒD ]\}}|| jv rBq8|› |› }||fV  q8t| jdtj	j
ƒtj	j
krl|› tjjj› }|tƒ fV  d S d S )Nri   rh   F)ÚrecurseÚget_extra_state)ÚaddÚnamed_childrenr   Únamed_buffersÚnamed_parametersÚ_non_persistent_buffers_setr~   Ú	__class__r`   ra   r   rƒ   rj   r„   r   )rj   rŽ   rd   Ú	submoduleÚnew_fqnÚobj©r   Úvisited_modulesr?   r@   r   ê   s*   €
ÿ
ÿüz+_iterate_valid_model_state.<locals>.recurserh   )r\   r`   ra   rY   r
   )rc   r?   rš   r@   Ú_iterate_valid_model_stateç   s   €rœ   )Ú
submodulesÚoptionsÚoptims.Ú
optim_onlyr   rž   c                C   s&  |rt  dt¡ |r|stdƒ‚|ptƒ }i }i }t| ƒD ]@\}}t|tƒr'qt| |ƒ}	| 	|d¡}
|
durIt
tt || ƒ |	¡ || ||< n|	 ¡ ||< |	D ]}
t|tƒs\|||
< qQqt| ¡ ƒD ]\}}|D ]
}
t
tj|ƒ||
< qjqdtƒ }|r¦t|ƒ}|  ¡ D ]"\}}||vrŒqƒt| |ƒ}	t|	ƒdks›J dƒ‚| dd„ |	D ƒ¡ qƒ|jr°|js°tdƒ‚t | ¡}|rò|jrÑt|j|jd	}t|j|jpÊ|jd	}tj}nt |jd
}t!|jd
}tj"}t#j$dd„ ƒ}t%j&|| |||d}nt#j'}t(di t)|ƒ¤||||t
t*t+j, |ƒ| t|ƒdkdœ¤ŽS )zW
    Verify the model and options passed by the user and generates _StateDictInfo.
    z¼Getting submodules only model/optim state_dict is deprecated and will be removed in 2.5. This feature can be achieved by manually filtering out the state_dict returned from get_state_dict.z;Optimizers are not passed in but optim_only is set to True.Nrk   z)Submodule FQN should only have 1 instancec                 s   s    | ]}|› d V  qdS )ri   Nr?   rl   r?   r?   r@   Ú	<genexpr>A  s   € z"_verify_options.<locals>.<genexpr>z?full_state_dict must be True when broadcast_from_rank0 is True.)Úoffload_to_cpuÚ
rank0_only)r¢   c              	   s   sx    t  ¡ - tj| |||d d V  W d   ƒ n1 sw   Y  W d   ƒ d S W d   ƒ d S 1 s5w   Y  d S )N©rj   Ústate_dict_typeÚstate_dict_configÚoptim_state_dict_config)ÚwarningsÚcatch_warningsrz   r¥   r¤   r?   r?   r@   Ú$fsdp_state_dict_type_without_warning_  s   €
üúÿ"ÿz=_verify_options.<locals>.fsdp_state_dict_type_without_warningr¤   r   )rQ   rR   rS   rV   rW   rT   rU   r?   )-r¨   ÚwarnÚFutureWarningr…   r-   rœ   rw   r   rŒ   Úgetr   r   rY   ÚupdateÚcopyr_   ÚitemsrZ   r[   r\   Únamed_modulesr{   rG   rB   Ú
ValueErrorrz   rW   r   rC   r   r!   ÚFULL_STATE_DICTr   r   ÚSHARDED_STATE_DICTr]   ÚcontextmanagerÚ	functoolsÚpartialr^   rO   r   r   r`   ra   )rc   rŸ   r    r   rž   rQ   rR   rd   ÚparamÚfqnsrn   Úparam_Úfqns_rS   rj   rW   r¦   r§   r¥   rª   rV   r?   r?   r@   Ú_verify_options  s¨   üÿ
þþ


€þÿ
ÿ
ÿ
þÿÿ
ûÿ

ør¼   Úmodel_state_dictÚoptim_state_dictÚinfoc                 C   s¾   |j D ]}t|ƒ}|d usJ dƒ‚q|jr3| s3|js3|js3|jr#|js3|jr3|js3t	dt
 ¡ ›dƒ‚|jrH|sH|jr>|jsH|jsHt	d|› ƒ‚|  ¡ D ]}t|v r\t	|› dt› dƒ‚qLd S )Nz)Expected a fsdp_state with a fsdp module.z}The option indicates that model state_dict is required to save or load, but model state_dict is empty.rank = dist.get_rank()=ri   zgThe option indicates that model state_dict is required to save, or load but optim state_dict is empty. z
 contains z6. This can happen if the model is not the root module.)rW   r"   rT   rS   rD   rC   rB   rF   rG   r…   ÚdistÚget_rankrU   Úkeysr|   )r½   r¾   r¿   rj   Ú
fsdp_stateÚkeyr?   r?   r@   Ú_verify_state_dict…  sZ   
ÿþýüûûúù	þÿÿþþýÿÿÿÿÿrÅ   r™   Úapic                 C   s,   t | |ƒ}|tv rtjt | j|ƒ| d}|S )N)Úself)r~   r8   r¶   r·   r–   )r™   rÆ   Úcallr?   r?   r@   Ú_state_dict_fn²  s   
rÉ   Ú
state_dictc                 C   sB   |j r|jrtj ¡ stƒ nd}t| |j|dS |jrt| ƒS | S )N)r   )rC   Ú
ranks_only)rB   rC   rZ   ÚdistributedÚis_initializedÚtupler   r   )rÊ   r¿   rË   r?   r?   r@   Ú_maybe_full_or_cpu_state_dict¹  s   ÿÿýÿrÏ   c                 C   sœ  |j si S | ¡  t| dƒƒ }W d   ƒ n1 sw   Y  t| ¡ ƒD ]:}t| |ƒ}t|ƒdks8J ||fƒ‚tt|ƒƒ}||kr_dt	fdd„}|||ƒsXt
d|› d|› ƒ‚| |¡||< q%|jr’i }| ¡ D ]&}|jD ] }| |¡svqn|jr€|| ||< qn|t|ƒd … }	|| ||	< qnqi|}|jr±|  ¡ D ]\}}
|
jr¡q™t| |ƒ}|D ]}| |¡ q¨q™t| ¡ ƒD ]\}}t |¡rÈ|jrÈ| |¡ q·t||ƒS )NrÊ   rk   rg   c                 S   s†   t |ƒt | ƒkr
dS | d¡}|  d¡}d}t|ƒD ]&\}}||| kr9|d7 }|t |ƒkr8|t |ƒd k  S q|dv r>q dS dS )NFri   r   rk   )rj   rs   T)r{   ru   rv   )rÄ   rn   Ú	fqn_splitÚ	key_splitÚfqn_idxÚkey_idxÚkey_namer?   r?   r@   ÚverifyÜ  s   

ÿz%_get_model_state_dict.<locals>.verifyzAn unexpected key, z, exists. FQN is )rT   rV   rÉ   r_   rÂ   rŒ   r{   ÚnextÚiterrM   r…   ÚpoprS   Ú
startswithrE   rD   r”   Úrequires_gradr°   rZ   Ú	is_tensorÚis_metarÏ   )rc   r¿   rÊ   rÄ   r¹   rn   rÕ   Únew_state_dictrp   r˜   r¸   Úpr?   r?   r@   Ú_get_model_state_dictË  sP   
ÿ

€

ù
ÿ
€
rß   c                 C   s\  |j r|s|jsti i ƒS i }t| ƒD ]3\}}t| |ƒ}t| |ddd}t||ƒD ]\}}	|jr6t ¡ dkrA||	krA| |¡||	< |||	< q)q|jr‹d }
| 	¡ D ]\}}t
 |¡rn| ¡ dkrn|
d u rg|j}
qP|
|jksnJ ‚qP|
d usuJ ‚t|||
|jd | 	¡ D ]\}}|||< q‚| ¡  ttt| dƒ||jdƒW  d   ƒ S 1 s§w   Y  d S )NF)re   rf   r   )ÚdevicerF   Úload_state_dict)rÊ   rF   )rT   rG   r$   rœ   rŒ   ÚziprÀ   rÁ   rØ   r°   rZ   rÛ   Údimrà   r   rF   rV   r   rÉ   )rc   rÊ   r¿   Úlocal_state_dictrÄ   Úvaluer¹   Úfqns_with_prefixrn   Úfqn_with_prefixrà   Úlocal_stater?   r?   r@   Ú_load_model_state_dict  sL   

ÿÿ
û€
ÿ

ÿþ$ÿré   Úoptimc                 C   s´   | j rdS | jD ]}|t D ]}|jdurtdƒ‚|jr"t |¡|_qqg }| jD ]}d|v r:| |d ¡ d|d< q)| j	dd | jD ]}d|v rQ| 
d¡|d< qD| jdd dS )	zH
    Initialize optim states by calling the step() with zero grads.
    Na  state_dict can only be used if the optimizer states are initialized (usually after one step() with gradients) or gradients are None. For the later case, state_dict will fake the gradients as zero to initialize the optimizer states. However, the gradients are not None.Úlrg        )Úclosurer   T)Úset_to_none)r7   r5   Ú_PARAMSÚgradr…   rÚ   rZ   Ú
zeros_likery   ÚsteprØ   Ú	zero_grad)rê   Úparam_groupr¸   Úlrsr?   r?   r@   Ú_init_optim_state<  s0   

ÿ€õ
€
€rõ   c           	   
   C   sÀ   dd„ }i }t t| t ƒ ¡ D ] \}}t t|ƒ ¡ D ]\}}||ƒ ||t› d|› d|› < qqt t| t ƒD ]&}| t¡}t tt	 |ƒD ]}| ¡ D ]\}}||t› d|› d|› < qKqEq7|S )aI  
    This API flattens the optimizer state_dict to support optimizer resharding for
    MPMD, e.g., pipeline parallelism.

    Without the API, the original optimizer state_dict looks like:
    {
        "state": {
            "layer1.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
            "layer2.weight": {
                "step": 10, "exp_avg": SomeTensor, "exp_avg_sq": SomeTensor
            },
        },
        "param_group": [
            {
                "lr": 0.0,
                "betas": (0.9, 0.95), ...,
                "params": ["layer1.weight", "layer2.weight"]
            }
        ]
    }

    With this API, the optimizer state_dict looks like:
    {
        "state.layer1.weight.step": 10,
        "state.layer2.weight.step": 10,
        "state.layer1.weight.exp_avg": SomeTensor,
        "state.layer2.weight.exp_avg": SomeTensor,
        "state.layer1.weight.exp_avg_sq": SomeTensor,
        "state.layer2.weight.exp_avg_sq": SomeTensor,
        "param_group.layer1.weight.lr" : 0.1,
        "param_group.layer2.weight.lr" : 0.1,
        "param_group.layer1.weight.betas" : (0.9, 0.95),
        "param_group.layer2.weight.betas" : (0.9, 0.95),
    }

    Note that if any of the value is a container, like the betas in the example,
    this API won't flattent it.
    c                 S   s*   t | tjttfƒstdt| ƒ› dƒ‚d S )NzUFlattening optimizer state_dict only supports tensor, int, float states now. Type is ri   )rw   rZ   r[   ÚintÚfloatÚNotImplementedErrorÚtype)Úvr?   r?   r@   Ú_raise_if_type_not_supportedŒ  s   þÿÿz?_flatten_optim_state_dict.<locals>._raise_if_type_not_supportedri   )
r   r*   Ú_STATEr°   r+   Ú_PGrØ   rî   r   rY   )	rÊ   rû   Úretrn   r7   Úkrú   ró   r¹   r?   r?   r@   Ú_flatten_optim_state_dictb  s   *þ
ÿÿr   c                 C   s\  i }g }t |t|i}| jD ]ž}| tg i¡ |t D ]A}|j| D ]9}|d t }	t|	tƒs0J ‚|	 |¡ |js9q!i ||< | j	|  
¡ D ]}
|t › d|› d|
›  tt|| ƒ|
< qDq!qttt |d t ƒd }| 
¡ D ]=}|tkrtqm|t› d|› d|›  }||d vr||d |< qm|d | |krªtd|› d|› d|› d|d | › d	ƒ‚qmq|S )zœ
    This API unflattens the state_dict generated by _flatten_optim_state_dict().
    See the docstring of _flatten_optim_state_dict() for more detail.
    éÿÿÿÿri   r   zaAll the parameters in the same parameter group should have the same saved param_group value. But z is z while other(s) is )rü   rý   r5   ry   rî   rQ   rw   r_   rÚ   r7   rÂ   r   r*   r   rY   r…   )rê   rÊ   r¿   r7   Úpg_stateÚ
return_osdró   r¸   rn   r6   Ú
state_nameÚfirst_param_fqnrÿ   rå   r?   r?   r@   Ú_unflatten_optim_state_dict¢  sR   	

ÿÿùÿÿþ
þÿÿúr  Ú
optimizersc              	      s  |j si S ti tg i}|D ]ð}t|ƒ t|dƒƒ }|jrm| ¡  t | ||¡}W d   ƒ n1 s2w   Y  |s:qt	|t  
¡ ƒD ]}d|v rW|t  |¡|t | dd¡< qB|t D ]}dd„ |t D ƒ}||t< q\nut	t dd„ |jD ƒ¡ƒ}tt|tt|ƒƒƒƒ}	i ‰ |  ¡ D ](\}
}t| |
ƒ}t|ƒd	ksœJ ‚tt|ƒƒ}||	vr§q‹|	| }|ˆ |< |ˆ |< q‹t	|t  
¡ ƒD ]}
ˆ |
 }|t  |
¡|t |< q¼|t D ]}‡ fd
d„|t D ƒ|t< qÒ|såqtt|t ƒ |t ¡ tt|t ƒ |t ¡ q|jr	ttt |ƒƒ}t!||ƒS )NrÊ   rs   ú
_orig_mod.rh   c                 S   s   g | ]}|  d d¡‘qS )r  rh   ©rt   ©rm   rÿ   r?   r?   r@   Ú
<listcomp>é  rr   z)_get_optim_state_dict.<locals>.<listcomp>c                 s   s    | ]}|t  V  qd S r9   )rî   )rm   Úgr?   r?   r@   r¡   ì  s   € z(_get_optim_state_dict.<locals>.<genexpr>rk   c                    s   g | ]}ˆ | ‘qS r?   r?   )rm   Úpid©Úfqn_pid_mappingr?   r@   r  þ  s    )"rU   rü   rý   rõ   rÉ   rW   rV   rz   r¾   r_   rÂ   rØ   rt   rî   r   Úfrom_iterabler5   rX   râ   Úranger{   r”   rŒ   rÖ   r×   r   r*   r®   r+   ÚextendrH   r,   r   rÏ   )rc   r  r¿   r¾   rê   Úosdrÿ   r  r6   Úparam_pid_mappingrÄ   r¸   r¹   rn   r  Úgroupr?   r  r@   Ú_get_optim_state_dictÏ  s\   
ÿ€
þ

ÿ
r  c              	   C   sœ  i }g }t |t|i}i }tdd„ tt|t  ƒ ¡ D ƒƒr|S |jD ]}| tg i¡ |t D ]q}	|j	|	 D ]i}
|
|j
v rXd}tt|t ƒD ]}|
ttt |t ƒv rVd} nqEnd}|s]q5|d t }t|tƒsjJ ‚| |
¡ |	jr}tt|t  ƒ|
 ||
< tt|t ƒD ]}|
ttt |t ƒv rt|t ƒd |t|ƒ< q„q5q.q!tt|t ƒD ]#}| t|ƒd¡}|dkr·q¨| ¡ D ]\}}|tkrÄq»||| |< q»q¨|S )að  
    Extract the corresponding optim state_dict from ``optim_state_dict`` for
    ``optim`` and return the result optim state_dict.

    Args:
        model (nn.Module): the root model.
        optim (torch.optim.Optimizer): the optimizer.
        optim_state_dict (Dict[str, ValueType]): the superset optim state_dict that
            contains the optim state_dict of ``optim``.
        info (_StateDictInfo): state dict information.

    Returns:
        The optim state_dict of ``optim``.
    c                 s   s    | ]}t |tƒV  qd S r9   )rw   rö   r
  r?   r?   r@   r¡   (  s   € 

ÿz*_split_optim_state_dict.<locals>.<genexpr>FTr  rk   )rü   rý   Úallr   r*   rÂ   r5   ry   rî   rQ   rR   r+   r   rY   rw   r_   rÚ   r{   Úidr­   r°   )rc   rê   r¾   r¿   r7   r  r  Ú
pg_mappingró   r¸   rn   Ú	in_paramsÚloaded_param_groupr6   ÚidxrÄ   rå   r?   r?   r@   Ú_split_optim_state_dict  sb   ÿ

ÿþ€
ÿ€üíÿür  c              	      s6  |j sd S |D ]}t|ƒ |r*t|v rt| |||ƒ}nt|ttttf |ƒ|ƒ}ni }|j	r´|  
¡ D ]d\}}t| |ƒ}t| |dd}	||	krHq3t|ƒdksPJ ‚| ¡ ‰|	 ¡ ‰|t D ]}
ttttf |
ƒ}‡‡fdd„|t D ƒ}||t< q\tt|t ƒ}t| ¡ ƒD ]}ˆ|v r–| |¡|| ˆˆ¡< q…q3| ¡  t | ||¡}W d   ƒ n1 s®w   Y  n\|jrd|_t| |f|ƒ}d|_d ‰ ‡ fdd„}ttj||ƒ}ˆ d usÚJ ‚t|ƒ\}}t|ƒ\}}t||ˆ d	 | ¡ D ]}||vr
||v sþJ ‚|| ||< || ||< qñt ||ƒ}t!|d
ƒ|d qd S )NF)rf   rk   c                    s   g | ]}|  ˆ ˆ¡‘qS r?   r	  )rm   rÄ   )rn   Úfqn_with_compilerr?   r@   r  |  s    ÿz*_load_optim_state_dict.<locals>.<listcomp>Tc                    s4   |   ¡ dkrˆ d u r| j‰ | S ˆ | jkrtdƒ‚| S )Nr   zDevice mismatch)rã   rà   r²   )Út©rà   r?   r@   Ú_device  s   
þz'_load_optim_state_dict.<locals>._devicer   rá   ©rÊ   )"rU   rõ   rü   r  r  r   r	   rY   r)   rW   r”   rŒ   r{   rØ   rý   r   rî   r*   r_   rÂ   rt   rV   rz   Úoptim_state_dict_to_loadrG   rB   r  r&   rZ   r[   r   r   r   rÉ   )rc   r  rÊ   r¿   rê   r¾   Úoriginal_fqnÚ_r¹   Úfqns_with_compilerr  Úvalr6   Ú	osd_staterÿ   rä   r!  Úflatten_osdÚosd_mappingÚflatten_local_osdÚlocal_osd_mappingÚ	optim_keyr?   )rà   rn   r  r@   Ú_load_optim_state_dictV  s€   
ÿÿ
ÿÿ
€þ
ÿÿ€	
€ÿ²r.  c                C   sX   t ƒ  t| tƒ d||d}t| |ƒ}t|i |ƒ |W  d  ƒ S 1 s%w   Y  dS )aH  
    Return the model state_dict of ``model``.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``model``.

    :rtype: typing.Dict[str, ValueType]
    F©r    r   rž   N)rA   r¼   rÎ   rß   rÅ   )rc   r   rž   r¿   r½   r?   r?   r@   r.   °  s   û
$ör.   c                C   st   t ƒ - t|tjjƒr|fnt|ƒ}t| |d||d}t| ||ƒ}ti ||ƒ |W  d  ƒ S 1 s3w   Y  dS )aË  
    Return the combined state_dict for optimizers.

    See ``get_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        The state_dict for ``optimizers``.

    :rtype: OptimizerStateType
    Tr/  N)	rA   rw   rZ   rê   Ú	OptimizerrÎ   r¼   r  rÅ   )rc   r  r   rž   r¿   r¾   r?   r?   r@   r/   Õ  s    ÿýû$ñr/   c                C   s‚   t ƒ 4 t|tjjƒr|fnt|ƒ}t| |d||d}t| |ƒ}t| ||ƒ}t	|||ƒ ||fW  d  ƒ S 1 s:w   Y  dS )aÖ  
    Return the model state_dict and optimizers state_dict.

    ``get_state_dict`` can process any module that is parallelized by PyTorch
    FSDP/fully_shard, DDP/replicate, tensor_parallel/parallelize_module, and any
    combination of these parallelisms. The main functions of ``get_state_dict``
    are: 1.) returning a model and optimizer state_dict that can be resharded
    with a different number of trainers and/or different parallelisms.
    2.) hiding the parallelism-specific state_dict APIs. Users don't have to call
    these APIs.
    3.) sanity checking the result state_dict.

    The keys of the result state dictionary are the canonical FQNs (Fully
    Qualified Names).  A canonical FQN refers to the FQN based on a parameter's
    position in an nn.Module hierarchy. More specifically, a canonical FQN to a
    parameter is the FQN returned by ``module.named_parameters()`` or
    ``module.named_buffers()`` when the module is not distributed by any
    parallelisms. Since the optimizer internally uses parameter IDs to represent
    a parameter, there will be a conversion from the parameter IDs to the
    canonical FQNs when calling this API.

    ``get_state_dict`` can also process a module that is not parallelized. In
    such a case, ``get_state_dict`` only performs one function -- converting the
    optimizer parameter IDs to the canonical FQNs.

    Example:
        >>> # xdoctest: +SKIP
        >>> import torch
        >>> from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        >>> from torch.nn.parallel import DistributedDataParallel as DDP
        >>> from torch.distributed.checkpoint.state_dict import get_state_dict

        >>> fsdp_model = FSDP(copy.deepcopy(model))
        >>> fsdp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)
        >>> ddp_model = DDP(copy.deepcopy(model))
        >>> ddp_optim = torch.optim.Adam(model.parameters(), lr=1e-3)


        >>> ddp_state_dict, ddp_optim_state_dict = get_state_dict(ddp_model, ddp_optim)
        >>> fsdp_state_dict, fsdp_optim_state_dict = get_state_dict(fsdp_model, fsdp_optim)

        >>> # if we simply call ddp_model.state_dict() and fsdp_model.state_dict(),
        >>> # the asserts will fail.
        >>> assert ddp_state_dict == fsdp_state_dict
        >>> assert ddp_optim_state == fsdp_optim_state_dict


    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[None, Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        submodules (deprecated): Optional[Set[nn.Module]]: only return the model parameters
            that belong to the submodules.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be returned. See
            `StateDictOptions` for the details.

    Returns:
        ``Tuple`` that contain model state_dict and optimizer state_dict.

    :rtype: typing.Tuple[typing.Dict[str, ValueType], OptimizerStateType]
    Fr/  N)
rA   rw   rZ   rê   r0  rÎ   r¼   rß   r  rÅ   )rc   r  r   rž   r¿   r½   r¾   r?   r?   r@   r0     s"   Fÿýû
$ðr0   c           	         sÜ   |si S t tt| ¡ ƒƒtjƒret dt¡ t	t
tjt
ttf f |ƒ}i }| ¡ D ]8\}}|  ¡ D ]/\}}||kr;q2t| |ƒ}t|ƒdksJJ dƒ‚tt|ƒƒ› d‰ | ‡ fdd„| ¡ D ƒ¡ q2q*|S t	t
ttf |ƒS )NzÑPassing model_state_dict as a ``Dict[nn.Module, Dict[str, Any]]``is deprecated and will be removed in 2.5. If you need this feature, please preprocessing the model_state_dict to achieve the same functionality.rk   z/FQNs for a submodule should only have 1 elementri   c                    s   i | ]	\}}ˆ | |“qS r?   r?   )rm   Úsubfqnrå   ro   r?   r@   Ú
<dictcomp>u  s    z/_unflatten_model_state_dict.<locals>.<dictcomp>)rw   rÖ   r×   rÂ   r`   ra   r¨   r«   r¬   r   r	   rY   r)   r°   r±   rŒ   r{   r®   )	rc   rÊ   Úcast_state_dictrÝ   r—   Úsub_state_dictrd   Úmr¹   r?   ro   r@   Ú_unflatten_model_state_dict[  s,   û
ÿù
r6  )rž   c                C   s^   t | |ƒ}tƒ  t| tƒ d|d}t|i |ƒ t| ||ƒW  d  ƒ S 1 s(w   Y  dS )a=  Load the model state_dict.

    The counterpart of ``get_model_state_dict`` to set the state_dict to the
    model. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        model_state_dict: (Dict[str, ValueType]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys
            * **unexpected_keys** is a list of str containing the unexpected keys

    :type model_state_dict: typing.Dict[str, ValueType]
    F©r    rž   N)r6  rA   r¼   rÎ   rÅ   ré   )rc   r½   rž   r¿   r?   r?   r@   r1   |  s   ÿ
$ür1   c                C   sr   t ƒ , t|tjjƒr|fnt|ƒ}t| |d|d}ti ||ƒ t| |||ƒ W d  ƒ dS 1 s2w   Y  dS )aÚ  Load the optimizers state_dict.

    The counterpart of ``get_optimizer_state_dict`` to set the state_dict to the
    optimizers. See ``set_state_dict`` for the detail usage.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        None

    :type optim_state_dict: typing.OptimizerStateType
    Tr7  N)	rA   rw   rZ   rê   r0  rÎ   r¼   rÅ   r.  )rc   r  r¾   rž   r¿   r?   r?   r@   r2   £  s   ÿý"÷r2   c                C   sˆ   t | |ƒ}tƒ 2 t|tjjƒr|fnt|ƒ}t| || |d}t|||ƒ t	| |||ƒ t
| ||ƒW  d  ƒ S 1 s=w   Y  dS )a4  Load the model state_dict and optimizers state_dict.

    The counterpart of ``get_state_dict`` to set the state_dict to the model and
    optimizers.  The given ``model_state_dict`` and ``optim_state_dict`` do not
    have to be returned by ``get_state_dict`` but must meet the following
    requirements: 1) all FQNs are canonical FQNs as defined in ``get_state_dict``,
    2) if a tensor is sharded, it must be either a ShardedTensor or DTensor,
    3) optimizer state_dict cannot contain the parameter IDs; the keys should be
    the canonical FQNs.

    Args:
        model (nn.Module): the nn.Module to the model.
        optimizers (Union[Optimizer, Iterable[Optimizer]]):
            The optimizers that are used to optimize ``model``.
        model_state_dict: (Union[Dict[nn.Module, Dict[str, ValueType]], Dict[str, ValueType]]):
           the model state_dict to load. If the key of the ``model_state_dict``
           is nn.Module, the key is a submodule of ``model`` and the value should
           be the state_dict of the submodule. When loading the state_dict,
           the prefix of the submodule will be append to the state_dict.
        optim_state_dict: OptimizerStateType:
            the optimizer state_dict to load.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.

    Returns:
        ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
            * **missing_keys** is a list of str containing the missing keys of the model state_dict.
            * **unexpected_keys** is a list of str containing the unexpected keys of the model state_dict.

    :type model_state_dict: typing.Dict[str, ValueType]
    :type optim_state_dict: typing.OptimizerStateType
    r7  N)r6  rA   rw   rZ   rê   r0  rÎ   r¼   rÅ   r.  ré   )rc   r  r½   r¾   rž   r¿   r?   r?   r@   r3   Ê  s   *ÿÿý
ÿ
$ôr3   c                   sj   t jt| |d‰‡fdd„}|| _t jt| |d‰ dtttf f‡ fdd„}|| _t	 
|¡ t	 
|¡ dS )aó  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``model`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rc   rž   c                      ó   ˆ ƒ S r9   r?   r?   ©Ú_state_dict_callr?   r@   Ústate_dict_call)  ó   z0_patch_model_state_dict.<locals>.state_dict_callrÊ   c                    ó   ˆ | d d S )N)r½   r?   r"  ©Ú_load_state_dict_callr?   r@   Úload_state_dict_call4  ó   z5_patch_model_state_dict.<locals>.load_state_dict_callN)r¶   r·   r.   rÊ   r1   r	   rY   r   rá   r8   r‘   )rc   rž   r;  r@  r?   ©r?  r:  r@   Ú_patch_model_state_dict  s    ýý
rC  c                   s”   t jt| ||d‰‡fdd„}t jt| ||d‰ dtttf f‡ fdd„}t |¡ t |¡ t	|t
jjƒr9|fnt|ƒ}|D ]}||_||_q?dS )a¤  Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers``.

    Patch the ``state_dict`` and ``load_state_dict`` attributes of ``optimizers`` to
    be a partial function to call ``get_state_dict`` and ``set_state_dict``.

    Note that if there are multiple optimizers, all of the optimizers will be patched.
    So users only need to call one of the state_dict() to get the full result.

    Example:
        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
        from torch.distributed.checkpoint.state_dict import patch_model_state_dict

        model = fsdp(model)
        patch_model_state_dict(model)

    Args:
        model (nn.Module): the nn.Module to the model.
        options (StateDictOptions): the options to control how
            model state_dict and optimizer state_dict should be loaded. See
            `StateDictOptions` for the details.
    Returns:
        None
    )rc   r  rž   c                      r8  r9   r?   r?   r9  r?   r@   r;  e  r<  z4_patch_optimizer_state_dict.<locals>.state_dict_callrÊ   c                    r=  )N)r¾   r?   r"  r>  r?   r@   r@  o  rA  z9_patch_optimizer_state_dict.<locals>.load_state_dict_callN)r¶   r·   r/   r2   r	   rY   r   r8   r‘   rw   rZ   rê   r0  rÎ   rÊ   rá   )rc   r  rž   r;  r@  rê   r?   rB  r@   Ú_patch_optimizer_state_dict?  s0   üü

ÿýþrD  )TT)pr]   r¶   r:   r¨   Údataclassesr   r   r   Ú	itertoolsr   Útypingr   r   r   r	   r
   r   r   r   r   r   r   r   rZ   Útorch.distributedrÌ   rÀ   Útorch.nnr`   Ú'torch.distributed._shard.sharded_tensorr   Ú#torch.distributed._state_dict_utilsr   r   r   r   r   Útorch.distributed._tensorr   Ú;torch.distributed.algorithms._checkpoint.checkpoint_wrapperr   Útorch.distributed.fsdpr   r   r   rz   r   r   r   r    r!   Ú$torch.distributed.fsdp._common_utilsr"   r#   Útorch.nn.modules.moduler$   Útorch.nn.parallelr%   rx   Útorch.utils._pytreer&   Ú__all__r|   rý   rî   rü   rY   r'   r[   rö   r÷   r(   r)   r*   r+   r,   r\   r8   rN   rµ   rA   r-   rO   Ú	lru_cachera   rM   rŒ   r   rœ   rê   r0  r¼   rÅ   rÉ   rÏ   rß   ré   rõ   r   r  r  r  r.  r.   r/   r0   r6  r1   r2   r3   rC  rD  r?   r?   r?   r@   Ú<module>   sê  
8(
ÿ

.
üÿþýüû>$úÿþýûú
ù
ÿþý
ü$-
ÿÿ

þÿÿ

þCÿ
þý
ü.&@ÿ
þý
ü-ÿþý
ü?ÿþýü
ûHÿþýü
û]üÿýü

û)ûÿþüû
ú1ûÿþüû
úYÿ$þ

ý%üÿ
þü
û,ûÿþýû
ú-úÿþ
üûú
ù>ýÿýü6üÿýüû