o
    }oim                     @   sR  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ edejdZed	ejdZed
edef dZeG dd dZG dd dZejg g fdeejef dedeeef de
e	eegef   de	defddZdd Z G dd dee Z!de	e dedej"fddZ#ed eeeedf eeef f d!eeeedf f deege!e f fd"d#Z$ed eeeedf eeef f d!eeeedf f d$ede!e fd%d#Z$	d)d eeeedf eeef f d!eeeedf f d$e
e fd&d#Z$G d'd( d(Z%dS )*    N)	dataclass)
AnyCallableDictGenericListOptionalTupleTypeVarUnionoverload)nn)extract_dtypes)loggingSourceModuleT)boundTargetModuleTF.c                   @   s6   e Zd ZU dZejed< eed< ejed< eed< dS )TransformCTXz Transform Data class Definition.sourcesource_statetargettarget_stateN)__name__
__module____qualname____doc__r   Module__annotations__dict r    r    K/home/ubuntu/.local/lib/python3.10/site-packages/nemo/lightning/io/state.pyr       s   
 

r   c                   @   s*   e Zd ZdZd	ddZdd Zdd ZdS )
_ModelStatezc
    Helper class for used for to modify state dict of a source model during model conversion.
    Nc                 C   s   || _ || _d S N)_state_dictconfig)self
state_dictr%   r    r    r!   __init__/   s   
z_ModelState.__init__c                 C   s   | j S r#   )r$   )r&   r    r    r!   r'   3   s   z_ModelState.state_dictc              
   C   sR   | j  D ]!\}}|j|krtd| d|j d| d ||| j |< qd S )NzConverting z from z (source model) to z (target model))r$   itemsdtyper   warningto)r&   r*   kvr    r    r!   r,   7   s
   
 z_ModelState.tor#   )r   r   r   r   r(   r'   r,   r    r    r    r!   r"   *   s
    
r"   r   r   mapping
transformsstate_dict_ignored_entriesreturnc              	      s  ddl m} | }t| drt| j|r| j}|}t|dr&t|j|r&|j}t| }| }	t|| ||	d}
|	 D ]\}}t
d| d|  t|||
}
q>|D ]}t
d|j d|j  ||
}
qWi }| D ]:\}}||	v r|	| }|jj|jkrtd| d	|j d
|j tj||jd||< |	| qqtd| d qq|	 D ]+\}}||}}d|v r|ddd D ]}t||}q|dd }||| qi }| D ]3\}}||	v r|j|	| jkrtd| d|j d|	| j tj|	| dd||< |	| q|	 D ].\}}||}}d|v rA|ddd D ]}t||}q1|dd }||| qttdd |	 } fdd|D }t|dkrlt d| d	 g }| D ]\}}|j!r|"| qs|rJ | d|t| ksJ ddd |	 D  dd d t| 	 D  t|drt|j|r||_|S |S )!a}  
    Applies a series of transformations to adapt the state dictionary of a source module to
    match the structure of a target module's state dictionary.

    This function renames keys according to a provided mapping and modifies values using a list
    of transformation functions. Each transformation function typically is decorated
    with `io.state_transform`.

    Args:
        source (nn.Module): The source module from which parameters and buffers are taken.
        target (TargetModuleT): The target module to which parameters and buffers are adapted.
        mapping (Dict[str, str]): Key-value pairs where each key from the source state dictionary
            is mapped to a corresponding key in the target state dictionary.
        transforms (Optional[List[Callable[[TransformCTX], TransformCTX]]]): A list of functions
            that modify the `TransformCTX` object. If None, no transformations beyond key renaming
            are applied. Defaults to None.
        state_dict_ignored_entries: List of entries to ignore in _target.state_dict(). There are cases
            where multiple entries in model's state_dict point to one entry in model's named_parameter.
            E.g., model has multiple pointers pointing to one shared parameters (`encoder.embed_tokens.weight`,
            `decoder.embed_tokens.weight` and `shared.weight` all points to `shared.weight
            in T5 Huggingface implementation.). In these cases, ignore redundant entries.

    Returns
    -------
        TargetModuleT: The modified target module with its state dictionary adjusted according to
        the specified mappings and transformations.

    Raises
    ------
        ValueError: If there's a mismatch in shape between corresponding source and target parameters
            or buffers.
        RuntimeError: If the target state dictionary contains keys that are not present in the source
            state dictionary after all transformations.

    Examples
    --------
        >>> source_module = nn.Linear(10, 5)
        >>> target_module = nn.Linear(10, 5)
        >>> mapping = {'weight': 'weights', 'bias': 'biases'}
        @io.state_transform(
            source_key="weight",
            target_key="weights"
        )
        def scale_weights(ctx):
            ctx.target_state['weights'] = ctx.source_state['weight'] * 2
            return ctx
        >>> transformed_target = apply_transforms(
        ...     source_module, target_module, mapping, [scale_weights]
        ... )
        >>> print(transformed_target.state_dict()['weights'])

    See Also
    --------
        - `TransformCTX`: For more details on the context object used in transformations.
        - `StateDictTransform`: For creating complex transformations.

    Note:
        This function is particularly useful when adapting models from different frameworks or
        when consolidating models with different architectural changes.
    r   )MegatronModulemodule)r   r   r   r   zMapping z -> zTransforming zShape mismatch for parameter z: target shape z vs converted source shape )requires_gradzUnexpected key: z  not in checkpoint but in model..NzShape mismatch for buffer z:  vs Fc                 S   s   | d uo	|  d S )N_extra_state)endswithxr    r    r!   <lambda>       z"apply_transforms.<locals>.<lambda>c                    s   g | ]}| vr|qS r    r    .0keyr1   r    r!   
<listcomp>       z$apply_transforms.<locals>.<listcomp>zAdditional keys: z  in checkpoint but not in model.z
There are meta tensors in the model after conversion.Did you forget to include these parameters in the mapping or transforms in `convert_state`?zCdtype mismatch between source and target state dicts. Left side is c                 S       i | ]\}}|t jkr||qS r    torchbfloat16r@   r-   r.   r    r    r!   
<dictcomp>        z$apply_transforms.<locals>.<dictcomp>z, Right side is c                 S   rE   r    rF   rI   r    r    r!   rJ      rK   )# megatron.core.transformer.moduler3   hasattr
isinstancer4   r   named_parametersr'   r   r)   r   debugStateDictTransform
source_key
target_keydatashape
ValueErrorr   	Parameterr5   popprintsplitgetattrregister_parameternamed_buffersregister_bufferlistfilterkeyslenRuntimeErroris_metaappend)r   r   r/   r0   r1   r3   _source_targettarget_orig_dtypesr   ctxrA   val	transform_paramsnameparamtarget_param_module_keypart_buffersbufferra   meta_tensor_keysr    rB   r!   apply_transforms?   s   D


"



rv   c                 C   s   | S r#   r    )inpr    r    r!   _default_transform   s   rx   c                	   @   st   e Zd ZdZefdeeeedf eeef f deeeedf f de	fddZ
ded	efd
dZdefddZdS )rQ   a  
    A transformation class for state dictionaries, allowing for flexible key matching and
    transformation of values between source and target state dictionaries.

    Attributes
    ----------
        source_key: A string, tuple of strings, or a dictionary specifying the keys in the source
            state dictionary to match. Wildcards (*) are supported.
        target_key: A string or tuple of strings specifying the keys in the target state dictionary
            to match. Wildcards (*) are supported.
        transform: A callable that performs the transformation on matched keys' values.

    Examples
    --------
        >>> def example_transform(ctx, *args):
        ...     return sum(args)
        >>> transform = StateDictTransform(
        ...     source_key="model.layers.*.self_attn.*_proj.weight",
        ...     target_key="decoder.layers.*.self_attention.linear_qkv.weight",
        ...     transform=example_transform
        ... )
    rR   .rS   rk   c                 C   s   || _ || _|| _d S r#   )rR   rS   rk   )r&   rR   rS   rk   r    r    r!   r(   
  s   
zStateDictTransform.__init__ri   r2   c                    s.  | j | j}|j|j }tt| jj}|	dd  t
ttfrt
tr4fddt|D }n} fdd| D tt| |}ttfdd|}fdd|D }|jd	kre|n| gg}t||  D ]<}	t
|	d	 trd
d |	D }	t|	 D ]}
| j|fi tt| fdd|
d d D ||
d < qtd|	 qq|S t  }t| t|}|jdkr|td krtd t
|trt|}|jdkr|td krtd| nt
|trtdfdd|D }tj|dd}|j|jk}tdd t| jj D }|rt |D ]\}}z|| }W n t!yM } zt"d|d| |d }~ww |rf fdd|D }| j|g|R  ||< n7t
|tro|gnt|}t#|t#|krtd| d|  fddt||D }| j|fi |||< td|d| q)|S t |D ]c\}}|| }t$|rĈ | gn fdd|D }|r| j|g|R  }ndd t||D }| j|fi |}t
|tr|||< nt|D ]\}}|||| < qtd |d| q|S )!Nri   c                    s   i | ]	\}}| | qS r    r    )r@   irn   )rR   r    r!   rJ         z/StateDictTransform.__call__.<locals>.<dictcomp>c                    s$   i | ]\}}|t t  |qS r    )_match_keysr_   ra   rI   source_dictr    r!   rJ   !  s   $ c                    s   |  v S r#   r    r;   source_matches_dictr    r!   r=   #      z-StateDictTransform.__call__.<locals>.<lambda>c                    s0   g | ]} | j d kr | n |  gqS )r   )ndimitem)r@   r.   r~   r    r!   rC   $  s    "z/StateDictTransform.__call__.<locals>.<listcomp>r   c                 S   s   g | ]}|gqS r    r    r@   r<   r    r    r!   rC   ,  s    c                       g | ]} | qS r    r    r   r|   r    r!   rC   /  r>   r7   z'Matched (transform)! layer_names_group=   z!No matches found for source key: z!No matches found for target key: z2Target key must be a string or a tuple of strings.c                    s   g | ]}t  |qS r    )r{   r?   )target_keysr    r!   rC   A      )axisc                 s       | ]	}|j |jkV  qd S r#   kindVAR_POSITIONALr@   rn   r    r    r!   	<genexpr>F  s    
z.StateDictTransform.__call__.<locals>.<genexpr>z7Enountered IndexError during transform.
source_matches=z
target_matches=c                    r   r    r    r@   r-   r|   r    r!   rC   R  r>   z)Mismatch between source and target keys: r8   c                    s   i | ]	\}}| | qS r    r    )r@   rn   r-   r|   r    r!   rJ   [  rz   z%Matched (multi source)! target_match=z source_match=c                    r   r    r    r   r|   r    r!   rC   d  r>   c                 S   s   i | ]\}}||qS r    r    )r@   rn   rj   r    r    r!   rJ   i  r   z&Matched (single source)! target_match=)%rR   rS   r   r   r   inspect	signaturerk   
parametersrX   rN   tuple	enumerater)   r{   r_   ra   r`   r   r   zipstrcall_transformr   rP   sizenparrayrV   stackanyvaluesndenumerate
IndexErrorerrorrb   isscalar)r&   ri   rS   target_dict	fn_paramssource_key_dicttarget_matchesparam_namessource_matcheslayer_names_grouplayer_namessource_keys_matchesmultiple_sourcesaccepts_var_argstarget_indextarget_matchsource_matchesource_values_source_match_listkwargssource_indexoutputsry   tr    )r}   rR   r   r   r!   __call__  s   

"B




zStateDictTransform.__call__c                 O   s   t | jj}tdd |D }t|t| }tdd | D }|s4||kr4td| d| dd|v rD| j|g|R i |S | j|i |S )	z4Perform transform and check if the given args valid.c                 S   s   g | ]}|d vr|qS ))r&   ri   r    )r@   pr    r    r!   rC   x  rD   z5StateDictTransform.call_transform.<locals>.<listcomp>c                 s   r   r#   r   r   r    r    r!   r   z  s    z4StateDictTransform.call_transform.<locals>.<genexpr>z	Expected z4 arguments for the transformation function, but got r6   ri   )r   r   rk   r   rb   r   r   rV   )r&   ri   argsr   func_paramsexpected_num_argsprovided_num_argsr   r    r    r!   r   u  s   z!StateDictTransform.call_transformN)r   r   r   r   rx   r   r   r	   r   r   r(   r   r   r   r    r    r    r!   rQ      s    

arQ   ra   patternc                    s  d}d}g }|t |k rS|||d  dkr$|d7 }|d |d7 }n)|| dkr8|d7 }|d |d7 }n|| d	krC|d
7 }n||| 7 }|d7 }|t |k std| d }t |}dd t|D  tdd | D ]!}||}|rt| D ]\}}	|	 | vr | |	 qqptt  D ]} | j	dd d qdd  D }
t  dkrdg}
t
j|
td}tdd | D ]}||}|r߇ fddt| D }||t|< q|S )N r      z**z(.+)*z([^.]+)r   r6   z\.^$c                 S   s   g | ]}g qS r    r    )r@   _r    r    r!   rC     s    z_match_keys.<locals>.<listcomp>c                 S      | d uS r#   r    r;   r    r    r!   r=     r   z_match_keys.<locals>.<lambda>c                 S   s   |   rt| S | S r#   )isdigitintr;   r    r    r!   r=     r>   )rA   c                 S   s   g | ]}t |qS r    )rb   )r@   matchesr    r    r!   rC     r>   )r*   c                 S   r   r#   r    r;   r    r    r!   r=     r   c                    s   g | ]\}} |  |qS r    )index)r@   ry   groupwildcard_matchesr    r!   rC     s    )rb   re   recompileranger`   matchr   groupssortr   emptyobjectr   )ra   r   escaped_patternry   wildcard_positionsregex_patternnum_wildcardsrA   r   r   rU   output_arrayindicesr    r   r!   r{     sP   






r{   rR   rS   c                 C      d S r#   r    rR   rS   r    r    r!   state_transform  s   r   fnc                 C   r   r#   r    )rR   rS   r   r    r    r!   r     s   c                    s(   dt f fdd}|du r|S ||S )a  
    A decorator for creating StateDictTransform instances with specified source and target keys,
    and a transformation function. This allows for concise definition of state dictionary
    transformations.

    Args:
        source_key: A string, tuple of strings, or a dictionary specifying the keys in the source
            state dictionary to match. Wildcards (*) are supported.
        target_key: A string or tuple of strings specifying the keys in the target state dictionary
            to match. Wildcards (*) are supported.
        fn: An optional callable that performs the transformation on matched keys' values. If not
            provided, the decorator can be used to wrap a function definition.

    Returns
    -------
        A StateDictTransform instance if `fn` is provided, otherwise returns a decorator that
        takes a function and returns a StateDictTransform instance.

    Examples
    --------
        >>> @state_transform(
        ...     source_key="model.layers.*.self_attn.*_proj.weight",
        ...     target_key="decoder.layers.*.self_attention.linear_qkv.weight"
        ... )
        ... def sum_transform(ctx, *args):
        ...     return sum(args)
    r2   c                    s   t  | S r#   rQ   )r   r   r    r!   wrapper  s   z state_transform.<locals>.wrapperNr   )rR   rS   r   r   r    r   r!   r     s   !c                	   @   s   e Zd ZdZededejfddZededejfddZ	eded	ejd
ejdejfddZ
ededejdejdejfddZedejdejfddZedejfddZedejfddZedejfddZededejfd d!Zd"S )#TransformFnszM
    A collection of common functions used in state dict transformation.
    ri   
linear_qkvc                    s   | j j}|j}|j}||  |j}|d|  }|||dg}|d}t fddt	|D }t
 | d }	t
 d | d }
|| d| }||	 d| }||
 d| }|||fS )z{
        Split interleave-concatenated qkv to q, k, v

        Example: export layer linear_qkv to HF {q|k|v}_proj
        r   r7   c                    ,   g | ]}t  d  |  d  |   qS r   rG   aranger@   ry   heads_per_groupr    r!   rC         z*TransformFns.split_qkv.<locals>.<listcomp>r   )r   r%   num_attention_headsnum_query_groupskv_channelsreshaper   rG   catr   r   cpu)ri   r   megatron_confighead_numr   	head_sizeqkv_total_dimhidden_sizeq_slicek_slicev_sliceq_projk_projv_projr    r   r!   	split_qkv  s&   


zTransformFns.split_qkvqkv_biasc                    s   | j j}|j}|j}||  |j}|d|  }|||g}t fddt|D }t	 | d }t	 d | d }	|| d
 }
|| d
 }||	 d
 }|
||fS )z
        Split interleave-concatenated qkv bias to separate q, k, v bias

        Example: export layer linear_qkv bias to HF {q|k|v}_proj bias
        r   c                    r   r   r   r   r   r    r!   rC   .  r   z/TransformFns.split_qkv_bias.<locals>.<listcomp>r   r7   )r   r%   r   r   r   r   rG   r   r   r   r   )ri   r   r   r   r   r   r   r   r   r   q_biask_biasv_biasr    r   r!   split_qkv_bias  s$   

zTransformFns.split_qkv_biasqr-   r.   c                 C   s  | j j}|j}|j}|| }|j}|j}	| }
||	f|
dd  }||	f|
dd  }|j| }|j| }|j| }g }t|D ]<}|	||| |d | ddddf  |	|||d ddddf  |	|||d ddddf  qAt
|}|jdksJ |j|jd |d | ksJ |j|jd |	ksJ |j|jd |
d ksJ |j||	|d|   |g}|S )z|
        Merge q, k, v to interleave-concatenated qkv.

        Example: import HF {q|k|v}_proj to layer linear_qkv
        r   N   r   r   )r   r%   r   r   r   r   r   viewr   re   rG   r   r   rU   r   )ri   r  r-   r.   r   r   r   r   r   r   old_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkv_weights_lry   qkv_weightsr    r    r!   	merge_qkv<  s0   


,$&
 zTransformFns.merge_qkvqbkbvbc                 C   s   | j j}|j}|j}|| }|j}||f}	||f}
|j|	 }|j|
 }|j|
 }td|f|}t	|D ]9}t
|||| |d | ddf f}t
||||d ddf f}t
||||d ddf f}q6|||d|   g}|S )z
        Merge q, k, v bias to interleave-concatenated qkv bias.

        Example: import HF {q|k|v}_proj bias to layer linear_qkv bias
        r   r   Nr   )r   r%   r   r   r   r  rG   r   type_asr   r   r   )ri   r  r  r  r   r   r   r   r   r  r  r   ry   r    r    r!   merge_qkv_biasa  s(   


*"$zTransformFns.merge_qkv_biasgateupc                 C   s   t j| |fddS )z}
        Merge gate and up proj into concatenated fc1

        Example: import HF {gate|up}_proj to layer linear_fc1
        r   dim)rG   r   )r  r  r    r    r!   	merge_fc1  s   zTransformFns.merge_fc1
linear_fc1c                 C   s   t j| ddd\}}||fS )z{
        Split concatenated fc1 to gate and up proj

        Example: export layer linear_fc1 to HF {gate|up}_proj
        r   r   r  )rG   chunk)r  	gate_projup_projr    r    r!   	split_fc1  s   zTransformFns.split_fc1rn   c                 C   s   | | fS )z
        Duplicate the source parameter to two target parameters

        Example: export Performant LoRA linear_fc1.adapter.linear_in to HF {gate|up}_proj.lora_A
        r    rn   r    r    r!   
duplicate2  s   zTransformFns.duplicate2c                 C   s
   | | | fS )z
        Duplicate the source parameter to three target parameters

        Example: export Performant LoRA linear_qkv.adapter.linear_in to HF {q|k|v}_proj.lora_A
        r    r  r    r    r!   
duplicate3  s   
zTransformFns.duplicate3	embeddingc                 C   s   | j j}|d|jddf S )z
        Prune the embedding size to vocab size

        Example: export embedding/output layer to HF with non-padded vocab size
        N)r   r%   
vocab_size)ri   r  r   r    r    r!   prune_padding  s   zTransformFns.prune_paddingN)r   r   r   r   staticmethodr   rG   Tensorr   r  r
  r  r  r  r  r  r  r    r    r    r!   r     s(    ""$" 	r   r#   )&r   r   dataclassesr   typingr   r   r   r   r   r   r	   r
   r   r   numpyr   rG   r   nemo.lightning.pytorch.utilsr   
nemo.utilsr   r   r   r   r   r   r"   no_gradr   rv   rx   rQ   ndarrayr{   r   r   r    r    r    r!   <module>   s~   0	
 / 7
*