o
    }oi@                     @   s  d dl Z d dlZd dlmZ d dlmZ d dlZd dlZd dlm	Z	m
Z
 d dlmZ d dlmZmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ edd\ZZedd\Z Z!edd\Z"Z#edd\Z$Z%edd\Z&Z'edd\Z(Z)e*ee!e%e)e#e'fZ+edd\Z,Z-ee e"fZ.e$e&fZ/dej0fddZ1dd Z2dd Z3dd Z4d d! Z5d"d# Z6d$d% Z7d&d' Z8G d(d) d)ej9j:Z;d*d+ Z<G d,d- d-ej0eZ=dS ).    N)version)Optional)ModelParallelConfigparallel_state)ShardedStateDict)ColumnParallelLinearRowParallelLinear)$gather_from_sequence_parallel_region#scatter_to_sequence_parallel_region)apply_swiglu_sharded_factory)nn)AdapterModuleUtil)activation_registry)adapter_mixin_strategies)safe_import_fromz+megatron.core.extensions.transformer_engineTEColumnParallelLinearTELayerNormColumnParallelLinearTEColumnParallelGroupedLinearTERowParallelLinearTERowParallelGroupedLinearTELinearz#apex.normalization.fused_layer_normMixedFusedLayerNormmc           	         s   j j }d}trht fddtD rhd}t } j} j| }t	 t
rgd _t dr3 j}nt dr< j}nd} j jrg|sgd _tjtd}|tjdkrgt j d	dret j d
drgd}nStrt fddtD rd}t } j| } j}n7trt	 trd} j} j}d}n%t	 trd} j} j}nt	 trd} j} j}n	tdt  |||||fS )zr
    Return input_is_parallel, in_features, out_feature attributes based on implementation of the base layer.
    Tc                 3       | ]}t  |V  qd S N
isinstance).0te_column_parallelr    S/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/llm/peft/utils.py	<genexpr>M       z5get_adapter_attributes_from_linear.<locals>.<genexpr>Fub_overlap_agub_overlap_ag_fpropztransformer-enginez1.5.0devtp_comm_overlaptp_comm_overlap_disable_qkvc                 3   r   r   r   )r   te_row_parallelr   r    r!   r"   j   r#   z%Layer type is unrecognized for LoRA: )configsequence_parallelHAVE_TEanyTECLr   $get_tensor_model_parallel_world_sizein_featuresout_featuresr   r   return_layernorm_outputhasattrr$   r%    return_layernorm_output_gathered	packagingr   VersiongetattrTERLr   r   
input_sizeoutput_sizer   NotImplementedErrortype)	r   disable_sequence_parallel_commbase_linear_is_parallelinput_is_paralleltp_sizer/   r0   r$   
te_versionr    r   r!   "get_adapter_attributes_from_linearG   sX   







rA   c                 C   s   t d| duS )z
    Return whether the current base module is an expert linear module.
    See ParallelLinearAdapter.is_expert for usage details.
    z#.*mlp\..*experts.*\.linear_fc[1-2]$N)rematch)fqnr    r    r!   is_expert_linear   s   rE   c                 C   s8   |du rdS t d| dd d }||}|duS )ac  
    Return whether the pattern (target module to add LoRA) matches the key (model weight name).

    Example:
    --------
        >>> wildcard_match("*.layers.0.*.linear_qkv", "decoder.layers.0.self_attention.linear_qkv")
        True
        >>> wildcard_match("*.layers.0.*.linear_qkv", "decoder.layers.1.self_attention.linear_qkv")
        False
    N^*z(.*)$)rB   compilereplacerC   )patternkeyregex_patternrC   r    r    r!   wildcard_match   s
   
rN   c                        fdd}|S )z!Init method based on N(0, sigma).c                    s   t jj| d dS )N        )meanstd)r   initnormal_tensorsigmar    r!   init_   s   z!init_method_normal.<locals>.init_r    )rX   rY   r    rW   r!   init_method_normal      rZ   c                    rO   ) c                    s   t jj|  dS )N)a)r   rS   kaiming_uniform_rU   valr    r!   rY      s   z*init_method_kaiming_uniform.<locals>.init_r    r`   rY   r    r_   r!   init_method_kaiming_uniform   r[   rb   c                    rO   )r\   c                    s   t j|  S r   )r   rS   	constant_rU   r_   r    r!   rY      s   z init_method_const.<locals>.init_r    ra   r    r_   r!   init_method_const   r[   rd   c                 C   sx   | j d | dkr| dfS || j d |  }t  tj| ddd|f} W d   | |fS 1 s3w   Y  | |fS r\   r   N)shapetorchno_gradr   
functionalpad)xmultpad_lenr    r    r!   pad_seq_to_mult   s   

rn   c                 C   sN   |dkr| S t   | d| ddf W  d   S 1 s w   Y  dS re   )rg   rh   )rk   rm   r    r    r!   unpad_seq_to_mult   s
   
$ro   c                   @   s(   e Zd ZdZedd Zedd ZdS )_All2AllHp2Spz
    All-2-All from Hidden Parallel to Sequence Parallel
    This is a temporary workaround and can be updated in the future
    TODO: Move the functionality to MCore
    c                    j   t  }t  }t|j|dd dd  D   fddt|D }tjj| |d tj	|dd}|S )r\   r   dimc                 S      g | ]}|  qS r    
contiguousr   rV   r    r    r!   
<listcomp>       z)_All2AllHp2Sp.forward.<locals>.<listcomp>c                       g | ]	}t  d  qS r   rg   
empty_liker   _	send_listr    r!   rx          group
r   r.   get_tensor_model_parallel_grouplistchunkrangerg   distributed
all_to_allcat)ctxinput_
world_sizer   receive_listrk   r    r   r!   forward      z_All2AllHp2Sp.forwardc                    rq   )r\   r   rr   c                 S   rt   r    ru   rw   r    r    r!   rx      ry   z*_All2AllHp2Sp.backward.<locals>.<listcomp>c                    rz   r{   r|   r~   r   r    r!   rx      r   r   r   r   )r   grad_outputr   r   r   rk   r    r   r!   backward   r   z_All2AllHp2Sp.backwardN)__name__
__module____qualname____doc__staticmethodr   r   r    r    r    r!   rp      s    
rp   c                 C   s
   t | S )r\   )rp   apply)r   r    r    r!   all2all_hp2sp   s   
r   c                #       s   e Zd ZdZ															d*d
edededededededededee	 dedB dedededededef" fddZ
defddZd d! Z	d+d$ed%ed&ee d'efd(d)Z  ZS ),ParallelLinearAdapterr\   swishxavierzeroFrP   NpostTr/   r0   rs   base_linear_name
activationcolumn_init_methodrow_init_methodr>   dropoutmodel_parallel_configalphadropout_positiona2a_experimental	is_expertr<   dropout_recomputer=   c              
      s  t    || _t|  | _|| _|d ur|n| j| _|| _|| _|| _	|| _
|
d u r.t }
|
j}d|
_|
| _|rIt|||
ddd| |d| _nt|||
dd| ||d| _|r\dnd}| j	rg|rg|rgd}|skd}t|||
d|| |d| _|	dkr|rdd l}|t|	| _n
t|	| _nd | _|
jr|   n|
jr|   | t  ||
_|| _|sd| _|sd| _d S d S )NFT)r)   r>   skip_bias_addbiasinit_method)r)   r   gather_outputr   disable_grad_reduce)r)   r   r   r   rP   r   ) super__init__r   r   r   rs   r   r>   r   use_a2ar   r   r*   r)   r   _get_init_fn	linear_inr   
linear_outthunderjitr   Dropoutr   bf16bfloat16fp16halfsetup_adapter_strategyr   ReturnResultAdapterStrategyr<   )selfr/   r0   rs   r   r   r   r   r>   r   r   r   r   r   r   r<   r   r=   kwargs_sequence_parallellin_out_gather_outputr   	__class__r    r!   r      s   


	

zParallelLinearAdapter.__init__r   c                 C   s^   |dkr
t jj}|S |dkrtd}|S |dkr!ttd}|S |dkr+td}|S td)	Nr   normalg?kaiming   r   rP   z9out_init_method should be zero, normal, kaiming or xavier)	r   rS   xavier_normal_rZ   rb   mathsqrtrd   r:   )r   r   init_fnr    r    r!   r   d  s   	z"ParallelLinearAdapter._get_init_fnc                 C   s  | j dur| jdkr|  |}d}| jrt|| jj\}}| js*| js*| js*t|}| jj	r5| jj
r5d|_| |\}}| |}| jj	rL| jj
rLd|_| |\}}| jsh| jrh| jsh| jrdt|}nt|}| j durw| jdkrw|  |}|| j| j  }|dkrt||}|S )r\   Nprer   Tr   )r   r   r   rn   r)   tensor_model_parallel_sizer<   r>   r	   cpu_offloadingcpu_offloading_activationsactivation_offloadingr   r   r   r   r   r
   r   rs   ro   )r   rk   rm   r   r    r    r!   r   q  s0   




zParallelLinearAdapter.forward r    prefixsharded_offsetsmetadatareturnc           	      C   s   i }| j | d||}| j| d||}d| jv r9| D ]\}}|| d| dfv r8t||||< q!|| || |S )z
        Sharded state dict for LoRA adapter. Special treatment is given to the linear_fc1 adapter
        since TP is sharded separately for the two logical matrices (gate and up)
        z
linear_in.zlinear_out.
linear_fc1zlinear_out.weightzlinear_out.bias)r   sharded_state_dictr   r   itemsr   update)	r   r   r   r   r   linear_in_sdlinear_out_sdkvr    r    r!   r     s   


z(ParallelLinearAdapter.sharded_state_dict)r   r   r   FrP   NNr   FFTFT)r   r    N)r   r   r   r   intstrboolfloatr   r   r   r   r   tupledictr   r   __classcell__r    r    r   r!   r      s|    	
l3r   )>r   rB   importlib.metadatar   typingr   r4   rg   megatron.corer   r   (megatron.core.dist_checkpointing.mappingr   megatron.core.tensor_parallelr   r   &megatron.core.tensor_parallel.mappingsr	   r
   megatron.core.transformer.mlpr   r   -nemo.collections.common.parts.adapter_modulesr   #nemo.collections.common.parts.utilsr   nemo.core.classes.mixinsr   nemo.utils.import_utilsr   r   HAVE_TE_COL_LINEARr   HAVE_TE_LN_COL_LINEARr   HAVE_TE_COL_GRP_LINEARr   HAVE_TE_ROW_LINEARr   HAVE_TE_ROW_GRP_LINEARr   HAVE_TE_LINEARallr+   r   	HAVE_APEXr-   r7   ModulerA   rE   rN   rZ   rb   rd   rn   ro   autogradFunctionrp   r   r   r    r    r    r!   <module>   sp   
;				$