o
    wiF                     @   s  d dl mZ d dlmZ d dlmZ zd dlmZmZm	Z	m
Z
mZmZ W n eefy7   d Z Z	 ZZY nw d dlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ ddd	d
ddddZ dddddZ!d*ddZ"G dd dZ#G dd de#Z$G dd de#Z%G dd de#Z&G dd  d e&Z'G d!d" d"e#Z(G d#d$ d$e#Z)G d%d& d&e#Z*G d'd( d(e#Z+e*e(e)e&e%e'e$dde+d)
Z,dS )+    )Dict)
DictConfig)logging)MCoreGPTEmbeddingMixinMCoreMLPMixinMCoreSelfAttentionMixinMCoreSequentialMLPMixinMCoreTransformerBlockMixinMCoreTransformerLayerMixinN)AdapterNameInfusedAdapterConfigLora4HtoHAdapterConfigLoraDenseAttentionAdapterConfigLoraHto4HAdapterConfigLoraKQVAdapterConfigLoraKQVAdapterWeightTyingConfigLoraMoe4HtoHAdapterConfigLoraMoeHto4HAdapterConfigLoraUnfusedHto4HAdapterConfigLoraUnfusedKQVAdapterConfigMLPHeadAdapterConfigMLPInfusedAdapterConfigParallelLinearAdapterConfig&ParallelLinearAdapterWeightTyingConfigPromptEncoderAdapterConfigattention_qkvattention_densemlp_fc1mlp_fc2	attentionmlpall)
qkv_moduledense_modulehto4h_module4htoh_moduler   r    r!   
linear_qkvlinear_proj
linear_fc1
linear_fc2)r   r   r   r   r   c                 C   s   |  d|}g }|D ]n}|td kr-td |vr|td  td |vr,|td  q
|td krNtd |vr@|td  td |vrM|td  q
|td krotd td td td fD ]}||vrm|| qbq
||vrx|| q
|S )	Ntarget_modulesr   r"   r#   r    r$   r%   r!   )getPEFT_MODULE_MAPappend)lora_cfgdefaultoriginal_target_modulesr+   module
sub_module r4   c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/nlp/parts/peft_config.pyget_target_modulesF   s:   
	
r6   c                   @   s.   e Zd ZdedefddZdd Zdd Zd	S )

PEFTConfigpeft_cfgname_key_to_cfgc                 C   s&   || _ |dd | _|dd| _d S )Nlayer_selectionweight_tyingF)r9   r,   r:   r;   )selfr8   r9   r4   r4   r5   __init__g   s
   
zPEFTConfig.__init__c                 C   s   | j S )Nr9   )r<   r4   r4   r5   get_config_dicto   s   zPEFTConfig.get_config_dictc                 C   sB   | dd d u r|j|j dksJ d|j|j }|S |j}|S )Nkv_channelsr   zKhidden_size must be divisible by num_attention_heads if kv_channels is None)r,   hidden_sizenum_attention_headsr@   )r<   cfgr@   r4   r4   r5   _calculate_kv_channelsr   s   z!PEFTConfig._calculate_kv_channelsN)__name__
__module____qualname__r   r   r=   r?   rD   r4   r4   r4   r5   r7   e   s    r7   c                          e Zd Z fddZ  ZS )SelectivePEFTConfigc                    s*   |j j}t j|i d |dg | _d S )Nr>   tunable_base_param_names)peftselective_tuningsuperr=   r,   rJ   )r<   rC   selective_cfg	__class__r4   r5   r=   ~   s   zSelectivePEFTConfig.__init__rE   rF   rG   r=   __classcell__r4   r4   rO   r5   rI   }       rI   c                       rH   )MLPHeadPEFTConfigc                    sR   |j |jjjd}tdi |}tj|i}tjdtfgi| _t	 
|jj| d S )N)in_featuresout_featuresdecoderr4   )rA   rK   mlp_head_tuningrV   r   r   MLP_HEAD_ADAPTERr	   name_key_to_mcore_mixinsrM   r=   )r<   rC   config_argsmlp_head_cfgr9   rO   r4   r5   r=      s   zMLPHeadPEFTConfig.__init__rQ   r4   r4   rO   r5   rT      rS   rT   c                       s(   e Zd Z fddZ	dddZ  ZS )LoraPEFTConfigc              
      s  |j j}| |}||j }|d|j}|d u r|j}|d| |  }|dddv }t|}i }	i }
|D ],}|td kru|ddd	krZtj}t	}| j
|||j||||d
}ntj}t}| 
|||j||}||	|< dtfg|
|< q6|td kr| 
||||jt}||	tj< dtfg|
tj< q6|td kr|r|jd n|j}|dd rtj}t}n|ddd	krtj}t}ntj}t}| 
|||j||}||	|< |tjkrdtfg|
|< tt|jD ]}|
| d| tf qq6dtfg|
|< q6|td krP|dd rtj}t}ntj }t!}| 
|||j|j|}||	|< |tjkrHdtfg|
|< tt|jD ]}|
| d| tf q7q6dtfg|
|< q6t"#d| dt$t%   t&d q6|
| _'t( )||	 d S )Nnum_query_groups   
activationgelu)z
fast-gegluzfast-swigluz
fast-reglur"   variantnemo	canonicalr^   r@   self_attentionr#   r$   num_moe_expertszmlp.expertszmlp.experts.local_experts.r    r%   z#Unrecognized target_module string: z.
The possible options are:    )*rK   lora_tuningrD   rB   r,   r6   r-   r   LORA_UNFUSED_KQV_ADAPTERr   _create_lora_configrA   LORA_KQV_ADAPTERr   r   r   LORA_DENSE_ATTENTION_ADAPTERffn_hidden_sizeLORA_MOE_Hto4H_ADAPTERr   LORA_UNFUSED_Hto4H_ADAPTERr   LORA_Hto4H_ADAPTERr   r   rangeintrg   r.   r   LORA_MOE_4HtoH_ADAPTERr   LORA_4HtoH_ADAPTERr   r   errorlistvaluesexitrZ   rM   r=   )r<   rC   r/   r@   projection_sizer^   qkv_projection_sizefast_glu_activationr+   r9   rZ   r2   _adapter_name_adapter_cfg_clsadapter_cfghto4h_projection_sizeirO   r4   r5   r=      s   







zLoraPEFTConfig.__init__Nc                 C   sF  |||j d d d|dd|ddd|j|d|j |dd	|d
dd}|tkrI|d us3J d|d us;J d|||d |d n|ttfv rW|d|ji |j	r|dd }	|	d u rgd}
n)|	dkro|j
}
n!|	dkrxd| }
n|	dkr|j }
n|	dkr|j }
ntd|	 d||j|
|	d |di |}|S )Nidentitycolumn_init_methodnormalrow_init_methodzeroFalphadropout_positionposta2a_experimental)rU   rV   dimnorm_position	norm_typer`   r   r   gather_outputdropoutr   r   r   z4num_query_groups must be provided for canonical Loraz/kv_channels must be provided for canonical Lorare   rV   rg   position_embedding_strategyr   addbiasadd   concat	mlpconcatz$Unknown position embedding strategy z for tied weightsnum_position_embeddingsdim_position_embeddingsr   r4   )adapter_dimr,   adapter_dropoutr   updatepopr   r   rg   r;   rA   RuntimeError
num_layers)r<   rC   r/   rU   rV   adapter_cfg_clsr^   r@   r[   r   r   r   r4   r4   r5   rk      sX   





z"LoraPEFTConfig._create_lora_config)NN)rE   rF   rG   r=   rk   rR   r4   r4   rO   r5   r]      s    ir]   c                   @   s   e Zd ZdS )QLoraPEFTConfigN)rE   rF   rG   r4   r4   r4   r5   r   4  s    r   c                       rH   )IA3PEFTConfigc                    s   t |j|j d}| |}|d|j}|| }t||j d}tj|tj	|tj
|i}tjdtfgtj	dtfgtj
dtfgi| _t |jj| d S )N)rU   r^   rf   r    )r   rn   tensor_model_parallel_sizerD   r,   rB   r   r   KEY_INFUSEDVALUE_INFUSEDMLP_INFUSEDr   r   rZ   rM   r=   rK   
ia3_tuning)r<   rC   mlp_infused_adapter_cfgr@   r^   kv_projection_sizeinfused_adapter_cfgr9   rO   r4   r5   r=   9  s    

zIA3PEFTConfig.__init__rQ   r4   r4   rO   r5   r   8  rS   r   c                       rH   )PtuningPEFTConfigc                    sh   t |jjj|jjj|jjj|jjj|j}tj	|i}tj	dt
fgi| _|jjj| _t |jj| d S )N	embedding)r   rK   p_tuningvirtual_tokensbottleneck_dimembedding_diminit_stdrA   r   PTUNING_ADAPTERr   rZ   rM   r=   )r<   rC   r   r9   rO   r4   r5   r=   R  s   
zPtuningPEFTConfig.__init__rQ   r4   r4   rO   r5   r   Q  rS   r   c                       rH   )CanonicalAdaptersPEFTConfigc              
      s   |j j}|j|j|j|dd|dd|dd|dd|jd	}|jr?||jd
 |j|dd d t	di |}nt
di |}tj|tj|i}tjdtfgtjdtfgi| _t || d S )Nr   prer   mixedfusedlayernormr   xavierr   r   )rU   rV   r   r   r   r   r   r   r_   r   r    r4   )rK   adapter_tuningrA   r   r,   r   r;   r   r   r   r   r   PRE_ATTN_ADAPTERPOST_ATTN_ADAPTERr
   rZ   rM   r=   )r<   rC   adapter_tuning_cfgr[   r   r9   rO   r4   r5   r=   b  s4   




z$CanonicalAdaptersPEFTConfig.__init__rQ   r4   r4   rO   r5   r   a  rS   r   c                       rH   )SDLoraPEFTConfigc                    sf   |j j}d d |jd d d|dd|ddd|j|jd}tjtdi |i}d | _	t
 || d S )	Nr   r   r   r   r   F)rU   rV   r   r   r   r`   r   r   r   r   network_alphar4   )rK   ri   r   r,   r   r   r   PARALLEL_LINEAR_ADAPTERr   rZ   rM   r=   )r<   rC   r/   r[   r9   rO   r4   r5   r=     s    

zSDLoraPEFTConfig.__init__rQ   r4   r4   rO   r5   r     rS   r   )
adapteria3ptuningloramlp_headqlora	selectivenoneNsdlora)r*   )-typingr   	omegaconfr   
nemo.utilsr   Bnemo.collections.nlp.modules.common.megatron.adapters.mcore_mixinsr   r   r   r   r	   r
   ImportErrorModuleNotFoundErrorGnemo.collections.nlp.modules.common.megatron.adapters.parallel_adaptersr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r-   LORA_CONFIG_TO_MCORE_MAPr6   r7   rI   rT   r]   r   r   r   r   r   PEFT_CONFIG_MAPr4   r4   r4   r5   <module>   sX   $H
 "'
