o
    wib                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZmZ d dlmZmZ d dl m!Z! d dl"m#Z#m$Z$ d dl%m&Z& d dl'm(Z( d dlm)Z)m*Z* d dl+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3m4Z4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; erd dl<m=Z= d dl>m?Z? d dl@mAZA e;dd\ZBZCe;dd\ZDZCe;dd\ZEZCdd d!efd"d#ZFeG d$d% d%e.ZGeG d&d' d'eGZHeG d(d) d)eGZIeG d*d+ d+eGZJG d,d- d-e/ZKe4LeKd.G d/d0 d0e4jMd1eKf ZNe4OeKd.G d2d3 d3e4jMeKd1f ZPG d4d5 d5eZQG d6d7 d7eBZRG d8d9 d9eZSG d:d; d;ej*jTZUd<ej)d=e	eV fd>d?ZWd@dA ZXg dBZYdS )C    N)	dataclass)Path)TYPE_CHECKING	AnnotatedCallableOptionalUnion)parallel_statetensor_parallel)get_bias_dropout_add)FusedScaleMaskSoftmax)PackedSeqParams)ColumnParallelLinear)MegatronModule
ModuleSpecTransformerConfigTransformerLayerTransformerLayerSubmodules)SelfAttentionSelfAttentionSubmodules)AttnMaskType)MLPMLPSubmodules)attention_mask_func)divide)Tensornn)openai_gelu)	GPTConfigGPTModel)Config)OptimizerModuleioteardown)TransformFns)dtype_from_hf)safe_import_from)GemmaForCausalLMAutoTokenizer)TokenizerSpecz+megatron.core.extensions.transformer_engineTERowParallelLinearTENormTELayerNormColumnParallelLinearconfigr   returnc                 C   sB   t ttt tdtjitttt	ddt
t tttt	ddt
ddS ) attn_mask_type)
linear_qkvcore_attentionlinear_proj)moduleparams
submodules)
linear_fc1
linear_fc2)r5   r7   )self_attentionself_attn_bdamlpmlp_bda)r   r   r   r   r   causalr   r-   Gemma2DotProductAttentionTERowParallelLinearLayerNormr   r   r   r.    rB   b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/gemma2.pygemma2_layer_spec?   s.   	rD   c                   @   s  e Zd ZU dZdZeed< eZe	ed< dZ
eed< dZeed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< d	Zeed< eZeee	dgef f ed< dZ eed < d!Z!eed"< d#Z"eed$< d%S )&Gemma2ConfigzGemma2 basic configRMSNormnormalizationactivation_funcTgated_linear_unitropeposition_embedding_typeFadd_bias_lineari    
seq_length   kv_channels        attention_dropouthidden_dropout#share_embeddings_and_output_weightslayernorm_zero_centered_gammagư>layernorm_epsiloni'  rotary_base)i   r   window_sizei  
vocab_sizegradient_accumulation_fusionr   transformer_layer_spec   query_pre_attn_scalarg      I@attn_logit_softcappingg      >@final_logit_softcappingN)#__name__
__module____qualname____doc__rG   str__annotations__r   rH   r   rI   boolrK   rL   rM   intrO   rQ   floatrR   rS   rT   rU   rV   rW   tuplerX   rY   rD   rZ   r   r   r\   r]   r^   rB   rB   rB   rC   rE   ^   s,   
 rE   c                   @   Z   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dS )Gemma2Config2BzGemma2 2B config   
num_layersi 	  hidden_size   num_attention_heads   num_query_groupsi $  ffn_hidden_sizerN   r\   Nr_   r`   ra   rb   rl   rf   rd   rm   ro   rq   rr   r\   rB   rB   rB   rC   rj   {      
 rj   c                   @   ri   )Gemma2Config9BzGemma2 9B config*   rl   i   rm      ro   rn   rq   i 8  rr   rN   r\   Nrs   rB   rB   rB   rC   ru      rt   ru   c                   @   ri   )Gemma2Config27BzGemma2 27B config.   rl   i   rm       ro   rw   rq   i   rr      r\   Nrs   rB   rB   rB   rC   rx      rt   rx   c                       sr   e Zd ZdZ				ddeee ee f dee ded dee	e
jge
jf  f fdd	Z fd
dZ  ZS )Gemma2Modelr0   Nr.   optim	tokenizerr*   model_transformc                    s   t  j|pt |||d d S )N)r}   r~   r   )super__init__rE   )selfr.   r}   r~   r   	__class__rB   rC   r      s   zGemma2Model.__init__c                    sR   ddl m} t   tjddr|| jjt tj	ddr'|| jj
t d S d S )Nr   )extend_instanceF)ignore_virtual)#nemo.collections.common.parts.utilsr   r   configure_modelr	   is_pipeline_first_stager5   	embeddingEmbeddingScalingMixinis_pipeline_last_stageoutput_layerGemma2OutputLayer)r   r   r   rB   rC   r      s   
zGemma2Model.configure_model)NNNN)r_   r`   ra   rb   r   r   rE   r    r!   r   r   Moduler   r   __classcell__rB   rB   r   rC   r|      s     	r|   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )HFGemmaImporterr0   r/   c                 C   s   t | j| jdS )N)r~   )r|   r.   r~   r   rB   rB   rC   init   s   zHFGemmaImporter.initoutput_pathc                 C   sh   ddl m} |jt| dd}|  }| |}| || | || td|  t	|| ~~|S )Nr   )Gemma2ForCausalLMauto)torch_dtypez/Converted Gemma2 model to Nemo, model saved to )
transformersr   from_pretrainedrc   r   
nemo_setupconvert_state	nemo_saveprintr#   )r   r   r   sourcetargettrainerrB   rB   rC   apply   s   

zHFGemmaImporter.applyc              	   C   L   ddddddddd	}t jd
dtjdt jddtjdg}t j||||dS )r0    embedding.word_embeddings.weight2decoder.layers.*.self_attention.linear_proj.weight&decoder.layers.*.mlp.linear_fc2.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight1decoder.layers.*.mlp.linear_fc1.layer_norm_weight5decoder.layers.*.mlp.linear_fc2.post_layernorm.weightAdecoder.layers.*.self_attention.linear_proj.post_layernorm.weightdecoder.final_layernorm.weight)model.embed_tokens.weight&model.layers.*.self_attn.o_proj.weight#model.layers.*.mlp.down_proj.weight%model.layers.*.input_layernorm.weight/model.layers.*.pre_feedforward_layernorm.weight0model.layers.*.post_feedforward_layernorm.weight.model.layers.*.post_attention_layernorm.weightmodel.norm.weightz&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight
source_key
target_keyfnz#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weight&decoder.layers.*.mlp.linear_fc1.weightmapping
transforms)r"   state_transformr$   	merge_qkv	merge_fc1apply_transformsr   r   r   r   r   rB   rB   rC   r      s*   	zHFGemmaImporter.convert_stater)   c                 C   s   ddl m} || t| S )r0   r   r(   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr)   save_hf_tokenizer_assetsrc   )r   r)   rB   rB   rC   r~      s   zHFGemmaImporter.tokenizerc                 C   s   ddl m} ddl m} |t| }|t| }dd }tdi d|jd|jd|jd	|j	d
|j
d|jd|jd|jd|jd|jd|jd|jdfddd||jd|jdddt|tjkdt|tjkdt|d|}|S )r0   r   )GemmaConfig)GenerationConfigc                 S   s(   d}| | dkr|d }| | dks|S )N   r      rB   )rX   baserB   rB   rC   make_vocab_size_divisible_by  s
   z<HFGemmaImporter.config.<locals>.make_vocab_size_divisible_byrl   rm   rr   ro   init_method_stdrU   rq   rV   r\   r]   r^   rW   rI   Tr   rX   rS   fp16bf16params_dtypegeneration_configNrB   )r   r   r   r   rc   rE   num_hidden_layersrm   intermediate_sizero   initializer_rangerms_norm_epsnum_key_value_heads
rope_thetar\   r]   r^   sliding_windowrX   r%   torchfloat16bfloat16)r   HFGemmaConfigr   r   r   r   outputrB   rB   rC   r.      s^   	


zHFGemmaImporter.configN)r/   r)   )r_   r`   ra   rb   r|   r   r   r   r   propertyr~   rE   r.   rB   rB   rB   rC   r      s    #r   r'   c                   @   sN   e Zd ZdZdddZdedefddZd	d
 Zedd Z	edddZ
dS )HFGemmaExporterr0   r/   r'   c                 C   sN   ddl m} ddlm} |  || jW  d    S 1 s w   Y  d S )Nr   )AutoModelForCausalLM)no_init_weights)r   r   transformers.modeling_utilsr   from_configr.   )r   r   r   rB   rB   rC   r   &  s
   
$zHFGemmaExporter.initr   c                 C   sH   |   }| t| \}}| ||}| }|| | j| |S r0   )r   	nemo_loadrc   r   cpusave_pretrainedr~   )r   r   r   r   _rB   rB   rC   r   -  s   
zHFGemmaExporter.applyc              	   C   r   )r0   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r"   r   r$   	split_qkv	split_fc1r   r   rB   rB   rC   r   :  s*   	zHFGemmaExporter.convert_statec                 C   s   t t| jjjS r   )r"   load_contextrc   modelr~   r   rB   rB   rC   r~   ^  s   zHFGemmaExporter.tokenizerrE   c                 C   sz   t jt| dd}ddlm} |dg|j|j|j|j|j	dur"|j	n|j|j |j
|j|j|j| jj|j|j|j|jdS )r0   zmodel.config)subpathr   )rE   r   N)architecturesr   rm   r   ro   head_dimmax_position_embeddingsr   r   r   rX   r   r\   r]   r^   )r"   r   rc   r   rE   rl   rm   rr   ro   rO   rM   r   rU   rq   r~   rX   rV   r\   r]   r^   )r   r   r   rB   rB   rC   r.   d  s*   

zHFGemmaExporter.configN)r/   r'   )r/   rE   )r_   r`   ra   rb   r   r   r   r   r   r~   r.   rB   rB   rB   rC   r   "  s    
$
r   c                       sb   e Zd ZdZ	ddededededef
 fdd	Z			dd
e
de
de
de
dedefddZ  ZS )r?   a  
    Region where selective activation recomputation is applied.
    This region is memory intensive but less compute intensive which
    makes activation checkpointing more efficient for LLMs (20B+).
    See Reducing Activation Recomputation in Large Transformer Models:
    https://arxiv.org/abs/2205.05198 for more details.

    We use the following notation:
     h: hidden size
     n: number of attention heads
     p: number of tensor model parallel partitions
     b: batch size
     s: sequence length
    Nr.   layer_numberr1   attention_typerQ   c           
   	      s(  t  j|d || _| jjdksJ dtd|| _d | _| jd dkr(|j| _|| _|| _| jj	| jj
 }t }t||| _t||j
| _t| jj
|| _t| jj|| _d }	t|j| _| jjrn| j}	|  j|	9  _t| jj| jj| j| jjt| jj|	d| _tj !|d u r| jj"n|| _"d S )NrA      z?Context parallelism is only supported by TEDotProductAttention!r   r   )input_in_fp16input_in_bf16r1   scaled_masked_softmax_fusion	mask_funcsoftmax_in_fp32scale)#r   r   r.   context_parallel_sizemaxr   rW   r1   r   rO   ro   r	   $get_tensor_model_parallel_world_sizer   hidden_size_per_partitionhidden_size_per_attention_head!num_attention_heads_per_partitionrq   num_query_groups_per_partitionmathsqrtr\   norm_factorapply_query_key_layer_scalingr   r   r   masked_softmax_fusionr   attention_softmax_in_fp32scale_mask_softmaxr   r   DropoutrQ   )
r   r.   r   r1   r   rQ   kwargsprojection_size
world_sizecoeffr   rB   rC   r     sD   	
z"Gemma2DotProductAttention.__init__querykeyvalueattention_maskpacked_seq_paramsc                 K   s`  |du sJ d| j | j dkr&|j| j | j dd}|j| j | j dd}|d|d|d|df}||d |d |d  d}||d |d |d  d}t |d |d  |d |d f|j	d	}	t
j|	|dd|ddddd
d| j d}
t|
| jj}
|
j| }|dur| jdurt|d|d| j}| ||}| jjst   | |}W d   n1 sw   Y  n| |}|d|d|d|df}||d|d |d  d}||d |d  |d d}t
||dd}|j| }|dddd }| dd | jf }|j| }|S )zForward.
        Modified from mcore.transformer.dot_product_attention to support Gemma2-specific
        final_logit_softcapping.
        NzaPacked sequence is not supported by DotProductAttention.Please use TEDotProductAttention instead.r   r   )dimr      mpurP   g      ?)betaalpha)r   r   repeat_interleavesizereshapeviewr	   get_global_memory_buffer
get_tensordtyper   baddbmm	transposer  logit_softcappingr.   r]   rW   get_swar  sequence_parallelr
   get_cuda_rng_trackerforkrQ   bmmpermute
contiguousr   )r   r  r  r  r  r1   r  r  output_sizematmul_input_buffermatmul_resultattention_scoresattention_probscontextnew_context_shaperB   rB   rC   forward  sj   



 

z!Gemma2DotProductAttention.forward)N)NN)r_   r`   ra   rb   r   rf   r   rc   rg   r   r   r   r0  r   rB   rB   r   rC   r?     s:    Ar?   c                       s:   e Zd ZdZdededef fddZ fddZ  ZS )	r@   z=Modified From TERowParallelLinear with an additional Post-LN.
input_sizer)  r.   c                   s*   t  j||fd|i| t||| _d S )Nr.   )r   r   r,   post_layernorm)r   r1  r)  r.   r  r   rB   rC   r   R  s   z%TERowParallelLinearLayerNorm.__init__c                    s   t  |\}}| ||fS )z)Forward with additional Post LN on output)r   r0  r2  )r   xr   biasr   rB   rC   r0  b  s   z$TERowParallelLinearLayerNorm.forward)	r_   r`   ra   rb   rf   r   r   r0  r   rB   rB   r   rC   r@   O  s    r@   c                           e Zd ZdZ fddZ  ZS )r   z:Extends from ColumnParallelLinear with logit soft capping.c                    s,   t  j|i |\}}t|| jj}||fS )z Forward with logit soft capping.)r   r0  r!  r.   r^   )r   argsr  r   r4  r   rB   rC   r0  k  s   zGemma2OutputLayer.forwardr_   r`   ra   rb   r0  r   rB   rB   r   rC   r   h  s    r   c                       r5  )r   z
    A mixin class for scaling embeddings in Megatron GPT.
    The scaling is applied only if the configuration (accessible via `self.config`)
    includes `apply_embedding_scaling` set to True.
    c                    s.   t  jdi |}|tj| jjd |jd S )z
        Forward pass that scales the output embeddings from the `forward` method of
        the superclass by the square root of the hidden size specified in the configuration.
        g      ?)r  NrB   )r   r0  r   tensorr.   rm   r  )r   r  
embeddingsr   rB   rC   r0  y  s   zEmbeddingScalingMixin.forwardr7  rB   rB   r   rC   r   r  s    r   logitsr   c                 C   s   |s| S |t | |  S )zIPrevents logits from growing excessively by scaling them to a fixed range)r   tanh)r:  r   rB   rB   rC   r!    s   r!  c                 C   sR   t j| |t jdd}t j|||  |d  d}t j|||  |d  d}| }|S )zECreate the equivalent attention mask fro SWA in [seq_q, seq_kv] shapecuda)r  devicer   )diagonalr   )r   onesre   triutril)seq_qseq_kvwmmumlrB   rB   rC   r"    s
   r"  )rE   rj   ru   rx   r|   )Zr   dataclassesr   pathlibr   typingr   r   r   r   r   r   megatron.corer	   r
   (megatron.core.fusions.fused_bias_dropoutr   #megatron.core.fusions.fused_softmaxr   megatron.core.packed_seq_paramsr   megatron.core.tensor_parallelr   megatron.core.transformerr   r   r   r   r   #megatron.core.transformer.attentionr   r   megatron.core.transformer.enumsr   megatron.core.transformer.mlpr   r   megatron.core.transformer.utilsr   megatron.core.utilsr   r   r   "nemo.collections.llm.fn.activationr   #nemo.collections.llm.gpt.model.baser   r   nemo.collections.llm.utilsr    nemo.lightningr!   r"   r#   nemo.lightning.io.stater$   nemo.lightning.pytorch.utilsr%   nemo.utils.import_utilsr&   r   r'   r   r)   1nemo.collections.common.tokenizers.tokenizer_specr*   r+   r   r,   r-   rD   rE   rj   ru   rx   r|   model_importerModelConnectorr   model_exporterr   r?   r@   r   r   r   rg   r!  r"  __all__rB   rB   rB   rC   <module>   sn   

j` M

