o
    wi\                     @   s  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z= e
rddl>m/Z? ddl@mAZA ddlBmCZC ddlDmEZE e=dd\ZFZGe=dd\ZHZGe=dd \ZIZGe=dd!\ZJZG	 d"e.d#efd$d%ZKeG d&d' d'e.ZLeG d(d) d)eLZMeG d*d+ d+eLZNeG d,d- d-eLZOeG d.d/ d/eLZPG d0d1 d1e/ZQG d2d3 d3eZRG d4d5 d5eZSd6eTd7eeTeTf d#eUfd8d9ZVG d:d; d;e ZWG d<d= d=eJZXe6YeQd>G d?d@ d@e6jZdAeQf Z[e6\eQd>G dBdC dCe6jZeQdAf Z]g dDZ^dS )EzGemma3 language model    N)	dataclass)	lru_cache)Path)TYPE_CHECKING	AnnotatedCallableOptionalTupleUnion)get_bias_dropout_add)BaseInferenceContext)LanguageModelEmbedding)RotaryEmbedding)PackedSeqParams)
ModuleSpecTransformerConfigTransformerLayerTransformerLayerSubmodules)SelfAttentionSelfAttentionSubmodules)AttnBackendAttnMaskType)MLPMLPSubmodules)Tensornn)openai_gelu)	GPTConfigGPTModel)TERowParallelLinearLayerNorm)Config)OptimizerModuleioteardown)TransformFns)dtype_from_hf)safe_import_from)r   Gemma3ForCausalLMAutoTokenizer)TokenizerSpecz+megatron.core.extensions.transformer_engineTERowParallelLinearTENormTELayerNormColumnParallelLinearTEDotProductAttentionconfigreturnc                 C   sF   t ttt tdtjitttt	t	t
ddtt tttt
ddtddS )zGemma3 custom layer spec.attn_mask_type)
linear_qkvcore_attentionq_layernormk_layernormlinear_proj)moduleparams
submodules)
linear_fc1
linear_fc2)r8   r:   )self_attentionself_attn_bdamlpmlp_bda)r   r   r   Gemma3SelfAttentionr   causalr   r.   Gemma3TEDotProductAttentionr-   r   r   r   r   )r0    rD   b/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/llm/gpt/model/gemma3.pygemma3_layer_spec^   s2   rF   c                       sH  e Zd ZU dZdZeed< dZeed< dZe	ed< dZ
eed	< d
Zeed< dZe	ed< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< ejZeed< d
Zeed< dZeed< eZeed< dZeed< dZeed< dZ eed< e!Z"e#e$ed ge$f f ed!< d
Z%eed"< 	#	#	#d)d$e&e d%d&f fd'd(Z'  Z(S )*Gemma3ConfigzGemma3 basic configi   
seq_lengthi@  
vocab_sizeropeposition_embedding_type)'  @B rotary_baseT#share_embeddings_and_output_weightsRMSNormnormalizationlayernorm_zero_centered_gammagư>layernorm_epsilon   window_size)      interleaved_attn_patterng        attention_dropouthidden_dropoutattention_backendgated_linear_unitFadd_bias_linearactivation_funcis_vision_languageflash_decodegradient_accumulation_fusionr   transformer_layer_spec#scatter_embedding_sequence_parallelNvp_stager1   MCoreGPTModelc                    s   | j dkr	tdt| dddu r|du sJ d| j\}}|| _t j||||d}||f| _t|drCt| | j| j	| j
| jd|_t| jd	| j| j|d
| j| j|d	|_t|ds`t|drd|  |S )z7Configure and instantiate a megatron-core Gemma3 model.rW   z3Context Parallel is not supported for Gemma3 model.$virtual_pipeline_model_parallel_sizeNzKVirtual pipeline model parallel size is not yet supported for Gemma3 model.)rd   	embedding)r0   rI   max_sequence_lengthrK   scatter_to_sequence_parallel      ?F)	kv_channelsrotary_percentrotary_interleavedseq_len_interpolation_factorrN   rope_scalingrope_scaling_factoruse_cpu_initializationrotary_base_localoutput_layer)context_parallel_size
ValueErrorgetattrrN   superconfigure_modelhasattrGemma3LanguageModelEmbeddingrI   rH   rK   rc   rg   Gemma3RotaryEmbeddingrk   rm   rn   rp   rq   rotary_pos_emb!setup_embeddings_and_output_layer)self	tokenizerpre_processpost_processrd   rr   rotary_base_globalmodel	__class__rD   rE   rx      s>   



zGemma3Config.configure_model)NNN))__name__
__module____qualname____doc__rH   int__annotations__rI   rK   strrN   tuplerO   boolrQ   rR   rS   floatrU   rX   rY   rZ   r   flashr[   r\   r]   r   r^   r   r_   r`   ra   rF   rb   r
   r   rc   r   rx   __classcell__rD   rD   r   rE   rG   {   s>   
 rG   c                   @   s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )Gemma3Config1BzGemma3 1B configFr_      
num_layersi  hidden_size   num_attention_headsrW   num_query_groups   rk   i   ffn_hidden_sizerT   rU   rj   rp   i   rH   Tbf16i   rI   N)r   r   r   r   r_   r   r   r   r   r   r   r   rk   r   rU   rp   r   rH   r   rI   rD   rD   rD   rE   r      s   
 r   c                   @   ~   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< dZeed< dZeed< dZeed< dS )Gemma3Config4BzGemma3 4B configTr_   "   r   i 
  r      r   r   r   r   rk   i (  r      rU          @rp   Nr   r   r   r   r_   r   r   r   r   r   r   r   rk   r   rU   rp   r   rD   rD   rD   rE   r         
 r   c                   @   r   )Gemma3Config12BzGemma3 12B configTr_   0   r   i   r      r   r   r   r   rk   i <  r   r   rU   r   rp   Nr   rD   rD   rD   rE   r      r   r   c                   @   s   e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed	< d
Zeed< dZeed< ded Zeed< dZeed< dZeed< dZeed< dS )Gemma3Config27BzGemma3 27B configTr_   >   r   i   r       r   r   r      rk   rj      softmax_scalei T  r   r   rU   r   rp   N)r   r   r   r   r_   r   r   r   r   r   r   r   rk   mathsqrtr   r   rU   rp   r   rD   rD   rD   rE   r      s   
 r   c                       sp   e Zd ZdZddddg fdeee ee f dee ded dee	e
jge
jf  dee f
 fd	d
Z  ZS )Gemma3ModelzGemma3 base modelNr0   optimr   r+   model_transformmodel_context_managersc                    s    t  j|pt ||||d d S )N)r   r   r   r   )rw   __init__rG   )r~   r0   r   r   r   r   r   rD   rE   r     s   
zGemma3Model.__init__)r   r   r   r   r   r   rG   r    r!   r   r   Modulelistr   r   rD   rD   r   rE   r     s$    r   c                	       s4   e Zd ZdZd	dedededef fddZ  ZS )
rz   zQGemma3 language token embedding.

    Adds a normalization to the embedding.
    N	input_idsposition_idstokentype_idsr1   c                    s$   t  |||}|| jjd  }|S )z!Calculate embedding and normalizeg      ?)rw   forwardr0   r   )r~   r   r   r   
embeddingsr   rD   rE   r   (  s   z$Gemma3LanguageModelEmbedding.forwardN)r   r   r   r   r   r   r   r   rD   rD   r   rE   rz   "  s    (rz   c                
       sf   e Zd ZdZ				ddededed	ef fd
dZeddddededede	f fddZ
  ZS )r{   zpGemma3 position rope embedding.

    Calculates rope embeddings for both local and global attention layers.
    Fr   rM   rL   ro   rp   rN   rr   c                    sL   |du sJ t  jd||d| |  j|  _td||d|| _d S )NF)ro   rN   rD   )rw   r   inv_freqr   
rope_local)r~   ro   rp   rN   rr   kwargsr   rD   rE   r   5  s   
zGemma3RotaryEmbedding.__init__r   )maxsizer   max_seq_lenoffset
packed_seqr1   c                    s(   t  |||}| j|||}||fS )z#Get global and local rope embedding)rw   r   r   )r~   r   r   r   rope_globalr   r   rD   rE   r   P  s   zGemma3RotaryEmbedding.forward)Fr   rM   rL   )r   F)r   r   r   r   r   r   r   r   r   r   r   r   rD   rD   r   rE   r{   /  s"    *r{   layer_numberlayer_patternc                 C   s   t |}| | dkS )Nr   )sum)r   r   pattern_sizerD   rD   rE   _is_local_attn_layerX  s   r   c                       s   e Zd ZdZ								ddddededee dee deeeeeef f  d	ee d
ee dee dee	 dee
 dee deeef f fddZ  ZS )rA   zyGemma3 self attention.

    Uses local rope embedding for local layers,
    global rope embedding for global layers.
    N)inference_paramshidden_statesattention_maskkey_value_statesinference_contextr|   rotary_pos_cosrotary_pos_sinattention_biaspacked_seq_paramssequence_len_offsetr   r1   c                   sf   t |tsJ |du r|du sJ t| j| jjr|d }n|d }t j|||||||||	|
|dS )z>Switch to either local or global rope embedding before forwardNr   rW   )r   r   r   r   r|   r   r   r   r   r   r   )
isinstancer   r   r   r0   rX   rw   r   )r~   r   r   r   r   r|   r   r   r   r   r   r   final_rotary_pos_embr   rD   rE   r   g  s$   
zGemma3SelfAttention.forward)NNNNNNNN)r   r   r   r   r   r   r   r
   r	   r   r   r   r   rD   rD   r   rE   rA   `  sJ    
	

rA   c                       s>   e Zd ZdZ	d
dededededee	 f
 fdd	Z
  ZS )rC   zGemma3 core attention.

    Switches between global and local sliding window attention
    based on the layer_number and pre-defined layer pattern.
    Nr0   r   r2   attention_typerY   c                    sX   t |}t||jr|jdf|_nd |_|jrtj}t j	d|||||d| d S )Nr   )r0   r   r2   r   rY   rD   )
copydeepcopyr   rX   rU   r_   r   	arbitraryrw   r   )r~   r0   r   r2   r   rY   r   r   rD   rE   r     s   


z$Gemma3TEDotProductAttention.__init__r   )r   r   r   r   r   r   r   r   r   r   r   r   rD   rD   r   rE   rC     s    rC   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )HFGemma3ImporterzGemma3 Huggingface importerr1   c                 C   s   t | j| jdS )N)r   )r   r0   r   r~   rD   rD   rE   init  s   zHFGemma3Importer.initoutput_pathc                 C   sh   ddl m} |jt| dd}|  }| |}| || | || td|  t	|| ~~|S )Nr   r'   auto)torch_dtypez,Converted HF Gemma3 model to Nemo, saved to )
transformersr(   from_pretrainedr   r   
nemo_setupconvert_state	nemo_saveprintr#   )r~   r   r(   sourcetargettrainerrD   rD   rE   apply  s   

zHFGemma3Importer.applyc                 C   P   ddddddddd	d
d
}t jddtjdt jddtjdg}t j||||dS )N embedding.word_embeddings.weight<decoder.layers.*.self_attention.linear_qkv.layer_norm_weight2decoder.layers.*.self_attention.q_layernorm.weight2decoder.layers.*.self_attention.k_layernorm.weight2decoder.layers.*.self_attention.linear_proj.weightAdecoder.layers.*.self_attention.linear_proj.post_layernorm.weight1decoder.layers.*.mlp.linear_fc1.layer_norm_weight&decoder.layers.*.mlp.linear_fc2.weight5decoder.layers.*.mlp.linear_fc2.post_layernorm.weightdecoder.final_layernorm.weight)
model.embed_tokens.weight%model.layers.*.input_layernorm.weight&model.layers.*.self_attn.q_norm.weight&model.layers.*.self_attn.k_norm.weight&model.layers.*.self_attn.o_proj.weight.model.layers.*.post_attention_layernorm.weight/model.layers.*.pre_feedforward_layernorm.weight#model.layers.*.mlp.down_proj.weight0model.layers.*.post_feedforward_layernorm.weightmodel.norm.weightz&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weight1decoder.layers.*.self_attention.linear_qkv.weight
source_key
target_keyfnz#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weight&decoder.layers.*.mlp.linear_fc1.weightmapping
transforms)r"   state_transformr$   	merge_qkv	merge_fc1apply_transformsr~   r   r   r  r	  rD   rD   rE   r     s.   	zHFGemma3Importer.convert_stater*   c                 C   s   ddl m} |t| S )Nr   r)   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr*   r   )r~   r*   rD   rD   rE   r     s   zHFGemma3Importer.tokenizerc                 C   s   ddl m} ddl m} t| }||}|j}||}|jdkr&t }n"|jdkr/t }n|jdkr8t	 }n|jdkrAt
 }ntd| t||_|j|_||_|S )	Nr   )rG   )GenerationConfigr   r   r   r   zUnrecognized import model: )r   rG   r  r   r   text_confignum_hidden_layersr   r   r   r   ru   r%   params_dtypeinitializer_rangeinit_method_stdgeneration_config)r~   HFGemma3Configr  namer   source_textr  outputrD   rD   rE   r0     s&   






zHFGemma3Importer.configN)r1   r*   )r   r   r   r   r   r   r   r   r   propertyr   rG   r0   rD   rD   rD   rE   r     s    )r   r(   c                   @   sL   e Zd ZdZdddZdedefddZd	d
 Zedd Z	edd Z
dS )HFGemma3ExporterzExport Gemma3 to HF formatr1   r(   c                 C   sN   ddl m} ddlm} |  || jW  d    S 1 s w   Y  d S )Nr   r'   )no_init_weights)r   r(   transformers.modeling_utilsr  _from_configr0   )r~   r(   r  rD   rD   rE   r     s
   
$zHFGemma3Exporter.initr   c                 C   sH   |   }| t| \}}| ||}| }|| | j| |S r   )r   	nemo_loadr   r   cpusave_pretrainedr   )r~   r   r   r   _rD   rD   rE   r   $  s   
zHFGemma3Exporter.applyc                 C   r   )Nr   r   r   r   r   r   r   r   r   r   )
r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  )r"   r
  r$   	split_qkv	split_fc1r  r  rD   rD   rE   r   0  s.   	zHFGemma3Exporter.convert_statec                 C   s   t t| jjjS r   )r"   load_contextr   r   r   r   rD   rD   rE   r   Y  s   zHFGemma3Exporter.tokenizerc                 C   s   t jt| dd}ddlm} |dg|j|j|j|j|j	d|j
|j|j|j|j|jd |jd d}|jd	kr<d
|_|S |j|_|S )Nzmodel.config)subpathr   )Gemma3TextConfigr(   gelu_pytorch_tanhrW   )architecturesr  r   intermediate_sizer   head_dimhidden_activationmax_position_embeddingsr  rms_norm_epsnum_key_value_headsrI   
rope_thetarope_local_base_freqr   r   )r"   r&  r   r   r(  r   r   r   r   rk   rH   r  rS   r   rI   rN   query_pre_attn_scalarr,  )r~   r   HFGemma3TextConfigr  rD   rD   rE   r0   ^  s.   
zHFGemma3Exporter.configN)r1   r(   )r   r   r   r   r   r   r   r   r  r   r0   rD   rD   rD   rE   r    s    
)
r  )rG   r   r   r   r   r   )_r   r   r   dataclassesr   	functoolsr   pathlibr   typingr   r   r   r   r	   r
   (megatron.core.fusions.fused_bias_dropoutr    megatron.core.inference.contextsr   ?megatron.core.models.common.embeddings.language_model_embeddingr   ;megatron.core.models.common.embeddings.rotary_pos_embeddingr   megatron.core.packed_seq_paramsr   megatron.core.transformerr   r   r   r   #megatron.core.transformer.attentionr   r   megatron.core.transformer.enumsr   r   megatron.core.transformer.mlpr   r   torchr   r   "nemo.collections.llm.fn.activationr   #nemo.collections.llm.gpt.model.baser   r   %nemo.collections.llm.gpt.model.gemma2r   nemo.collections.llm.utilsr    nemo.lightningr!   r"   r#   nemo.lightning.io.stater$   nemo.lightning.pytorch.utilsr%   nemo.utils.import_utilsr&   "megatron.core.models.gpt.gpt_modelre   r   r(   r  r*   1nemo.collections.common.tokenizers.tokenizer_specr+   r,   r#  r-   r.   r/   rF   rG   r   r   r   r   r   rz   r{   r   r   r   rA   rC   model_importerModelConnectorr   model_exporterr  __all__rD   rD   rD   rE   <module>   s     R)

-
'
db