o
    wiZU                     @   s   d dl mZmZ d dlmZ d dlmZ d dlZd dlm	Z	m
Z
mZmZmZ d dlmZmZ d dlmZmZ d dlmZmZmZ d d	lmZmZ d d
lmZ erVd dlmZ eG dd deZeG dd deZ eG dd deZ!e"edG dd dej#def Z$e%edG dd dej#edf Z&e'dej(fddZ)ej*ddddej(fddZ+ej*ddddej(fd d!Z,ej*d"d#ddej(fd$d%Z-ej*d#d"ddej(fd&d'Z.ej*d(d)ddej(fd*d+Z/ej*d)d(ddej(fd,d-Z0dS ).    )	dataclassfield)Path)TYPE_CHECKINGN)Gemma3ConfigGemma3Config1BGemma3Config4BGemma3Config12BGemma3Config27B)Gemma3VLConfigGemma3VLModel)!Gemma3VLMultimodalProjectorConfigGemma3VLVisionConfig)
export_qkvexport_qkv_bias
import_qkv)ioteardown)TransformFnsAutoTokenizerc                   @   T   e Zd ZU dZedd dZeed< edd dZe	ed< edd dZ
eed	< d
S )Gemma3VLConfig4BzGemma3 VL config 4Bc                   C      t  S N)r    r   r   i/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/gemma3vl/model/gemma3vl.py<lambda>*       zGemma3VLConfig4B.<lambda>default_factorylanguage_transformer_configc                   C   r   r   r   r   r   r   r   r   +   r   vision_transformer_configc                   C      t dddS )N  i 
  
input_sizehidden_sizer   r   r   r   r   r   -       vision_projection_configN__name__
__module____qualname____doc__r   r!   r   __annotations__r#   r   r+   r   r   r   r   r   r   &      
 r   c                   @   r   )Gemma3VLConfig12BzGemma3 VL config 12Bc                   C   r   r   )r	   r   r   r   r   r   5   r   zGemma3VLConfig12B.<lambda>r   r!   c                   C   r   r   r"   r   r   r   r   r   6   r   r#   c                   C   r$   )Nr%   i   r&   r)   r   r   r   r   r   8   r*   r+   Nr,   r   r   r   r   r3   1   r2   r3   c                   @   r   )Gemma3VLConfig27BzGemma3 VL config 27Bc                   C   r   r   )r
   r   r   r   r   r   @   r   zGemma3VLConfig27B.<lambda>r   r!   c                   C   r   r   r"   r   r   r   r   r   A   r   r#   c                   C   r$   )Nr%   i   r&   r)   r   r   r   r   r   C   r*   r+   Nr,   r   r   r   r   r4   <   r2   r4   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )Gemma3VLImporterzGemma3 VL model HF importerreturnc                 C   s   t | j| jdS )N)	tokenizer)r   configr8   selfr   r   r   initK   s   zGemma3VLImporter.initoutput_pathc                 C   sd   ddl m} |t| }|  }| |}| || | || td|  t	|| ~~|S )Nr   Gemma3Modelz.Converted HF Gemma3VL model to NeMo, saved to )
transformersr?   from_pretrainedstrr<   
nemo_setupconvert_state	nemo_saveprintr   )r;   r=   r?   sourcetargettrainerr   r   r   applyN   s   

zGemma3VLImporter.applyc                 C      i ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,	}t tttjd-d.td/tjd0d1tjd/g}tj||||d2S )3N;vision_tower.vision_model.embeddings.patch_embedding.weightvision_model.conv1.weight9vision_tower.vision_model.embeddings.patch_embedding.biasvision_model.conv1.bias>vision_tower.vision_model.embeddings.position_embedding.weight'vision_model.position_embeddings.weight/vision_tower.vision_model.post_layernorm.weightvision_model.ln_post.weight-vision_tower.vision_model.post_layernorm.biasvision_model.ln_post.bias=vision_tower.vision_model.encoder.layers.*.layer_norm1.weightIvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight;vision_tower.vision_model.encoder.layers.*.layer_norm1.biasGvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_biasDvision_tower.vision_model.encoder.layers.*.self_attn.out_proj.weight?vision_model.decoder.layers.*.self_attention.linear_proj.weightBvision_tower.vision_model.encoder.layers.*.self_attn.out_proj.bias=vision_model.decoder.layers.*.self_attention.linear_proj.bias=vision_tower.vision_model.encoder.layers.*.layer_norm2.weight>vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight;vision_tower.vision_model.encoder.layers.*.layer_norm2.bias<vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias9vision_tower.vision_model.encoder.layers.*.mlp.fc1.weight3vision_model.decoder.layers.*.mlp.linear_fc1.weight7vision_tower.vision_model.encoder.layers.*.mlp.fc1.bias1vision_model.decoder.layers.*.mlp.linear_fc1.bias9vision_tower.vision_model.encoder.layers.*.mlp.fc2.weight3vision_model.decoder.layers.*.mlp.linear_fc2.weight7vision_tower.vision_model.encoder.layers.*.mlp.fc2.bias1vision_model.decoder.layers.*.mlp.linear_fc2.bias-multi_modal_projector.mm_soft_emb_norm.weight+vision_projection.mm_soft_embed_norm.weight"language_model.embed_tokens.weight/language_model.embedding.word_embeddings.weightKlanguage_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weightAlanguage_model.decoder.layers.*.self_attention.q_layernorm.weightAlanguage_model.decoder.layers.*.self_attention.k_layernorm.weightAlanguage_model.decoder.layers.*.self_attention.linear_proj.weightPlanguage_model.decoder.layers.*.self_attention.linear_proj.post_layernorm.weight@language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight5language_model.decoder.layers.*.mlp.linear_fc2.weightDlanguage_model.decoder.layers.*.mlp.linear_fc2.post_layernorm.weight-language_model.decoder.final_layernorm.weight)	.language_model.layers.*.input_layernorm.weight/language_model.layers.*.self_attn.q_norm.weight/language_model.layers.*.self_attn.k_norm.weight/language_model.layers.*.self_attn.o_proj.weight7language_model.layers.*.post_attention_layernorm.weight8language_model.layers.*.pre_feedforward_layernorm.weight,language_model.layers.*.mlp.down_proj.weight9language_model.layers.*.post_feedforward_layernorm.weightlanguage_model.norm.weight0multi_modal_projector.mm_input_projection_weightvision_projection.proj.weight
source_key
target_keyfnz,language_model.layers.*.mlp.gate_proj.weightz*language_model.layers.*.mlp.up_proj.weight5language_model.decoder.layers.*.mlp.linear_fc1.weightmapping
transforms)	_import_vision_qkv_import_vision_qkv_bias_import_language_qkvr   state_transform_vision_projector_permuter   	merge_fc1apply_transformsr;   rG   rH   r   r   r   r   r   rD   _   sx   	
$zGemma3VLImporter.convert_stater   c                 C   s   ddl m} |t| S )Nr   r   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   rB   )r;   r   r   r   r   r8      s   zGemma3VLImporter.tokenizerc           
      C   s   ddl m} t| }||}|j}|j}|jdkrt }n"|jdkr't }n|jdkr0t	 }n|jdkr9t
 }ntd| t }t|j|jd}t|||d	}	|	S )
Nr   r      "   0   >   zUnrecognized import model: r&   )r!   r#   r+   )r@   r   rB   rA   text_configvision_confignum_hidden_layersr   r   r	   r
   
ValueErrorr   r   r(   r   )
r;   HFGemma3ConfignamerG   source_textsource_visionr!   r#   r+   outputr   r   r   r9      s2   




zGemma3VLImporter.configN)r7   r   )r-   r.   r/   r0   r   r<   r   rJ   rD   propertyr8   r   r9   r   r   r   r   r6   G   s    9r6   r?   c                   @   sJ   e Zd ZdZdd ZdedefddZdd	 Zed
d Z	edd Z
dS )Gemma3VLExporterzExport Gemma3 VL to HFc                 C   sN   ddl m} ddlm} |  || jW  d    S 1 s w   Y  d S )Nr   r>   )no_init_weights)r@   r?   transformers.modeling_utilsr   _from_configr9   )r;   r?   r   r   r   r   r<      s
   
$zGemma3VLExporter.initr=   r7   c                 C   sH   |   }| t| \}}| ||}| }|| | j| d S r   )r<   	nemo_loadrB   rD   cpusave_pretrainedr8   )r;   r=   rH   rG   _r   r   r   rJ      s   
zGemma3VLExporter.applyc                 C   rK   )3NrM   rL   rO   rN   rQ   rP   rS   rR   rU   rT   rW   rV   rY   rX   r[   rZ   r]   r\   r_   r^   ra   r`   rc   rb   re   rd   rg   rf   ri   rh   rk   rj   rm   rl   rw   rx   ry   rz   r{   r|   r}   r~   r   )	rn   ro   rp   rq   rr   rs   rt   ru   rv   r   r   r   r   r   r   )	_export_vision_qkv_export_vision_qkv_bias_export_language_qkvr   r   r   r   	split_fc1r   r   r   r   r   rD      sx   	
%zGemma3VLExporter.convert_statec                 C   s   t t| jjjS r   )r   load_contextrB   modelr8   r:   r   r   r   r8     s   zGemma3VLExporter.tokenizerc           
      C   s   t jt| dd}|j}|j}ddlm} ddlm} ddlm} |dg|j	|j
|j|j|jd|j|j|j|j|j|jd	 |jd d
}|j	dkrMd|_n|j|_||j
|j|j|j|j	|jdd}|||d}	|	S )Nzmodel.config)subpathr   r   )Gemma3TextConfig)SiglipVisionConfigGemma3ForCausalLMgelu_pytorch_tanh   )architecturesr   r(   intermediate_sizenum_attention_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_epsnum_key_value_heads
vocab_size
rope_thetarope_local_base_freqr      F)r(   
image_sizer   r   r   
patch_sizevision_use_head)r   r   )r   r   rB   r!   r#   r@   r   r   r   
num_layersr(   ffn_hidden_sizer   kv_channels
seq_lengthinit_method_stdlayernorm_epsilonnum_query_groupsr   rotary_basequery_pre_attn_scalarr   img_h	patch_dim)
r;   rG   r   r   r   HFGemma3TextConfigHFGemma3VisionConfigoutput_textoutput_visionr   r   r   r   r9     sH   

zGemma3VLExporter.configN)r-   r.   r/   r0   r<   r   rJ   rD   r   r8   r9   r   r   r   r   r      s    
:
r   ctxc                 C   s   t |dS )N)r   r   )torchpermute)r   xr   r   r   r   C  s   r   )z/language_model.layers.*.self_attn.q_proj.weightz/language_model.layers.*.self_attn.k_proj.weightz/language_model.layers.*.self_attn.v_proj.weightz@language_model.decoder.layers.*.self_attention.linear_qkv.weight)r   r   c              
   C   2   | j jj}t||||j|j|j|j |j|jdS Nhead_numr   heads_per_groupr(   	head_size)rH   r9   r!   r   r   r   r(   r   r   qkvmegatron_configr   r   r   r   H     


r   c                 C   s.   | j jj}t||j|j|j|j |j|jdS r   )rH   r9   r   r   r   r   r(   r   r   qkv	hf_configr   r   r   r   _  s   


r   )zBvision_tower.vision_model.encoder.layers.*.self_attn.q_proj.weightzBvision_tower.vision_model.encoder.layers.*.self_attn.k_proj.weightzBvision_tower.vision_model.encoder.layers.*.self_attn.v_proj.weightz>vision_model.decoder.layers.*.self_attention.linear_qkv.weightc              
   C   r   r   )rH   r9   r#   r   r   r   r(   r   r   r   r   r   r   t  r   r   c                 C   s,   | j jj}t||j|jd|j|j|j dS )Nr   r   )rH   r9   r   r   r   r(   r   r   r   r   r     s   


r   )z@vision_tower.vision_model.encoder.layers.*.self_attn.q_proj.biasz@vision_tower.vision_model.encoder.layers.*.self_attn.k_proj.biasz@vision_tower.vision_model.encoder.layers.*.self_attn.v_proj.biasz<vision_model.decoder.layers.*.self_attention.linear_qkv.biasc              
   C   sH   | j jj}t|d|d|d|j|j|j|j d|jddS )Nr   r   )	rH   r9   r#   r   	unsqueezer   r   r   squeeze)r   q_biask_biasv_biasr   r   r   r   r     s   


	r   c                 C   s(   | j jj}t||j|jd|j|j dS )Nr   )r   r   r   r   )rH   r9   r   r   r   r(   )r   qkv_biasr   r   r   r   r     s   


r   )1dataclassesr   r   pathlibr   typingr   r   %nemo.collections.llm.gpt.model.gemma3r   r   r   r	   r
   (nemo.collections.vlm.gemma3vl.model.baser   r   *nemo.collections.vlm.gemma3vl.model.visionr   r   %nemo.collections.vlm.neva.model.llavar   r   r   nemo.lightningr   r   nemo.lightning.io.stater   r   r   r   r3   r4   model_importerModelConnectorr6   model_exporterr   staticmethodTransformCTXr   r   r   r   r   r   r   r   r   r   r   r   <module>   sn   




y 