o
    }oiJ                     @   sl   d dl Z d dlmZmZ d dlmZmZ d dlmZ	 d dl
mZ dd Zdd	 Zd
d Zdd Zdd ZdS )    N)AutoProcessorMllamaConfig)MllamaTextConfigMllamaVisionConfig)	lightning)vlmc                 C   sL  |j }|j}|jp
|}|jp|| }|| }| d||} tj|||f| jd}tj|||f| jd}tj|||f| jd}	d}
t|D ]V}| |
|
| ddddf ||| |d | ddddf< |
|7 }
| |
ddddf ||ddddf< |
d7 }
| |
ddddf |	|ddddf< |
d7 }
qDd|fd|fd|	fgS )	z*Split attention qkv from nemo to hf formatdevicer   N   q_projk_projv_proj	hidden_sizenum_attention_headsnum_query_groupskv_channelsreshapetorchemptyr
   range)
qkv_weightmodel_configr   head_numr   	head_sizeheads_per_groupq_weightk_weightv_weight	qkv_indexi r"   T/home/ubuntu/.local/lib/python3.10/site-packages/nemo/export/multimodal/converter.pysplit_qkv_weight   s(   
&((
r$   c           
      C   s   |j }|j}|jp
|}|jp|| }| d||} tj|||f| jd}tj|||f| jd}d}t|D ]2}	| |ddddf ||	ddddf< |d7 }| |ddddf ||	ddddf< |d7 }q5d|fd|fgS )z0Split cross attention qkv from nemo to hf formatr   r	   r   Nr   r   r   r   )
	kv_weightr   r   r   r   r   r   r   kv_indexr!   r"   r"   r#   split_kv_weight2   s   
((
r'   c                 C   s(   t j| ddd} d| d fd| d fgS )zSplit linear fc to gate   r   )axis	gate_projup_projr   )r   chunk)gate_weightr"   r"   r#   split_gate_weightF   s   r.   c                 C   s   t | j| j| j| j| jdd}dd t||jD }t	|j
|j|j ||j|j|j|j|jddddd	d
g ddd}t||ddS )z'Convert nemo mllama config to hf configbfloat16)num_hidden_layersr   attention_heads
image_sizemax_num_tilestorch_dtypec                 S   s   g | ]\}}|| qS r"   r"   ).0r!   xr"   r"   r#   
<listcomp>X   s    z)convert_mllama_config.<locals>.<listcomp>g       @g      @g      ?i    llama3)factorhigh_freq_factorlow_freq_factor original_max_position_embeddings	rope_type)i i i	 )
rope_thetar0   cross_attention_layersr   intermediate_sizer   num_key_value_heads
vocab_sizerope_scalingeos_token_idr4   )r4   )r   
num_layersr   r   vision_chunk_sizevision_max_num_chunks	enumerate_init_fusion_schedulenum_cross_attention_layersr   rotary_baseffn_hidden_sizer   rB   r   )source_visionsource_textvision_configr?   text_configr"   r"   r#   convert_mllama_configM   s<   	
rQ   c                    s  t |}tjdddd}tjddd|tjddddd	}| }|j}tj	t
 |d
}|j}|j}	|j}
|| |}|jjjj}| }~d}d| dfd| dfd| dfd| dfd| dfd| dfd| dfd| dfd| dfd| dfd | d!fd"| d#fd$d%d&d'g}t|	jD ]|}|d(| d)| d*| d+fd(| d,| d*| d,fd(| d-| d*| d-fd(| d.| d*| d/fd(| d0| d*| d1fd(| d2| d*| d3fd(| d4| d*| d5fd(| d6| d*| d7fd(| d8| d*| d9fg	 qt|	jD ]}|d:| d)| d;| d+fd:| d<| d;| d<fd:| d=| d;| d=fd:| d,| d;| d,fd:| d-| d;| d-fd:| d.| d;| d/fd:| d0| d;| d1fd:| d2| d;| d3fd:| d4| d;| d5fd:| d6| d;| d7fd:| d8| d;| d9fg q|
j|
j  |
j|
j d>tD ]}|d?  d  }|d?  d  d@kr_|  d? }|dA| dB dC| dDfdA| dE dC| dFfdA| dG dC| dHfdA| d- dC| dIfdA| dJ dC| dKfdA| d0 dC| dLfdA| dM dC| d9fdA| dN dC| d<fdA| dO dC| d=fg	 q|| d }|dA| d) dP| d+fdA| d0 dP| dLfdA| dM dP| d9fdA| d- dP| dQfg qi }|D ]\}}|| ||< qdRdS }dTdU } fdVdW} fdXdY}dZd[ }||||	 ||| ||||
 ||| ||| |t|	|
fS )\z/Convert nemo mllama to hf state dict and configr   F)tensor_model_parallel_sizeckpt_load_optimizerckpt_save_optimizeri  gpuz
bf16-mixed)	precision2   )devices	max_stepsacceleratorstrategypluginsval_check_intervallimit_val_batches)	tokenizerzvision_model.vision_encoderzvision_model.class_embeddingz.class_embeddingz1vision_model.gated_positional_embedding.embeddingz.positional_embeddingz=vision_model.gated_positional_embedding.tile_embedding.weightz'.gated_tile_positional_embedding.weightz,vision_model.gated_positional_embedding.gatez .gated_positional_embedding_gatez vision_model.layernorm_post.biasz.ln_post.biasz"vision_model.layernorm_post.weightz.ln_post.weightzvision_model.layernorm_pre.biasz.ln_pre.biasz!vision_model.layernorm_pre.weightz.ln_pre.weightz<vision_model.post_tile_positional_embedding.embedding.weightz%.post_tile_pos_embed.embedding.weightz0vision_model.post_tile_positional_embedding.gatez.post_tile_pos_embed.gatez;vision_model.pre_tile_positional_embedding.embedding.weightz$.pre_tile_pos_embed.embedding.weightz/vision_model.pre_tile_positional_embedding.gatez.pre_tile_pos_embed.gate)zmulti_modal_projector.biasz+vision_model.vision_projection.encoder.bias)zmulti_modal_projector.weightz-vision_model.vision_projection.encoder.weight)z language_model.model.norm.weightz-language_model.decoder.final_layernorm.weight)zlanguage_model.lm_head.weightz"language_model.output_layer.weight vision_model.transformer.layers.z.self_attn.o_proj.weightz.transformer.layers.z".self_attention.linear_proj.weightz.input_layernorm.biasz.input_layernorm.weightz.post_attention_layernorm.biasz.pre_mlp_layernorm.biasz .post_attention_layernorm.weightz.pre_mlp_layernorm.weightz.mlp.fc1.biasz.mlp.linear_fc1.biasz.mlp.fc1.weight.mlp.linear_fc1.weightz.mlp.fc2.biasz.mlp.linear_fc2.biasz.mlp.fc2.weightz.mlp.linear_fc2.weight'vision_model.global_transformer.layers.z.global_transformer.layers.z
.gate_attnz	.gate_ffnzlanguage_model.decoder   r   language_model.model.layers.z.cross_attn.o_proj.weight.xattn_layers.z#.cross_attention.linear_proj.weightz.cross_attn.q_proj.weightz .cross_attention.linear_q.weightz.cross_attn.k_norm.weightz#.cross_attention.k_layernorm.weightz+.cross_attention.linear_q.layer_norm_weightz.cross_attn.q_norm.weightz#.cross_attention.q_layernorm.weightz!.mlp.linear_fc1.layer_norm_weightz.mlp.down_proj.weightz.cross_attn_attn_gatez.cross_attn_mlp_gate.layers.z,.self_attention.linear_qkv.layer_norm_weightc           	      S   s   |j }i }t|jD ]%}| d| d }t||D ]\}}d| d| d}|d|||< qq
t|jD ]%}| d| d }t||D ]\}}d| d| d}|d|||< qDq5|S )	Nz/vision_model.vision_encoder.transformer.layers.!.self_attention.linear_qkv.weightr`   .self_attn..weightr   z6vision_model.vision_encoder.global_transformer.layers.rb   )r   r   rE   r$   r   num_global_layers)	
state_dictvision_model_configr   new_state_dictr!   qkv_weightsnameweightnew_keyr"   r"   r#   convert_vision_qkv_weightO  s&   

z<convert_mllama_nemo_to_hf.<locals>.convert_vision_qkv_weightc                 S   s"   | d }d| |jd dddiS )Nz0vision_model.vision_encoder.conv1._linear.weightz#vision_model.patch_embedding.weightr   rc      )r   shape)rk   conv1_weightr"   r"   r#   convert_patch_embedingg  s   z9convert_mllama_nemo_to_hf.<locals>.convert_patch_embedingc                    s   |j }i }tD ]k}|d  d  }|d  d  dkrI|  d }|  d| d }t||D ]\}}	d| d| d}
|	d	|||
< q2q	|| d }|  d
| d }t||D ]\}}	d| d| d}
|	d	|||
< q^q	|S )Nrc   r   r   re   z!.cross_attention.linear_kv.weightrd   z.cross_attn.ri   r   rf   rg   rh   )r   r   r'   r   r$   )rk   language_model_configr   rm   r!   	cross_numxattn_index
kv_weightsro   rp   rq   
attn_indexrn   cross_attention_frequencyprefixtoal_num_layerr"   r#   convert_language_qkv_weightk  s$   z>convert_mllama_nemo_to_hf.<locals>.convert_language_qkv_weightc           
         s   i }t D ]K}|d  d  }|d  d  dkr+|  d }|  d| d }n|| d }|  d| d }t|D ]\}}d| d| d	}	|||	< q?q|S )
Nrc   r   r   re   ra   rf   rd   z.mlp.ri   )r   r.   )
rk   rm   r!   rx   ry   r-   r{   ro   rp   rq   r|   r"   r#   convert_gate  s   
z/convert_mllama_nemo_to_hf.<locals>.convert_gatec                 S   s&   | d }| d }dt j||fddiS )Nz/language_model.embedding.word_embeddings.weightz)language_model.learnable_embedding.weightz(language_model.model.embed_tokens.weightr   )dim)r   cat)rk   word_embeddingslearnable_embeddingr"   r"   r#   convert_embedding  s   z4convert_mllama_nemo_to_hf.<locals>.convert_embedding)r   from_pretrainednlMegatronStrategyTrainerMegatronMixedPrecision	to_fabricr_   r   MLlamaModelMLlamaConfig11BInstructconfigrl   rw   
load_modelmodulerk   r   rE   extendrj   rJ   updaterQ   )checkpoint_pathprocessor_name	processorr[   trainerfabricr_   modelr   rl   rw   rk   vkey_mapr!   rx   ry   r{   rm   rq   old_keyrr   rv   r   r   r   r"   r|   r#   convert_mllama_nemo_to_hfr   sh  











)










1








)



r   )r   transformersr   r   /transformers.models.mllama.configuration_mllamar   r   nemor   r   nemo.collectionsr   r$   r'   r.   rQ   r   r"   r"   r"   r#   <module>   s   %