o
    }oiD                     @   s  d dl mZmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZmZmZ erdd dlm Z  d dl!m"Z" eG dd deZ#eG dd de#Z$eG dd de#Z%G dd deZ&e'e&dG dd dej(de&f Z)dd Z*dd Z+dej,fdd Z-ej.d!d"d#d$ej/fd%d&Z0ej.d'd(d#d$ej/fd)d*Z1ej.d+d,d#d$ej/fd-d.Z2ej.d/d0d#d$ej/fd1d2Z3ej.d3d4d#d5d6 Z4dS )7    )	dataclassfield)Path)TYPE_CHECKING	AnnotatedCallableOptionalUnionN)TransformerConfig)nn)Llama2Config7BLlama2Config13BLlamaConfig)Config)
NevaConfig	NevaModel)HFCLIPVisionConfigMultimodalProjectorConfig)OptimizerModuleioteardownAutoTokenizer)TokenizerSpecc                   @   s*   e Zd ZU dZdZeed< dZeed< dS )LlavaConfigzLlava Model Base ConfigTdrop_vision_class_tokenfreeze_vision_modelN)__name__
__module____qualname____doc__r   bool__annotations__r    r#   r#   Y/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/neva/model/llava.pyr   (   s   
 r   c                   @   h   e Zd ZU dZddlmZ edd dZee	d< edd dZ
eeef e	d	< ed
d dZee	d< dS )Llava15Config7BzLlava v1.5 Config 7Br   PretrainedConfigc                   C      t  S N)r   r#   r#   r#   r$   <lambda>6       zLlava15Config7B.<lambda>default_factorylanguage_transformer_configc                   C   
   t ddS N!openai/clip-vit-large-patch14-336pretrained_model_name_or_pathr   r#   r#   r#   r$   r+   8      
 vision_transformer_configc                   C      t ddddS )N      
input_sizehidden_sizeffn_hidden_sizer   r#   r#   r#   r$   r+   ;       vision_projection_configNr   r   r   r    transformersr(   r   r/   r
   r"   r7   r	   rA   r#   r#   r#   r$   r&   0      
 r&   c                   @   r%   )Llava15Config13BzLlava v1.5 Config 13Br   r'   c                   C   r)   r*   )r   r#   r#   r#   r$   r+   E   r,   zLlava15Config13B.<lambda>r-   r/   c                   C   r0   r1   r5   r#   r#   r#   r$   r+   G   r6   r7   c                   C   r8   )Nr9   i   r;   r?   r#   r#   r#   r$   r+   J   r@   rA   NrB   r#   r#   r#   r$   rE   ?   rD   rE   c                       sf   e Zd ZdZ				d
deee ee f dee ded dee	e
jge
jf  f fdd	Z  ZS )
LlavaModelzLlava Model NeMo WrapperNconfigoptim	tokenizerr   model_transformc                    s   t  j|pt |||d d S )N)rH   rI   rJ   )super__init__r   )selfrG   rH   rI   rJ   	__class__r#   r$   rL   Q   s   zLlavaModel.__init__)NNNN)r   r   r   r    r   r   r   r   r   r   r   ModulerL   __classcell__r#   r#   rN   r$   rF   N   s    rF   hfc                   @   sZ   e Zd ZdZdefddZdedefddZdd	d
Ze	dddZ
e	defddZdS )HFLlavaImporterzLlava Model HF Importerreturnc                 C   s   t | j| jdS )N)rI   )rF   rG   rI   )rM   r#   r#   r$   init_   s   zHFLlavaImporter.initoutput_pathc                 C   sr   ddl m} |t| }|  }| |}| || td|  | || td|  t	|| ~~|S )Nr   )LlavaForConditionalGenerationz)Converted Llava model to Nemo, saving to zConverted Llava model saved to )
rC   rW   from_pretrainedstrrU   
nemo_setupconvert_stateprint	nemo_saver   )rM   rV   rW   sourcetargettrainerr#   r#   r$   applyc   s   

zHFLlavaImporter.applyFc                 C   s   dddddddd}d	|j   v r|d	d
ddd nd|j   v r2|ddddd ntd|r?|ddi d|j   v rP|ddi n"d|j   v rn|ddddddddd d!d"d#d$d%d& ntd'tj|||ttt	t
tgd(S ))Nz/language_model.embedding.word_embeddings.weightzAlanguage_model.decoder.layers.*.self_attention.linear_proj.weightz5language_model.decoder.layers.*.mlp.linear_fc2.weightzKlanguage_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weightz@language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weightz-language_model.decoder.final_layernorm.weightz"language_model.output_layer.weight)z(language_model.model.embed_tokens.weightz5language_model.model.layers.*.self_attn.o_proj.weightz2language_model.model.layers.*.mlp.down_proj.weightz4language_model.model.layers.*.input_layernorm.weightz=language_model.model.layers.*.post_attention_layernorm.weightz language_model.model.norm.weightzlanguage_model.lm_head.weightz+vision_projection.encoder.linear_fc1.weightz)vision_projection.encoder.linear_fc1.biasz+vision_projection.encoder.linear_fc2.weightz)vision_projection.encoder.linear_fc2.bias)z%multi_modal_projector.linear_1.weightz#multi_modal_projector.linear_1.biasz%multi_modal_projector.linear_2.weightz#multi_modal_projector.linear_2.biaszvision_projection.0.weightzvision_projection.0.biaszvision_projection.2.weightzvision_projection.2.biasz%Unable to map vision projection keys.image_newlinez4vision_model.vision_model.embeddings.class_embeddingzvision_tower.vision_model.**zvision_model.vision_model.**vision_model.class_tokenzvision_model.conv1.weightz'vision_model.position_embeddings.weightzIvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weightzGvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_biasz>vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weightz<vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_biasz?vision_model.decoder.layers.*.self_attention.linear_proj.weightz=vision_model.decoder.layers.*.self_attention.linear_proj.biasz3vision_model.decoder.layers.*.mlp.linear_fc1.weightz1vision_model.decoder.layers.*.mlp.linear_fc1.biasz3vision_model.decoder.layers.*.mlp.linear_fc2.weightz1vision_model.decoder.layers.*.mlp.linear_fc2.biaszvision_model.ln_pre.weightzvision_model.ln_pre.bias)z;vision_tower.vision_model.embeddings.patch_embedding.weightz>vision_tower.vision_model.embeddings.position_embedding.weightz=vision_tower.vision_model.encoder.layers.*.layer_norm1.weightz;vision_tower.vision_model.encoder.layers.*.layer_norm1.biasz=vision_tower.vision_model.encoder.layers.*.layer_norm2.weightz;vision_tower.vision_model.encoder.layers.*.layer_norm2.biaszDvision_tower.vision_model.encoder.layers.*.self_attn.out_proj.weightzBvision_tower.vision_model.encoder.layers.*.self_attn.out_proj.biasz9vision_tower.vision_model.encoder.layers.*.mlp.fc1.weightz7vision_tower.vision_model.encoder.layers.*.mlp.fc1.biasz9vision_tower.vision_model.encoder.layers.*.mlp.fc2.weightz7vision_tower.vision_model.encoder.layers.*.mlp.fc2.biasz-vision_tower.vision_model.pre_layrnorm.weightz+vision_tower.vision_model.pre_layrnorm.biasz"Unable to map vision encoder keys.)mapping
transforms)module
state_dictkeysupdateKeyErrorr   apply_transforms_import_language_qkv_import_vision_qkv_import_vision_qkv_bias_import_cls_token_import_linear_fc1)rM   r^   r_   rb   rd   r#   r#   r$   r[   v   s|   		zHFLlavaImporter.convert_stater   c                 C   s   ddl m} |t| S )Nr   r   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   rY   )rM   r   r#   r#   r$   rI      s   zHFLlavaImporter.tokenizerc           	      C   s   ddl m} |t| }|j}dd }t|j|j|j|j	|j
|j|j|jd||jdd}tdd	}td
ddd}t||||jd}|S )Nr   )r   c                 S   s(   d}| | dkr|d }| | dks|S )N   r      r#   )
vocab_sizebaser#   r#   r$   make_vocab_size_divisible_by   s
   z<HFLlavaImporter.config.<locals>.make_vocab_size_divisible_byTF)
num_layersr=   r>   num_attention_headsinit_method_stdlayernorm_epsilonnum_query_groupsrotary_basegated_linear_unitrv   #share_embeddings_and_output_weightsr2   r3   r9   r:   r;   )r/   r7   rA   vision_feature_layer)rC   r   rX   rY   text_configr   num_hidden_layersr=   intermediate_sizerx   initializer_rangerms_norm_epsnum_key_value_heads
rope_thetart   r   r   r   )	rM   HFLlavaConfigr^   text_conifgrv   r/   r7   rA   outputr#   r#   r$   rG      s8   zHFLlavaImporter.configN)F)rT   r   )r   r   r   r    rF   rU   r   ra   r[   propertyrI   r   rG   r#   r#   r#   r$   rS   [   s    
KrS   rW   c                 C   sd  |   }||f|dd   }	||f|dd   }
| j|	 } |j|
 }|j|
 }g }t|D ]<}|| || |d | d d d d f  ||||d d d d d f  ||||d d d d d f  q-t|}|jdksyJ |j|jd |d | ksJ |j|jd |ksJ |j|jd |d ksJ |j|||d|   |g}|S )N      r   rs   )	sizeviewrangeappendtorchcatndimshapereshape)qkvhead_numr{   heads_per_groupr=   	head_sizeold_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkv_weights_liqkv_weightsr#   r#   r$   
import_qkv   s$   


,$&
 r   c                    s   |d|  }|  ||dg} | d}t fddt|D }t | d }t d | d }	| |  d| }
| |  d| }| |	  d| }|
||fS )Nrs   c                    ,   g | ]}t  d  |  d  |   qS rs   r   arange.0r   r   r#   r$   
<listcomp>      zexport_qkv.<locals>.<listcomp>r   )r   r   r   r   r   r   cpu)
linear_qkvr   r{   r   r=   r   qkv_total_dimq_slicek_slicev_sliceq_projk_projv_projr#   r   r$   
export_qkv  s   


r   qkv_biasc                    s   |d|  }|  ||g} t fddt|D }t | d }t d | d }| |  d }	| |  d }
| |  d }|	|
|fS )z
    Split interleave-concatenated qkv bias to separate q, k, v bias

    Example: export layer linear_qkv bias to HF {q|k|v}_proj bias
    rs   c                    r   r   r   r   r   r#   r$   r   -  r   z#export_qkv_bias.<locals>.<listcomp>r   r   )r   r   r   r   r   r   )r   r   r{   r   r   r   r   r   r   q_biask_biasv_biasr#   r   r$   export_qkv_bias#  s   

r   )z5language_model.model.layers.*.self_attn.q_proj.weightz5language_model.model.layers.*.self_attn.k_proj.weightz5language_model.model.layers.*.self_attn.v_proj.weightz@language_model.decoder.layers.*.self_attention.linear_qkv.weight)
source_key
target_keyctxc              
   C   2   | j jj}t||||j|j|j|j |j|jdS Nr   r{   r   r=   r   )r_   rG   r/   r   rx   r{   r=   kv_channelsr   r   r   r   megatron_configr#   r#   r$   rl   <     


rl   )zBvision_tower.vision_model.encoder.layers.*.self_attn.q_proj.weightzBvision_tower.vision_model.encoder.layers.*.self_attn.k_proj.weightzBvision_tower.vision_model.encoder.layers.*.self_attn.v_proj.weightz>vision_model.decoder.layers.*.self_attention.linear_qkv.weightc              
   C   r   r   )r_   rG   r7   r   rx   r{   r=   r   r   r#   r#   r$   rm   S  r   rm   )z@vision_tower.vision_model.encoder.layers.*.self_attn.q_proj.biasz@vision_tower.vision_model.encoder.layers.*.self_attn.k_proj.biasz@vision_tower.vision_model.encoder.layers.*.self_attn.v_proj.biasz<vision_model.decoder.layers.*.self_attention.linear_qkv.biasc              
   C   sH   | j jj}t|d|d|d|j|j|j|j d|jddS )Nr   r   r   )	r_   rG   r7   r   	unsqueezerx   r{   r   squeeze)r   r   r   r   r   r#   r#   r$   rn   j  s   


	rn   )z4vision_tower.vision_model.embeddings.class_embeddingrc   c                 C   s   | dddS )Nr   r   )r   )r   	cls_tokenr#   r#   r$   ro     s   ro   )z2language_model.model.layers.*.mlp.gate_proj.weightz0language_model.model.layers.*.mlp.up_proj.weightz5language_model.decoder.layers.*.mlp.linear_fc1.weightc                 C   s   t j| |fddS )Nr   )axis)r   r   )downgater#   r#   r$   rp     s   	rp   )5dataclassesr   r   pathlibr   typingr   r   r   r   r	   r   ,megatron.core.transformer.transformer_configr
   r   nemo.collections.llmr   r   r   nemo.collections.llm.utilsr   $nemo.collections.vlm.neva.model.baser   r    nemo.collections.vlm.vision.baser   r   nemo.lightningr   r   r   rq   r   1nemo.collections.common.tokenizers.tokenizer_specr   r   r&   rE   rF   model_importerModelConnectorrS   r   r   Tensorr   state_transformTransformCTXrl   rm   rn   ro   rp   r#   r#   r#   r$   <module>   sh   
 