o
    wi                     @   s  d dl mZmZ d dlmZ d dlmZmZmZm	Z	 d dl
Z
d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d dl'm(Z(m)Z) d dl*mZm+Z+mZm,Z, d dl-m.Z. d dl/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 erd dl:m;Z; eG dd deZ<eG dd deZ=eG dd deZ>eG dd deZ?eG d d! d!eZ@eG d"d# d#eZAeG d$d% d%eZBe2Ce+d&G d'd( d(e2jDd)e+f ZEe2Fe+d&G d*d+ d+e2jDe+d)f ZGd,d- ZHe2jId.d/d0d1e2jJfd2d3ZKe2jId4d5d0d1e2jJfd6d7ZLe2jId8d9d0d1e2jJfd:d;ZMe2jId<d=d0d1e2jJfd>d?ZNe2jId@dAd0d1e2jJfdBdCZOe2jIdDdEd0dFdG ZPe2jIdHdId0dJdK ZQe2jIdLdMd0dNdO ZRdPdQ Z(dRe
jSfdSdTZ)e2jId/dUd0d1e2jJfdVdWZTe2jId5dXd0d1e2jJfdYdZZUe2jId9d8d0d1e2jJfd[d\ZVe2jId=d<d0d1e2jJfd]d^ZWe2jIdAd_d0d1e2jJfd`daZXe2jIdEdDd0dbdc ZYe2jIdIdHd0ddde ZZe2jIdMdLd0dfdg Z[dS )h    )	dataclassfield)Path)TYPE_CHECKINGDictTupleUnionN)TransformerConfig)
AutoConfig)AutoModelForImageTextToText)Qwen2_5_VLConfig)Qwen2VLConfig)Qwen2VLForConditionalGeneration)Qwen2_5_VLVisionConfigQwen2VLVisionConfig)TokenizerSpec)Qwen2ConfigQwen2Config1P5BQwen2Config7BQwen2Config72BQwen25Config3BQwen25Config7BQwen25Config32BQwen25Config72B)
export_qkvexport_qkv_bias)r   Qwen2VLModelr   Qwen25VLVisionConfigMultimodalProjectorConfig)load_distributed_model_weights)ioteardown)_ModelState)dtype_from_hf)loggingAutoTokenizerc                   @   h   e Zd ZU dZddlmZ edd dZee	d< edd dZ
eeef e	d	< ed
d dZee	d< dS )Qwen2VLConfig2BzQwen2VL Config 2Br   PretrainedConfigc                   C   s
   t ddS )NT)#share_embeddings_and_output_weights)r    r.   r.   g/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/qwen2vl/model/qwen2vl.py<lambda>D   s   
 zQwen2VLConfig2B.<lambda>default_factorylanguage_transformer_configc                   C      t dddS N       )
num_layersnum_attention_headsr   r.   r.   r.   r/   r0   G       vision_transformer_configc                   C      t ddddS )N   i   
input_sizehidden_sizeffn_hidden_sizer   r.   r.   r.   r/   r0   J       vision_projection_configN__name__
__module____qualname____doc__transformersr,   r   r3   r	   __annotations__r;   r   rC   r.   r.   r.   r/   r*   =   s   
 r*   c                   @   r)   )Qwen2VLConfig7BzQwen2VL Config 7Br   r+   c                   C      t  S N)r   r.   r.   r.   r/   r0   T       zQwen2VLConfig7B.<lambda>r1   r3   c                   C   r4   r5   r   r.   r.   r.   r/   r0   V   r:   r;   c                   C   r<   )Nr=      r>   r   r.   r.   r.   r/   r0   Y   rB   rC   NrD   r.   r.   r.   r/   rK   N      
 rK   c                   @   r)   )Qwen2VLConfig72BzQwen2VL Config 72Br   r+   c                   C   rL   rM   )r   r.   r.   r.   r/   r0   c   rN   zQwen2VLConfig72B.<lambda>r1   r3   c                   C   r4   r5   r   r.   r.   r.   r/   r0   e   r:   r;   c                   C   r<   )Nr=       r>   r   r.   r.   r.   r/   r0   h   rB   rC   NrD   r.   r.   r.   r/   rQ   ]   rP   rQ   c                   @   r)   )Qwen25VLConfig3BzQwen2.5VL Config 3Br   r+   c                   C   rL   rM   )r   r.   r.   r.   r/   r0   r   rN   zQwen25VLConfig3B.<lambda>r1   r3   c                   C   r4   r5   r   r.   r.   r.   r/   r0   t   r:   r;   c                   C      t dddddS )N	mcore_mlpr=   i   projector_typer?   r@   rA   r   r.   r.   r.   r/   r0   w       rC   NrD   r.   r.   r.   r/   rS   l   rP   rS   c                   @   r)   )Qwen25VLConfig7BzQwen2.5VL Config 7Br   r+   c                   C   rL   rM   )r   r.   r.   r.   r/   r0      rN   zQwen25VLConfig7B.<lambda>r1   r3   c                   C   r4   r5   rT   r.   r.   r.   r/   r0      r:   r;   c                   C   rU   )NrV   r=   rO   rW   r   r.   r.   r.   r/   r0      rY   rC   NrD   r.   r.   r.   r/   rZ   }   rP   rZ   c                   @   r)   )Qwen25VLConfig32BzQwen2.5VL Config 32Br   r+   c                   C   rL   rM   )r   r.   r.   r.   r/   r0      rN   zQwen25VLConfig32B.<lambda>r1   r3   c                   C      t ddddS Nr6   r7   i  )r8   r9   rA   rT   r.   r.   r.   r/   r0      rB   r;   c                   C   s   t dddddS )NrV   r=   rW   r   r.   r.   r.   r/   r0      rY   rC   NrD   r.   r.   r.   r/   r[      rP   r[   c                   @   r)   )Qwen25VLConfig72BzQwen2.5VL Config 72Br   r+   c                   C   rL   rM   )r   r.   r.   r.   r/   r0      rN   zQwen25VLConfig72B.<lambda>r1   r3   c                   C   r\   r]   rT   r.   r.   r.   r/   r0      rB   r;   c                   C   rU   )NrV   r=   rR   rW   r   r.   r.   r.   r/   r0      rY   rC   NrD   r.   r.   r.   r/   r^      rP   r^   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )HFQwen2VLImporterzQwen2VL Model HF Importerreturnc                 C   s   t | jd| jdS )Nzqwen2-vl)model_version	tokenizer)r   configrc   selfr.   r.   r/   init   s   zHFQwen2VLImporter.initoutput_pathc                 C   s   t jt| dd}tjt| dd}|jdk| _|  }| |}|t	|}|t	|}| 
|| td|  | || td|  t|| ~~|S )NTtrust_remote_code
qwen2_5_vlz+Converted Qwen2VL model to Nemo, saving to !Converted Qwen2VL model saved to )r   from_pretrainedstrHFAutoConfig
model_typeis_v2_5rg   
nemo_setuptor%   convert_stateprint	nemo_saver#   )rf   rh   source	hf_configtargettrainerr.   r.   r/   apply   s   

zHFQwen2VLImporter.applyc                 C   s   ddddddddd	d
dddd}|j jjs|ddi | jr)|dddd n|ddddddd d|j  v rI|ddddd nd|j  v r]|ddd d!d ntd"t	t
tttg}| jrq|ttg7 }tj||||d#S )$Nvision_model.conv1.weightIvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weightGvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_bias>vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight<vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias?vision_model.decoder.layers.*.self_attention.linear_proj.weight=vision_model.decoder.layers.*.self_attention.linear_proj.bias/language_model.embedding.word_embeddings.weightAlanguage_model.decoder.layers.*.self_attention.linear_proj.weight5language_model.decoder.layers.*.mlp.linear_fc2.weightKlanguage_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight@language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight-language_model.decoder.final_layernorm.weight)visual.patch_embed.proj.weightvisual.blocks.*.norm1.weightvisual.blocks.*.norm1.biasvisual.blocks.*.norm2.weightvisual.blocks.*.norm2.bias visual.blocks.*.attn.proj.weightvisual.blocks.*.attn.proj.biasmodel.embed_tokens.weight&model.layers.*.self_attn.o_proj.weight#model.layers.*.mlp.down_proj.weight%model.layers.*.input_layernorm.weight.model.layers.*.post_attention_layernorm.weightmodel.norm.weightlm_head.weight"language_model.output_layer.weight3vision_model.decoder.layers.*.mlp.linear_fc2.weight1vision_model.decoder.layers.*.mlp.linear_fc2.bias+vision_model.decoder.final_layernorm.weight)$visual.blocks.*.mlp.down_proj.weight"visual.blocks.*.mlp.down_proj.biasvisual.merger.ln_q.weight3vision_model.decoder.layers.*.mlp.linear_fc1.weight1vision_model.decoder.layers.*.mlp.linear_fc1.bias)vision_model.decoder.final_layernorm.bias)visual.blocks.*.mlp.fc1.weightvisual.blocks.*.mlp.fc1.biasvisual.blocks.*.mlp.fc2.weightvisual.blocks.*.mlp.fc2.biasr   visual.merger.ln_q.bias+vision_projection.encoder.linear_fc1.weight)vision_projection.encoder.linear_fc1.bias+vision_projection.encoder.linear_fc2.weight)vision_projection.encoder.linear_fc2.bias)visual.merger.mlp.0.weightvisual.merger.mlp.0.biasvisual.merger.mlp.2.weightvisual.merger.mlp.2.biasvision_projection.0.weightvision_projection.0.biasvision_projection.2.weightvision_projection.2.bias%Unable to map vision projection keys.mapping
transforms)rd   r3   r-   updaterq   module
state_dictkeysKeyError_import_language_qkv_import_language_qkv_bias_import_vision_qkv_import_vision_qkv_bias_import_linear_fc1 _import_vision_linear_fc1_weight_import_vision_linear_fc1_biasr"   apply_transforms)rf   rw   ry   r   r   r.   r.   r/   rt      s   

	zHFQwen2VLImporter.convert_stater(   c                 C   s   ddl m} |t| S )Nr   r'   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr(   rn   )rf   r(   r.   r.   r/   rc     s   zHFQwen2VLImporter.tokenizerc                 C   s  ddl m} |tj|dkrtdtj dtjt| dd}|jdk}d	d
 }|}t	|j
|j|j|j|j|j|j|jd||j|j|jt|tjkt|tjkt|d}|j}|rt|jt|tjkt|tjkt|d}|j|jd  }	t|	|j|	dt|tjkt|tjkt|d}
n3tt|tjkt|tjkt|d}|j|jd  }	t|	|j|	dt|tjkt|tjkt|d}
t|||
dt|tjkt|tjkt|d}|S )Nr   Version4.51.3#Current version of transformers is ),Please lower the version to be <= 4.51.3Tri   rk   c                 S   s(   d}| | dkr|d }| | dks|S )N   r      r.   )
vocab_sizebaser.   r.   r/   make_vocab_size_divisible_by2  s
   z>HFQwen2VLImporter.config.<locals>.make_vocab_size_divisible_by)r8   r@   rA   r9   init_method_stdlayernorm_epsilonnum_query_groupsrotary_basegated_linear_unitr   r-   r   fp16bf16params_dtype)rA   r   r   r   r   rV   )r?   r@   rA   rX   r   r   r   )r   r   r   )r3   r;   rC   vision_feature_layerr   r   r   ) packaging.versionr   rI   __version__
ValueErrorro   rm   rn   rp   r   num_hidden_layersr@   intermediate_sizer9   initializer_rangerms_norm_epsnum_key_value_heads
rope_thetar   tie_word_embeddingsr%   torchfloat16bfloat16vision_configr   spatial_merge_sizer    out_hidden_sizer   	embed_dimr   )rf   r   rx   rq   r   text_configr3   r   r;   merge_hidden_sizerC   outputr.   r.   r/   rd   #  s   



zHFQwen2VLImporter.configN)ra   r(   )rE   rF   rG   rH   r   rg   r   r{   rt   propertyrc   r   rd   r.   r.   r.   r/   r`      s    Pr`   r   c                   @   sp   e Zd ZdZejfdddZdedefddZd	d
 Z	e
dddZdedeeef fddZe
dddZdS )HFQwen2VLExportera   
    Exporter class for converting NeMo Qwen2VL model to HuggingFace format.

    Inherits:
        io.ModelConnector: Connector interface to handle setup, save, and load using the Lightning framework.

    Methods:
        init: Initializes a new HuggingFace Qwen2VL model instance.
        apply: Converts the NeMo model to HuggingFace format and saves it.
        convert_state: Maps and transforms the state dictionary from NeMo to HuggingFace format.
        config: Generates and returns the HuggingFace Qwen2VL config for the model.
    ra   r   c                 C   sF   ddl m} |  tj| j|dW  d   S 1 sw   Y  dS )a0  
        Initializes a HuggingFace Qwen2VLForConditionalGeneration model.

        Args:
            dtype: The data type to use for the model (default: torch.bfloat16)

        Returns:
            Qwen2VLForConditionalGeneration: A HuggingFace Qwen2VL model initialized with the configuration.
        r   )no_init_weights)torch_dtypeN)transformers.modeling_utilsr   r   from_configrd   )rf   dtyper   r.   r.   r/   rg     s   
$zHFQwen2VLExporter.initrh   c                 C   s   t d | | \}}t d t d |  }t d | |||}| }|| z	| jj| W n tyF   t 	d Y nw t
d|  |S )a3  
        Converts the NeMo Qwen2VL model to HuggingFace format and saves it to the specified path.

        Args:
            output_path (Path): The path where the converted HuggingFace model will be saved.

        Returns:
            Path: The output path where the HuggingFace model was saved.
        z9Loading Qwen2VL NeMo checkpoint. This may take a while...zQwen2VL NeMo checkpoint loaded.zInitializing the HF model..zStart Converting the model..zFailed to save tokenizerrl   )r&   info	ckpt_loadrg   rt   cpusave_pretrainedrc   	Exceptionwarningru   )rf   rh   rw   source_configry   r.   r.   r/   r{     s    





zHFQwen2VLExporter.applyc                 C   s   ddddddddd	d
dddd}|j jr|ddi n|ddi | jr0|dddd n|ddddddd d|  v rO|dddd d! nd"|  v rb|dddd d# ntd$ttt	t
tg}| jrv|ttg7 }tj||||d%S )&a  
        Maps and transforms the state dictionary from NeMo to HuggingFace format.

        Args:
            source: The source NeMo model.
            target: The target HuggingFace model.

        Returns:
            The target HuggingFace model with the converted state.
        r   r   r   r   r   r   r   r   r   r   r   r   r   )r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   r   )r3   r-   r   rq   r   r   r   _export_language_qkv_export_language_qkv_bias_export_vision_qkv_export_vision_qkv_bias_export_linear_fc1 _export_vision_linear_fc1_weight_export_vision_linear_fc1_biasr"   r   )rf   rw   ry   r   r   r   r.   r.   r/   rt     s   	
	zHFQwen2VLExporter.convert_stater   c                 C   s   t jt| ddjS )z~
        Gets the tokenizer from the loaded model context.

        Returns:
            The tokenizer specification.
        modelsubpath)r"   load_contextrn   rc   re   r.   r.   r/   rc     s   zHFQwen2VLExporter.tokenizerpathc              
   C   s   t jt| dd}|d }i }|jj}|jj}t|d }|D ]>\}}	d|v r)q |dd}
d|
v rZ|		d	|ksA|		d	|krZt
|		d	D ]}|	| ||
dd
t| < qH|	||
< q t|}||fS )a  
        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
        so that it is consistent with the key names you would get from loading the checkpoint into a model.
        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.

        Args:
            path (Path): The path from which the model will be loaded.

        Returns
        -------
            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
        model.configr  weightsT_extra_statezmodule. layersr   zlayers.)r"   r  rn   r3   r8   r;   r!   itemsreplacesizeranger$   )rf   r  rd   dist_ckpt_folderr   langauge_layersvision_layersdistributed_model_weightskvnew_kirw   r.   r.   r/   r   !  s    $ 
zHFQwen2VLExporter.ckpt_loadHFQwen2VLConfigc                 C   s4  ddl m} |tj|dkrtdtj dtjt| dd}|j}|j	}|j
}t|do3|jd	k| _| jrtdBi d
|jd|jd|jd|jdddt|j|j d|jddd|jd|jd|jd|jd|jdg ddddddd }tdBi d |d!|jd|jd"|jd#|jd$d%d&|jd|jd'|jd(|jd)|jd*|j d+d,g d-d.d/|j!ddd0d1d2d3d4d5d6d7d8d9d:d;d<d=}|S t"|j|j|jd>t|j|j |jd|j|j|j|j|jd?dd@ }t#|j|j|j|j|j|j|j|j|j!|j |ddAS )Cz
        Generates the configuration for the HuggingFace Qwen2VL model based on the NeMo model.

        Returns:
            HFQwen2VLConfig: A configuration object for the HuggingFace Qwen2VL model.
        r   r   r   r   r   r  r  fullatt_block_indexesNdepthr   r@   r   
hidden_actsilu	mlp_ratio	num_headsin_channels   
patch_sizer   spatial_patch_sizetemporal_patch_sizer   )            tokens_per_secondr   rp   rk   r   r   r   r   r   r9   max_window_layersF   max_position_embeddingsr   r   r   r   rope_scalingmrope)r7      r-  )typemrope_sectionr   bos_token_idi[P eos_token_idi]P vision_start_token_ididP vision_end_token_idieP vision_token_idifP image_token_idigP video_token_idi  
quick_geluqwen2_vl)r  r   r@   r  r  r  r  r   r   r!  r"  r   rp   r   )r   r@   r   r9   r   r   r   r   r   r   r   r   r.   )$r   r   rI   r   r   r"   r  rn   r3   r;   rC   hasattrr  rq   HFQwen25VLVisionConfigr8   r   r@   intrA   r9   	patch_dimr   r!  r"  r   to_dictHFQwen25VLConfig
seq_lengthr   r   r   r   r-   HFQwen2VLVisionConfigr  )rf   r   rw   language_configvision_model_configrC   r   rx   r.   r.   r/   rd   B  s   
	
	
zHFQwen2VLExporter.configN)ra   r   )ra   r   )ra   r  )rE   rF   rG   rH   r   r   rg   r   r{   rt   r   rc   r   r   r   rd   r.   r.   r.   r/   r   }  s    _	!r   c                 C   sd  |   }||f|dd   }	||f|dd   }
| j|	 } |j|
 }|j|
 }g }t|D ]<}|| || |d | d d d d f  ||||d d d d d f  ||||d d d d d f  q-t|}|jdksyJ |j|jd |d | ksJ |j|jd |ksJ |j|jd |d ksJ |j|||d|   |g}|S )N   r  r   r   )	r  viewr  appendr   catndimshapereshape)qr  r  head_numr   heads_per_groupr@   	head_sizeold_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkv_weights_lr  qkv_weightsr.   r.   r/   
import_qkv  s$   


,$&
 rS  )visual.blocks.*.attn.qkv.weightz>vision_model.decoder.layers.*.self_attention.linear_qkv.weight)
source_key
target_keyctxc              
   C   s   | j jj}t|jd d }||jksJ |d |d d f }|||d d d f }||d d d d f }t||||j|j|j|j |j|j	dS )Nr   r  r   rK  r   rL  r@   rM  )
ry   rd   r;   r;  rH  r@   rS  r9   r   kv_channels)rW  hf_qkv_weightsmegatron_configslicerJ  r  r  r.   r.   r/   r     s    

r   )visual.blocks.*.attn.qkv.biasz<vision_model.decoder.layers.*.self_attention.linear_qkv.biasc              
   C   s   | j jj}t|jd d }||jksJ |d | }|||d  }||d d  }t|d|d|d|j|j	|j|j	 d|j
ddS )Nr   r  r   r   rC  rX  )ry   rd   r;   r;  rH  r@   rS  	unsqueezer9   r   rY  squeeze)rW  hf_qkv_biasr[  r\  q_biask_biasv_biasr.   r.   r/   r     s$   

	r   )z&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weightz@language_model.decoder.layers.*.self_attention.linear_qkv.weightc              
   C   s2   | j jj}t||||j|j|j|j |j|jdS NrX  )ry   rd   r3   rS  r9   r   r@   rY  )rW  rJ  r  r  r[  r.   r.   r/   r      s   


r   )z$model.layers.*.self_attn.q_proj.biasz$model.layers.*.self_attn.k_proj.biasz$model.layers.*.self_attn.v_proj.biasz>language_model.decoder.layers.*.self_attention.linear_qkv.biasc              
   C   sH   | j jj}t|d|d|d|j|j|j|j d|jddS )Nr   rC  rX  )	ry   rd   r3   rS  r^  r9   r   rY  r_  )rW  ra  rb  rc  r[  r.   r.   r/   r     s   


	r   )'vision_model.embeddings.class_embeddingzvision_model.class_tokenc                 C   s   | dddS )NrC  r   )rI  rW  	cls_tokenr.   r.   r/   _import_cls_token.  s   rh  )z#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weightz5language_model.decoder.layers.*.mlp.linear_fc1.weightc                 C      t j| |fddS Nr   axisr   rF  downgater.   r.   r/   r   7  s   	r   )z$visual.blocks.*.mlp.gate_proj.weightz"visual.blocks.*.mlp.up_proj.weightr   c                 C   ri  rj  rm  rn  r.   r.   r/   r   C     r   )z"visual.blocks.*.mlp.gate_proj.biasz visual.blocks.*.mlp.up_proj.biasr   c                 C   ri  rj  rm  rn  r.   r.   r/   r   L  rq  r   c                    s   |d|  }|  ||dg} | d}t fddt|D }t | d }t d | d }	| |  d| }
| |  d| }| |	  d| }|
||fS )Nr   r   c                    ,   g | ]}t  d  |  d  |   qS r   r   arange.0r  rL  r.   r/   
<listcomp>\      zexport_qkv.<locals>.<listcomp>rC  )rI  r  r   rF  r  ru  r   )
linear_qkvrK  r   rL  r@   rM  qkv_total_dimq_slicek_slicev_sliceq_projk_projv_projr.   rx  r/   r   U  s   


r   qkv_biasc                    s   |d|  }|  ||g} t fddt|D }t | d }t d | d }| |  d }	| |  d }
| |  d }|	|
|fS )z
    Split interleave-concatenated qkv bias to separate q, k, v bias

    Example: export layer linear_qkv bias to HF {q|k|v}_proj bias
    r   c                    rr  rs  rt  rv  rx  r.   r/   ry  u  rz  z#export_qkv_bias.<locals>.<listcomp>rC  r   )rI  r   rF  r  ru  r   )r  rK  r   rL  rM  r|  r}  r~  r  ra  rb  rc  r.   rx  r/   r   k  s   

r   rT  c              	   C   sP   | j jj}|jdkr|jn|j}tjt||j	|j	|j	|j	 |||j	 dddS )Nr8  rX  r   rk  )
ry   rd   r   rp   r   r@   r   rF  r   r  )rW  qkvrx   r@   r.   r.   r/   r     s   

r   r]  c                 C   sN   | j jj}|jdkr|jn|j}tjt||j	|j	|j	|j	 ||j	 dddS )Nr8  rK  r   rL  rM  r   rk  )
ry   rd   r   rp   r   r@   r   rF  r   r  )rW  r  rx   r@   r.   r.   r/   r     s   

r   c                 C   s2   | j j}t||j|j|j|j |j|j|j dS rd  )ry   rd   r   r9   r   r@   )rW  r  rx   r.   r.   r/   r     s   


r   c                 C   s.   | j j}t||j|j|j|j |j|j dS )Nr  )ry   rd   r   r9   r   r@   )rW  r  rx   r.   r.   r/   r     s   


r   re  c                 C   s   |  S rM   )r_  rf  r.   r.   r/   _export_cls_token  s   r  c                 C      t j| ddd\}}||fS Nr   r   )dimr   chunk)
linear_fc1	gate_projup_projr.   r.   r/   r        	r   c                 C   r  r  r  )vision_fc1_weightr  r  r.   r.   r/   r     r  r   c                 C   r  r  r  )vision_fc1_biasr  r  r.   r.   r/   r     r  r   )\dataclassesr   r   pathlibr   typingr   r   r   r   r   rI   ,megatron.core.transformer.transformer_configr	   r
   ro   r   r   r>  r   r  r   7transformers.models.qwen2_5_vl.configuration_qwen2_5_vlr   r:  3transformers.models.qwen2_vl.configuration_qwen2_vlr   r@  1nemo.collections.common.tokenizers.tokenizer_specr   nemo.collections.llmr   r   r   r   r   r   r   r   %nemo.collections.vlm.neva.model.llavar   r   'nemo.collections.vlm.qwen2vl.model.baser   r   nemo.collections.vlm.visionr    .nemo.export.trt_llm.nemo_ckpt_loader.nemo_filer!   nemo.lightningr"   r#   nemo.lightning.io.stater$   nemo.lightning.pytorch.utilsr%   
nemo.utilsr&   r   r(   r*   rK   rQ   rS   rZ   r[   r^   model_importerModelConnectorr`   model_exporterr   rS  state_transformTransformCTXr   r   r   r   rh  r   r   r   Tensorr   r   r   r   r  r   r   r   r.   r.   r.   r/   <module>   s  (

 
M  5




