o
    }oiz                     @   s  d dl mZmZ d dlmZ d dlmZmZmZm	Z	 d dl
Z
d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZ d dlmZmZ d dlmZmZmZ d dl m!Z! d dl"m#Z# d dl$m%Z%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, erd dl-m.Z. eG dd deZ/eG dd deZ0eG dd deZ1e%2edG dd de%j3def Z4e%5edG dd de%j3edf Z6d d! Z7e%j8d"d#d$d%e%j9fd&d'Z:e%j8d(d)d$d%e%j9fd*d+Z;e%j8d,d-d$d%e%j9fd.d/Z<e%j8d0d1d$d%e%j9fd2d3Z=e%j8d4d5d$d%e%j9fd6d7Z>e%j8d8d9d$d:d; Z?e%j8d#d<d$d%e%j9fd=d>Z@e%j8d)d?d$d%e%j9fd@dAZAe%j8d-d,d$d%e%j9fdBdCZBe%j8d1d0d$d%e%j9fdDdEZCe%j8d5dFd$d%e%j9fdGdHZDe%j8d9d8d$dIdJ ZEdS )K    )	dataclassfield)Path)TYPE_CHECKINGDictTupleUnionN)TransformerConfigQwen2VLConfig)Qwen2VLForConditionalGenerationQwen2VLVisionConfig)TokenizerSpec)Qwen2ConfigQwen2Config1P5BQwen2Config7BQwen2Config72B)
export_qkvexport_qkv_bias)r   Qwen2VLModelr   MultimodalProjectorConfig)load_distributed_model_weights)ioteardown)_ModelState)dtype_from_hf)loggingAutoTokenizerc                   @   h   e Zd ZU dZddlmZ edd dZee	d< edd dZ
eeef e	d	< ed
d dZee	d< dS )Qwen2VLConfig2BzQwen2VL Config 2Br   PretrainedConfigc                   C   s
   t ddS )NT)#share_embeddings_and_output_weights)r    r&   r&   ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/qwen2vl/model/qwen2vl.py<lambda>2   s   
 zQwen2VLConfig2B.<lambda>default_factorylanguage_transformer_configc                   C      t dddS N       )
num_layersnum_attention_headsr   r&   r&   r&   r'   r(   5       vision_transformer_configc                   C      t ddddS )N   i   
input_sizehidden_sizeffn_hidden_sizer   r&   r&   r&   r'   r(   8       vision_projection_configN__name__
__module____qualname____doc__transformersr$   r   r+   r	   __annotations__r3   r   r;   r&   r&   r&   r'   r"   +   s   
 r"   c                   @   r!   )Qwen2VLConfig7BzQwen2VL Config 7Br   r#   c                   C      t  S N)r   r&   r&   r&   r'   r(   B       zQwen2VLConfig7B.<lambda>r)   r+   c                   C   r,   r-   r   r&   r&   r&   r'   r(   D   r2   r3   c                   C   r4   )Nr5   i   r6   r   r&   r&   r&   r'   r(   G   r:   r;   Nr<   r&   r&   r&   r'   rC   <      
 rC   c                   @   r!   )Qwen2VLConfig72BzQwen2VL Config 72Br   r#   c                   C   rD   rE   )r   r&   r&   r&   r'   r(   Q   rF   zQwen2VLConfig72B.<lambda>r)   r+   c                   C   r,   r-   r   r&   r&   r&   r'   r(   S   r2   r3   c                   C   r4   )Nr5   i    r6   r   r&   r&   r&   r'   r(   V   r:   r;   Nr<   r&   r&   r&   r'   rH   K   rG   rH   hfc                   @   sX   e Zd ZdZdefddZdedefddZdd	 Ze	dddZ
e	defddZdS )HFQwen2VLImporterzQwen2VL Model HF Importerreturnc                 C   s   t | j| jdS )N)	tokenizer)r   configrL   selfr&   r&   r'   init^   s   zHFQwen2VLImporter.initoutput_pathc                 C   s   t t| }|  }| |}|t|j}|t|j}| || t	d|  | 
|| t	d|  t|| ~~|S )Nz+Converted Qwen2VL model to Nemo, saving to !Converted Qwen2VL model saved to )r   from_pretrainedstrrP   
nemo_setuptor   rM   convert_stateprint	nemo_saver   )rO   rQ   sourcetargettrainerr&   r&   r'   applyb   s   

zHFQwen2VLImporter.applyc              	   C   s   i ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%}|j jjsF|d&d'i d(|j  v rZ|d(d)d*d+d, nd-|j  v rn|d-d.d/d0d, ntd1tj	|||t
ttttgd2S )3Nvisual.patch_embed.proj.weightvision_model.conv1.weightvisual.blocks.*.norm1.weightIvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weightvisual.blocks.*.norm1.biasGvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_biasvisual.blocks.*.norm2.weight>vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weightvisual.blocks.*.norm2.bias<vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias visual.blocks.*.attn.proj.weight?vision_model.decoder.layers.*.self_attention.linear_proj.weightvisual.blocks.*.attn.proj.bias=vision_model.decoder.layers.*.self_attention.linear_proj.biasvisual.blocks.*.mlp.fc1.weight3vision_model.decoder.layers.*.mlp.linear_fc1.weightvisual.blocks.*.mlp.fc1.bias1vision_model.decoder.layers.*.mlp.linear_fc1.biasvisual.blocks.*.mlp.fc2.weight3vision_model.decoder.layers.*.mlp.linear_fc2.weightvisual.blocks.*.mlp.fc2.bias1vision_model.decoder.layers.*.mlp.linear_fc2.biasvisual.merger.ln_q.weight+vision_model.decoder.final_layernorm.weightvisual.merger.ln_q.bias)vision_model.decoder.final_layernorm.biasmodel.embed_tokens.weight/language_model.embedding.word_embeddings.weight&model.layers.*.self_attn.o_proj.weightAlanguage_model.decoder.layers.*.self_attention.linear_proj.weight#model.layers.*.mlp.down_proj.weight5language_model.decoder.layers.*.mlp.linear_fc2.weight%model.layers.*.input_layernorm.weightKlanguage_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight@language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight-language_model.decoder.final_layernorm.weight).model.layers.*.post_attention_layernorm.weightmodel.norm.weightlm_head.weight"language_model.output_layer.weight+vision_projection.encoder.linear_fc1.weight)vision_projection.encoder.linear_fc1.bias+vision_projection.encoder.linear_fc2.weight)vision_projection.encoder.linear_fc2.bias)visual.merger.mlp.0.weightvisual.merger.mlp.0.biasvisual.merger.mlp.2.weightvisual.merger.mlp.2.biasvision_projection.0.weightvision_projection.0.biasvision_projection.2.weightvision_projection.2.bias%Unable to map vision projection keys.mapping
transforms)rM   r+   r%   updatemodule
state_dictkeysKeyErrorr   apply_transforms_import_language_qkv_import_language_qkv_bias_import_vision_qkv_import_vision_qkv_bias_import_linear_fc1)rO   rZ   r[   r   r&   r&   r'   rW   w   s   	

	zHFQwen2VLImporter.convert_stater    c                 C   s   ddl m} |t| S )Nr   r   )=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr    rT   )rO   r    r&   r&   r'   rL      s   zHFQwen2VLImporter.tokenizerc                 C   sL  ddl m} |tj|dkrtdtj dddlm} |t| }dd }|}t|j	|j
|j|j|j|j|j|jd	||j|j|jt|tjkt|tjkt|d
}|j}tt|tjkt|tjkt|d}|j|jd  }	t|	|j
|	dt|tjkt|tjkt|d}
t|||
dt|tjkt|tjkt|d}|S )Nr   )Versionz4.51.3z#Current version of transformers is z),Please lower the version to be <= 4.51.3r
   c                 S   s(   d}| | dkr|d }| | dks|S )N   r      r&   )
vocab_sizebaser&   r&   r'   make_vocab_size_divisible_by   s
   z>HFQwen2VLImporter.config.<locals>.make_vocab_size_divisible_byT)r0   r8   r9   r1   init_method_stdlayernorm_epsilonnum_query_groupsrotary_basegated_linear_unitr   r%   r   fp16bf16params_dtype)r   r   r   r   	mcore_mlp)r7   r8   r9   projector_typer   r   r   )r+   r3   r;   vision_feature_layerr   r   r   )packaging.versionr   rA   __version__
ValueErrorr   rS   rT   r   num_hidden_layersr8   intermediate_sizer1   initializer_rangerms_norm_epsnum_key_value_heads
rope_thetar   tie_word_embeddingsr   torchfloat16bfloat16vision_configr   	embed_dimspatial_merge_sizer   )rO   r   HFQwen2VLConfig	hf_configr   text_configr+   r   r3   merge_hidden_sizer;   outputr&   r&   r'   rM      sh   

zHFQwen2VLImporter.configN)rK   r    )r=   r>   r?   r@   r   rP   r   r]   rW   propertyrL   r   rM   r&   r&   r&   r'   rJ   Z   s    =rJ   r   c                   @   sp   e Zd ZdZejfdddZdedefddZd	d
 Z	e
dddZdedeeef fddZe
dddZdS )HFQwen2VLExportera   
    Exporter class for converting NeMo Qwen2VL model to HuggingFace format.

    Inherits:
        io.ModelConnector: Connector interface to handle setup, save, and load using the Lightning framework.

    Methods:
        init: Initializes a new HuggingFace Qwen2VL model instance.
        apply: Converts the NeMo model to HuggingFace format and saves it.
        convert_state: Maps and transforms the state dictionary from NeMo to HuggingFace format.
        config: Generates and returns the HuggingFace Qwen2VL config for the model.
    rK   r   c                 C   sF   ddl m} |  tj| j|dW  d   S 1 sw   Y  dS )a0  
        Initializes a HuggingFace Qwen2VLForConditionalGeneration model.

        Args:
            dtype: The data type to use for the model (default: torch.bfloat16)

        Returns:
            Qwen2VLForConditionalGeneration: A HuggingFace Qwen2VL model initialized with the configuration.
        r   )no_init_weights)torch_dtypeN)transformers.modeling_utilsr   r   _from_configrM   )rO   dtyper   r&   r&   r'   rP     s   
$zHFQwen2VLExporter.initrQ   c                 C   s   t d | | \}}t d t d |  }t d | |||}| }|| z	| jj| W n tyF   t 	d Y nw t
d|  |S )a3  
        Converts the NeMo Qwen2VL model to HuggingFace format and saves it to the specified path.

        Args:
            output_path (Path): The path where the converted HuggingFace model will be saved.

        Returns:
            Path: The output path where the HuggingFace model was saved.
        z9Loading Qwen2VL NeMo checkpoint. This may take a while...zQwen2VL NeMo checkpoint loaded.zInitializing the HF model..zStart Converting the model..zFailed to save tokenizerrR   )r   info	ckpt_loadrP   rW   cpusave_pretrainedrL   	ExceptionwarningrX   )rO   rQ   rZ   source_configr[   r&   r&   r'   r]   $  s    





zHFQwen2VLExporter.applyc              	   C   s
  i ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%}|j jrF|dd&i n|d'd&i d(|  v r`|d)d*d+d,d- nd.|  v rs|d)d*d+d,d/ ntd0tj|||tt	t
ttgd1S )2a  
        Maps and transforms the state dictionary from NeMo to HuggingFace format.

        Args:
            source: The source NeMo model.
            target: The target HuggingFace model.

        Returns:
            The target HuggingFace model with the converted state.
        r_   r^   ra   r`   rc   rb   re   rd   rg   rf   ri   rh   rk   rj   rm   rl   ro   rn   rq   rp   rs   rr   ru   rt   rw   rv   ry   rx   r{   rz   r}   r|   r   r~   r   r   )r   r   r   r   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   r   )r+   r%   r   r   r   r   r   r   _export_language_qkv_export_language_qkv_bias_export_vision_qkv_export_vision_qkv_bias_export_linear_fc1)rO   rZ   r[   r   r   r&   r&   r'   rW   A  s   	
	zHFQwen2VLExporter.convert_stater   c                 C   s   t jt| ddjS )z~
        Gets the tokenizer from the loaded model context.

        Returns:
            The tokenizer specification.
        modelsubpath)r   load_contextrT   rL   rN   r&   r&   r'   rL     s   zHFQwen2VLExporter.tokenizerpathc              
   C   s   t jt| dd}|d }i }|jj}|jj}t|d }|D ]>\}}	d|v r)q |dd}
d|
v rZ|		d	|ksA|		d	|krZt
|		d	D ]}|	| ||
dd
t| < qH|	||
< q t|}||fS )a  
        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
        so that it is consistent with the key names you would get from loading the checkpoint into a model.
        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.

        Args:
            path (Path): The path from which the model will be loaded.

        Returns
        -------
            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
        model.configr   weightsT_extra_statezmodule. layersr   zlayers.)r   r   rT   r+   r0   r3   r   itemsreplacesizeranger   )rO   r   rM   dist_ckpt_folderr   langauge_layersvision_layersdistributed_model_weightskvnew_kirZ   r&   r&   r'   r     s    $ 
zHFQwen2VLExporter.ckpt_loadr   c                 C   s   t jt| dd}|j}|j}|j}t|j|j|j	dt
|j|j	 |jd|j|j|j|j|jddd }t|j|j	|j|j|j|j|j|j|j|j|ddS )	z
        Generates the configuration for the HuggingFace Qwen2VL model based on the NeMo model.

        Returns:
            HFQwen2VLConfig: A configuration object for the HuggingFace Qwen2VL model.
        r   r   
quick_gelu   qwen2_vlr   )depthr   r8   
hidden_act	mlp_ratio	num_headsin_channels
patch_sizer   spatial_patch_sizetemporal_patch_sizer   
model_typer   )r   r8   r   r1   r   r   r   r   r   r   r   r   )r   r   rT   r+   r3   r;   HFQwen2VLVisionConfigr0   r   r8   intr9   r1   	patch_dimr   r   r   r   to_dictr   r   r   r   r%   r   )rO   rZ   language_configvision_model_configr;   r   r&   r&   r'   rM     sH   	zHFQwen2VLExporter.configN)rK   r   )rK   r   )rK   r   )r=   r>   r?   r@   r   r   rP   r   r]   rW   r   rL   r   r   r   rM   r&   r&   r&   r'   r     s    J	!r   c                 C   sd  |   }||f|dd   }	||f|dd   }
| j|	 } |j|
 }|j|
 }g }t|D ]<}|| || |d | d d d d f  ||||d d d d d f  ||||d d d d d f  q-t|}|jdksyJ |j|jd |d | ksJ |j|jd |ksJ |j|jd |d ksJ |j|||d|   |g}|S )N   r   r   r   )	r   viewr   appendr   catndimshapereshape)qr   r   head_numr   heads_per_groupr8   	head_sizeold_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkv_weights_lr   qkv_weightsr&   r&   r'   
import_qkv  s$   


,$&
 r  )visual.blocks.*.attn.qkv.weightz>vision_model.decoder.layers.*.self_attention.linear_qkv.weight)
source_key
target_keyctxc              
   C   s   | j jj}t|jd d }||jksJ |d |d d f }|||d d d f }||d d d d f }t||||j|j|j|j |j|j	dS )Nr   r   r   r  r   r  r8   r  )
r[   rM   r3   r   r	  r8   r  r1   r   kv_channels)r  hf_qkv_weightsmegatron_configslicer  r   r   r&   r&   r'   r     s    

r   )visual.blocks.*.attn.qkv.biasz<vision_model.decoder.layers.*.self_attention.linear_qkv.biasc              
   C   s   | j jj}t|jd d }||jksJ |d | }|||d  }||d d  }t|d|d|d|j|j	|j|j	 d|j
ddS )Nr   r   r   r   r  r  )r[   rM   r3   r   r	  r8   r  	unsqueezer1   r   r  squeeze)r  hf_qkv_biasr  r  q_biask_biasv_biasr&   r&   r'   r     s$   

	r   )z&model.layers.*.self_attn.q_proj.weightz&model.layers.*.self_attn.k_proj.weightz&model.layers.*.self_attn.v_proj.weightz@language_model.decoder.layers.*.self_attention.linear_qkv.weightc              
   C   s2   | j jj}t||||j|j|j|j |j|jdS Nr  )r[   rM   r+   r  r1   r   r8   r  )r  r  r   r   r  r&   r&   r'   r   8  s   


r   )z$model.layers.*.self_attn.q_proj.biasz$model.layers.*.self_attn.k_proj.biasz$model.layers.*.self_attn.v_proj.biasz>language_model.decoder.layers.*.self_attention.linear_qkv.biasc              
   C   sH   | j jj}t|d|d|d|j|j|j|j d|jddS )Nr   r  r  )	r[   rM   r+   r  r  r1   r   r  r   )r  r"  r#  r$  r  r&   r&   r'   r   O  s   


	r   )'vision_model.embeddings.class_embeddingzvision_model.class_tokenc                 C   s   | dddS )Nr  r   )r
  r  	cls_tokenr&   r&   r'   _import_cls_tokenf  s   r)  )z#model.layers.*.mlp.gate_proj.weightz!model.layers.*.mlp.up_proj.weightz5language_model.decoder.layers.*.mlp.linear_fc1.weightc                 C   s   t j| |fddS )Nr   axis)r   r  )downgater&   r&   r'   r   o  s   	r   r  c              	   C   s>   | j jj}tjt||j|j|j|j |j|j|j dddS )Nr  r   r*  )r[   rM   r   r   r  r   r   r   r  qkvr   r&   r&   r'   r   {  s   


r   r  c                 C   s:   | j jj}tjt||j|j|j|j |j|j dddS )Nr  r   r  r  r   r*  )r[   rM   r   r   r  r   r   r   r  qkv_biasr   r&   r&   r'   r     s   


r   c                 C   s2   | j j}t||j|j|j|j |j|j|j dS r%  )r[   rM   r   r1   r   r8   r.  r&   r&   r'   r     s   


r   c                 C   s.   | j j}t||j|j|j|j |j|j dS )Nr0  )r[   rM   r   r1   r   r8   r1  r&   r&   r'   r     s   


r   r&  c                 C   s   |  S rE   )r   r'  r&   r&   r'   _export_cls_token  s   r3  c                 C   s   t j| ddd\}}||fS )Nr   r   )dim)r   chunk)
linear_fc1	gate_projup_projr&   r&   r'   r     s   	r   )Fdataclassesr   r   pathlibr   typingr   r   r   r   r   rA   ,megatron.core.transformer.transformer_configr	   r   r   r   3transformers.models.qwen2_vl.configuration_qwen2_vlr   r   1nemo.collections.common.tokenizers.tokenizer_specr   nemo.collections.llmr   r   r   r   %nemo.collections.vlm.neva.model.llavar   r   'nemo.collections.vlm.qwen2vl.model.baser   nemo.collections.vlm.visionr   .nemo.export.trt_llm.nemo_ckpt_loader.nemo_filer   nemo.lightningr   r   nemo.lightning.io.stater   nemo.lightning.pytorch.utilsr   
nemo.utilsr   r   r    r"   rC   rH   model_importerModelConnectorrJ   model_exporterr   r  state_transformTransformCTXr   r   r   r   r)  r   r   r   r   r   r3  r   r&   r&   r&   r'   <module>   s   
 
, c
