o
    wiFH                     @   s  d dl mZmZ d dlmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlmZmZ eG dd	 d	eZeG d
d deZeG dd deZeG dd deZeG dd deZeG dd deZeedG dd dejdef Zdd ZejddddejfddZejd d!ddejfd"d#Z ejd$d%ddejfd&d'Z!ejd(d)ddejfd*d+Z"ejd,d-ddejfd.d/Z#dS )0    )	dataclassfield)PathN)AutoTokenizer)ApproxGELUActivation)
CLIPConfig	CLIPModelCLIPTextModelConfigCLIPViTConfig)ioteardownc                   @   s  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< d	Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< d"S )#CLIPViTL_14_224_ConfigClip vit large patch14 configclipvision_model_type   	patch_dim   img_himg_w   
num_layersnum_attention_heads   hidden_size        hidden_dropoutattention_dropout   ffn_hidden_sizeFgated_linear_unit@   kv_channelslayernorm_zero_centered_gammaapply_query_key_layer_scalingbias_activation_fusionTbias_dropout_fusionattention_softmax_in_fp32	LayerNormnormalizationapply_rope_fusionmasked_softmax_fusionpersist_layer_normN)__name__
__module____qualname____doc__r   str__annotations__r   intr   r   r   r   r   r   floatr   r   r    boolr"   r#   r$   r%   r&   r'   r)   r*   r+   r,    r6   r6   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/clip/model/clip.pyr      s.   
 r   c                   @   s&  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< d	Zeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < d!Zeed"< dZeed#< dZeed$< dZeed%< dS )&CLIPViTB_32_224_Configr   r   r       r   r   r   r   r   r   r   r   r   r   r   r   r   r   Fr    Nr"      class_token_len{Gz?init_method_stdr#   r$   r%   Tr&   r'   r(   r)   r*   r+   r,   ) r-   r.   r/   r0   r   r1   r2   r   r3   r   r   r   r   r   r   r4   r   r   r    r5   r"   r;   r=   r#   r$   r%   r&   r'   r)   r*   r+   r,   r6   r6   r6   r7   r8   6   s2   
 r8   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< ded< dS )CLIPTextModelB_32_224_ConfigzClip text model Base configP   max_seq_lengthmax_position_embeddingsr   r      r      r      r   r<   r=   Tuse_scaled_init_methodr   r   r   Fr$   r'   r(   r)   do_layer_norm_weight_decayr,   r+   r&   r%   N)r-   r.   r/   r0   r@   r3   r2   rA   r   r   r   r   r=   r4   rE   r5   r   r   r$   r'   r)   rF   r,   r+   r&   r6   r6   r6   r7   r>   S   s*   
 
r>   c                   @   s   e Zd ZU dZdZeed< dZeed< dZeed< dZ	eed< d	Z
eed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dS )CLIPTextModelL_14_224_ConfigzClip text model large configM   r@   rA   r   r   rB   r   rC   r   rD   r   r<   r=   TrE   r   r   r   Fr$   rF   r,   r+   r&   N)r-   r.   r/   r0   r@   r3   r2   rA   r   r   r   r   r=   r4   rE   r5   r   r   r$   rF   r,   r+   r&   r6   r6   r6   r7   rG   o   s$   
 
rG   c                   @   >   e Zd ZU dZedd dZeed< edd dZe	ed< dS )	CLIPConfigL14z Main Clip config for Large modelc                   C      t  S N)rG   r6   r6   r6   r7   <lambda>       zCLIPConfigL14.<lambda>default_factorytext_transformer_configc                   C   rK   rL   )r   r6   r6   r6   r7   rM      rN   vision_transformer_configN
r-   r.   r/   r0   r   rQ   r	   r2   rR   r
   r6   r6   r6   r7   rJ         
 rJ   c                   @   rI   )	CLIPConfigB32zMain Clip config for Base modelc                   C   rK   rL   )r>   r6   r6   r6   r7   rM      rN   zCLIPConfigB32.<lambda>rO   rQ   c                   C   rK   rL   )r8   r6   r6   r6   r7   rM      rN   rR   NrS   r6   r6   r6   r7   rU      rT   rU   hfc                   @   sZ   e Zd ZdZdefddZdedefddZdd	d
Ze	dddZ
e	defddZdS )HFClipImporterzImport model from Hugging Facereturnc                 C   s   t | j| jdS )N)	tokenizer)r   configrY   selfr6   r6   r7   init   s   zHFClipImporter.initoutput_pathc                 C   s   ddl m} |t| }|  }| |}|tj}|tj}| 	|| t
d|  | || t
d|  t|| ~~|S )Nr   )r   z(Converted Clip model to Nemo, saving to zConverted Clip model saved to )transformersr   from_pretrainedr1   r]   
nemo_setuptotorchbfloat16convert_stateprint	nemo_saver   )r\   r^   r   sourcetargettrainerr6   r6   r7   apply   s   

zHFClipImporter.applyFc                 C   s   ddd}| i dddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4 tj|||tttttgd5S )6Nztext_model.head.weightzvision_model.head.weight)ztext_projection.weightzvisual_projection.weightz,text_model.embeddings.token_embedding.weightz+text_model.embedding.word_embeddings.weightz/text_model.embeddings.position_embedding.weightz/text_model.embedding.position_embeddings.weightz"text_model.final_layer_norm.weightz!text_model.final_layernorm.weightz text_model.final_layer_norm.biasztext_model.final_layernorm.bias'vision_model.embeddings.class_embeddingvision_model.class_tokenz.vision_model.embeddings.patch_embedding.weightzvision_model.conv1.weightz1vision_model.embeddings.position_embedding.weightz'vision_model.position_embeddings.weightz vision_model.pre_layrnorm.weightzvision_model.ln_pre.weightzvision_model.pre_layrnorm.biaszvision_model.ln_pre.biasz"vision_model.post_layernorm.weightz#vision_model.final_layernorm.weightz vision_model.post_layernorm.biasz!vision_model.final_layernorm.biasz5text_model.encoder.layers.*.self_attn.out_proj.weightz=text_model.decoder.layers.*.self_attention.linear_proj.weightz3text_model.encoder.layers.*.self_attn.out_proj.biasz;text_model.decoder.layers.*.self_attention.linear_proj.biasz.text_model.encoder.layers.*.layer_norm1.weightzGtext_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weightz,text_model.encoder.layers.*.layer_norm1.biaszEtext_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_biasz*text_model.encoder.layers.*.mlp.fc1.weightz1text_model.decoder.layers.*.mlp.linear_fc1.weightz(text_model.encoder.layers.*.mlp.fc1.biasz/text_model.decoder.layers.*.mlp.linear_fc1.biasz1text_model.decoder.layers.*.mlp.linear_fc2.weightz/text_model.decoder.layers.*.mlp.linear_fc2.biasz<text_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weightz:text_model.decoder.layers.*.mlp.linear_fc1.layer_norm_biasz?vision_model.decoder.layers.*.self_attention.linear_proj.weightz=vision_model.decoder.layers.*.self_attention.linear_proj.biaszIvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weightzGvision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_biasz3vision_model.decoder.layers.*.mlp.linear_fc1.weightz1vision_model.decoder.layers.*.mlp.linear_fc1.biasz3vision_model.decoder.layers.*.mlp.linear_fc2.weightz1vision_model.decoder.layers.*.mlp.linear_fc2.biasz>vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weightz<vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias)z*text_model.encoder.layers.*.mlp.fc2.weightz(text_model.encoder.layers.*.mlp.fc2.biasz.text_model.encoder.layers.*.layer_norm2.weightz,text_model.encoder.layers.*.layer_norm2.biasz7vision_model.encoder.layers.*.self_attn.out_proj.weightz5vision_model.encoder.layers.*.self_attn.out_proj.biasz0vision_model.encoder.layers.*.layer_norm1.weightz.vision_model.encoder.layers.*.layer_norm1.biasz,vision_model.encoder.layers.*.mlp.fc1.weightz*vision_model.encoder.layers.*.mlp.fc1.biasz,vision_model.encoder.layers.*.mlp.fc2.weightz*vision_model.encoder.layers.*.mlp.fc2.biasz0vision_model.encoder.layers.*.layer_norm2.weightz.vision_model.encoder.layers.*.layer_norm2.bias)mapping
transforms)updater   apply_transforms_import_cls_token_import_vision_qkv_bias_import_vision_qkv_import_language_qkv_bias_import_language_qkv)r\   rh   ri   image_newlinern   r6   r6   r7   re      s   	
%zHFClipImporter.convert_stater   c                 C   s   t t| S rL   )r   r1   r[   r6   r6   r7   rY      s   zHFClipImporter.tokenizerc           	      C   s  ddl m} |t| }|j}dd }t|j|j|j|j	|j
|j|jd||jd|j|jt|jdd}|j}tdi ddd	|jd
|jd|jd|jd|j
d|jd|jd|jd|j	dddddtd|jd|jd|jdd}t||d}|S )Nr   )r   c                 S   s(   d}| | dkr|d }| | dks|S )N   r      r6   )
vocab_sizebaser6   r6   r7   make_vocab_size_divisible_by  s
   z;HFClipImporter.config.<locals>.make_vocab_size_divisible_byF)
output_dimr   r   r   r   r=   layernorm_epsilonr    r|   #share_embeddings_and_output_weightsr   r   activation_funcr@   r$   r   r   r   r   r   r   r   r   r   r   r   r    r$   r   r}   r=   r~   r;      )rQ   rR   r6   )r_   r   r`   r1   text_configr	   projection_dimnum_hidden_layersr   intermediate_sizer   initializer_rangelayer_norm_epsrz   r   dropoutr   rA   vision_configr
   
patch_size
image_size)	r\   HFCLIPConfigrh   text_conifgr|   language_transformer_configr   rR   outputr6   r6   r7   rZ      sz   	
zHFClipImporter.configN)F)rX   r   )r-   r.   r/   r0   r   r]   r   rk   re   propertyrY   r   rZ   r6   r6   r6   r7   rW      s    
:rW   r   c                 C   sd  |   }||f|dd   }	||f|dd   }
| j|	 } |j|
 }|j|
 }g }t|D ]<}|| || |d | d d d d f  ||||d d d d d f  ||||d d d d d f  q-t|}|jdksyJ |j|jd |d | ksJ |j|jd |ksJ |j|jd |d ksJ |j|||d|   |g}|S )Nr      r   ry   )	sizeviewrangeappendrc   catndimshapereshape)qkvhead_numnum_query_groupsheads_per_groupr   	head_sizeold_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkv_weights_liqkv_weightsr6   r6   r7   
import_qkv8  s$   


,$&
 r   )z5vision_model.encoder.layers.*.self_attn.q_proj.weightz5vision_model.encoder.layers.*.self_attn.k_proj.weightz5vision_model.encoder.layers.*.self_attn.v_proj.weightz>vision_model.decoder.layers.*.self_attention.linear_qkv.weight)
source_key
target_keyctxc              
   C   2   | j jj}t||||j|j|j|j |j|jdS Nr   r   r   r   r   )ri   rZ   rR   r   r   r   r   r"   r   r   r   r   megatron_configr6   r6   r7   rt   R  s   


rt   )z3vision_model.encoder.layers.*.self_attn.q_proj.biasz3vision_model.encoder.layers.*.self_attn.k_proj.biasz3vision_model.encoder.layers.*.self_attn.v_proj.biasz<vision_model.decoder.layers.*.self_attention.linear_qkv.biasc              
   C   H   | j jj}t|d|d|d|j|j|j|j d|jddS Nr   r   )	ri   rZ   rR   r   	unsqueezer   r   r"   squeezer   q_biask_biasv_biasr   r6   r6   r7   rs   i     


	rs   )z1text_model.encoder.layers.*.self_attn.q_proj.biasz1text_model.encoder.layers.*.self_attn.k_proj.biasz1text_model.encoder.layers.*.self_attn.v_proj.biasz:text_model.decoder.layers.*.self_attention.linear_qkv.biasc              
   C   r   r   )	ri   rZ   rQ   r   r   r   r   r"   r   r   r6   r6   r7   ru     r   ru   )z3text_model.encoder.layers.*.self_attn.q_proj.weightz3text_model.encoder.layers.*.self_attn.k_proj.weightz3text_model.encoder.layers.*.self_attn.v_proj.weightz<text_model.decoder.layers.*.self_attention.linear_qkv.weightc              
   C   r   r   )ri   rZ   rQ   r   r   r   r   r"   r   r6   r6   r7   rv     s   


rv   )rl   rm   c                 C   s   | dddS )Nr   r   )r   )r   	cls_tokenr6   r6   r7   rr     s   rr   )$dataclassesr   r   pathlibr   rc   torch.distributed=nemo.collections.common.tokenizers.huggingface.auto_tokenizerr   2nemo.collections.nlp.modules.common.megatron.utilsr   nemo.collections.vlm.clip.modelr   r   r	   r
   nemo.lightningr   r   r   r8   r>   rG   rJ   rU   model_importerModelConnectorrW   r   state_transformTransformCTXrt   rs   ru   rv   rr   r6   r6   r6   r7   <module>   sb   
  