o
    wi"                     @   s  d dl mZ d dlmZ d dlmZ d dlmZ d dl	Z	d dl
mZ d dlmZ d dlmZmZ eG dd	 d	eZG d
d dejejejZeedG dd dejdef Zdd ZejddddejfddZejddddejfddZejddddejfddZdS )     )	dataclass)Path)OptionalN)
quick_gelu)CLIPViTConfig)ioteardownc                   @   s&  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< eZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< d"Z eed#< dZ!eed$< d%S )&CLIPViTL_14_336_ConfigzClip vit large patch14 configclipvision_model_type   	patch_dimiP  img_himg_w   
num_layers   num_attention_headsTadd_bias_linearadd_qkv_biasi   hidden_sizeg        hidden_dropoutattention_dropouti   ffn_hidden_sizeFgated_linear_unitactivation_func@   kv_channelsnum_query_groupslayernorm_zero_centered_gammaapply_query_key_layer_scalingbias_activation_fusionbias_dropout_fusionattention_softmax_in_fp32	LayerNormnormalizationapply_rope_fusionN)"__name__
__module____qualname____doc__r   str__annotations__r   intr   r   r   r   r   boolr   r   r   floatr   r   r   r   r   callabler   r   r   r    r!   r"   r#   r%   r&    r1   r1   a/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/vlm/vision/clip_vit.pyr	      s2   
 r	   c                       s6   e Zd ZdZd	dee f fddZd
ddZ  ZS )CLIPViTModelzCLIP ViT Model WrapperNconfigc                    s   t    || _d S N)super__init__r4   )selfr4   	__class__r1   r2   r7   ;   s   

zCLIPViTModel.__init__returnc                 C   s   t | ds| j | _d S d S )Nmodule)hasattrr4   configure_modelr<   r8   r1   r1   r2   r>   @   s   
zCLIPViTModel.configure_modelr5   )r;   N)	r'   r(   r)   r*   r   r   r7   r>   __classcell__r1   r1   r9   r2   r3   8   s    r3   hfc                   @   sJ   e Zd ZdZdefddZdedefddZede	fdd	Z
d
d ZdS )CLIPViTImporterzCLIP HF Importerr;   c                 C   s
   t | jS r5   )r3   r4   r?   r1   r1   r2   initJ   s   
zCLIPViTImporter.initoutput_pathc                 C   sh   ddl m} |jt| dd}|  }| |}| || | || td|  t	|| ~~|S )Nr   )	AutoModelTtrust_remote_codez!Converted CLIPViT model saved to )
transformersrE   from_pretrainedr+   rC   
nemo_setupconvert_state	nemo_saveprintr   )r8   rD   rE   sourcetargettrainerr1   r1   r2   applyN   s   

zCLIPViTImporter.applyc                 C   sl   ddl m} |jt| dd}t|jj|jj|jj|jj|jj	|jj
|jjt|jj|jj
 |jj
d	}|S )Nr   )
AutoConfigTrF   )	r   r   r   r   r   r   r   r   r   )rH   rR   rI   r+   r	   vision_config
patch_sizer   
image_sizeintermediate_sizer   num_hidden_layersr-   )r8   rR   rN   outputr1   r1   r2   r4   `   s   zCLIPViTImporter.configc                 C   sD   i }| ddddddddd	d
ddddd tj|||tttgdS )Nzconv1.weightzposition_embeddings.weightzln_pre.weightzln_pre.biasz2decoder.layers.*.self_attention.linear_proj.weightz0decoder.layers.*.self_attention.linear_proj.biasz<decoder.layers.*.self_attention.linear_qkv.layer_norm_weightz:decoder.layers.*.self_attention.linear_qkv.layer_norm_biasz&decoder.layers.*.mlp.linear_fc1.weightz$decoder.layers.*.mlp.linear_fc1.biasz&decoder.layers.*.mlp.linear_fc2.weightz$decoder.layers.*.mlp.linear_fc2.biasz1decoder.layers.*.mlp.linear_fc1.layer_norm_weightz/decoder.layers.*.mlp.linear_fc1.layer_norm_bias)z.vision_model.embeddings.patch_embedding.weightz1vision_model.embeddings.position_embedding.weightz vision_model.pre_layrnorm.weightzvision_model.pre_layrnorm.biasz7vision_model.encoder.layers.*.self_attn.out_proj.weightz5vision_model.encoder.layers.*.self_attn.out_proj.biasz0vision_model.encoder.layers.*.layer_norm1.weightz.vision_model.encoder.layers.*.layer_norm1.biasz,vision_model.encoder.layers.*.mlp.fc1.weightz*vision_model.encoder.layers.*.mlp.fc1.biasz,vision_model.encoder.layers.*.mlp.fc2.weightz*vision_model.encoder.layers.*.mlp.fc2.biasz0vision_model.encoder.layers.*.layer_norm2.weightz.vision_model.encoder.layers.*.layer_norm2.bias)mapping
transforms)updater   apply_transforms_import_cls_token_import_vision_qkv_bias_import_vision_qkv)r8   rN   rO   rY   r1   r1   r2   rK   t   s6   zCLIPViTImporter.convert_stateN)r'   r(   r)   r*   r3   rC   r   rQ   propertyr   r4   rK   r1   r1   r1   r2   rB   F   s    rB   CLIPVisionModelc                 C   sd  |   }||f|dd   }	||f|dd   }
| j|	 } |j|
 }|j|
 }g }t|D ]<}|| || |d | d d d d f  ||||d d d d d f  ||||d d d d d f  q-t|}|jdksyJ |j|jd |d | ksJ |j|jd |ksJ |j|jd |d ksJ |j|||d|   |g}|S )N      r      )	sizeviewrangeappendtorchcatndimshapereshape)qkvhead_numr   heads_per_groupr   	head_sizeold_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkv_weights_liqkv_weightsr1   r1   r2   
import_qkv   s$   


,$&
 rz   )z3vision_model.encoder.layers.*.self_attn.q_proj.biasz3vision_model.encoder.layers.*.self_attn.k_proj.biasz3vision_model.encoder.layers.*.self_attn.v_proj.biasz/decoder.layers.*.self_attention.linear_qkv.bias)
source_key
target_keyctxc              
   C   sF   | j j}t|d|d|d|j|j|j|j d|jddS )Nrb   rq   r   rr   r   rs   )rO   r4   rz   	unsqueezer   r   r   squeeze)r}   q_biask_biasv_biasmegatron_configr1   r1   r2   r^      s   

	r^   )z5vision_model.encoder.layers.*.self_attn.q_proj.weightz5vision_model.encoder.layers.*.self_attn.k_proj.weightz5vision_model.encoder.layers.*.self_attn.v_proj.weightz1decoder.layers.*.self_attention.linear_qkv.weightc              
   C   s0   | j j}t||||j|j|j|j |j|jdS )Nr   )rO   r4   rz   r   r   r   r   )r}   rn   ro   rp   r   r1   r1   r2   r_      s   

r_   )z'vision_model.embeddings.class_embeddingclass_tokenc                 C   s   | dddS )Nrb   r~   )rm   )r}   	cls_tokenr1   r1   r2   r]      s   r]   )dataclassesr   pathlibr   typingr   lightning.pytorchpytorchLri   "nemo.collections.llm.fn.activationr    nemo.collections.vlm.vision.baser   nemo.lightningr   r   r	   LightningModuleIOMixinConnectorMixinr3   model_importerModelConnectorrB   rz   state_transformTransformCTXr^   r_   r]   r1   r1   r1   r2   <module>   s:   
P