o
    }oi"                     @   s   d dl mZ d dlmZ d dlmZ d dlmZ d dl	Z	d dl
mZ d dlmZ d dlmZmZ eG dd	 d	eZG d
d dejejejZeedG dd dejdef Zdd ZejddddejfddZejddddejfddZdS )    )	dataclass)Path)OptionalN)openai_gelu)CLIPViTConfig)ioteardownc                   @   s>  e Zd ZU dZdZeed< dZeed< dZ	eed< dZ
eed< d	Zeed
< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< eZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed < dZeed!< d"Z eed#< dZ!eed$< dZ"eed%< d&Z#eed'< d(S ))SigLIPViT400M_14_384_Configz Siglip so400m patch14 384 configsiglipvision_model_type   	patch_dimi  img_himg_w   
num_layers   num_attention_headsTadd_bias_linearadd_qkv_biasi  hidden_sizeg        hidden_dropoutattention_dropouti  ffn_hidden_sizeFgated_linear_unitactivation_funcH   kv_channelsnum_query_groupslayernorm_zero_centered_gammaapply_query_key_layer_scalingbias_activation_fusionbias_dropout_fusionattention_softmax_in_fp32	LayerNormnormalizationapply_rope_fusionqk_layernormgư>layernorm_epsilonN)$__name__
__module____qualname____doc__r   str__annotations__r   intr   r   r   r   r   boolr   r   r   floatr   r   r   r   r   callabler   r   r   r    r!   r"   r#   r%   r&   r'   r(    r3   r3   Z/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/vlm/vision/siglip_vit.pyr	      s6   
 r	   c                       s6   e Zd ZdZd	dee f fddZd
ddZ  ZS )SigLIPViTModelzSigLIP ViT NeMo WrapperNconfigc                    s   t    || _d S N)super__init__r6   )selfr6   	__class__r3   r4   r9   >   s   

zSigLIPViTModel.__init__returnc                 C   s   t | ds| j | _d S d S )Nmodule)hasattrr6   configure_modelr>   r:   r3   r3   r4   r@   C   s   
zSigLIPViTModel.configure_modelr7   )r=   N)	r)   r*   r+   r,   r   r   r9   r@   __classcell__r3   r3   r;   r4   r5   ;   s    r5   hfc                   @   sJ   e Zd ZdZdefddZdedefddZede	fdd	Z
d
d ZdS )SigLIPViTImporterzHF SigLIP ViT Importerr=   c                 C   s
   t | jS r7   )r5   r6   rA   r3   r3   r4   initM   s   
zSigLIPViTImporter.initoutput_pathc                 C   sh   ddl m} |jt| dd}|  }| |}| || | || td|  t	|| ~~|S )Nr   )	AutoModelTtrust_remote_codez#Converted SigLIPViT model saved to )
transformersrG   from_pretrainedr-   rE   
nemo_setupconvert_state	nemo_saveprintr   )r:   rF   rG   sourcetargettrainerr3   r3   r4   applyQ   s   

zSigLIPViTImporter.applyc                 C   s|   ddl m} |jt| dd}|jj}t||jj|jj| | |jj| | |jj	|jj
|jj|jj|jj
 |jj
d	}|S )Nr   )
AutoConfigTrH   )	r   r   r   r   r   r   r   r   r   )rJ   rT   rK   r-   vision_config
patch_sizer	   r   
image_sizeintermediate_sizer   num_hidden_layers)r:   rT   rP   r   outputr3   r3   r4   r6   c   s   zSigLIPViTImporter.configc                 C   sD   i }| ddddddddd	d
dddddd tj|||ttgdS )Nzconv1.weightz
conv1.biaszposition_embeddings.weightzln_post.weightzln_post.biasz2decoder.layers.*.self_attention.linear_proj.weightz0decoder.layers.*.self_attention.linear_proj.biasz<decoder.layers.*.self_attention.linear_qkv.layer_norm_weightz:decoder.layers.*.self_attention.linear_qkv.layer_norm_biasz&decoder.layers.*.mlp.linear_fc1.weightz$decoder.layers.*.mlp.linear_fc1.biasz&decoder.layers.*.mlp.linear_fc2.weightz$decoder.layers.*.mlp.linear_fc2.biasz1decoder.layers.*.mlp.linear_fc1.layer_norm_weightz/decoder.layers.*.mlp.linear_fc1.layer_norm_bias)z.vision_model.embeddings.patch_embedding.weightz,vision_model.embeddings.patch_embedding.biasz1vision_model.embeddings.position_embedding.weightz"vision_model.post_layernorm.weightz vision_model.post_layernorm.biasz7vision_model.encoder.layers.*.self_attn.out_proj.weightz5vision_model.encoder.layers.*.self_attn.out_proj.biasz0vision_model.encoder.layers.*.layer_norm1.weightz.vision_model.encoder.layers.*.layer_norm1.biasz,vision_model.encoder.layers.*.mlp.fc1.weightz*vision_model.encoder.layers.*.mlp.fc1.biasz,vision_model.encoder.layers.*.mlp.fc2.weightz*vision_model.encoder.layers.*.mlp.fc2.biasz0vision_model.encoder.layers.*.layer_norm2.weightz.vision_model.encoder.layers.*.layer_norm2.bias)mapping
transforms)updater   apply_transforms_import_vision_qkv_bias_import_vision_qkv)r:   rP   rQ   r[   r3   r3   r4   rM   x   s6   zSigLIPViTImporter.convert_stateN)r)   r*   r+   r,   r5   rE   r   rS   propertyr   r6   rM   r3   r3   r3   r4   rD   I   s    rD   SigLIPVisionModelc                 C   sd  |   }||f|dd   }	||f|dd   }
| j|	 } |j|
 }|j|
 }g }t|D ]<}|| || |d | d d d d f  ||||d d d d d f  ||||d d d d d f  q-t|}|jdksyJ |j|jd |d | ksJ |j|jd |ksJ |j|jd |d ksJ |j|||d|   |g}|S )N      r      )	sizeviewrangeappendtorchcatndimshapereshape)qkvhead_numr   heads_per_groupr   	head_sizeold_tensor_shapenew_q_tensor_shapenew_kv_tensor_shapeqkv_weights_liqkv_weightsr3   r3   r4   
import_qkv   s$   


,$&
 r{   )z3vision_model.encoder.layers.*.self_attn.q_proj.biasz3vision_model.encoder.layers.*.self_attn.k_proj.biasz3vision_model.encoder.layers.*.self_attn.v_proj.biasz/decoder.layers.*.self_attention.linear_qkv.bias)
source_key
target_keyctxc              
   C   sF   | j j}t|d|d|d|j|j|j|j d|jddS )Nrc   rr   r   rs   r   rt   )rQ   r6   r{   	unsqueezer   r   r   squeeze)r~   q_biask_biasv_biasmegatron_configr3   r3   r4   r_      s   

	r_   )z5vision_model.encoder.layers.*.self_attn.q_proj.weightz5vision_model.encoder.layers.*.self_attn.k_proj.weightz5vision_model.encoder.layers.*.self_attn.v_proj.weightz1decoder.layers.*.self_attention.linear_qkv.weightc              
   C   s0   | j j}t||||j|j|j|j |j|jdS )Nr   )rQ   r6   r{   r   r   r   r   )r~   ro   rp   rq   r   r3   r3   r4   r`      s   

r`   )dataclassesr   pathlibr   typingr   lightning.pytorchpytorchLrj   "nemo.collections.llm.fn.activationr    nemo.collections.vlm.vision.baser   nemo.lightningr   r   r	   LightningModuleIOMixinConnectorMixinr5   model_importerModelConnectorrD   r{   state_transformTransformCTXr_   r`   r3   r3   r3   r4   <module>   s0   
P