o
    ίi]                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dl m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z) d dl*m+Z+m,Z,m-Z-m.Z.m/Z/ e
 Z0G dd dej1Z2G dd dej1Z3G dd dej1Z4G dd dej1Z5G dd dej1Z6G dd dej1Z7G dd dej1Z8G dd de8Z9dS )     N)OrderedDict)asdict)partial)	getLogger)	AnyCallableDictListOptionalSequenceTupleUnionLiteral)	rearrange)DropPath)nn)
functional)	constant_xavier_normal_xavier_uniform_)	Parameter)
checkpoint)Rope2D)PEConfigPETextConfigPE_VISION_CONFIGPE_TEXT_CONFIGfetch_pe_checkpointc                       s.   e Zd Zd	 fdd	Zdd Zdd Z  ZS )

LayerScaleh㈵>Fc                    s    t    || _|| _|| _d S N)super__init__inplacediminit_values)selfr$   r%   r#   	__class__ J/home/ubuntu/.local/lib/python3.10/site-packages/core/vision_encoder/pe.pyr"      s   

zLayerScale.__init__c                 C   s   | j r	|| jS || j S r    )r#   mul_gammar&   xr)   r)   r*   forward%   s   zLayerScale.forwardc                 C   s   t | jt| j | _d S r    )r   r   r%   torchonesr$   r,   r&   r)   r)   r*   init_tensors(   s   zLayerScale.init_tensors)r   F)__name__
__module____qualname__r"   r/   r3   __classcell__r)   r)   r'   r*   r      s    r   c                       sT   e Zd Zddejejfdedededededef fd	d
Zde	j
fddZ  ZS )AttentionPooling      	embed_dim	num_heads	num_probe	mlp_ratio	act_layer
norm_layerc              
      s   t    || _|| _| j| dksJ dttd|| j| _tj	| j| jdd| _
||| _t|| | _ttdt| j| jfd| fdt| j| jfg| _d S )	Nr   (embed_dim must be divisible by num_headsr9   Tbatch_firstc_fcgeluc_proj)r!   r"   r;   r<   r   r   r0   randnprobeMultiheadAttentionattn	layernormint	mlp_width
Sequentialr   Linearmlp)r&   r;   r<   r=   r>   r?   r@   r'   r)   r*   r"   -   s(   
	


zAttentionPooling.__init__r.   c                 C   sT   |j \}}}| j|ddf|j}| j|||ddd }|| | | }|S )Nr9   F)need_weightsr   )shaperH   repeattodtyperJ   rP   rK   )r&   r.   batch_qr)   r)   r*   r/   P   s
   zAttentionPooling.forward)r4   r5   r6   r   GELU	LayerNormrL   r   r"   r0   Tensorr/   r7   r)   r)   r'   r*   r8   ,   s&    #r8   c                       sJ   e Zd ZdZ	ddededeej f fddZdd	 Z	dd
dZ
  ZS )SelfAttentionz7
    Implements sequence packed attention and RoPe
    Nr;   r<   ropec                    s   t t|   || _|| _|| | _| j| | jksJ dttd| || _	ttd| | _
tj||dd| _|| _| jd | _d S )NrA      T)bias      )r!   r\   r"   r;   r<   head_dimr   r0   emptyin_proj_weightin_proj_biasr   rO   out_projr]   scale)r&   r;   r<   r]   r'   r)   r*   r"   _   s   
zSelfAttention.__init__c                 C   s(   t | j t| jd t| jjd d S )N        )r   rc   r   rd   re   r_   r2   r)   r)   r*   r3   v   s   
zSelfAttention.init_tensorsc              	   C   s   |j \}}}t|| j| j}|dd|fdddd	 }|d |d |d }}}	t
|d| jd}t
|d| jd}t
|	d| jd}	| jrT| ||\}}tj|||	d d	d
| jd}
t
|
d}
t|
| jj| jjS )Nr^   r   r9      zb s (h d) -> b h s d)hrg   F)	attn_mask	dropout_p	is_causalrf   zb h s d -> b s (h d))rR   Flinearrc   rd   	unflatten	unsqueeze	transposesqueeze
contiguousr   r<   r]   scaled_dot_product_attentionrf   re   weightr_   )r&   r.   rl   rV   seqr;   projrX   kvrJ   r)   r)   r*   r/   {   s&   
zSelfAttention.forwardr    )r4   r5   r6   __doc__rL   r
   r   Moduler"   r3   r/   r7   r)   r)   r'   r*   r\   Z   s    r\   c                       s   e Zd Zddejejddfdededededed	ed
ede	ej
 f fddZ	ddejde	ej fddZ	ddejde	ej fddZ  ZS )ResidualAttentionBlock      @Nrg   d_modeln_headr>   ls_init_valuer?   r@   	drop_pathr]   c	           
   
      s   t    |rt|||d| _n	tj||dd| _|d ur"t||nt | _|d ur0t||nt | _	||| _
||| _|dkrGt|nt | _|dkrTt|nt | _t|| }	ttdt||	fd| fdt|	|fg| _d S )N)r]   TrB   rg   rD   rE   rF   )r!   r"   r\   rJ   r   rI   r   Identityls_1ls_2ln_1ln_2r   
drop_path1
drop_path2rL   rN   r   rO   rP   )
r&   r   r   r>   r   r?   r@   r   r]   rM   r'   r)   r*   r"      s2   





zResidualAttentionBlock.__init__q_xrl   c                 C   sR   |d ur|j tjks||j }t| jtr| j||dS | j||||ddd S )Nrl   F)rl   rQ   r   )rU   r0   boolrT   
isinstancerJ   r\   )r&   r   rl   r)   r)   r*   
_call_attn   s   z!ResidualAttentionBlock._call_attnr.   c              
   C   sH   ||  | | j| ||d }|| | | | | }|S Nr   )r   r   r   r   r   r   rP   r   )r&   r.   rl   r)   r)   r*   r/      s
    zResidualAttentionBlock.forwardr    )r4   r5   r6   r   rY   rZ   rL   floatr   r
   r}   r"   r0   r[   r   r/   r7   r)   r)   r'   r*   r~      sH    	1
r~   c                       s   e Zd Zddejejddfdededededed	ed
edede	ej
 f fddZejjdddZejjdefddZ		ddejde	ej defddZ  ZS )Transformerr   Nrg   widthlayersheadsr>   r   r?   r@   r   r]   c
           
   
      sL   t    | _|| _d| _t fddt|D | _d S )NFc                    s$   g | ]}t  d qS )r   r?   r@   r   r]   )r~   ).0rW   r?   r   r   r   r>   r@   r]   r   r)   r*   
<listcomp>   s    z(Transformer.__init__.<locals>.<listcomp>)	r!   r"   r   r   grad_checkpointingr   
ModuleListrange	resblocks)
r&   r   r   r   r>   r   r?   r@   r   r]   r'   r   r*   r"      s   

zTransformer.__init__Tc                 C   s
   || _ d S r    )r   r&   enabler)   r)   r*   set_grad_checkpointing  s   
z"Transformer.set_grad_checkpointing	layer_idxc                 C   s2   | j | | j  d | _ t| jd| j  | _dS ); Delete layers so the last layer is the given layer index. r9   N)r   r   r   r   r&   r   r)   r)   r*   truncate
  s   zTransformer.truncaterh   r.   rl   c                 C   sd   | j | | j  }t| jD ]"\}}| jr"tj s"t||d d |}n|||d}||kr/ |S q|S r   )r   	enumerater   r   r0   jitis_scriptingr   )r&   r.   rl   r   stop_idxirr)   r)   r*   r/     s   zTransformer.forwardT)Nrh   )r4   r5   r6   r   rY   rZ   rL   r   r   r
   r}   r"   r0   r   ignorer   r   r[   r/   r7   r)   r)   r'   r*   r      sN    	
!r   c                &       sX  e Zd Zejeejdddddddddddd	d
fdededededede	de	de
de
dededede
de
de
dee deded f$ fddZd d! Zd@d"ed#e
fd$d%Zd&efd'd(Ze		dAd)ed*e
d+ee fd,d-Zed.d/ Zejjd@d0d1Zd2ed3efd4d5Zd6ejfd7d8Z		9	dBd6ejd:e
d&ed;e
fd<d=Zd6ejfd>d?Z  ZS )CVisionTransformerr   epsTNrg   i  F      rJ   
patch_sizer   r   r   r>   r?   r@   
use_ln_preuse_ln_postr   r   
image_sizeuse_abs_posemb
use_rope2duse_cls_token
output_dimattn_pooler_heads	pool_typerJ   tokavgnonec                    s
  t    |dv sJ || _|| _|p|| _|| _|| _|| _|| _|| _	|| _
|| _|| _tjd|||dd| _| jrEt|| | j
dnd | _|rN||nt | _|	rZ|| jnt | _t|||||
|||| jd	| _|dkr|t||||d| _nd | _|   d S )	Nr   r^   F)in_channelsout_channelskernel_sizestrider_   )r$   r   r   rJ   r;   r<   r?   r@   )r!   r"   r   r   r   proj_dimr   r   r   r   r   r   r   r   Conv2dconv1r   r]   r   ln_preln_postr   transformerr8   	attn_poolr3   )r&   r   r   r   r   r>   r?   r@   r   r   r   r   r   r   r   r   r   r   r   r'   r)   r*   r"   &  sb   

	
zVisionTransformer.__init__c                    s    fdd  |  | j   | jd }| jr#t|t| j | _| j	rB| j
| j | _t|tt| j| jd  | j | _| jd urWt|t| j| j | _d S d S )Nc                    s@   |   D ]\}}t|drtd|  |   | qd S )Nr3   z$Initializing tensors for submodule: )named_childrenhasattrloggerdebugr3   )modulenamechildinit_submodule_tensorsr)   r*   r   x  s   

z>VisionTransformer.init_tensors.<locals>.init_submodule_tensorsr`   rj   )r]   r3   r   r   r   r   r0   rG   class_embeddingr   r   r   posemb_grid_sizerL   positional_embeddingr   ry   )r&   
init_scaler)   r   r*   r3   w  s(   



zVisionTransformer.init_tensors	ckpt_pathverbosec                 C   s   t j|dd}d|v r|d }nd|v r|d }dd | D }tdd |D r3d	d | D }| j|d
d\}}|sB|sB|rbtd|  td|  td|  td|  d S d S )NTweights_only
state_dictweightsc                 S      i | ]\}}| d d|qS zmodule. replacer   rz   r{   r)   r)   r*   
<dictcomp>      z/VisionTransformer.load_ckpt.<locals>.<dictcomp>c                 s   s    | ]}| d V  qdS )visual.N)
startswithr   rz   r)   r)   r*   	<genexpr>  s    z.VisionTransformer.load_ckpt.<locals>.<genexpr>c                 S   s&   i | ]\}}d |v r| dd|qS )visualr   r   r   r   r)   r)   r*   r     s   & Fstrictz)Missing keys for loading vision encoder: z,Unexpected keys for loading vision encoder: )r0   loaditemsanyload_state_dictr   infoprintr&   r   r   _sdmur)   r)   r*   	load_ckpt  s   
zVisionTransformer.load_ckptr   c                 C   s   | j | | j j| _dS )r   N)r   r   r   r   r)   r)   r*   r     s   zVisionTransformer.truncater   
pretrainedcheckpoint_pathc                 K   sR   |t vrt| dtt | }|| | di |}|r'|t|| |S )N not found in configs.r)   )r   RuntimeErrorr   updater   r   )clsr   r   r   kwdargsargsmodelr)   r)   r*   from_config  s   
zVisionTransformer.from_configc                 C   s   t t S r    )listr   keysr   r)   r)   r*   available_configs  s   z#VisionTransformer.available_configsc                 C   s   | j j|d d S )N)r   )r   r   r   r)   r)   r*   r     s   z(VisionTransformer.set_grad_checkpointinggrid_hgrid_wc                 C   s   | j |kr| j |kr| jd S | j}| jr"|dd |dd }}|d| j | j ddddd }tj|||fdd	d
}|ddddd| j }| jrZt	j
||gdd}|d S )z:Interpolates the absolute position embedding if necessary.)N.Nr9   rh   r   r^   rj   bilinearF)sizemodealign_cornersr$   )r   r   r   reshapepermuteru   ro   interpolater   r0   cat)r&   r  r  	pos_embedcls_token_embedr)   r)   r*   _sample_abs_posemb  s    
z$VisionTransformer._sample_abs_posembr.   c                 C   s\   | j dkr|d d df S | j dkr|jddS | j dkr%| |dS | j dkr,|S t)Nr   r   r   r9   r  rJ   r   )r   meanr   rt   NotImplementedErrorr-   r)   r)   r*   _pool  s   



zVisionTransformer._poolrh   normstrip_cls_tokenc                 C   s   |j \}}}}|| j || j }	}
| |}|dddd|d| j}| jr<tj| j	
ddd|dd|gdd}| jrG|| |	|
 }| jrS| j|j|	|
 | |}| j||d}|rf| |}|rx| jrx|d d dd d d f }|S )Nr   rj   r^   r9   rh   r  )r   )rR   r   r   r  r  r   r   r0   r  r   viewexpandr   r  r   r]   update_griddevicer   r   r   )r&   r.   r  r   r  rV   rW   rk   wr  r  r)   r)   r*   forward_features  s(   



z"VisionTransformer.forward_featuresc                 K   s8   | j |fddi|}| |}| jd ur|| j }|S )Nr  T)r  r  r   ry   )r&   r.   kwargsr)   r)   r*   r/     s
   


zVisionTransformer.forwardr   FN)Frh   F)r4   r5   r6   r   rY   r   rZ   rL   r   r   r   r
   r   r"   r3   strr   r   classmethodr  r  r0   r   r   r   r  r[   r  r  r/   r7   r)   r)   r'   r*   r   %  s    	
Q 

$r   c                !       s   e Zd Zddddddddd	d
dd	ejeejddd	dfdedededededededede	dede
de	dedede	de	f  fdd Zd!d" Zd.d#e
d$e	fd%d&Zd'd( Z	d/d)eej de
fd*d+Zd,d- Z  ZS )0TextTransformerH   i   i   r      r   Nr   Fr   argmaxr   r   Tcontext_length
vocab_sizer   r   r   r>   r   r   no_causal_maskpad_idr   	proj_biasr?   r@   output_tokensr   c              	      s  t    |dv sJ || _|| _| | _| _|| _|| _|| _|| _	|
| _
|| _t||| _tt| j|| _t|||||||d| _|rM||nt | _|	rXd | _n
| jd|  dd |dksj|dkrtt||||d| _nd | _|rt||| _d S tt||| _d S )	N)firstlastr'  r   )r   r   r   r>   r   r?   r@   rl   F)
persistentrJ   attn_eosr   )r!   r"   r   r-  num_posr(  r)  r   r   r   r+  r   r   	Embeddingtoken_embeddingr   r0   rb   r   r   r   r   ln_finalrl   register_bufferbuild_causal_maskr8   r   rO   text_projection)r&   r(  r)  r   r   r   r>   r   r   r*  r+  r   r,  r?   r@   r-  r   r'   r)   r*   r"   )  sN   



zTextTransformer.__init__c                 C   s,   t | j| j}|td |d |S )N-infr9   )r0   rb   r2  fill_r   triu_)r&   maskr)   r)   r*   r7  m  s   
z!TextTransformer.build_causal_maskr   r   c                 C   s   t j|dd}d|v r|d }nd|v r|d }dd | D }| j|dd\}}|s0|s0|rPtd	|  td
|  td	|  td
|  d S d S )NTr   r   r   c                 S   r   r   r   r   r)   r)   r*   r   |  r   z-TextTransformer.load_ckpt.<locals>.<dictcomp>Fr   z Missing keys for loading model: z#Unexpected keys for loading model: )r0   r   r   r   r   r   r   r   r)   r)   r*   r   u  s   
zTextTransformer.load_ckptc                 C   sp   || j kd}tj|dd|jd dfdd}tj|j|jd}|d |	| t
d t|| jd}|S )Nr9   r   rj   T)value)r  r9  )r+  rr   ro   padrR   r0   rb   r  r:  masked_fill_r   repeat_interleaver   )r&   textcls_maskadditive_maskr)   r)   r*   build_cls_mask  s   
zTextTransformer.build_cls_maskrA  c                 C   s   |dkr|d d df |d d dd f }}||fS |dkr6|d d df |d d d df }}||fS |dkrW|d us@J |t |jd |jddf |}}||fS | }}||fS )Nr.  r   r9   r/  rh   r'  r  )r0   arangerR   r'  )r&   r.   rA  r   pooledtokensr)   r)   r*   text_global_pool  s   &
&&z TextTransformer.text_global_poolc                 C   s   |j d }| |}| j}|d ur|d |d |f }|| jd |  }| j||d}| |}| j||| jd\}}| jd urRt	| jt
jrM| |}n|| j }| jrY||fS |S )Nr9   r   )r   )rR   r4  rl   r   r   r5  rH  r   r8  r   r   rO   r-  )r&   rA  seq_lenr.   rl   rF  rG  r)   r)   r*   r/     s$   



zTextTransformer.forwardr   )Nr'  )r4   r5   r6   r   rY   r   rZ   rL   r   r   r"  r   r"   r7  r   rD  r
   r0   r[   rH  r/   r7   r)   r)   r'   r*   r$  (  sx    	
D

r$  c                	       s   e Zd Zedfdededef fddZdde	fd	d
Z
dde	fddZdde	f fddZ		ddeej deej fddZe		ddede	dee fddZedd Z  ZS )CLIPg$I$I,@
vision_cfgtext_cfginit_logit_scalec                    sR   t t| jdi t| tdi t|| _| jj| _tt	
g | | _d S )Nr)   )r!   rJ  r"   r   r   r   r   r   r   r0   r1   logit_scale)r&   rK  rL  rM  r'   r)   r*   r"     s   
zCLIP.__init__F	normalizec                 C   s    |  |}|rtj|ddS |S Nrh   r  )r   ro   rO  )r&   imagerO  r.   r)   r)   r*   encode_image  s   
zCLIP.encode_imagec                 C   sP   |j \}}}}}||| |||}| j||d}	|	||d}
|
jdd}
|
S )NrO  rh   r9   r  )rR   r  rR  r  )r&   videorO  bncrk   r  frms	frm_featsvideo_featsr)   r)   r*   encode_video  s   zCLIP.encode_videoc                    s"   t  |}|rtj|ddS |S rP  )r!   r/   ro   rO  )r&   rA  rO  r.   r'   r)   r*   encode_text  s   zCLIP.encode_textNrQ  rA  c                 C   sD   |d ur| j |ddnd }|d ur| j|ddnd }||| j fS )NTrS  )rR  r\  rN  exp)r&   rQ  rA  image_featurestext_featuresr)   r)   r*   r/     s
   zCLIP.forwardr   r   r   c                 C   sH   |t vs|tvrt| d| t | t| }|r"|t|| |S )Nr   )r   r   r   r   r   )r   r   r   r   r  r)   r)   r*   r    s   zCLIP.from_configc                 C   s   dd t D S )Nc                 S   s   g | ]}|t v r|qS r)   )r   r   r)   r)   r*   r     s    z*CLIP.available_configs.<locals>.<listcomp>)r   r  r)   r)   r*   r    s   zCLIP.available_configs)F)NNr!  )r4   r5   r6   nplogr   r   r   r"   r   rR  r[  r\  r
   r0   r[   r/   r#  r"  r  r  r7   r)   r)   r'   r*   rJ    s>    
rJ  ):copymathrandomcollectionsr   dataclassesr   	functoolsr   loggingr   typingr   r   r   r	   r
   r   r   r   r   numpyr`  r0   torch.nnr   einopsr   timm.layersr   r   ro   torch.nn.initr   r   r   torch.nn.parameterr   torch.utils.checkpointr   core.vision_encoder.roper   core.vision_encoder.configr   r   r   r   r   r   r}   r   r8   r\   r~   r   r   r$  rJ  r)   r)   r)   r*   <module>   s@    ,.?KA   