o
    ߥi/                     @   s&  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZ d dl	Z
d dlZd dlm  mZ d dlmZ d dlmZ d dlmZ G dd dejZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZdefddZej rdnddfdedeeejf fddZdS )    N)OrderedDict)TupleUnion)nn)tqdm)
TorchModelc                       s$   e Zd Zdejf fddZ  ZS )	LayerNormxc                    s$   |j }t |tj}||S N)dtypesuperforwardtypetorchfloat32)selfr	   	orig_typeret	__class__ _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vop_retrieval/backbone.pyr      s   
zLayerNorm.forward)__name__
__module____qualname__r   Tensorr   __classcell__r   r   r   r   r      s    r   c                   @   s   e Zd ZdejfddZdS )	QuickGELUr	   c                 C   s   |t d|  S )NgZd;?)r   sigmoidr   r	   r   r   r   r          zQuickGELU.forwardN)r   r   r   r   r   r   r   r   r   r   r      s    r   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nc_fc   geluc_proj)r   __init__r   MultiheadAttentionattnr   ln_1
Sequentialr   Linearr   mlpln_2r$   )r   r"   r#   r$   r   r   r   r)   &   s   



zResidualAttentionBlock.__init__r	   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )Nr   deviceF)need_weightsr$   r   )r$   tor   r2   r+   r   r   r   r   	attention4   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r
   )r5   r,   r/   r0   r   r   r   r   r   ;   s   zResidualAttentionBlock.forwardr
   )
r   r   r   intr   r   r)   r5   r   r   r   r   r   r   r!   $   s    r!   c                	       sD   e Zd Z	ddedededejf fddZdejfd	d
Z  ZS )TransformerNwidthlayersheadsr$   c                    s<   t    | _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r   )r!   ).0_r$   r:   r8   r   r   
<listcomp>K   s    
z(Transformer.__init__.<locals>.<listcomp>)r   r)   r8   r9   r   r-   range	resblocks)r   r8   r9   r:   r$   r   r=   r   r)   C   s   
zTransformer.__init__r	   c                 C   s
   |  |S r
   )r@   r   r   r   r   r   P   s   
zTransformer.forwardr
   )	r   r   r   r6   r   r   r)   r   r   r   r   r   r   r7   A   s    r7   c                       sF   e Zd Zdedededededef fddZd	ejfd
dZ  ZS )VisualTransformerinput_resolution
patch_sizer8   r9   r:   
output_dimc                    s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t|||| _t|| _t|t	|| | _d S )N   F)in_channelsout_channelskernel_sizestridebias            )r   r)   rB   rD   r   Conv2dconv1	Parameterr   randnclass_embeddingpositional_embeddingr   ln_prer7   transformerln_postproj)r   rB   rC   r8   r9   r:   rD   scaler   r   r   r)   V   s&   




zVisualTransformer.__init__r	   c                 C   s   |  |}||jd |jd d}|ddd}| j|j}tj|jd d|jd |j|j	d}|| }tj
||gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urx|| j }|S )Nr   rM   rL   r1   dim)rO   reshapeshapepermuterR   r4   r   r   zerosr2   catrS   rT   rU   rV   rW   )r   r	   x_1x_2r   r   r   r   m   s$   




zVisualTransformer.forward)	r   r   r   r6   r)   r   r   r   r   r   r   r   r   rA   T   s    rA   c                       s   e Zd Zdededeeeeeef ef dededededed	ed
ef fddZdd Zdd Ze	dd Z
dd ZdddZdd Z  ZS )CLIP	embed_dimimage_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersc                    s   t    || _|d }t||||||d| _t||
|	|  d| _|| _t	
||| _t	t| j|| _t|| _t	t||| _t	tg td | _|   d S )N@   )rB   rC   r8   r9   r:   rD   )r8   r9   r:   r$   g$I$I,@)r   r)   ri   rA   visualr7   build_attention_maskrU   rj   r   	Embeddingtoken_embeddingrP   r   emptyrS   r   ln_finaltext_projectiononesnploglogit_scaleinitialize_parameters)r   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   vision_headsr   r   r   r)      s8   


zCLIP.__init__c                 C   s   t jj| jjdd t jj| jdd | jjd d| jj d  }| jjd }d| jj d }| jj	D ]-}t jj|j
j|d t jj|j
jj|d t jj|jjj|d t jj|jjj|d q3| jd urut jj| j| jjd d d S d S )Ng{Gz?)stdg{Gz?rK   rL   )r   initnormal_rr   weightrS   rU   r8   r9   r@   r+   in_proj_weightout_projr/   r%   r(   ru   )r   proj_stdattn_stdfc_stdblockr   r   r   rz      s"   


zCLIP.initialize_parametersc                 C   s,   t | j| j}|td |d |S )Nz-infrM   )r   rs   ri   fill_floattriu_)r   maskr   r   r   rp      s   
zCLIP.build_attention_maskc                 C   s   | j jjjS r
   )ro   rO   r   r   )r   r   r   r   r      s   z
CLIP.dtypec                 C   s   |  || jS r
   )ro   r   r   )r   imager   r   r   encode_image   r    zCLIP.encode_imageFc                 C   s   |  || j}|| j| j }|ddd}| |}|ddd}| || j}|r5|| j S |t	|j
d |jddf | j }|S )NrM   r   rL   rY   rZ   )rr   r   r   rS   r^   rU   rt   ru   r   aranger]   argmax)r   textreturn_all_tokensr	   r   r   r   encode_text   s   


zCLIP.encode_textc                 C   sj   |  |}| |}||jddd }||jddd }| j }|| |  }|| |  }||fS )NrY   T)r[   keepdim)r   r   normry   expt)r   r   r   image_featurestext_featuresry   logits_per_imagelogits_per_textr   r   r   r      s   


zCLIP.forward)F)r   r   r   r6   r   r   r)   rz   rp   propertyr   r   r   r   r   r   r   r   r   rc      s2    
$

rc   
state_dictc                    sh  d v }|r1 d j d }tdd   D } d j d }t d j d d d	 }|| }n6 fd
ddD }t|} d j d }t d j d d d	 }d }|d d  d j d kscJ |d } d j d }	 d j d }
 d j d } d j d }|d }ttdd  D }t|	|||||
||||
}dD ]	}| v r |= q|  | S )Nzvisual.projzvisual.conv1.weightr   c                 S   s$   g | ]}| d r|dr|qS )zvisual.z.attn.in_proj_weight)
startswithendswithr;   kr   r   r   r>      s    zbuild_model.<locals>.<listcomp>rY   zvisual.positional_embeddingrM   g      ?c                    s&   g | ] t t fd dD qS )c                 3   s.    | ]}| d   r|dd V  qdS )zvisual.layer.rL   Nr   splitr   br   r   	<genexpr>   s    
z)build_model.<locals>.<listcomp>.<genexpr>)lenset)r;   r   r   r   r>      s    )rM   rL   rE   r&   zvisual.layer1.0.conv1.weightz$visual.attnpool.positional_embeddingrL       ru   rS   ztoken_embedding.weightzln_final.weightrn   c                 s   s(    | ]}| d r|dd V  qdS )ztransformer.resblocksr   rL   Nr   r   r   r   r   r     s    
zbuild_model.<locals>.<genexpr>)rB   ri   rj   )	r]   r   keysroundtupler   rc   load_state_dicteval)r   vitrg   rf   rh   	grid_sizere   countsoutput_widthrd   ri   rj   rk   rl   rm   modelkeyr   r   r   build_model   sd   




r   cudacpuTnamer2   c                    sl  d}| }zt jj||r ndd }d }W n ty3   |r*td| d d}t j|dd}Y nw |sMt|p<| 	 }t
 dkrK|  |S t jj fddg d}d	d
 |jdD d fdd}|| ||j ||j t
 dkrt jjdd g d}t|jd d }	|	 fdd}
||
 |
|j |
|j |  |S )NFr   )map_locationzFile z6 is not a JIT archive. Loading as a state dict insteadc                      s   t g t  S r
   )r   rv   r4   r2   r   )r2   r   r   <lambda>5  s    zload_clip.<locals>.<lambda>)example_inputsc                 S   s   g | ]
}d t |v r|qS )Device)repr)r;   nr   r   r   r>   6  s
    zload_clip.<locals>.<listcomp>prim::ConstantrY   c                    st   t | dr	| jgng }t | dr|| jj |D ]}|dD ]}d| v r6t|d dr6|  q qd S )Ngraphforward1r   valuer   )	hasattrr   appendr   findAllNodesattributeNamesstrr   copyAttributes)modulegraphsr   node)device_noder   r   patch_device;  s   

zload_clip.<locals>.patch_devicec                   S   s   t g  S r
   )r   rv   r   r   r   r   r   r   L  s    aten::torM   c                    s   t | dr	| jgng }t | dr|| jj |D ](}|dD ] }t| }dD ]}||  d dkr?||    q*q qd S )Nr   r   r   )rM   rL   r      )	r   r   r   r   r   listinputsr   r   )r   r   r   r   r   i)
float_noder   r   patch_floatP  s   
zload_clip.<locals>.patch_float)r   jitloadr   RuntimeErrorwarningswarnr   r   r4   r   r   tracer   r   applyr   r   r   findNoder   r   )r   r2   r   
model_pathr   r   device_holderr   float_holderfloat_inputr   r   )r2   r   r   r   	load_clip  s\   







r   ) hashlibosurllibr   collectionsr   typingr   r   numpyrw   r   torch.nn.functionalr   
functionalFr   'modelscope.models.base.base_torch_modelr   r   r   r!   r7   rA   rc   dictr   r   is_availabler   r2   r   r   r   r   r   <module>   s4   1b7