o
    ߥix.                     @   s   d dl Z d dlmZ d dlmZmZ d dlZd dlZd dlm	Z	 G dd de	j
ZG dd de	jZG d	d
 d
e	j
ZG dd de	j
ZG dd de	j
ZG dd de	j
ZdefddZej rddnddfdedeeejf fddZdS )    N)OrderedDict)TupleUnion)nnc                       s   e Zd Zdededeeeeeef ef dededededed	ed
ef fddZdd Zdd Ze	dd Z
dd Zdd Zdd Z  ZS )CLIP	embed_dimimage_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersc                    s   t    || _|d }t||||||d| _t||
|	|  d| _|| _t	
||| _t	t| j|| _t|| _t	t||| _t	tg td | _|   d S )N@   )input_resolution
patch_sizewidthlayersheads
output_dim)r   r   r   	attn_maskg$I$I,@)super__init__r   VisionTransformervisualTransformerbuild_attention_masktransformerr   r   	Embeddingtoken_embedding	Parametertorchemptypositional_embedding	LayerNormln_finaltext_projectiononesnploglogit_scaleinitialize_parameters)selfr   r   r	   r
   r   r   r   r   r   r   vision_heads	__class__ ]/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/soonet/clip.pyr      s8   


zCLIP.__init__c                 C   s   t jj| jjdd t jj| jdd | jjd d| jj d  }| jjd }d| jj d }| jj	D ]-}t jj|j
j|d t jj|j
jj|d t jj|jjj|d t jj|jjj|d q3| jd urut jj| j| jjd d d S d S )Ng{Gz?)stdg{Gz?         )r   initnormal_r!   weightr%   r   r   r   	resblocksattnin_proj_weightout_projmlpc_fcc_projr(   )r.   proj_stdattn_stdfc_stdblockr2   r2   r3   r-   <   s"   


zCLIP.initialize_parametersc                 C   s,   t | j| j}|td |d |S )Nz-inf   )r#   r$   r   fill_floattriu_)r.   maskr2   r2   r3   r   N   s   
zCLIP.build_attention_maskc                 C   s   | j jjjS N)r   conv1r9   dtype)r.   r2   r2   r3   rL   V   s   z
CLIP.dtypec                 C   s   |  || jS rJ   )r   typerL   )r.   imager2   r2   r3   encode_imageZ      zCLIP.encode_imagec                 C   s   |  || j}|| j| j }|ddd}| |}|ddd}| || j}|t|j	d |j
ddf }|S )NrE   r   r6   dim)r!   rM   rL   r%   permuter   r'   r#   arangeshapeargmax)r.   textxr2   r2   r3   encode_text]   s   

 zCLIP.encode_textc                 C   sb   |  |}| |}||jddd }||jddd }| j }|| |  }| }||fS )NrE   T)rS   keepdim)rO   rZ   normr,   expt)r.   rN   rX   image_featurestext_featuresr,   logits_per_imagelogits_per_textr2   r2   r3   forwardm   s   


zCLIP.forward)__name__
__module____qualname__intr   r   r   r-   r   propertyrL   rO   rZ   rc   __classcell__r2   r2   r0   r3   r      s:    	
-
r   c                       s(   e Zd ZdZdejf fddZ  ZS )r&   z*Subclass torch's LayerNorm to handle fp16.rY   c                    s$   |j }t |tj}||S rJ   )rL   r   rc   rM   r#   float32)r.   rY   	orig_typeretr0   r2   r3   rc      s   
zLayerNorm.forward)rd   re   rf   __doc__r#   Tensorrc   ri   r2   r2   r0   r3   r&      s    r&   c                   @   s   e Zd ZdejfddZdS )	QuickGELUrY   c                 C   s   |t d|  S )NgZd;?)r#   sigmoidr.   rY   r2   r2   r3   rc      rP   zQuickGELU.forwardN)rd   re   rf   r#   rn   rc   r2   r2   r2   r3   ro      s    ro   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_headr   c              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nr?      gelur@   )r   r   r   MultiheadAttentionr;   r&   ln_1
Sequentialr   Linearro   r>   ln_2r   )r.   rs   rt   r   r0   r2   r3   r      s   



zResidualAttentionBlock.__init__rY   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )NrL   deviceF)need_weightsr   r   )r   torL   r}   r;   rq   r2   r2   r3   	attention   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S rJ   )r   rx   r>   r{   rq   r2   r2   r3   rc      s   zResidualAttentionBlock.forwardrJ   )
rd   re   rf   rg   r#   rn   r   r   rc   ri   r2   r2   r0   r3   rr      s    rr   c                	       sD   e Zd Z	ddedededejf fddZdejfd	d
Z  ZS )r   Nr   r   r   r   c                    s<   t    | _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r2   )rr   ).0_r   r   r   r2   r3   
<listcomp>   s    
z(Transformer.__init__.<locals>.<listcomp>)r   r   r   r   r   ry   ranger:   )r.   r   r   r   r   r0   r   r3   r      s   
zTransformer.__init__rY   c                 C   s
   |  |S rJ   )r:   rq   r2   r2   r3   rc      s   
zTransformer.forwardrJ   )	rd   re   rf   rg   r#   rn   r   rc   ri   r2   r2   r0   r3   r      s    r   c                       sF   e Zd Zdedededededef fddZd	ejfd
dZ  ZS )r   r   r   r   r   r   r   c                    s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t|||| _t|| _t|t	|| | _d S )N   F)in_channelsout_channelskernel_sizestridebiasr5   r6   rE   )r   r   r   r   r   Conv2drK   r"   r#   randnclass_embeddingr%   r&   ln_prer   r   ln_postproj)r.   r   r   r   r   r   r   scaler0   r2   r3   r      s&   




zVisionTransformer.__init__rY   c                 C   s   |  |}||jd |jd d}|ddd}| j|jtj|jd d|jd |j|j	d }tj
||gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urt|| j }|S )Nr   rE   rQ   r6   r|   rR   )rK   reshaperV   rT   r   r   rL   r#   zerosr}   catr%   r   r   r   r   )r.   rY   class_tokenr2   r2   r3   rc      s$   




zVisionTransformer.forward)	rd   re   rf   rg   r   r#   rn   rc   ri   r2   r2   r0   r3   r      s    r   
state_dictc                 C   s   | d j d }tdd |  D }| d j d }t| d j d d d }|| }| d	 j d }| d
 j d }| d j d }| d j d }	|	d }
ttdd | D }t||||||||	|
|
}dD ]	}|| v rm| |= qd||  | S )Nzvisual.conv1.weightr   c                 S   s$   g | ]}| d r|dr|qS )zvisual.z.attn.in_proj_weight)
startswithendswithr   kr2   r2   r3   r      s    zbuild_model.<locals>.<listcomp>rQ   zvisual.positional_embeddingrE   g      ?r(   r%   ztoken_embedding.weightzln_final.weightr   c                 s   s(    | ]}| d r|dd V  qdS )ztransformer.resblocks.r6   N)r   splitr   r2   r2   r3   	<genexpr>   s    
zbuild_model.<locals>.<genexpr>)r   r   r   )rV   lenkeysroundsetr   load_state_dicteval)r   r
   r	   r   	grid_sizer   r   r   r   r   r   r   modelkeyr2   r2   r3   build_model   s:   

r   cudacpuTnamer}   c                    sl  d}| }zt jj||r ndd }d }W n ty3   |r*td| d d}t j|dd}Y nw |sMt|p<| 	 }t
 dkrK|  |S t jj fddg d}d	d
 |jdD d fdd}|| ||j ||j t
 dkrt jjdd g d}t|jd d }	|	 fdd}
||
 |
|j |
|j |  |S )NFr   )map_locationzFile z6 is not a JIT archive. Loading as a state dict insteadc                      s   t g t  S rJ   )r#   r)   r   r}   r2   )r}   r2   r3   <lambda>)  s    zload_clip.<locals>.<lambda>)example_inputsc                 S   s   g | ]
}d t |v r|qS )Device)repr)r   nr2   r2   r3   r   *  s
    zload_clip.<locals>.<listcomp>prim::ConstantrQ   c                    st   t | dr	| jgng }t | dr|| jj |D ]}|dD ]}d| v r6t|d dr6|  q qd S )Ngraphforward1r   valuer   )	hasattrr   appendr   findAllNodesattributeNamesstrr   copyAttributes)modulegraphsr   node)device_noder2   r3   patch_device/  s   

zload_clip.<locals>.patch_devicec                   S   s   t g  S rJ   )r#   r)   rG   r2   r2   r2   r3   r   @  s    aten::torE   c                    s   t | dr	| jgng }t | dr|| jj |D ](}|dD ] }t| }dD ]}||  d dkr?||    q*q qd S )Nr   r   r   )rE   r6   r      )	r   r   r   r   r   listinputsr   r   )r   r   r   r   r   i)
float_noder2   r3   patch_floatD  s   
zload_clip.<locals>.patch_float)r#   jitloadr   RuntimeErrorwarningswarnr   r   r   r   rG   tracer   r   applyrO   rZ   r   findNoder   r   )r   r}   r   
model_pathr   r   device_holderr   float_holderfloat_inputr   r2   )r}   r   r   r3   	load_clip  s\   







r   )r   collectionsr   typingr   r   numpyr*   r#   r   Moduler   r&   ro   rr   r   r   dictr   r   is_availabler   r}   r   r2   r2   r2   r3   <module>   s(   r	0#