o
    ߥinW                     @   s  d dl Z d dlmZ d dlmZmZmZmZ d dlZd dl	Z
d dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZm Z  d dl!m"Z" e" Z#dgZ$G dd dej%Z&G dd dej%Z'G dd dej%Z(G dd dej)Z)G dd dej%Z*G dd dej%Z+G dd dej%Z,G dd dej%Z-G dd dej%Z.dd  Z/d!ej%fd"d#Z0ej1e j2ej3d$G d%d deZ4dS )&    N)OrderedDict)AnyDictTupleUnion)Models)
TorchModel)MODELS)FullTokenizer)
BertConfig)	BertModel)ModeKeys	ModelFileTasks)
get_loggerCLIPForMultiModalEmbeddingc                       s2   e Zd ZdZd fdd	ZdejfddZ  ZS )	
Bottleneck      c                    s  t    tj||ddd| _t|| _tj||dddd| _t|| _|dkr/t	|nt
 | _tj||| j ddd| _t|| j | _tjdd| _d | _|| _|dksb||tj krttdt	|fd	tj||| j dddd
fdt|| j fg| _d S d S )Nr   F)bias   )paddingr   Tinplacez-10)strider   1)super__init__nnConv2dconv1BatchNorm2dbn1conv2bn2	AvgPool2dIdentityavgpool	expansionconv3bn3ReLUrelu
downsampler   r   
Sequentialr   )selfinplanesplanesr   	__class__ \/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/clip/model.pyr   +   s6   

zBottleneck.__init__xc                 C   st   |}|  | | |}|  | | |}| |}| | |}| jd ur/| |}||7 }|  |}|S N)	r-   r#   r!   r%   r$   r(   r+   r*   r.   )r0   r7   identityoutr5   r5   r6   forwardK   s   



zBottleneck.forwardr   )	__name__
__module____qualname__r)   r   torchTensorr;   __classcell__r5   r5   r3   r6   r   (   s     r   c                	       s:   e Zd Z	d
dedededef fddZdd	 Z  ZS )AttentionPool2dNspacial_dim	embed_dim	num_heads
output_dimc                    st   t    tt|d d ||d  | _t||| _t||| _	t||| _
t||p2|| _|| _d S )N   r   g      ?)r   r   r   	Parameterr@   randnpositional_embeddingLineark_projq_projv_projc_projrF   )r0   rD   rE   rF   rG   r3   r5   r6   r   ]   s   

zAttentionPool2d.__init__c              	   C   s4  | |jd |jd |jd |jd  ddd}tj|jddd|gdd}|| jd d d d d f |j }t	j
di d|d	|d
|d|jd d| jd| jjd| jjd| jjdd dt| jj| jj| jjgdd dd ddddd| jjd| jjddd| jdd\}}|d S )Nr   r   rH   r   TdimkeepdimrR   querykeyvalueembed_dim_to_checkrF   q_proj_weightk_proj_weightv_proj_weightin_proj_weightin_proj_biasbias_kbias_vadd_zero_attnF	dropout_pout_proj_weightout_proj_biasuse_separate_proj_weighttrainingneed_weightsr5   )reshapeshapepermuter@   catmeanrK   todtypeFmulti_head_attention_forwardrF   rN   weightrM   rO   r   rP   rf   )r0   r7   _r5   r5   r6   r;   k   sf   $

	
zAttentionPool2d.forwardr8   )r=   r>   r?   intr   r;   rB   r5   r5   r3   r6   rC   [   s    rC   c                       s8   e Zd ZdZ		d fdd	ZdddZd	d
 Z  ZS )ModifiedResNeta  
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
       @   c                    s6  t    || _|| _tjd|d ddddd| _t|d | _tj|d |d dddd| _	t|d | _
tj|d |dddd| _t|| _td| _tjdd| _|| _| ||d	 | _| j|d |d dd
| _| j|d |d dd
| _| j|d |d dd
| _|d }t|d |||| _d S )Nr   rH   r   F)kernel_sizer   r   r   )rw   r   r   Tr   r   )r   r          )r   r   rG   input_resolutionr   r    r!   r"   r#   r$   r%   r*   r+   r&   r(   r,   r-   	_inplanes_make_layerlayer1layer2layer3layer4rC   attnpool)r0   layersrG   headsrz   widthrE   r3   r5   r6   r      s4   


zModifiedResNet.__init__r   c                 C   sH   t | j||g}|t j | _td|D ]}|t | j| qtj| S )Nr   )r   r{   r)   rangeappendr   r/   )r0   r2   blocksr   r   rr   r5   r5   r6   r|      s
   
zModifiedResNet._make_layerc                    sZ    fdd}|  jjj}||} |} |} |} |} |}|S )Nc                    sL    j  jf j jf j jffD ]\}} ||| } q | } | S r8   )r!   r#   r$   r%   r*   r+   r-   r(   )r7   convbnr0   r5   r6   stem   s   

z$ModifiedResNet.forward.<locals>.stem)	typer!   rq   rn   r}   r~   r   r   r   )r0   r7   r   r5   r   r6   r;      s   




zModifiedResNet.forward)ru   rv   r<   )r=   r>   r?   __doc__r   r|   r;   rB   r5   r5   r3   r6   rt      s    
"	rt   c                       s(   e Zd ZdZdejf fddZ  ZS )	LayerNormz*Subclass torch's LayerNorm to handle fp16.r7   c                    s$   |j }t |tj}||S r8   )rn   r   r;   r   r@   float32)r0   r7   	orig_typeretr3   r5   r6   r;      s   
zLayerNorm.forward)r=   r>   r?   r   r@   rA   r;   rB   r5   r5   r3   r6   r      s    r   c                   @   s   e Zd ZdejfddZdS )	QuickGELUr7   c                 C   s   |t d|  S )NgZd;?)r@   sigmoidr0   r7   r5   r5   r6   r;         zQuickGELU.forwardN)r=   r>   r?   r@   rA   r;   r5   r5   r5   r6   r      s    r   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nc_fcr   gelurP   )r   r   r   MultiheadAttentionattnr   ln_1r/   r   rL   r   mlpln_2r   )r0   r   r   r   r3   r5   r6   r      s   



zResidualAttentionBlock.__init__r7   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )Nrn   deviceF)rg   r   r   )r   rm   rn   r   r   r   r5   r5   r6   	attention   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r8   )r   r   r   r   r   r5   r5   r6   r;      s   zResidualAttentionBlock.forwardr8   )
r=   r>   r?   rs   r@   rA   r   r   r;   rB   r5   r5   r3   r6   r      s    r   c                	       sD   e Zd Z	ddedededejf fddZdejfd	d
Z  ZS )TransformerNr   r   r   r   c                    s<   t    | _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r5   )r   ).0rr   r   r   r   r5   r6   
<listcomp>  s    
z(Transformer.__init__.<locals>.<listcomp>)r   r   r   r   r   r/   r   	resblocks)r0   r   r   r   r   r3   r   r6   r      s   
zTransformer.__init__r7   c                 C   s
   |  |S r8   )r   r   r5   r5   r6   r;     s   
zTransformer.forwardr8   )	r=   r>   r?   rs   r@   rA   r   r;   rB   r5   r5   r3   r6   r      s    r   c                       sF   e Zd Zdedededededef fddZd	ejfd
dZ  ZS )VisualTransformerrz   
patch_sizer   r   r   rG   c                    s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t|||| _t|| _t|t	|| | _d S )Nr   F)in_channelsout_channelsrw   r   r         rH   r   )r   r   rz   rG   r   r    r!   rI   r@   rJ   class_embeddingrK   r   ln_prer   transformerln_postproj)r0   rz   r   r   r   r   rG   scaler3   r5   r6   r     s&   




zVisualTransformer.__init__r7   c              	   C   s   |  |}||jd |jd d}|ddd}tj| j|jtj	|jd d|jd |j|j
d |gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urr|| j }|S )Nr   r   rY   rH   r   rT   )r!   rh   ri   rj   r@   rk   r   rm   rn   zerosr   rK   r   r   r   r   r   r5   r5   r6   r;   *  s4   
	



zVisualTransformer.forward)	r=   r>   r?   rs   r   r@   rA   r;   rB   r5   r5   r3   r6   r     s    r   c                %       s   e Zd Z	d"dededeeeeeef ef dedededed	ed
edededededededededef$ fddZ	dd Z
edd Zdd Zdd Zdd Zd d! Z  ZS )#CLIPrv   rE   image_resolutionvision_layersvision_widthvision_patch_size
vocab_size!text_attention_probs_dropout_probtext_hidden_acttext_hidden_dropout_probtext_hidden_sizetext_initializer_rangetext_intermediate_sizetext_max_position_embeddingstext_num_attention_headstext_num_hidden_layerstext_type_vocab_size	tokenizervision_head_widthc                    s   t    t|ttfr|d | }t|||||d| _n|| }t||||||d| _t||
|||||	||||dd| _	t
| j	| _tt|
|| _ttg td | _|| _|   d S )Nry   )r   rG   r   rz   r   )rz   r   r   r   r   rG   g-q=)vocab_size_or_config_json_filehidden_sizenum_hidden_layersnum_attention_headsintermediate_size
hidden_acthidden_dropout_probattention_probs_dropout_probmax_position_embeddingstype_vocab_sizeinitializer_rangelayer_norm_eps$I$I,@)r   r   
isinstancetuplelistrt   visualr   r   bert_configr   bertr   rI   r@   emptytext_projectiononesnploglogit_scaler   initialize_parameters)r0   rE   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   vision_headsr3   r5   r6   r   K  sP   


zCLIP.__init__c                 C   s  t tg td | _t| jt	rw| jj
d urR| jj
jjd }t jj| jj
jj|d t jj| jj
jj|d t jj| jj
jj|d t jj| jj
jj|d | jj| jj| jj| jjfD ]}| D ]\}}|drut j| qfq`| jd urt jj| j| jjd d d S d S )Nr   r   )stdz
bn3.weight)r   rI   r@   r   r   r   r   r   r   rt   r   rP   in_featuresinitnormal_rN   rq   rM   rO   r}   r~   r   r   named_parametersendswithzeros_r   r   r   )r0   r   resnet_blocknameparamr5   r5   r6   r     s*   


zCLIP.initialize_parametersc                 C   s   | j jjjS r8   )r   r!   rq   rn   r   r5   r5   r6   rn     s   z
CLIP.dtypec                 C   s   |  || jS r8   )r   r   rn   )r0   imager5   r5   r6   encode_image  r   zCLIP.encode_imagec                 C   sT   | j jd }||| j}| j||dd | j}|d d dd d f | j S )Nz[PAD])attention_maskr   )r   vocabner   rn   r   r   )r0   text	pad_indexr   r7   r5   r5   r6   encode_text  s   zCLIP.encode_textc                 C   s   |d us|d usJ d|d u r|  |S |d u r| |S | |}|  |}||jddd }||jddd }||| j fS )Nz#text and image cannot both be None!rY   TrQ   )r   r   normr   exp)r0   r   r   image_featurestext_featuresr5   r5   r6   r;     s   



zCLIP.forwardc                 C   sb   |  |}| |}||jddd }||jddd }| j }|| |  }| }||fS )Nr   TrQ   )r   r   r   r   r   t)r0   r   r   r   r   r   logits_per_imagelogits_per_textr5   r5   r6   get_similarity  s   


zCLIP.get_similarity)rv   )r=   r>   r?   rs   r   r   floatstrr
   r   r   propertyrn   r   r   r;   r   rB   r5   r5   r3   r6   r   I  s\    	
D
r   c                 C   s4   |   D ]}|j |_|jr|jj |j_qd S r8   )
parametersdatar   grad)modelpr5   r5   r6   convert_models_to_fp32  s   r  r  c                 C   s   dd }|  | dS )z+Convert applicable model parameters to fp16c                 S   s   t | tjtjtjfr | jj | j_| jd ur | jj | j_t | tj	rGg dd dD dddD ]}t
| |}|d urF|j |_q5t | trR| tj dD ]}t| |rjt
| |}|d urj|j |_qTd S )Nc                 S   s   g | ]}| d qS )_proj_weightr5   )r   sr5   r5   r6   r     s    zEconvert_weights.<locals>._convert_weights_to_fp16.<locals>.<listcomp>)inqkvr^   r_   r`   )r   r   )r   r   Conv1dr    rL   rq   r   halfr   r   getattrr   rm   r@   hasattr)moduleattrtensorr   r5   r5   r6   _convert_weights_to_fp16  s6   




z1convert_weights.<locals>._convert_weights_to_fp16N)apply)r  r  r5   r5   r6   convert_weights  s   r  )module_namec                       sl   e Zd Z fddZdeeef deeef fddZdeeef deeef fdd	Ze	d
d Z
  ZS )r   c              	      sL  t  j|d|i| d|}td|  tj|s J d|}td|  tj|s5J t|ddd6}t|ddd}t	
|| _t	
| D ]	\}}	|	| j|< qRW d    n1 sfw   Y  W d    n1 suw   Y  | d	tj }
t|
d
| _tdi | jd| ji| _t| j t
| d	tj d}d|v r|d n|}tt| d drdd | D }tt| d drdd | D }| j| | j  tj rdttjddnd| _ tj r| j!| j  tdttjdd d S | j"  td d S )N	model_dirz{}/vision_model_config.jsonz!Loading vision model config from z{}/text_model_config.jsonzLoading text model config from rzutf-8)encoding/)
vocab_filer   cpu
state_dictr   r  c                 S   "   i | ]\}}|t d d |qS )zmodule.Nlenr   r	  r
  r5   r5   r6   
<dictcomp>#     " z7CLIPForMultiModalEmbedding.__init__.<locals>.<dictcomp>
clip_modelc                 S   r  )zclip_model.Nr  r   r5   r5   r6   r!  &  r"  zcuda:{}
LOCAL_RANKz%Use GPU {} for finetuning & inferencez"Use CPU for finetuning & inferencer5   )#r   r   formatloggerinfoospathexistsopenjsonload
model_infoitemsr   
VOCAB_FILEr
   r   r   r#  r  r@   TORCH_MODEL_BIN_FILEnextiter
startswithload_state_dictevalcudais_availablers   environgetr   rm   r   )r0   r  argskwargsvision_model_config_filetext_model_config_filefvftr	  r
  r  
checkpointsdr3   r5   r6   r      st   
 



z#CLIPForMultiModalEmbedding.__init__inputreturnc           	      C   s  ddl m} |jd |jd i}|dtj}d|v rht|d tj	rh|d 
| j}| dkr;|jd dkr;|d}tj|tjk | j|}||jddd	 }W d    n1 s^w   Y  |||j< d
|v rt|d
 tj	r|d
 
| j}| dkr|jd dkr|d}tj|tjk | j|}||jddd	 }W d    n1 sw   Y  |||j< |tjkr| jjd   |d< |S )Nr   )
OutputKeysmodeimg   r   rY   TrQ   r   r         ?r   )modelscope.outputsrE  IMG_EMBEDDINGTEXT_EMBEDDINGr:  r   	INFERENCEr   r@   rA   rm   r   rR   ri   squeezeautogradset_grad_enabledTRAINr#  r   r   r   r   r   rl   )	r0   rC  rE  outputrF  image_tensorr   text_tensorr   r5   r5   r6   r;   5  sD   






z"CLIPForMultiModalEmbedding.forwardinputsc                 C   s   |S r8   r5   )r0   rU  r5   r5   r6   postprocess[  s   z&CLIPForMultiModalEmbedding.postprocessc                 C   s   d| j j  S )NrI  )r#  r   r   r   r5   r5   r6   temperature^  s   z&CLIPForMultiModalEmbedding.temperature)r=   r>   r?   r   r   r   r   r;   rV  r   rW  rB   r5   r5   r3   r6   r     s    "5"&)5r(  collectionsr   typingr   r   r   r   r,  numpyr   r@   torch.nnr   torch.nn.functional
functionalro   modelscope.metainfor   modelscope.modelsr   modelscope.models.builderr	   1modelscope.models.multi_modal.clip.bert_tokenizerr
   5modelscope.models.multi_modal.clip.configuration_bertr   0modelscope.models.multi_modal.clip.modeling_bertr   modelscope.utils.constantr   r   r   modelscope.utils.loggerr   r&  __all__Moduler   rC   rt   r   r   r   r   r   r   r  r  register_modulemulti_modal_embeddingclipr   r5   r5   r5   r6   <module>   s@   3/G	8 