o
    ߥi@                     @   s  d dl mZ d dlmZmZ d dlZd dlm  mZ	 d dl
m  mZ d dlmZ G dd dejZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdd ZdS )    )OrderedDict)TupleUnionN)nnc                   @   s   e Zd ZdejfddZdS )	QuickGELUxc                 C   s   |t d|  S )NgZd;?)torchsigmoidselfr    r   a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/multi_modal/mplug/clip/clip.pyforward      zQuickGELU.forwardN)__name__
__module____qualname__r   Tensorr   r   r   r   r   r      s    r   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nc_fc   geluc_proj)super__init__r   MultiheadAttentionattn	LayerNormln_1
Sequentialr   Linearr   mlpln_2r   )r   r   r   r   	__class__r   r   r      s   



zResidualAttentionBlock.__init__r   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )NdtypedeviceF)need_weightsr   r   )r   tor)   r*   r   r
   r   r   r   	attention"   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S N)r-   r!   r$   r%   r
   r   r   r   r   )   s   zResidualAttentionBlock.forwardr.   )
r   r   r   intr   r   r   r-   r   __classcell__r   r   r&   r   r      s    r   c                       sJ   e Zd Z		ddedededejdef
 fdd	Zd
ejfddZ  Z	S )TransformerNTwidthlayersheadsr   use_grad_ckpc                    sB   t    | _|| _tj fddt|D  | _|| _d S )Nc                    s   g | ]}t  qS r   )r   ).0_r   r4   r2   r   r   
<listcomp>:   s    
z(Transformer.__init__.<locals>.<listcomp>)	r   r   r2   r3   r   r"   range	resblocksr5   )r   r2   r3   r4   r   r5   r&   r8   r   r   1   s   


zTransformer.__init__r   c                 C   s,   | j r| jD ]}t||}q|S | |S r.   )r5   r;   
checkpoint)r   r   
each_blockr   r   r   r   @   s
   

zTransformer.forward)NT)
r   r   r   r/   r   r   boolr   r   r0   r   r   r&   r   r1   /   s    r1   c                       s2   e Zd ZdZd fdd	ZdejfddZ  ZS )	
Bottleneckr      c                    s  t    tj||ddd| _t|| _tj||dddd| _t|| _|dkr/t	|nt
 | _tj||| j ddd| _t|| j | _tjdd| _d | _|| _|dksb||tj krttdt	|fd	tj||| j dddd
fdt|| j fg| _d S d S )Nr@   F)bias   )paddingrA   Tinplacez-10)striderA   1)r   r   r   Conv2dconv1BatchNorm2dbn1conv2bn2	AvgPool2dIdentityavgpool	expansionconv3bn3ReLUrelu
downsamplerG   r?   r"   r   )r   inplanesplanesrG   r&   r   r   r   L   s6   

zBottleneck.__init__r   c                 C   st   |}|  | | |}|  | | |}| |}| | |}| jd ur/| |}||7 }|  |}|S r.   )	rV   rL   rJ   rN   rM   rQ   rT   rS   rW   )r   r   identityoutr   r   r   r   l   s   



zBottleneck.forwardr@   )	r   r   r   rR   r   r   r   r   r0   r   r   r&   r   r?   I   s     r?   c                	       s:   e Zd Z	d
dedededef fddZdd	 Z  ZS )AttentionPool2dNspacial_dim	embed_dim	num_heads
output_dimc                    st   t    tt|d d ||d  | _t||| _t||| _	t||| _
t||p2|| _|| _d S )N   r@   g      ?)r   r   r   	Parameterr   randnpositional_embeddingr#   k_projq_projv_projr   r`   )r   r^   r_   r`   ra   r&   r   r   r   ~   s   

zAttentionPool2d.__init__c              	   C   sD  | |jd |jd |jd |jd  ddd}tj|jddd|gdd}|| jd d d d d f |j }| j	r@d}nd	}t
jdi d
|d|d|d|jd d| jd| jjd| jjd| jjdd dt| jj| jj| jjgdd dd ddd|d| jjd| jjddd| j	dd\}}|d S ) Nr   r@   rb   rB   Tdimkeepdimrj   g?g        querykeyvalueembed_dim_to_checkr`   q_proj_weightk_proj_weightv_proj_weightin_proj_weightin_proj_biasbias_kbias_vadd_zero_attnF	dropout_pout_proj_weightout_proj_biasuse_separate_proj_weighttrainingr+   r   )reshapeshapepermuter   catmeanre   r,   r)   r~   Fmulti_head_attention_forwardr`   rg   weightrf   rh   rA   r   )r   r   dropoutr7   r   r   r   r      sl   $

	
zAttentionPool2d.forwardr.   )r   r   r   r/   r   r   r0   r   r   r&   r   r]   |   s    r]   c                       s:   e Zd ZdZ		d fdd	ZdddZdd
dZ  ZS )ModifiedResNeta  
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
       @   c                    s6  t    || _|| _tjd|d ddddd| _t|d | _tj|d |d dddd| _	t|d | _
tj|d |dddd| _t|| _td| _tjdd| _|| _| ||d	 | _| j|d |d dd
| _| j|d |d dd
| _| j|d |d dd
| _|d }t|d |||| _d S )NrB   rb   r@   F)kernel_sizerG   rC   rA   )r   rC   rA   TrD   r   )rG   r          )r   r   ra   input_resolutionr   rI   rJ   rK   rL   rM   rN   rS   rT   rO   rQ   rU   rV   	_inplanes_make_layerlayer1layer2layer3layer4r]   attnpool)r   r3   ra   r4   r   r2   r_   r&   r   r   r      s4   


zModifiedResNet.__init__r@   c                 C   sH   t | j||g}|t j | _td|D ]}|t | j| qtj| S )Nr@   )r?   r   rR   r:   appendr   r"   )r   rY   blocksrG   r3   r7   r   r   r   r      s
   
zModifiedResNet._make_layerFc                    s^    fdd}|  jjj}||} |} |} |} |}|s- |}|S )Nc                    sL    j  jf j jf j jffD ]\}} ||| } q | } | S r.   )rJ   rL   rM   rN   rS   rT   rV   rQ   )r   convbnr   r   r   stem   s   

z$ModifiedResNet.forward.<locals>.stem)	typerJ   r   r)   r   r   r   r   r   )r   r   skip_last_layerr   r   r   r   r      s   




zModifiedResNet.forward)r   r   r\   )F)r   r   r   __doc__r   r   r   r0   r   r   r&   r   r      s    
"	r   c                       s(   e Zd ZdZdejf fddZ  ZS )r    z*Subclass torch's LayerNorm to handle fp16.r   c                    s   |j }t |}||S r.   )r)   r   r   r   )r   r   	orig_typeretr&   r   r   r      s   
zLayerNorm.forward)r   r   r   r   r   r   r   r0   r   r   r&   r   r       s    r    c                       sN   e Zd Zdedededededef fddZ			
	
ddejfddZ  ZS )VisualTransformerr   
patch_sizer2   r3   r4   ra   c                    s   t    || _|| _|| _tjd|||dd| _|d }t|t	
| | _t|t	
|| d d | | _t|| _t|||| _t|| _t|t	
|| | _d S )NrB   F)in_channelsout_channelsr   rG   rA         rb   r@   )r   r   r   ra   r4   r   rI   rJ   rc   r   rd   class_embeddingre   r    ln_prer1   transformerln_postproj)r   r   r   r2   r3   r4   ra   scaler&   r   r   r     s(   




zVisualTransformer.__init__FNr   c                 C   s   |  |}||jd |jd d}|ddd}| j|j}tj|jd d|jd |j|j	d}tj
|| |gdd}|| j|jd |dd d f  }| |}|ddd}| |}|ddd}|rr| |}|S || j }|S )Nr   r@   rq   rb   r(   rl   )rJ   r   r   r   r   r,   r)   r   zerosr*   r   re   sizer   r   r   r   )r   r   r   text_embedding	text_maskcls_embx_zerosr   r   r   r     s,   
(



zVisualTransformer.forward)FNN)	r   r   r   r/   r   r   r   r   r0   r   r   r&   r   r      s    r   c                       s   e Zd Zdededeeeeeef ef dededededed	ed
ef fddZdd Zdd Ze	dd Z
dd Zdd Zdd Z  ZS )CLIPr_   image_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersc                    s   t    || _t|ttfr |d d }t|||||d| _n|d }t||||||d| _t	||
|	| 
 d| _|| _t||| _tt| j|| _t|| _tt||| _ttg | _|   d S )Nr   r   )r3   ra   r4   r   r2   )r   r   r2   r3   r4   ra   )r2   r3   r4   r   )r   r   r   
isinstancetuplelistr   visualr   r1   build_attention_maskr   r   r   	Embeddingtoken_embeddingrc   r   emptyre   r    ln_finaltext_projectiononeslogit_scaleinitialize_parameters)r   r_   r   r   r   r   r   r   r   r   r   vision_headsr&   r   r   r   =  sJ   



zCLIP.__init__c           	      C   s  t jj| jjdd t jj| jdd t| jtr|| jj	d urW| jj	j
jd }t jj| jj	jj|d t jj| jj	jj|d t jj| jj	jj|d t jj| jj	j
j|d | jj| jj| jj| jjfD ]}| D ]\}}|drzt j| qkqe| jjd d| jj d  }| jjd }d| jj d }| jjD ]-}t jj|jj|d t jj|jjj|d t jj|jjj|d t jj|jj
j|d q| jd urt jj| j| jjd d d S d S )Ng{Gz?)stdg{Gz?r   z
bn3.weightrb   ) r   initnormal_r   r   re   r   r   r   r   r   in_featuresrg   rf   rh   r   r   r   r   named_parametersendswithzeros_r   r2   r3   r;   r   ru   out_projr$   r   r   )	r   r   resnet_blocknameparamproj_stdattn_stdfc_stdblockr   r   r   r   s  s@   



zCLIP.initialize_parametersc                 C   s,   t | j| j}|td |d |S )Nz-infr@   )r   r   r   fill_floattriu_)r   maskr   r   r   r     s   
zCLIP.build_attention_maskc                 C   s   | j jjjS r.   )r   rJ   r   r)   r   r   r   r   r)     s   z
CLIP.dtypec                 C   s   |  || jS r.   )r   r   r)   )r   imager   r   r   encode_image  r   zCLIP.encode_imagec                 C   s   |  || j}|| j| j }|ddd}| |}|ddd}| || j}|t|j	d |j
ddf | j }|S )Nr@   r   rb   rq   rl   )r   r   r)   re   r   r   r   r   aranger   argmaxr   )r   textr   r   r   r   encode_text  s   


zCLIP.encode_textc                 C   sj   |  |}| |}||jddd }||jddd }| j }|| |  }|| |  }||fS )Nrq   Tri   )r   r   normr   expt)r   r   r   image_featurestext_featuresr   logits_per_imagelogits_per_textr   r   r   r     s   


zCLIP.forward)r   r   r   r/   r   r   r   r   r   propertyr)   r   r   r   r0   r   r   r&   r   r   ;  s:    	
6"
r   c                 C   s.   t | j| j| j| j| j| j| j| j| j	| j

S r.   )r   clip_embed_dimclip_image_resolutionclip_vision_layersclip_vision_widthclip_vision_patch_sizeclip_context_lengthclip_vocab_sizeclip_transformer_widthclip_transformer_headsclip_transformer_layers)configr   r   r   load_from_config  s   
r   )collectionsr   typingr   r   r   torch.nn.functionalr   
functionalr   torch.utils.checkpointutilsr<   Moduler   r   r1   r?   r]   r   r    r   r   r   r   r   r   r   <module>   s"   33H	; 