o
    ߥi"@                     @   s  d dl mZ d dlmZmZ d dlZd dlZd dlm	  m
Z d dlm	Z	 G dd de	jZG dd de	jZG d	d
 d
e	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZG dd de	jZde	jfddZdd ZdS )    )OrderedDict)TupleUnionN)nnc                       s2   e Zd ZdZd fdd	ZdejfddZ  ZS )	
Bottleneck      c                    s2  t    tj||ddd| _t|| _tjdd| _tj||dddd| _	t|| _
tjdd| _|dkr=t|nt | _tj||| j ddd| _t|| j | _tjdd| _d | _|| _|dksp||tj krttdt|fd	tj||| j dddd
fdt|| j fg| _d S d S )Nr   F)biasTinplace   )paddingr	   z-10)strider	   1)super__init__r   Conv2dconv1BatchNorm2dbn1ReLUrelu1conv2bn2relu2	AvgPool2dIdentityavgpool	expansionconv3bn3relu3
downsampler   r   
Sequentialr   )selfinplanesplanesr   	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/text_driven_segmentation/model.pyr      s:   

zBottleneck.__init__xc                 C   st   |}|  | | |}| | | |}| |}| | |}| j	d ur/| 	|}||7 }| 
|}|S N)r   r   r   r   r   r   r   r!   r    r#   r"   )r%   r,   identityoutr*   r*   r+   forward2   s   



zBottleneck.forwardr   )	__name__
__module____qualname__r   r   torchTensorr0   __classcell__r*   r*   r(   r+   r      s    "r   c                	       s:   e Zd Z	d
dedededef fddZdd	 Z  ZS )AttentionPool2dNspacial_dim	embed_dim	num_heads
output_dimc                    st   t    tt|d d ||d  | _t||| _t||| _	t||| _
t||p2|| _|| _d S )N   r   g      ?)r   r   r   	Parameterr5   randnpositional_embeddingLineark_projq_projv_projc_projr;   )r%   r9   r:   r;   r<   r(   r*   r+   r   D   s   

zAttentionPool2d.__init__c              	   C   s   |j ddddd}tj|jddd|gdd}|| jd d d d d f |j }tj	di d|d d d	|d
|d|j
d d| jd| jjd| jjd| jjdd dt| jj| jj| jjgdd dd ddddd| jjd| jjddd| jdd\}}|dS )Nr=   )	start_dimr   r   TdimkeepdimrH   querykeyvalueembed_dim_to_checkr;   q_proj_weightk_proj_weightv_proj_weightin_proj_weightin_proj_biasbias_kbias_vadd_zero_attnF	dropout_pout_proj_weightout_proj_biasuse_separate_proj_weighttrainingneed_weightsr*   )flattenpermuter5   catmeanr@   todtypeFmulti_head_attention_forwardshaper;   rC   weightrB   rD   r	   rE   r\   squeeze)r%   r,   _r*   r*   r+   r0   R   sZ   $

	

zAttentionPool2d.forwardr-   )r2   r3   r4   intr   r0   r7   r*   r*   r(   r+   r8   B   s    r8   c                       s8   e Zd ZdZ		d fdd	ZdddZd	d
 Z  ZS )ModifiedResNeta  
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
       @   c                    sR  t    || _|| _tjd|d ddddd| _t|d | _tj	dd| _
tj|d |d dddd| _t|d | _tj	dd| _tj|d |dddd| _t|| _tj	dd| _td| _|| _| ||d	 | _| j|d |d dd
| _| j|d |d dd
| _| j|d |d dd
| _|d }t|d |||| _d S )Nr   r=   r   F)kernel_sizer   r   r	   Tr
   )rn   r   r	   r   )r   r          )r   r   r<   input_resolutionr   r   r   r   r   r   r   r   r   r   r    r!   r"   r   r   	_inplanes_make_layerlayer1layer2layer3layer4r8   attnpool)r%   layersr<   headsrq   widthr:   r(   r*   r+   r   v   s8   


zModifiedResNet.__init__r   c                 C   sH   t | j||g}|t j | _td|D ]}|t | j| qtj| S )Nr   )r   rr   r   rangeappendr   r$   )r%   r'   blocksr   ry   ri   r*   r*   r+   rs      s
   
zModifiedResNet._make_layerc                    sZ    fdd}|  jjj}||} |} |} |} |} |}|S )Nc                    sP       | }    | }    | }  	| } | S r-   )
r   r   r   r   r   r   r"   r!   r    r   )r,   r%   r*   r+   stem   s
   
z$ModifiedResNet.forward.<locals>.stem)	typer   rg   rc   rt   ru   rv   rw   rx   )r%   r,   r   r*   r   r+   r0      s   




zModifiedResNet.forward)rl   rm   r1   )r2   r3   r4   __doc__r   rs   r0   r7   r*   r*   r(   r+   rk   n   s    
$	rk   c                       s(   e Zd ZdZdejf fddZ  ZS )	LayerNormz*Subclass torch's LayerNorm to handle fp16.r,   c                    s$   |j }t |tj}||S r-   )rc   r   r0   r   r5   float32)r%   r,   	orig_typeretr(   r*   r+   r0      s   
zLayerNorm.forward)r2   r3   r4   r   r5   r6   r0   r7   r*   r*   r(   r+   r      s    r   c                   @   s   e Zd ZdejfddZdS )	QuickGELUr,   c                 C   s   |t d|  S )NgZd;?)r5   sigmoidr%   r,   r*   r*   r+   r0         zQuickGELU.forwardN)r2   r3   r4   r5   r6   r0   r*   r*   r*   r+   r      s    r   c                       sP   e Zd Z	ddededejf fddZdejfdd	Zdejfd
dZ  Z	S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      sr   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _d S )Nc_fcr   gelurE   )r   r   r   MultiheadAttentionattnr   ln_1r$   r   rA   r   mlpln_2r   )r%   r   r   r   r(   r*   r+   r      s   



zResidualAttentionBlock.__init__r,   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )Nrc   deviceF)r]   r   r   )r   rb   rc   r   r   r   r*   r*   r+   	attention   s   
z ResidualAttentionBlock.attentionc                 C   s,   ||  | | }|| | | }|S r-   )r   r   r   r   r   r*   r*   r+   r0      s   zResidualAttentionBlock.forwardr-   )
r2   r3   r4   rj   r5   r6   r   r   r0   r7   r*   r*   r(   r+   r      s    r   c                       s.   e Zd Zd fdd	ZdejfddZ  ZS )TransformerNc                    s<   t    | _|| _tj fddt|D  | _d S )Nc                    s   g | ]}t  qS r*   )r   ).0ri   r   rz   r{   r*   r+   
<listcomp>   s    
z(Transformer.__init__.<locals>.<listcomp>)r   r   r{   ry   r   r$   r|   	resblocks)r%   r{   ry   rz   r   r(   r   r+   r      s   
zTransformer.__init__r,   c                 C   s
   |  |S r-   )r   r   r*   r*   r+   r0      s   
zTransformer.forwardr-   )r2   r3   r4   r   r5   r6   r0   r7   r*   r*   r(   r+   r      s    	r   c                       sF   e Zd Zdedededededef fddZd	ejfd
dZ  ZS )VisionTransformerrq   
patch_sizer{   ry   rz   r<   c                    s   t    || _|| _tjd|||dd| _|d }t|t	| | _
t|t	|| d d | | _t|| _t|||| _t|| _t|t	|| | _d S )Nr   F)in_channelsout_channelsrn   r   r	         r=   r   )r   r   rq   r<   r   r   r   r>   r5   r?   class_embeddingr@   r   ln_prer   transformerln_postproj)r%   rq   r   r{   ry   rz   r<   scaler(   r*   r+   r      s&   




zVisionTransformer.__init__r,   c                 C   s   |  |}||jd |jd d}|ddd}| j|j}tj|jd d|jd |j|j	d}tj
|| |gdd}|| j|j }| |}|ddd}| |}|ddd}| |d d dd d f }| jd urv|| j }|S )Nr   r   rO   r=   r   rJ   )r   reshaperf   r_   r   rb   rc   r5   zerosr   r`   r@   r   r   r   r   )r%   r,   x1x2r*   r*   r+   r0     s&   




zVisionTransformer.forward)	r2   r3   r4   rj   r   r5   r6   r0   r7   r*   r*   r(   r+   r      s    r   c                       s   e Zd Zdededeeeeeef ef dededededed	ed
ef fddZdd Zdd Ze	dd Z
dd Zdd Zdd Z  ZS )CLIPr:   image_resolutionvision_layersvision_widthvision_patch_sizecontext_length
vocab_sizetransformer_widthtransformer_headstransformer_layersc                    s   t    || _t|ttfr |d d }t|||||d| _n|d }t||||||d| _t	||
|	| 
 d| _|| _t||| _tt| j|| _t|| _tt||| _ttg td | _|   d S )Nrp   rm   )ry   r<   rz   rq   r{   )rq   r   r{   ry   rz   r<   )r{   ry   rz   r   g$I$I,@)r   r   r   
isinstancetuplelistrk   visualr   r   build_attention_maskr   r   r   	Embeddingtoken_embeddingr>   r5   emptyr@   r   ln_finaltext_projectiononesnploglogit_scaleinitialize_parameters)r%   r:   r   r   r   r   r   r   r   r   r   vision_headsr(   r*   r+   r   &  sJ   



zCLIP.__init__c           	      C   s  t jj| jjdd t jj| jdd t| jtr|| jj	d urW| jj	j
jd }t jj| jj	jj|d t jj| jj	jj|d t jj| jj	jj|d t jj| jj	j
j|d | jj| jj| jj| jjfD ]}| D ]\}}|drzt j| qkqe| jjd d| jj d  }| jjd }d| jj d }| jjD ]-}t jj|jj|d t jj|jjj|d t jj|jjj|d t jj|jj
j|d q| jd urt jj| j| jjd d d S d S )Ng{Gz?)stdg{Gz?r   z
bn3.weightr=   ) r   initnormal_r   rg   r@   r   r   rk   rx   rE   in_featuresrC   rB   rD   rt   ru   rv   rw   named_parametersendswithzeros_r   r{   ry   r   r   rS   out_projr   r   r   )	r%   r   resnet_blocknameparamproj_stdattn_stdfc_stdblockr*   r*   r+   r   \  s@   



zCLIP.initialize_parametersc                 C   s,   t | j| j}|td |d |S )Nz-infr   )r5   r   r   fill_floattriu_)r%   maskr*   r*   r+   r   ~  s   
zCLIP.build_attention_maskc                 C   s   | j jjjS r-   )r   r   rg   rc   r   r*   r*   r+   rc     s   z
CLIP.dtypec                 C   s   |  || jS r-   )r   r   rc   )r%   imager*   r*   r+   encode_image  r   zCLIP.encode_imagec                 C   s   |  || j}|| j| j }|ddd}| |}|ddd}| || j}|t|j	d |j
ddf | j }|S )Nr   r   r=   rO   rJ   )r   r   rc   r@   r_   r   r   r5   arangerf   argmaxr   )r%   textr,   r*   r*   r+   encode_text  s   

zCLIP.encode_textc                 C   sb   |  |}| |}||jddd }||jddd }| j }|| |  }| }||fS )Nr   TrG   )r   r   normr   expt)r%   r   r   image_featurestext_featuresr   logits_per_imagelogits_per_textr*   r*   r+   r0     s   


zCLIP.forward)r2   r3   r4   rj   r   r   r   r   r   propertyrc   r   r   r0   r7   r*   r*   r(   r+   r   $  s:    	
6"
r   modelc                 C   s   dd }|  | dS )z+Convert applicable model parameters to fp16c                 S   s   t | tjtjtjfr | jj | j_| jd ur | jj | j_t | tj	rGg dd dD dddD ]}t
| |}|d urF|j |_q5dD ]}t| |r_t
| |}|d ur_|j |_qId S )Nc                 S   s   g | ]}| d qS )_proj_weightr*   )r   sr*   r*   r+   r     s    zEconvert_weights.<locals>._convert_weights_to_fp16.<locals>.<listcomp>)inqkvrT   rU   rV   )r   r   )r   r   Conv1dr   rA   rg   datahalfr	   r   getattrhasattr)llattrtensorr   r*   r*   r+   _convert_weights_to_fp16  s2   



z1convert_weights.<locals>._convert_weights_to_fp16N)apply)r   r   r*   r*   r+   convert_weights  s   r   c                  C   s*   t dddddddddd
} t|  |  S )	Ni   rl      i   rp   M   i   ro   )r   r   eval)r   r*   r*   r+   build_model  s   r   )collectionsr   typingr   r   numpyr   r5   torch.nn.functionalr   
functionalrd   Moduler   r8   rk   r   r   r   r   r   r   r   r   r*   r*   r*   r+   <module>   s$   5,I	1 