o
    ߥiu                     @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dl	Z	d dl
m  mZ d dl	mZ ddlmZ G dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZeddddddedddddddZdddZedkre ZdS dS )    N)OrderedDict)TupleUnion)nn   )ViMc                       s$   e Zd Zdejf fddZ  ZS )	LayerNormxc                    s$   |j }t |tj}||S N)dtypesuperforwardtypetorchfloat32)selfr	   	orig_typeret	__class__ c/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vision_middleware/backbone.pyr      s   
zLayerNorm.forward)__name__
__module____qualname__r   Tensorr   __classcell__r   r   r   r   r      s    r   c                   @   s   e Zd ZdejfddZdS )	QuickGELUr	   c                 C   s   |t d|  S )NgZd;?)r   sigmoidr   r	   r   r   r   r      s   zQuickGELU.forwardN)r   r   r   r   r   r   r   r   r   r   r      s    r   c                       sT   e Zd Z	ddededejf fddZdejfdd	Zdejd
efddZ	  Z
S )ResidualAttentionBlockNd_modeln_head	attn_maskc              
      s   t    t||| _t|| _ttdt	||d fdt
 fdt	|d |fg| _t|| _|| _t | _t | _d S )Nc_fc   geluc_proj)r   __init__r   MultiheadAttentionattnr   ln_1
Sequentialr   Linearr   mlpln_2r#   r   vim_attvim_mlp)r   r!   r"   r#   r   r   r   r(   !   s   


zResidualAttentionBlock.__init__r	   c                 C   s>   | j d ur| j j|j|jdnd | _ | j|||d| j dd S )N)r   deviceF)need_weightsr#   r   )r#   tor   r2   r*   r   r   r   r   	attention3   s   
z ResidualAttentionBlock.attention	task_namec                 C   sT   |  |}|| | }|| || }| |}|| | }|| || }|S r
   )r+   r5   r0   r/   r.   r1   )r   r	   r6   
x_normed_1
x_normed_2r   r   r   r   :   s   

zResidualAttentionBlock.forwardr
   )r   r   r   intr   r   r(   r5   strr   r   r   r   r   r   r       s    r    c                	       sD   e Zd Z	ddedededejf fddZdejfd	d
Z  ZS )TransformerNwidthlayersheadsr#   c                    s<   t    | _|| _t fddt|D | _d S )Nc                    s   g | ]}t  qS r   )r    ).0_r#   r>   r<   r   r   
<listcomp>P   s    
z(Transformer.__init__.<locals>.<listcomp>)r   r(   r<   r=   r   
ModuleListrange	resblocks)r   r<   r=   r>   r#   r   rA   r   r(   H   s   
zTransformer.__init__r	   c           	      K   sD   |  \}}}g }t| jD ]\}}||fi |}|| q|S r
   )size	enumeraterE   append)	r   r	   kwargsLBDfeaturesiblkr   r   r   r   U   s   zTransformer.forwardr
   )	r   r   r   r9   r   r   r(   r   r   r   r   r   r   r;   F   s    r;   c                       sN   e Zd ZdZ	ddedededededef fd	d
ZdejfddZ  Z	S )VisionTransformeraf  
    The Vision Transformer (ViT) model
    Args:
        - input_resolution (int): shape of input image
        - patch_width (int): size of patch tokens
        - width (int): feature channels
        - layers (int): number of transformer layers
        - heads (int): number of multi-head attention
        - output_dim (int): output feature channels
       input_resolution
patch_sizer<   r=   r>   
output_dimc                    s   t    || _tjd|||dd| _|d }t|t| | _	t|t|| d d | | _
t|| _|| | _t|||| _t|| _t|t|| | _|| _d S )N   F)in_channelsout_channelskernel_sizestridebiasg         r   )r   r(   rR   r   Conv2dconv1	Parameterr   randnclass_embeddingpositional_embeddingr   ln_prepatch_per_sider;   transformerln_postprojrT   )r   rR   rS   r<   r=   r>   rT   scaler   r   r   r(   j   s(   






zVisionTransformer.__init__r	   c           	   	   K   sN  |  |}|d}|d}||jd |jd d}|ddd}| j|jddd|dd}t	j
||gdd}|| j|j }| |}|ddd}| j|fi |}|d }|ddd}| |d d dd d f }| jd ur~|| j }g }|D ]}||dd d d d d f ddd|d|| q|| |S )Nr   r[   r   )dim)r]   rF   reshapeshapepermuter`   r4   r   repeatr   catra   rb   rd   re   rf   rH   )	r   r	   rI   rK   P	cls_tokenx_per_layeroutputsoutputr   r   r   r      s:   






"
zVisionTransformer.forward)rQ   )
r   r   r   __doc__r9   r(   r   r   r   r   r   r   r   r   rP   ^   s"    rP         i      )rR   rS   r<   r=   r>       )vit_b16_224vit_b32_224ry   c                 C   s$   t |  }tdi |}|| |S )z build a ViT + ViM model
        Args:
            arch: name of backbone
            pretrained: weights of pretrained model
    Nr   )
model_dictrP   load_state_dict)arch
pretrained
model_argsmodelr   r   r   build_backbone   s   
r   __main__)ry   N)mathoscollectionsr   typingr   r   numpynpr   torch.nn.functionalr   
functionalFvimr   r   Moduler   r    r;   rP   dictr{   r   r   r   r   r   r   r   <module>   s*   'L

