o
    ߥi                     @   s
  d dl Z d dlZd dlZd dlZd dlmZ d dlm  mZ	 d dl
m  mZ d dlmZmZmZ G dd dejZdde j fddZd	d
 Zdd ZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )    N)DropPath	to_2tupletrunc_normal_c                       s4   e Zd ZdZddejdf fdd	Zdd Z  ZS )Mlpz Multilayer perceptron.N        c                    sN   t    |p|}|p|}t||| _| | _t||| _t|| _d S N)	super__init__nnLinearfc1actfc2Dropoutdrop)selfin_featureshidden_featuresout_features	act_layerr   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/cv/vidt/backbone.pyr	      s   
zMlp.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S r   )r   r   r   r   )r   xr   r   r   forward!   s   




zMlp.forward)	__name__
__module____qualname____doc__r
   GELUr	   r   __classcell__r   r   r   r   r      s    r   i'     c              	   C   s  |d }| }|j dtjd}|j dtjd}d}||ddddddf |  | }||ddddddf |  | }tj|tj| jd}	|d|	d  |  }	|dddddddf |	 }
|dddddddf |	 }tj|
dddddddddf  |
dddddddddf  fd	d
d}
tj|dddddddddf  |dddddddddf  fd	d
d}tj	||
fdd
}|S )aD   Masked Sinusoidal Positional Encoding

    Args:
        x: [PATCH] tokens
        mask: the padding mask for [PATCH] tokens
        num_pos_feats: the size of channel dimension
        temperature: the temperature value
        scale: the normalization scale

    Returns:
        pos: Sinusoidal positional encodings
    r"      )dtypegư>N)r$   devicer      dim   )
cumsumtorchfloat32aranger&   stacksincosflattencat)r   masknum_pos_featstemperaturescalenot_masky_embedx_embedepsdim_tpos_xpos_yposr   r   r   masked_sin_pos_encoding*   s2   &&  JJr@   c                 C   sR   | j \}}}}| ||| ||| ||} | dddddd d|||}|S )z
    Args:
        x: (B, H, W, C)
        window_size (int): window size

    Returns:
        windows: (num_windows*B, window_size, window_size, C)
    r   r#   r*   r"   r'      r%   )shapeviewpermute
contiguous)r   window_sizeBHWCwindowsr   r   r   window_partitionW   s   	rL   c                 C   sb   t | jd || | |  }| ||| || ||d}|dddddd |||d}|S )z
    Args:
        windows: (num_windows*B, window_size, window_size, C)
        window_size (int): Window size
        H (int): Height of image
        W (int): Width of image

    Returns:
        x: (B, H, W, C)
    r   r%   r#   r*   r"   r'   rA   )intrB   rC   rD   rE   )rK   rF   rH   rI   rG   r   r   r   r   window_reverseh   s   $rN   c                       s:   e Zd ZdZ				d
 fdd	Z			ddd	Z  ZS )ReconfiguredAttentionModulea   Window based multi-head self attention (W-MSA) module with relative position bias -> extended with RAM.
    It supports both of shifted and non-shifted window.

    !!!!!!!!!!! IMPORTANT !!!!!!!!!!!
    The original attention module in Swin is replaced with the reconfigured attention module in Section 3.
    All the Args are shared, so only the forward function is modified.
    See https://arxiv.org/pdf/2110.03921.pdf
    !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

    Args:
        dim (int): Number of input channels.
        window_size (tuple[int]): The height and width of the window.
        num_heads (int): Number of attention heads.
        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
    TNr   c                    s  t    || _|| _|| _|| }|p|d | _tt	d|d  d d|d  d  || _
t| jd }	t| jd }
tt|	|
g}t|d}|d d d d d f |d d d d d f  }|ddd }|d d d d df  | jd d 7  < |d d d d df  | jd d 7  < |d d d d df  d| jd  d 9  < |d}| d| tj||d |d| _t|| _t||| _t|| _t| j
d	d
 tjdd| _d S )Ng      r"   r   r#   r%   relative_position_indexr*   bias{Gz?stdr(   )r   r	   r)   rF   	num_headsr7   r
   	Parameterr,   zerosrelative_position_bias_tabler.   r/   meshgridr2   rD   rE   sumregister_bufferr   qkvr   	attn_dropproj	proj_dropr   Softmaxsoftmax)r   r)   rF   rV   qkv_biasqk_scaler^   r`   head_dimcoords_hcoords_wcoordscoords_flattenrelative_coordsrP   r   r   r   r	      sX   
	"(,
z$ReconfiguredAttentionModule.__init__Fc           *   	   C   s  | j d | j d ksJ | j d }|| }|sS|j\}}	}
}|	|
 }||||}tj||gdd}| |}|ddd|ddf |dd|dddf }}nh|d j\}}	}
}|	|
 }|d j\}}}}|| }|d |||}|d |||}tj|||gdd}| |}|ddd|ddf |dd||| ddf |dd|| dddf }}}|||	|
d}t||}|jd }|||| d| j|| j }|	ddddd}|d |d |d }}}|| j
 }||d	d }| j| jd | j d | j d  | j d | j d  d}|	ddd }||d }|durY|jd }||| || j||}|dd} ||  }|d| j||}| |}| |}|| dd||||}!||dd| j|| j }|	ddddd}|d |d |d }"}#}$|r||||d| j|| j }|ddddddddddddf 	dddddd
 }%|%d|| j|| d}%|%d |%d }&}'tj|&|#gddtj|'|$gdd}#}$|"| j
 }"|"|#d	d }(|dur|(| }(| |(}(| |(}(|(|$ dd|d|})t|!||	|
}!tj|!||	|
 ||)gdd}| |}| |}|ddd|	|
 ddf ||	|
|}!|dd|	|
 dddf })|!|)fS )al   Forward function.
        RAM module receives [Patch] and [DET] tokens and returns their calibrated ones

        Args:
            x: [PATCH] tokens
            det: [DET] tokens
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None -> mask for shifted window attention

            "additional inputs for RAM"
            cross_attn: whether to use cross-attention [det x patch] (for selective cross-attention)
            cross_attn_mask: mask for cross-attention

        Returns:
            patch_x: the calibrated [PATCH] tokens
            det_x: the calibrated [DET] tokens
        r   r#   r(   Nr%   r*   r"   r'   rA   )rF   rB   rC   r,   r3   r]   rL   reshaperV   rD   r7   	transposerY   rP   rE   	unsqueezerb   r^   rN   r_   r`   )*r   r   detr4   
cross_attncross_attn_maskrF   local_map_sizerG   rH   rI   rJ   Nfull_qkv	patch_qkvdet_qkv_ori_Hori_Wori_N	shifted_xcross_xcross_patch_qkvB_
_patch_qkvpatch_qpatch_kpatch_v
patch_attnrelative_position_biasnWtmp0tmp1patch_xdet_qdet_kdet_vpatch_kvcross_patch_kcross_patch_vdet_attndet_xr   r   r   r      s   

8
R












 

*z#ReconfiguredAttentionModule.forward)TNr   r   )NFNr   r   r   r   r	   r   r!   r   r   r   r   rO   z   s    2rO   c                
       sB   e Zd ZdZddddddddejejf
 fdd		Zd
d Z  Z	S )SwinTransformerBlocka]   Swin Transformer Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        shift_size (int): Shift size for SW-MSA.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
       r         @TNr   c              	      s   t    || _|| _|| _|| _|| _d| j  kr#| jk s(J d J d||| _t|t	| j||||	|d| _
|
dkrDt|
nt | _||| _t|| }t||||d| _d | _d | _d S )Nr   z shift_size must in 0-window_size)rF   rV   rc   rd   r^   r`   r   )r   r   r   r   )r   r	   r)   rV   rF   
shift_size	mlp_rationorm1rO   r   attnr   r
   Identity	drop_pathnorm2rM   r   mlprH   rI   )r   r)   rV   rF   r   r   rc   rd   r   r^   r   r   
norm_layermlp_hidden_dimr   r   r   r	   W  sB   
(



zSwinTransformerBlock.__init__c              	   C   s"  |j \}}}| j| j}	}
||	|
 | j ksJ d|}| |}|ddd|	|
 ddf |dd|	|
 dddf }}|||	|
|}|}d }}| j|
| j  | j }| j|	| j  | j }t|dd||||f}|j \}}}}|\}}| 	|}| j
dkrtj|| j
 | j
 fdd}|}n|}d}|r|| }|| }||f}n|| }|}| j|||||d\}}| j
dkrtj|| j
| j
fdd}n|}|dks|dkr|ddd|	d|
ddf  }|||	|
 |}tj||gdd}|| | }|| | | | }|S )	a   Forward function.

        Args:
            x: Input feature, tensor size (B, H*W + DET, C). i.e., binded [PATCH, DET] tokens
            H, W: Spatial resolution of the input feature.
            mask_matrix: Attention mask for cyclic shift.

            "additional inputs'
            pos: (patch_pos, det_pos)
            cross_attn: whether to use cross attn [det x [det + patch]]
            cross_attn_mask: attention mask for cross-attention

        Returns:
            x: calibrated & binded [PATCH, DET] tokens
        input feature has wrong sizeNr   )r#   r"   )shiftsdims)r4   ro   rp   rq   r#   r(   )rB   rH   rI   det_token_numr   rC   rF   Fpaddet_pos_linearr   r,   rollr   rE   r3   r   r   r   )r   r   mask_matrixr?   rp   rq   rG   LrJ   rH   rI   shortcutro   orig_xpad_lpad_tpad_rpad_brw   HpWp	patch_posdet_posr{   	attn_maskcross_patchr   r   r   r     sb   
>




	
$zSwinTransformerBlock.forward)
r   r   r   r   r
   r    	LayerNormr	   r   r!   r   r   r   r   r   E  s    ,r   c                       s0   e Zd ZdZejdf fdd	Zdd Z  ZS )PatchMergingz Patch Merging Layer

    Args:
        dim (int): Number of input channels.
        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
    Tc                    sd   t    || _|rd| nd}tjd| |dd| _|d| | _tj||dd| _||| _d S )Nr"      r'   FrQ   )	r   r	   r)   r
   r   	reductionnorm	expansionr   )r   r)   r   expand
expand_dimr   r   r   r	     s   
zPatchMerging.__init__c              
   C   s  |j \}}}||| | j ksJ d|ddd|| ddf |dd|| dddf }}|||||}|d dkpE|d dk}|rXt|ddd|d d|d f}|ddddddddddf }	|ddddddddddf }
|ddddddddddf }|ddddddddddf }t|	|
||gd}||dd| }|ddd}tj||gdd}| |}| 	|}|S )	aV   Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C), i.e., binded [PATCH, DET] tokens
            H, W: Spatial resolution of the input feature.

        Returns:
            x: merged [PATCH, DET] tokens;
            only [PATCH] tokens are reduced in spatial dim, while [DET] tokens is fix-scale
        r   Nr"   r#   r   r%   r'   r(   )
rB   r   rC   r   r   r,   r3   repeatr   r   )r   r   rH   rI   rG   r   rJ   ro   	pad_inputx0x1x2x3r   r   r   r     s$   > $$$$

zPatchMerging.forward	r   r   r   r   r
   r   r	   r   r!   r   r   r   r   r     s    r   c                       sD   e Zd ZdZdddddddejdddf fdd		Zdd
dZ  ZS )
BasicLayera   A basic Swin Transformer layer for one stage.

    Args:
        dim (int): Number of feature channels
        depth (int): Depths of this stage.
        num_heads (int): Number of attention head.
        window_size (int): Local window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
        drop (float, optional): Dropout rate. Default: 0.0
        attn_drop (float, optional): Attention dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
    r   r   TNr   Fc                    s   t    	| _	d | _|| _| _|| _t 	f
ddt	|D | _
|d ur=|| d| _d S d | _d S )Nr"   c                    sP   g | ]$}t 	|d  dkrdn	d   ttr!| ndqS )r"   r   )r)   rV   rF   r   r   rc   rd   r   r^   r   r   )r   
isinstancelist.0i
r^   r)   r   r   r   r   rV   rd   rc   rF   r   r   
<listcomp>G  s$    
z'BasicLayer.__init__.<locals>.<listcomp>)r)   r   r   )r   r	   rF   r   depthr)   use_checkpointr
   
ModuleListrangeblocks
downsample)r   r)   r   rV   rF   r   rc   rd   r   r^   r   r   r   lastr   r   r   r   r	   /  s   

 

zBasicLayer.__init__c              	   C   s  |j d }tt|| j | j }tt|| j | j }	tjd||	df|jd}
td| j t| j | j	 t| j	 df}td| j t| j | j	 t| j	 df}d}|D ]}|D ]}||
dd||ddf< |d7 }q_q[t
|
| j}|d| j| j }|d|d }||dktd|dktd}|r|j dd \}}||kr||kstj|d  ||fd	tjd }t||| j}| }||dktd|dktd}|||| dd}tj|d| jfdd
}nd}d}||f}t| jD ]8\}}|||_|_|rd}|}|}nd}d}d|f}| jr7tj||||||d}q	||||||d}q	| jdurd| |||}|d d |d d }}||||||fS ||||||fS )aD   Forward function.

        Args:
            x: Input feature, tensor size (B, H*W, C).
            H, W: Spatial resolution of the input feature.
            det_pos: pos encoding for det token
            input_mask: padding mask for inputs
            cross_attn: whether to use cross attn [det x [det + patch]]
        r   r#   )r&   Nr%   r"   g      Yr   size)valueTF)r?   rp   rq   )rB   rM   npceilrF   r,   rX   r&   slicer   rL   rC   rn   masked_fillfloatr   interpolatetoboolr@   r)   r   r   	enumerater   rH   rI   r   
checkpointr   )r   r   rH   rI   r   
input_maskrp   rG   r   r   img_maskh_slicesw_slicescnthwmask_windowsr   _H_Wr   rq   r?   n_blkblk_cross_attn_cross_attn_mask_posx_downWhWwr   r   r   r   ^  s   





	

zBasicLayer.forward)Fr   r   r   r   r   r     s    /r   c                       s2   e Zd ZdZ				d
 fdd	Zdd	 Z  ZS )
PatchEmbedaE   Image to Patch Embedding

    Args:
        patch_size (int): Patch token size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        norm_layer (nn.Module, optional): Normalization layer. Default: None
    r'   r*   `   Nc                    sX   t    t|}|| _|| _|| _tj||||d| _|d ur'||| _	d S d | _	d S )N)kernel_sizestride)
r   r	   r   
patch_sizein_chans	embed_dimr
   Conv2dr_   r   )r   r   r   r   r   r   r   r   r	     s   

zPatchEmbed.__init__c              
   C   s   |  \}}}}|| jd  dkr#t|d| jd || jd   f}|| jd  dkr@t|ddd| jd || jd   f}| |}| jdurp| d| d}}|ddd}| |}|ddd| j	||}|S )zForward function.r#   r   Nr"   r*   r%   )
r   r   r   r   r_   r   r2   rm   rC   r   )r   r   rw   rH   rI   r   r   r   r   r   r     s   $


zPatchEmbed.forward)r'   r*   r   Nr   r   r   r   r   r     s    
r   c                       s   e Zd ZdZddddg dg ddd	d
ddddejdd
g dddf fdd	Zdd Zej	j
dd ZdddgfddZdd Zd! fdd	Zdd  Z  ZS )"SwinTransformera   Swin Transformer backbone.
        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030

    Args:
        pretrain_img_size (int): Input image size for training the pretrained model,
            used in absolute postion embedding. Default 224.
        patch_size (int | tuple(int)): Patch size. Default: 4.
        in_chans (int): Number of input image channels. Default: 3.
        embed_dim (int): Number of linear projection output channels. Default: 96.
        depths (tuple[int]): Depths of each Swin Transformer stage.
        num_heads (tuple[int]): Number of attention head of each stage.
        window_size (int): Window size. Default: 7.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
        drop_rate (float): Dropout rate.
        attn_drop_rate (float): Attention dropout rate. Default: 0.
        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
        out_indices (Sequence[int]): Output from which stages.
        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
            -1 means not freezing any args.
        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
       r'   r*   r   )r"   r"      r"   )r*   r         r   r   TNr   g?F)r#   r"   r*   r%   c                    s  t    || _t|| _ | _|| _|| _|| _|| _	t
|| | jr%|nd d| _| jr\t|}t|}|d |d  |d |d  g}ttd |d |d | _t| jdd tj|d| _dd td|t|D }t | _t| jD ]F}tt d	|  || || |||	|
|||t|d | t|d |d   ||| jk rtnd || jd k rd nd
|d}| j| q{ fddt| jD }|| _|D ]}||| }d| }| || q|    d S )N)r   r   r   r   r   r#   rS   rT   )pc                 S   s   g | ]}|  qS r   )item)r   r   r   r   r   r   R  s    z,SwinTransformer.__init__.<locals>.<listcomp>r"   T)r)   r   rV   rF   r   rc   rd   r   r^   r   r   r   r   r   c                    s   g | ]
}t  d |  qS )r"   )rM   r   r   r   r   r   m  s    r   )!r   r	   pretrain_img_sizelen
num_layersr   ape
patch_normout_indicesfrozen_stagesr   patch_embedr   r
   rW   r,   rX   absolute_pos_embedr   r   pos_droplinspacer[   r   layersr   r   rM   r   appendnum_features
add_module_freeze_stages)r   r  r   r   r   depthsrV   rF   r   rc   rd   	drop_rateattn_drop_ratedrop_path_rater   r  r  r  r  r   patches_resolutiondpri_layerlayerr  
layer_namer   r  r   r	     st   


&

zSwinTransformer.__init__c                 C   s   | j dkr| j  | j D ]}d|_q| j dkr!| jr!d| j_| j dkrI| j  td| j d D ]}| j	| }|  | D ]}d|_qBq3d S d S )Nr   Fr#   r"   )
r  r	  eval
parametersrequires_gradr  r
  r  r   r  )r   paramr   mr   r   r   r  z  s    




zSwinTransformer._freeze_stagesc                 C   s   ddhS )Ndet_pos_embed	det_tokenr   r   r   r   r   no_weight_decay  s   zSwinTransformer.no_weight_decayd   r   c                    s   | _ | _ttd| jd  _t jdd _| _	td||}t|dd}tj| _
 fddtt jd D  _|dkrO j j	 | _dt jt j   _ jD ]}||_|jd	urp||j_|jD ]}||_t||j|_qsqb|d
krd	 jd _d	S d	S )a*   A funtion to add neccessary (leanable) variables to Swin Transformer for object detection

            Args:
                method: vidt or vidt_wo_neck
                det_token_num: the number of object to detect, i.e., number of object queries
                pos_dim: the channel dimension of positional encodings for [DET] and [PATCH] tokens
                cross_indices: the indices where to use the [DET X PATCH] cross-attention
                    there are four possible stages in [0, 1, 2, 3]. 3 indicates Stage 4 in the ViDT paper.
        r#   r   rS   rT   c                    s   g | ]	} j |d   qS )r#   )r  r   r"  r   r   r     s    z0SwinTransformer.finetune_det.<locals>.<listcomp>vidtr"   Nvidt_wo_neckr%   )methodr   r
   rW   r,   rX   r  r!  r   pos_dimr   r   r  num_channelsr  cross_indicesr  mask_divisorr   r   r   r)   r   )r   r'  r   r(  r*  r   r  blockr   r"  r   finetune_det  s<   



zSwinTransformer.finetune_detc              	   C   s  |j d |j d |j d }}}| |}|d|d}}|ddd}| |}| j|dd}| j}t	j
|d  || j || j fdtjd }g }	t| jD ]l}
| j|
 }|
| jv rhdnd	}tj||gdd
}|||||||d\}}}}}}|ddd| j ddf |dd| j dddf }}|
dkr|ddd| j ddf |||ddddd}|	| qZ|	||||ddddd |dd| j dddf ddd}|ddd}|	\}}}}||||||fS )a   Forward function.

            Args:
                x: input rgb images
                mask: input padding masks [0: rgb values, 1: padded values]

            Returns:
                patch_outs: multi-scale [PATCH] tokens (four scales are used)
                    these tokens are the first input of the neck decoder
                det_tgt: final [DET] tokens obtained at the last stage
                    this tokens are the second input of the neck decoder
                det_pos: the learnable pos encoding for [DET] tokens.
                    these encodings are used to generate reference points in deformable attention
        r   r"   r*   r#   r%   Nr   TFr(   )r   r   rp   )rB   r	  r   r2   rm   r  r!  r   r   r   r   r   r+  r   r,   r   r   r  r  r*  r3   r   rC   rD   r  )r   r   r4   rG   rw   r   r   r!  r   
patch_outsstager  rp   x_outrH   rI   	patch_outdet_tgt
features_0
features_1
features_2
features_3r   r   r   r     s\   "




&	

"(zSwinTransformer.forwardc                    s   t t| | |   dS )z?Convert the model into training mode while keep layers freezed.N)r   r   trainr  )r   moder   r   r   r7    s   zSwinTransformer.trainc                 C   sn   d}|| j  7 }t| jD ]
\}}|| 7 }q|| j| jd  | jd  d| j  7 }|| j| j 7 }|S )Nr   r#   r"   )r	  flopsr   r  r  r  r  num_classes)r   r9  r   r  r   r   r   r9    s   
zSwinTransformer.flops)T)r   r   r   r   r
   r   r	   r  r,   jitignorer#  r-  r   r7  r9  r!   r   r   r   r   r     s@    `

9Or   )mathosnumpyr   r,   torch.nnr
   torch.nn.functional
functionalr   torch.utils.checkpointutilsr   timm.models.layersr   r   r   Moduler   pir@   rL   rN   rO   r   r   r   r   r   r   r   r   r   <module>   s.   
- L > 02