o
    ϯi^S                     @   s   d dl mZmZmZmZ d dlZd dlZd dlmZ d dl	m
Z
 ddlmZ G dd dejZ		dd	ejd
ee dee dedee dejfddZG dd dejZG dd dejZdS )    )CallableListOptionalTupleN)	_size_3_t   )DropPathc                       sd   e Zd ZdZddejdfdedee dee dede	d	df fd
dZ
dejd	ejfddZ  ZS )Mlpu  
    A MLP block that contains two linear layers with a normalization layer. The MLP
    block is used in a transformer model after the attention block.

    ::

                         Linear (in_features, hidden_features)
                                           ↓
                                 Normalization (act_layer)
                                           ↓
                                Dropout (p=dropout_rate)
                                           ↓
                         Linear (hidden_features, out_features)
                                           ↓
                                Dropout (p=dropout_rate)
    N        in_featureshidden_featuresout_features	act_layerdropout_ratereturnc                    sb   t    || _|p|}|p|}t||| _| | _t||| _| jdkr/t|| _	dS dS )aO  
        Args:
            in_features (int): Input feature dimension.
            hidden_features (Optional[int]): Hidden feature dimension. By default,
                hidden feature is set to input feature dimension.
            out_features (Optional[int]): Output feature dimension. By default, output
                features dimension is set to input feature dimension.
            act_layer (Callable): Activation layer used after the first linear layer.
            dropout_rate (float): Dropout rate after each linear layer. Dropout is not used
                by default.
        r
   N)
super__init__r   nnLinearfc1actfc2Dropoutdropout)selfr   r   r   r   r   	__class__ Q/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/layers/attention.pyr      s   

zMlp.__init__xc                 C   sJ   |  |}| |}| jdkr| |}| |}| jdkr#| |}|S )z=
        Args:
            x (tensor): Input tensor.
        r
   )r   r   r   r   r   )r   r   r   r   r   forward<   s   






zMlp.forward)__name__
__module____qualname____doc__r   GELUintr   r   floatr   torchTensorr    __classcell__r   r   r   r   r	      s(    r	   Ttensorpool	thw_shapehas_cls_embednormr   c                 C   sn  |du r| |fS | j }|dkrn|dkr| d} ntd| j |rE| ddddddddf | ddddddddf }} | j\}}}	}
|\}}}| || ||||
ddddd } || } | jd | jd | jd g}| jd | jd  | jd  }| |||
|dd} |rtj	|| fdd} |dur|| } |dkr	 | |fS | 
d} | |fS )	u  
    Apply pool to a flattened input (given pool operation and the unflattened shape).


                                         Input
                                           ↓
                                        Reshape
                                           ↓
                                          Pool
                                           ↓
                                        Reshape
                                           ↓
                                          Norm


    Args:
        tensor (torch.Tensor): Input tensor.
        pool (Optional[Callable]): Pool operation that is applied to the input tensor.
            If pool is none, return the input tensor.
        thw_shape (List): The shape of the input tensor (before flattening).
        has_cls_embed (bool): Whether the input tensor contains cls token. Pool
            operation excludes cls token.
        norm: (Optional[Callable]): Optional normalization operation applied to tensor
            after pool.

    Returns:
        tensor (torch.Tensor): Input tensor after pool.
        thw_shape (List[int]): Output tensor shape (before flattening).
    N      r   zUnsupported input dimension r      dim)ndim	unsqueezeNotImplementedErrorshapereshapepermute
contiguous	transposer(   catsqueeze)r+   r,   r-   r.   r/   
tensor_dimcls_tokBNLCTHWL_pooledr   r   r   _attention_poolK   s4   $B
(
rI   c                       s  e Zd ZdZdddddddejdddfded	ed
edede	de	de	de	de
dedededdf fddZdejdee dejdee dejdee dee dee deejejejf fd d!Zdejdejdejd"eejee f deejee ejee ejee f f
d#d$Zd%ee d&ee d'ee dee fd(d)Zdejdejdejd*ed+ed,ed-ed.edee fd/d0Zd1ejd"ee deejee f fd2d3Z  ZS )4MultiScaleAttentionu  
    Implementation of a multiscale attention block. Compare to a conventional attention
    block, a multiscale attention block optionally supports pooling (either
    before or after qkv projection). If pooling is not used, a multiscale attention
    block is equivalent to a conventional attention block.

    ::
                                   Input
                                     |
                    |----------------|-----------------|
                    ↓                ↓                 ↓
                  Linear           Linear            Linear
                    &                &                 &
                 Pool (Q)         Pool (K)          Pool (V)
                    → -------------- ←                 |
                             ↓                         |
                       MatMul & Scale                  |
                             ↓                         |
                          Softmax                      |
                             → ----------------------- ←
                                         ↓
                                   MatMul & Scale
                                         ↓
                                      DropOut
       Fr
   r   r   r   Tconvr4   	num_headsqkv_biasr   kernel_q	kernel_kvstride_q	stride_kv
norm_layerr.   	pool_mode
pool_firstr   Nc              	      s`  t    |dv sJ || _|| _|| _|| }|d | _|
| _dd |D }dd |D }tj|||d| _	tj|||d| _
tj|||d| _t||| _|dkrZt|| _|durnt|d	krnt|d	krnd}|durt|d	krt|d	krd}|d
v r|dkrtjntj}|dur||||ddnd| _|dur||||ddnd| _|dur||||dd| _dS d| _dS |dkr)|durtj||||||ddnd| _|dur|	|nd| _|durtj||||||ddnd| _|dur|	|nd| _|durtj||||||ddnd| _|dur$|	|| _dS d| _dS td| )a  
        Args:
            dim (int): Input feature dimension.
            num_heads (int): Number of heads in the attention layer.
            qkv_bias (bool): If set to False, the qkv layer will not learn an additive
                bias. Default: False.
            dropout_rate (float): Dropout rate.
            kernel_q (_size_3_t): Pooling kernel size for q. If both pooling kernel
                size and pooling stride size are 1 for all the dimensions, pooling is
                disabled.
            kernel_kv (_size_3_t): Pooling kernel size for kv. If both pooling kernel
                size and pooling stride size are 1 for all the dimensions, pooling is
                disabled.
            stride_q (_size_3_t): Pooling kernel stride for q.
            stride_kv (_size_3_t): Pooling kernel stride for kv.
            norm_layer (nn.Module): Normalization layer used after pooling.
            has_cls_embed (bool): If set to True, the first token of the input tensor
                should be a cls token. Otherwise, the input tensor does not contain a
                cls token. Pooling is not applied to the cls token.
            pool_mode (str): Pooling mode. Option includes "conv" (learned pooling), "avg"
                (average pooling), and "max" (max pooling).
            pool_first (bool): If set to True, pool is applied before qkv projection.
                Otherwise, pool is applied after qkv projection. Default: False.
        )rM   avgmaxg      c                 S      g | ]}t |d  qS r2   r&   ).0qr   r   r   
<listcomp>       z0MultiScaleAttention.__init__.<locals>.<listcomp>c                 S   rY   rZ   r[   )r\   kvr   r   r   r^      r_   )biasr
   Nr   )rW   rX   rX   F	ceil_moderM   )stridepaddinggroupsra   zUnsupported model )r   r   rV   r   rN   scaler.   r   r   r]   kvprojr   	proj_dropnumpyprod	MaxPool3d	AvgPool3dpool_qpool_kpool_vConv3dnorm_qnorm_knorm_vr7   )r   r4   rN   rO   r   rP   rQ   rR   rS   rT   r.   rU   rV   head_dim	padding_q
padding_kvpool_opr   r   r   r      s   
(





"zMultiScaleAttention.__init__r]   q_sizerh   k_sizeri   v_size
batch_size	chan_sizec	           	      C   s   |  |||| j|| j dddd}| |||| j|| j dddd}| |||| j|| j dddd}|||fS Nr   r2   r   r1   )r]   r9   rN   r:   rh   ri   )	r   r]   r{   rh   r|   ri   r}   r~   r   r   r   r   	_qkv_proj4  s   
zMultiScaleAttention._qkv_projr-   c                 C   s   t || j|| jt| dr| jnd d\}}t || j|| jt| dr$| jnd d\}}t || j|| jt| dr9| jnd d\}}||||||fS )Nrt   )r.   r/   ru   rv   )	rI   rp   r.   hasattrrt   rq   ru   rr   rv   )r   r]   rh   ri   r-   q_shapek_shapev_shaper   r   r   	_qkv_poolP  s,   	


zMultiScaleAttention._qkv_poolr   r   r   c                 C   sd   | j r
t|d nt|}| j rt|d nt|}| j r(t|d nt|}|||fS )Nr   )r.   rl   rm   )r   r   r   r   q_Nk_Nv_Nr   r   r   _get_qkv_lengthp  s   
z#MultiScaleAttention._get_qkv_lengthr   r   r   rA   rD   c	           	      C   sX   | dddd|||}| dddd|||}| dddd|||}|||fS r   )r:   r9   )	r   r]   rh   ri   r   r   r   rA   rD   r   r   r   _reshape_qkv_to_seq{  s   
z'MultiScaleAttention._reshape_qkv_to_seqr   c              
   C   sb  |j \}}}| jrX|||| j|| j dddd}| } }}| ||||\}}	}}
}}| |	|
|\}}}| ||||||||\}}}| ||||||||\}}}n#| } }}| ||||||||\}}}| ||||\}}	}}
}}||	dd | j
 }|jdd}|j d }|| 	dd|||}| |}| jdkr| |}||	fS )	
        Args:
            x (torch.Tensor): Input tensor.
            thw_shape (List): The shape of the input tensor (before flattening).
        r   r2   r   r1   r3   r
   )r8   rV   r9   rN   r:   r   r   r   r   r<   rg   softmaxrj   r   rk   )r   r   r-   rA   rB   rD   r]   rh   ri   r   r   r   r   r   r   attnr   r   r   r      s&   	$ 



zMultiScaleAttention.forward)r!   r"   r#   r$   r   	LayerNormr&   boolr'   r   r   strr   r(   r)   r   r   r   r   r   r   r    r*   r   r   r   r   rJ      s    	
 		

 
 
	

rJ   c                #       s   e Zd ZdZddddejejdddddddfded	ed
edede	dededej
dej
dededededede	de	ddf" fddZdejdee deejee f fddZ  ZS ) MultiScaleBlocku\  
    Implementation of a multiscale vision transformer block. Each block contains a
    multiscale attention layer and a Mlp layer.

    ::


                                      Input
                                        |-------------------+
                                        ↓                   |
                                       Norm                 |
                                        ↓                   |
                                MultiScaleAttention        Pool
                                        ↓                   |
                                     DropPath               |
                                        ↓                   |
                                    Summation ←-------------+
                                        |
                                        |-------------------+
                                        ↓                   |
                                       Norm                 |
                                        ↓                   |
                                       Mlp                 Proj
                                        ↓                   |
                                     DropPath               |
                                        ↓                   |
                                    Summation  ←------------+
    g      @Fr
   rL   rM   Tr4   dim_outrN   	mlp_ratiorO   r   droppath_rater   rT   rP   rQ   rR   rS   rU   r.   rV   r   Nc                    s   t    || _|| _|	|| _dd |D }|}dd |D }t|||||
|||tj|||d| _|dkr:t	|nt
 | _|	|| _t|| }|| _t|||||d| _||krbt||| _t|dkrttj|||dd	| _d
S d
| _d
S )a  
        Args:
            dim (int): Input feature dimension.
            dim_out (int): Output feature dimension.
            num_heads (int): Number of heads in the attention layer.
            mlp_ratio (float): Mlp ratio which controls the feature dimension in the
                hidden layer of the Mlp block.
            qkv_bias (bool): If set to False, the qkv layer will not learn an additive
                bias. Default: False.
            dropout_rate (float): DropOut rate. If set to 0, DropOut is disabled.
            droppath_rate (float): DropPath rate. If set to 0, DropPath is disabled.
            act_layer (nn.Module): Activation layer used in the Mlp layer.
            norm_layer (nn.Module): Normalization layer.
            kernel_q (_size_3_t): Pooling kernel size for q. If pooling kernel size is
                1 for all the dimensions, pooling is not used (by default).
            kernel_kv (_size_3_t): Pooling kernel size for kv. If pooling kernel size
                is 1 for all the dimensions, pooling is not used. By default, pooling
                is disabled.
            stride_q (_size_3_t): Pooling kernel stride for q.
            stride_kv (_size_3_t): Pooling kernel stride for kv.
            pool_mode (str): Pooling mode. Option includes "conv" (learned pooling), "avg"
                (average pooling), and "max" (max pooling).
            has_cls_embed (bool): If set to True, the first token of the input tensor
                should be a cls token. Otherwise, the input tensor does not contain a
                cls token. Pooling is not applied to the cls token.
            pool_first (bool): If set to True, pool is applied before qkv projection.
                Otherwise, pool is applied after qkv projection. Default: False.
        c                 S   s    g | ]}|d kr|d  n|qS )r   r   )r\   sr   r   r   r^     s     z,MultiScaleBlock.__init__.<locals>.<listcomp>c                 S   rY   rZ   r[   )r\   skipr   r   r   r^     r_   )rN   rO   r   rP   rQ   rR   rS   rT   r.   rU   rV   r
   )r   r   r   r   r   r   Frb   N)r   r   r4   r   norm1rJ   r   r   r   r   Identity	drop_pathnorm2r&   r.   r	   mlpr   rj   lenrn   	pool_skip)r   r4   r   rN   r   rO   r   r   r   rT   rP   rQ   rR   rS   rU   r.   rV   kernel_skipstride_skippadding_skipmlp_hidden_dimr   r   r   r     sP   
/

zMultiScaleBlock.__init__r   r-   c           	      C   s|   |  | ||\}}t|| j|| jd\}}|| | }| |}| |}| j| j	kr3| 
|}|| | }||fS )r   )r.   )r   r   rI   r   r.   r   r   r   r4   r   rj   )	r   r   r-   x_blockthw_shape_newx_res_x_normx_mlpr   r   r   r    $  s   	



zMultiScaleBlock.forward)r!   r"   r#   r$   r   r%   r   r&   r'   r   Moduler   r   r   r(   r)   r   r   r    r*   r   r   r   r   r     sr    "	
Zr   )TN)typingr   r   r   r   rl   r(   torch.nnr   torch.nn.common_typesr   r   r   r   r	   r)   r&   r   rI   rJ   r   r   r   r   r   <module>   s4   B
F  