o
    ϯiz;                  A   @   s  d dl mZ d dlmZmZmZmZ d dlZd dlm	Z	 d dl
mZmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d	d
lmZ G dd de	jZddddddddddddd	ddddddddddddedddddedededed ed!ed"ed#ed$ed%ee d&ee d'ee d(ed)ed*ed+ed,ed-ed.ed/ed0ed1eeee   d2eeee   d3eeee   d4eeee   d5ee d6ee d7ee d8ed9ed:ed;e	jf@d<d=ZdS )>    )partial)CallableListOptionalTupleN)MultiScaleBlock#SpatioTemporalClsPositionalEncoding)round_widthset_attributes)create_vit_basic_head)init_net_weights)	_size_2_t	_size_3_t   )create_conv_patch_embedc                       sz   e Zd ZdZdeej dejdeej deej dejdeej deej d	d
f fddZde	j
d	e	j
fddZ  ZS )MultiscaleVisionTransformersu  
    Multiscale Vision Transformers
    Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra Malik,
    Christoph Feichtenhofer
    https://arxiv.org/abs/2104.11227

    ::

                                       PatchEmbed
                                           ↓
                                   PositionalEncoding
                                           ↓
                                        Dropout
                                           ↓
                                     Normalization
                                           ↓
                                         Block 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Block N
                                           ↓
                                     Normalization
                                           ↓
                                          Head


    The builder can be found in `create_mvit`.
    patch_embedcls_positional_encodingpos_dropnorm_patch_embedblocks
norm_embedheadreturnNc                   s:   t    t| t  t|dsJ dt| ddd dS )a  
        Args:
            patch_embed (nn.Module): Patch embed module.
            cls_positional_encoding (nn.Module): Positional encoding module.
            pos_drop (Optional[nn.Module]): Dropout module after patch embed.
            blocks (nn.ModuleList): Stack of multi-scale transformer blocks.
            norm_layer (nn.Module): Normalization layer before head.
            head (Optional[nn.Module]): Head module.
        patch_embed_shapez@cls_positional_encoding should have attribute patch_embed_shape.g{Gz?vit)init_stdstyleN)super__init__r
   localshasattrr   )selfr   r   r   r   r   r   r   	__class__ [/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/vision_transformers.pyr   2   s   
z%MultiscaleVisionTransformers.__init__xc                 C   s   | j d ur
|  |}| |}| jd ur| |}| jd ur#| |}| jj}| jD ]	}|||\}}q*| jd ur>| |}| jd urH| |}|S )N)r   r   r   r   r   r   r   r   )r"   r'   thwblkr%   r%   r&   forwardM   s   











z$MultiscaleVisionTransformers.forward)__name__
__module____qualname____doc__r   nnModule
ModuleListr   torchTensorr*   __classcell__r%   r%   r#   r&   r      s(     	
r   T   	layernorm   `   )r7      r9   )      r;   )r   r7   r7   Fg      @        convg      ?i  )cls_embed_onsep_pos_embeddepthnormenable_patch_embedinput_channelspatch_embed_dimconv_patch_embed_kernelconv_patch_embed_strideconv_patch_embed_paddingenable_patch_embed_normuse_2d_patch	num_heads	mlp_ratioqkv_biasdropout_rate_blockdroppath_rate_blockpooling_mode
pool_firstembed_dim_mulatten_head_mulpool_q_stride_sizepool_kv_stride_sizepool_kv_stride_adaptivepool_kvq_kernelr   head_dropout_ratehead_activationhead_num_classesspatial_sizetemporal_sizer>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   r   rW   rX   rY   r   c           1         sF  |r
|dks
J d|dur|du sJ d|dkr"t tjdd}ntdt| tr/| | f} |r4tjntj} |rCt|||	|
|| d	nd}!|| d
 | d g|rVdt	|
 n|
|rhfddt
tD n}"t||"||d}#dd td
||D }$|dkrtj|d}%t|d t|d }&}'|durt
t|D ]| d |&| d
 < q|durt
t|D ]| d |'| d
 < q|r||nd}(t })dd t
|D }*dd t
|D }+dd t
|D dd t
|D },|dur8t
t|D ]3| dd | d
 < |dur#||*| d
 < qdd | dd D |*| d
 < q|duri| g }t
|D ]#t d
kr_ fddt
t D  |g   qE|durt
t|D ]3| dd |,| d
 < |dur||+| d
 < qtdd | dd D |+| d
 < qtt
|D ]Ht||' ddd}t||& |d}t||&d  t||'d  d}-|)t||-|||||$ ||* |+  |, |||d q|-}.||.}/|dur||.||rdnd||d}0nd}0t|!|#|dkr|%nd|(|)|/|0dS ) a  
    Build Multiscale Vision Transformers (MViT) for recognition. A Vision Transformer
    (ViT) is a specific case of MViT that only uses a single scale attention block.

    Args:
        spatial_size (_size_2_t): Input video spatial resolution (H, W). If a single
            int is given, it assumes the width and the height are the same.
        temporal_size (int): Number of frames in the input video.
        cls_embed_on (bool): If True, use cls embed in the model. Otherwise features
            are average pooled before going to the final classifier.
        sep_pos_embed (bool): If True, perform separate spatiotemporal embedding.
        depth (int): The depth of the model.
        norm (str): Normalization layer. It currently supports "layernorm".

        enable_patch_embed (bool): If true, patchify the input video. If false, it
            assumes the input should have the feature dimension of patch_embed_dim.
        input_channels (int): Channel dimension of the input video.
        patch_embed_dim (int): Embedding dimension after patchifing the video input.
        conv_patch_embed_kernel (Tuple[int]): Kernel size of the convolution for
            patchifing the video input.
        conv_patch_embed_stride (Tuple[int]): Stride size of the convolution for
            patchifing the video input.
        conv_patch_embed_padding (Tuple[int]): Padding size of the convolution for
            patchifing the video input.
        enable_patch_embed_norm (bool): If True, apply normalization after patchifing
            the video input.
        use_2d_patch (bool): If True, use 2D convolutions to get patch embed.
            Otherwise, use 3D convolutions.

        num_heads (int): Number of heads in the first transformer block.
        mlp_ratio (float): Mlp ratio which controls the feature dimension in the
            hidden layer of the Mlp block.
        qkv_bias (bool): If set to False, the qkv layer will not learn an additive
            bias. Default: False.
        dropout_rate_block (float): Dropout rate for the attention block.
        droppath_rate_block (float): Droppath rate for the attention block.
        pooling_mode (str): Pooling mode. Option includes "conv" (learned pooling), "avg"
            (average pooling), and "max" (max pooling).
        pool_first (bool): If set to True, pool is applied before qkv projection.
            Otherwise, pool is applied after qkv projection. Default: False.
        embed_dim_mul (Optional[List[List[int]]]): Dimension multiplication at layer i.
            If X is used, then the next block will increase the embed dimension by X
            times. Format: [depth_i, mul_dim_ratio].
        atten_head_mul (Optional[List[List[int]]]): Head dimension multiplication at
            layer i. If X is used, then the next block will increase the head by
            X times. Format: [depth_i, mul_dim_ratio].
        pool_q_stride_size (Optional[List[List[int]]]): List of stride sizes for the
            pool q at each layer. Format:
            [[i, stride_t_i, stride_h_i, stride_w_i], ...,].
        pool_kv_stride_size (Optional[List[List[int]]]): List of stride sizes for the
            pool kv at each layer. Format:
            [[i, stride_t_i, stride_h_i, stride_w_i], ...,].
        pool_kv_stride_adaptive (Optional[_size_3_t]): Initial kv stride size for the
            first block. The stride size will be further reduced at the layer where q
            is pooled with the ratio of the stride of q pooling. If
            pool_kv_stride_adaptive is set, then pool_kv_stride_size should be none.
        pool_kvq_kernel (Optional[_size_3_t]): Pooling kernel size for q and kv. It None,
            the kernel_size is [s + 1 if s > 1 else s for s in stride_size].

        head (Callable): Head model.
        head_dropout_rate (float): Dropout rate in the head.
        head_activation (Callable): Activation in the head.
        head_num_classes (int): Number of classes in the final classification head.

    Example usage (building a MViT_B model for Kinetics400):

        spatial_size = 224
        temporal_size = 16
        embed_dim_mul = [[1, 2.0], [3, 2.0], [14, 2.0]]
        atten_head_mul = [[1, 2.0], [3, 2.0], [14, 2.0]]
        pool_q_stride_size = [[1, 1, 2, 2], [3, 1, 2, 2], [14, 1, 2, 2]]
        pool_kv_stride_adaptive = [1, 8, 8]
        pool_kvq_kernel = [3, 3, 3]
        head_num_classes = 400
        MViT_B = create_multiscale_vision_transformers(
            spatial_size=spatial_size,
            temporal_size=temporal_size,
            embed_dim_mul=embed_dim_mul,
            atten_head_mul=atten_head_mul,
            pool_q_stride_size=pool_q_stride_size,
            pool_kv_stride_adaptive=pool_kv_stride_adaptive,
            pool_kvq_kernel=pool_kvq_kernel,
            head_num_classes=head_num_classes,
        )
    r   z-If use_2d_patch, temporal_size needs to be 1.NzEpool_kv_stride_size should be none if pool_kv_stride_adaptive is set.r6   gư>)epszOnly supports layernorm.)in_channelsout_channelsconv_kernel_sizeconv_strideconv_paddingr=   r   r   c                    s   g | ]
} | |  qS r%   r%   .0i)
input_dimsinput_stirder%   r&   
<listcomp>  s    z9create_multiscale_vision_transformers.<locals>.<listcomp>)	embed_dimr   r?   has_clsc                 S   s   g | ]}|  qS r%   )item)rd   r'   r%   r%   r&   rh     s    r<   )pc                 S      g | ]}g qS r%   r%   rc   r%   r%   r&   rh          c                 S   rm   r%   r%   rc   r%   r%   r&   rh   !  rn   c                 S   rm   r%   r%   rc   r%   r%   r&   rh   "  rn   c                 S   rm   r%   r%   rc   r%   r%   r&   rh   #  rn   c                 S       g | ]}|d kr|d  n|qS rb   r%   rd   sr%   r%   r&   rh   +      c                    s&   g | ]}t  |  |  d qS rb   )max)rd   d)
_stride_kvre   stride_qr%   r&   rh   5  s    c                 S   ro   rb   r%   rp   r%   r%   r&   rh   A  rr   )	min_widthdivisor)rx   )dimdim_outrJ   rK   rL   dropout_ratedroppath_rate
norm_layerkernel_q	kernel_kvrv   	stride_kv	pool_modehas_cls_embedrP   clsmean)in_featuresout_featuresseq_pool_typer{   
activation)r   r   r   r   r   r   r   )r   r/   	LayerNormNotImplementedError
isinstanceintConv2dConv3dr   tuplerangelenr   r2   linspaceDropoutonesr1   appendr	   r   r   )1rZ   r[   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   r   rW   rX   rY   r}   conv_patch_opr   r   r   dprr   dim_mulhead_mulr   mvit_blockspool_qpool_kvr   rz   ri   r   
head_modelr%   )ru   re   rf   rg   rv   r&   %create_multiscale_vision_transformersb   s   {

	







r   ) 	functoolsr   typingr   r   r   r   r2   torch.nnr/   pytorchvideo.layersr   r   pytorchvideo.layers.utilsr	   r
   pytorchvideo.models.headr   pytorchvideo.models.weight_initr   torch.nn.common_typesr   r   stemr   r0   r   r   boolstrfloatr   r%   r%   r%   r&   <module>   s   U	
 !"#$