o
    ϯi.                  1   @   s  d dl mZ d dlmZmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZmZ d dlmZ d	d	d
ejddd	dd	e	ejejddejddedededee dee dee dedee dee dee dedee dededed ed!ed"ed#ejf&d$d%Zd&d'd(d)ejddejd*d+d,d-d.d/d-d0d1eeeefejd2d	ejd3d4d5ed6ed7ed8eded ed!ed"ed9ed:ee d;ee d<eee  d=eee  d>ee d?eee  d@ee dAee dBee dCedDee dEee dFedGed#ejf0dHdIZdS )J    )partial)CallableTupleN)create_conv_2plus1d)create_res_basic_head)Net)create_bottleneck_blockcreate_res_stage)create_res_basic_stem   r   r   )r   r   r      r   r   )   r   r   r   gh㈵>g?)conv_a_kernel_sizeconv_a_strideconv_a_paddingconv_aconv_b_kernel_sizeconv_b_strideconv_b_paddingconv_b_num_groupsconv_b_dilationconv_bconv_cnormnorm_epsnorm_momentum
activationdim_in	dim_innerdim_outr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   returnc                 C   s   t di d| d|d|d|d|d|d|d|d	|d
|	d|
d|dtt||||dd|d|d|d|d|S )u
  
    2plus1d bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
    and Activations repeated in the following order:

    ::

                                    Conv3d (conv_a)
                                           ↓
                                 Normalization (norm_a)
                                           ↓
                                   Activation (act_a)
                                           ↓
                                  Conv(2+1)d (conv_b)
                                           ↓
                                 Normalization (norm_b)
                                           ↓
                                   Activation (act_b)
                                           ↓
                                    Conv3d (conv_c)
                                           ↓
                                 Normalization (norm_c)

    Normalization examples include: BatchNorm3d and None (no normalization).
    Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).

    Args:
        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        conv_a_stride (tuple): convolutional stride size(s) for conv_a.
        conv_a_padding (tuple): convolutional padding(s) for conv_a.
        conv_a (callable): a callable that constructs the conv_a conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_b_stride (tuple): convolutional stride size(s) for conv_b.
        conv_b_padding (tuple): convolutional padding(s) for conv_b.
        conv_b_num_groups (int): number of groups for groupwise convolution for
            conv_b.
        conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        conv_b (callable): a callable that constructs the conv_b conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_c (callable): a callable that constructs the conv_c conv layer, examples
            include nn.Conv3d, OctaveConv, etc

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer, examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).

    Returns:
        (nn.Module): 2plus1d bottleneck block.
    r   r    r!   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   N )r   r   r   )r   r    r!   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r#   P/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/r2plus1d.pycreate_2plus1d_bottleneck_block   sV   Q	
r%   r   2   i  g        @   )r      r(   )r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )   r(   r(   T)input_channelmodel_depthmodel_num_classdropout_rater   r   r   r   stem_dim_outstem_conv_kernel_sizestem_conv_stridestage_conv_a_kernel_sizestage_conv_b_kernel_sizestage_conv_b_num_groupsstage_conv_b_dilationstage_spatial_stridestage_temporal_stridestage_bottleneck	head_poolhead_pool_kernel_sizehead_output_sizehead_activationhead_output_with_global_averager*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   c           #      C   sX  t jd dddd}|| v sJ | d|  || }g }t| ||	|
dd |	D d	||d
}|| |}|d }tt|D ]M}|d }|| }|| || || f} t|||||| || g ddd || D || | dd || D || || ||d}!||! |}|d }qEt	||||||||d}"||" t
t|dS )u_  
    Build the R(2+1)D network from::
    A closer look at spatiotemporal convolutions for action recognition.
    Du Tran, Heng Wang, Lorenzo Torresani, Jamie Ray, Yann LeCun, Manohar Paluri. CVPR 2018.

    R(2+1)D follows the ResNet style architecture including three parts: Stem,
    Stages and Head. The three parts are assembled in the following order:

    ::

                                         Input
                                           ↓
                                         Stem
                                           ↓
                                         Stage 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Stage N
                                           ↓
                                         Head

    Args:

        input_channel (int): number of channels for the input video clip.

        model_depth (int): the depth of the resnet.
        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.

        norm (callable): a callable that constructs normalization layer.
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer.

        stem_dim_out (int): output channel size for stem.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.

        stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
            for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
        stage_conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        stage_spatial_stride (tuple): the spatial stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        stage_bottleneck (tuple): a callable that constructs bottleneck block layer
            for each stage. Examples include: create_bottleneck_block,
            create_2plus1d_bottleneck_block.

        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_pool_kernel_size (tuple): the pooling kernel size.
        head_output_size (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.

    Returns:
        (nn.Module): basic resnet.
    z"PYTORCHVIDEO.model.create_r2plus1d)r   r)      r   )r   r)      r   )r      $   r   )r&   e      z is not in c                 S      g | ]}|d  qS r   r#   .0sizer#   r#   r$   
<listcomp>      z#create_r2plus1d.<locals>.<listcomp>N)in_channelsout_channelsconv_kernel_sizeconv_strideconv_paddingpoolr   r   r)   r   c                 S   rC   rD   r#   rE   r#   r#   r$   rH     rI   c                 S   rC   rD   r#   rE   r#   r#   r$   rH   "  rI   )depthr   r    r!   
bottleneckr   r   r   r   r   r   r   r   r   r   r   )in_featuresout_featuresrO   output_sizepool_kernel_sizer-   r   output_with_global_average)blocks)torch_C_log_api_usage_oncekeysr
   appendrangelenr	   r   r   nn
ModuleList)#r*   r+   r,   r-   r   r   r   r   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   _MODEL_STAGE_DEPTHstage_depthsrW   stemstage_dim_instage_dim_outidxstage_dim_innerrP   stage_conv_b_stridestageheadr#   r#   r$   create_r2plus1d{   sv   u





rk   )	functoolsr   typingr   r   rX   torch.nnr_    pytorchvideo.layers.convolutionsr   pytorchvideo.models.headr   pytorchvideo.models.netr   pytorchvideo.models.resnetr   r	   pytorchvideo.models.stemr
   Conv3dBatchNorm3dReLUintfloatModuler%   	AvgPool3dSoftmaxboolrk   r#   r#   r#   r$   <module>   s  		

p	



 &'(/01234