o
    ϯi:                  3   @   s&  d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZ d dlmZ dddd ejejd	d
ddddddddde
dejddddddedededededededee dee dedee d ee d!ee d"ee d#ed$ee d%ee d&ed'ed(ed)ee d*ee d+ed,ed-ejf2d.d/ZdS )0    )CallableTupleN)create_res_basic_head)Netcreate_bottleneck_blockcreate_res_stage)create_res_basic_stem   2   i  @   )r	      r   )      r   )r   r	   r	   r   r   r   )r	   r	   r	   r   )r   r   r   r      )r   r   r   T)input_channelmodel_depthmodel_num_classdropout_ratenorm
activationstem_dim_outstem_conv_kernel_sizestem_conv_stride	stem_poolstem_pool_kernel_sizestem_pool_stridestage_conv_a_kernel_sizestage_conv_b_kernel_sizestage_conv_b_width_per_groupstage_spatial_stridestage_temporal_stride
bottleneckbottleneck_ratio	head_poolhead_pool_kernel_sizehead_output_sizehead_activationhead_output_with_global_averager   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   returnc           $      C   sL  t jd dddd}|| v sJ | d|  || }g }t| |||dd |D |	|
|d	d |
D ||d
}|| |}|d }tt|D ]?}|| }|| } || || || f}!t| |||||ddd |D ||!dd |D || d||d}"||" |}|d }qMt	||||||||d}#||# t
t|dS )uj  
    Build Channel-Separated Convolutional Networks (CSN):
    Video classification with channel-separated convolutional networks.
    Du Tran, Heng Wang, Lorenzo Torresani, Matt Feiszli. ICCV 2019.

    CSN follows the ResNet style architecture including three parts: Stem,
    Stages and Head. The three parts are assembled in the following order:

    ::

                                         Input
                                           ↓
                                         Stem
                                           ↓
                                         Stage 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Stage N
                                           ↓
                                         Head

    CSN uses depthwise convolution. To further reduce the computational cost, it uses
    low resolution (112x112), short clips (4 frames), different striding and kernel
    size, etc.

    Args:

        input_channel (int): number of channels for the input video clip.

        model_depth (int): the depth of the resnet. Options include: 50, 101, 152.
            model_num_class (int): the number of classes for the video dataset.
            dropout_rate (float): dropout rate.

        norm (callable): a callable that constructs normalization layer.

        activation (callable): a callable that constructs activation layer.

        stem_dim_out (int): output channel size to stem.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.
        stem_pool (callable): a callable that constructs resnet head pooling layer.
        stem_pool_kernel_size (tuple): pooling kernel size(s).
        stem_pool_stride (tuple): pooling stride size(s).

        stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_width_per_group(int): the width of each group for conv_b. Set
            it to 1 for depthwise convolution.
        stage_spatial_stride (tuple): the spatial stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        bottleneck (callable): a callable that constructs bottleneck block layer.
            Examples include: create_bottleneck_block.
        bottleneck_ratio (int): the ratio between inner and outer dimensions for
            the bottleneck block.

        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_pool_kernel_size (tuple): the pooling kernel size.
        head_output_size (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.

    Returns:
        (nn.Module): the csn model.
    zPYTORCHVIDEO.model.create_csn)r	   r      r	   )r	   r      r	   )r	      $   r	   )r
   e      z is not in c                 S      g | ]}|d  qS r    .0sizer2   r2   K/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/csn.py
<listcomp>       zcreate_csn.<locals>.<listcomp>c                 S   r0   r1   r2   r3   r2   r2   r6   r7      r8   )in_channelsout_channelsconv_kernel_sizeconv_strideconv_paddingpoolpool_kernel_sizepool_stridepool_paddingr   r   r   r   c                 S   r0   r1   r2   r3   r2   r2   r6   r7      r8   c                 S   r0   r1   r2   r3   r2   r2   r6   r7      r8   )depthdim_in	dim_innerdim_outr"   conv_a_kernel_sizeconv_a_strideconv_a_paddingconv_b_kernel_sizeconv_b_strideconv_b_paddingconv_b_num_groupsconv_b_dilationr   r   r   )in_featuresout_featuresr>   output_sizer?   r   r   output_with_global_average)blocks)torch_C_log_api_usage_oncekeysr   appendrangelenr   r   r   nn
ModuleList)$r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   _MODEL_STAGE_DEPTHstage_depthsrR   stemstage_dim_instage_dim_outidxstage_dim_innerrB   stage_conv_b_stridestageheadr2   r2   r6   
create_csn   s|   g




rf   )typingr   r   rS   torch.nnrZ   pytorchvideo.models.headr   pytorchvideo.models.resnetr   r   r   pytorchvideo.models.stemr   BatchNorm3dReLU	AvgPool3dintfloatboolModulerf   r2   r2   r2   r6   <module>   s   	 !