o
    ϯiX                  C   @   sj  d dl mZmZmZmZ d dlZd dlZd dlm	Z	 d dl
mZ d dlmZmZ d dlmZmZ d dlmZmZ ddd	e	jd
dddde	je	je	jdde	jddedededee dee dee dedee dee dee dedee deded ed!ed"ed#ed$e	jf&d%d&Zddd	e	jddd'dde	je	je	jdde	jddedededee dee dee dedee dee dee dedee deded ed!ed"ed#ed$e	jf&d(d)Zd*d+ Zd,eddd	e	jd
dddde	je	je	je	jdde	je	jd-dededed.ed/ed0edee dee dee dedee dee dee dedee deded1ed ed!ed"ed2ed3ed$e	jf0d4d5Zddd	e	jd
dddde	je	je	jdde	jdd6edededed.edeee eee  f dee deee eee  f dedee dee dee dedee deded ed!ed"ed#ed$e	jf*d7d8Z d9d:d;d<Z!d=d>d?d@e	je	jdAdBde	j"d
dedddCdDdEdFdGdGdEeee	j#dHdddIdJdKedLedMedNed ed#edOedPee dQee dRedSee dTee dUedVedWee dXeee eee  f dYeee eee  f dZee d[eee eee  f d\ee d]ee d^ee d.eee ef d_ed`edaee dbee dcedded$e	jf<dedfZ$d=d>dgd@e	je	jdAdhde	j"d
dedddCdDdEdidjdjdEeee	j#dkde	j%d,dldmd dn dKedLedMedNed ed#edOedPee dQee dRedSee dTee dUedVedWee dXeee eee  f dYeee eee  f dZee d[eee eee  f d\ee d]ee d^ee d.eee ef d_ed`edaee dbee dceddedoee dpedqed$e	jfBdrdsZ&dd>d?d@e	je	jdAdtduddvdweddddvdEddEdGdGeeeefe	j#dxdddIdydKedLedMedNed ed#edOedPee dQee dRedSee dTee dUedVedWee dXeee eee  f dYeee eee  f dZee d[eee eee  f d\ee d]ee d^ee d.eee ef d`edaee dbee dcedded$e	jf:dzd{Z'G d|d} d}e	jZ(G d~d de	jZ)G dd de	jZ*G dd de	jZ+dS )    )CallableListTupleUnionN)set_attributes)create_res_basic_headcreate_res_roi_pooling_head)DetectionBBoxNetworkNet)create_acoustic_res_basic_stemcreate_res_basic_stem      r   )   r   r   )r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   gh㈵>g?)conv_a_kernel_sizeconv_a_strideconv_a_paddingconv_aconv_b_kernel_sizeconv_b_strideconv_b_paddingconv_b_num_groupsconv_b_dilationconv_bconv_cnormnorm_epsnorm_momentum
activationdim_in	dim_innerdim_outr   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   returnc              
   C   s   || ||||dd}|du rdn||||d}|du rdn| }||||||	d|
|d}|du r2dn||||d}|du r?dn| }|||ddd}|du rPdn||||d}t ||||||||dS )	u
  
    Bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
    and Activations repeated in the following order:

    ::

                                    Conv3d (conv_a)
                                           ↓
                                 Normalization (norm_a)
                                           ↓
                                   Activation (act_a)
                                           ↓
                                    Conv3d (conv_b)
                                           ↓
                                 Normalization (norm_b)
                                           ↓
                                   Activation (act_b)
                                           ↓
                                    Conv3d (conv_c)
                                           ↓
                                 Normalization (norm_c)

    Normalization examples include: BatchNorm3d and None (no normalization).
    Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).

    Args:
        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        conv_a_stride (tuple): convolutional stride size(s) for conv_a.
        conv_a_padding (tuple): convolutional padding(s) for conv_a.
        conv_a (callable): a callable that constructs the conv_a conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_b_stride (tuple): convolutional stride size(s) for conv_b.
        conv_b_padding (tuple): convolutional padding(s) for conv_b.
        conv_b_num_groups (int): number of groups for groupwise convolution for
            conv_b.
        conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        conv_b (callable): a callable that constructs the conv_b conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_c (callable): a callable that constructs the conv_c conv layer, examples
            include nn.Conv3d, OctaveConv, etc

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer, examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).

    Returns:
        (nn.Module): resnet bottleneck block.
    Fin_channelsout_channelskernel_sizestridepaddingbiasNnum_featuresepsmomentumr(   r)   r*   r+   r,   r-   groupsdilationr   r(   r)   r*   r-   r   norm_aact_ar   norm_bact_br   norm_c)BottleneckBlock)r#   r$   r%   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r7   r8   r9   r:   r;    r=   N/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/resnet.pycreate_bottleneck_block   sZ   Q
r?   r   r   r   c           %   
   C   s  || ||||dd}|du rdn||||d}|du rdn| }|d ddg}|}|	d ddg}d|d |d g}|}d|	d |	d g}|
fd \}}|d ddg}d|d |d g}||||||d||d}|du rmdn||||d}|du rzdn| } ||||||d||d}!|du rdn||||d}"|du rdn| }#|||d	dd
}|du rdn||||d}$t |||t|!|gt|"|gt|#| g||$dS )uP  
    Acoustic Bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
    and Activations repeated in the following order:

    ::

                                    Conv3d (conv_a)
                                           ↓
                                 Normalization (norm_a)
                                           ↓
                                   Activation (act_a)
                                           ↓
                           ---------------------------------
                           ↓                               ↓
                Temporal Conv3d (conv_b)        Spatial Conv3d (conv_b)
                           ↓                               ↓
                 Normalization (norm_b)         Normalization (norm_b)
                           ↓                               ↓
                   Activation (act_b)              Activation (act_b)
                           ↓                               ↓
                           ---------------------------------
                                           ↓
                                    Conv3d (conv_c)
                                           ↓
                                 Normalization (norm_c)

    Normalization examples include: BatchNorm3d and None (no normalization).
    Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).

    Args:
        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        conv_a_stride (tuple): convolutional stride size(s) for conv_a.
        conv_a_padding (tuple): convolutional padding(s) for conv_a.
        conv_a (callable): a callable that constructs the conv_a conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_b_stride (tuple): convolutional stride size(s) for conv_b.
        conv_b_padding (tuple): convolutional padding(s) for conv_b.
        conv_b_num_groups (int): number of groups for groupwise convolution for
            conv_b.
        conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        conv_b (callable): a callable that constructs the conv_b conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_c (callable): a callable that constructs the conv_c conv layer, examples
            include nn.Conv3d, OctaveConv, etc

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer, examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).

    Returns:
        (nn.Module): resnet acoustic bottleneck block.
    Fr'   Nr.   r   r   r   r2   r   r5   r6   )SeparableBottleneckBlocknn
ModuleList)%r#   r$   r%   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r7   r8   conv_b_1_kernel_sizeconv_b_1_strideconv_b_1_paddingconv_b_2_kernel_sizeconv_b_2_strideconv_b_2_paddingconv_b_1_num_groupsconv_b_2_num_groupsconv_b_1_dilationconv_b_2_dilationconv_b_1norm_b_1act_b_1conv_b_2norm_b_2act_b_2r;   r=   r=   r>    create_acoustic_bottleneck_block   s   V
rT   c                 C   s   | | S )zH
    Utility function used in lieu of lamda which are not picklable
    r=   )xyr=   r=   r>   _trivial_sum?  s   rW   F)use_shortcutbranch_fusionr   r   r   r   r   r   r   r   r   r   r   	conv_skipr   r    r!   activation_bottleneckactivation_block
bottleneckrX   rY   rZ   r[   r\   c                 C   s  t ttjt||}d}|s|dur%| |kst|dkr%||||d}t| |ks3t|dks3|r<|| |d|ddnd||di d| d|d	|d
|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d||du rd|dS | |dS )u  
    Residual block. Performs a summation between an identity shortcut in branch1 and a
    main block in branch2. When the input and output dimensions are different, a
    convolution followed by a normalization will be performed.

    ::


                                         Input
                                           |-------+
                                           ↓       |
                                         Block     |
                                           ↓       |
                                       Summation ←-+
                                           ↓
                                       Activation

    Normalization examples include: BatchNorm3d and None (no normalization).
    Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).
    Transform examples include: BottleneckBlock.

    Args:
        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        bottleneck (callable): a callable that constructs bottleneck block layer.
            Examples include: create_bottleneck_block.
        use_shortcut (bool): If true, use conv and norm layers in skip connection.
        branch_fusion (callable): a callable that constructs summation layer.
            Examples include: lambda x, y: x + y, OctaveSum.

        conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        conv_a_stride (tuple): convolutional stride size(s) for conv_a.
        conv_a_padding (tuple): convolutional padding(s) for conv_a.
        conv_a (callable): a callable that constructs the conv_a conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_b_stride (tuple): convolutional stride size(s) for conv_b.
        conv_b_padding (tuple): convolutional padding(s) for conv_b.
        conv_b_num_groups (int): number of groups for groupwise convolution for
            conv_b.
        conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        conv_b (callable): a callable that constructs the conv_b conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_c (callable): a callable that constructs the conv_c conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_skip (callable): a callable that constructs the conv_skip conv layer,
        examples include nn.Conv3d, OctaveConv, etc

        norm (callable): a callable that constructs normalization layer. Examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation_bottleneck (callable): a callable that constructs activation layer in
            bottleneck. Examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None
            (not performing activation).
        activation_block (callable): a callable that constructs activation layer used
            at the end of the block. Examples include: nn.ReLU, nn.Softmax, nn.Sigmoid,
            and None (not performing activation).

    Returns:
        (nn.Module): resnet basic block layer.
    Nr   r.   r   F)r*   r+   r-   r#   r$   r%   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   )branch1_convbranch1_normbranch2r"   rY   r=   )tuplemapnpprodzipResBlock)r#   r$   r%   r]   rX   rY   r   r   r   r   r   r   r   r   r   r   r   rZ   r   r    r!   r[   r\   branch1_conv_stride
norm_modelr=   r=   r>   create_res_blockF  sv   ^	

ri   depthc                 C   s"  g }t |d tr|g}t |d tr|g}||  d|  }||  d|  }t| D ]^}tdi d|dkr6|n|d|d|d|d|| d|dkrM|nd	d
|| d|d|	d|dkra|
nd	d|d|d|d|d|d|d|d|d|d|}|| q*tt|dS )u  
    Create Residual Stage, which composes sequential blocks that make up a ResNet. These
    blocks could be, for example, Residual blocks, Non-Local layers, or
    Squeeze-Excitation layers.

    ::


                                        Input
                                           ↓
                                       ResBlock
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                       ResBlock

    Normalization examples include: BatchNorm3d and None (no normalization).
    Activation examples include: ReLU, Softmax, Sigmoid, and None (no activation).
    Bottleneck examples include: create_bottleneck_block.

    Args:
        depth (init): number of blocks to create.

        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        bottleneck (callable): a callable that constructs bottleneck block layer.
            Examples include: create_bottleneck_block.

        conv_a_kernel_size (tuple or list of tuple): convolutional kernel size(s)
            for conv_a. If conv_a_kernel_size is a tuple, use it for all blocks in
            the stage. If conv_a_kernel_size is a list of tuple, the kernel sizes
            will be repeated until having same length of depth in the stage. For
            example, for conv_a_kernel_size = [(3, 1, 1), (1, 1, 1)], the kernel
            size for the first 6 blocks would be [(3, 1, 1), (1, 1, 1), (3, 1, 1),
            (1, 1, 1), (3, 1, 1)].
        conv_a_stride (tuple): convolutional stride size(s) for conv_a.
        conv_a_padding (tuple or list of tuple): convolutional padding(s) for
            conv_a. If conv_a_padding is a tuple, use it for all blocks in
            the stage. If conv_a_padding is a list of tuple, the padding sizes
            will be repeated until having same length of depth in the stage.
        conv_a (callable): a callable that constructs the conv_a conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_b_stride (tuple): convolutional stride size(s) for conv_b.
        conv_b_padding (tuple): convolutional padding(s) for conv_b.
        conv_b_num_groups (int): number of groups for groupwise convolution for
            conv_b.
        conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        conv_b (callable): a callable that constructs the conv_b conv layer, examples
            include nn.Conv3d, OctaveConv, etc
        conv_c (callable): a callable that constructs the conv_c conv layer, examples
            include nn.Conv3d, OctaveConv, etc

        norm (callable): a callable that constructs normalization layer. Examples
            include nn.BatchNorm3d, and None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer. Examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).

    Returns:
        (nn.Module): resnet basic stage layer.
    r   Nr#   r$   r%   r]   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r[   r\   )
res_blocksr=   )
isinstanceintrangeri   appendResStagerB   rC   )rj   r#   r$   r%   r]   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   rk   indblockr=   r=   r>   create_res_stage  sl   `	
rs   )r         r   )r   rt      r   )r      $   r   )2   e      r   ry   i  g      ?@   )r      r}   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )rt   r}   r}   T)input_channelmodel_depthmodel_num_classdropout_rater   r"   stem_dim_outstem_conv_kernel_sizestem_conv_stride	stem_poolstem_pool_kernel_sizestem_pool_stridestemstage1_poolstage1_pool_kernel_sizestage_conv_a_kernel_sizestage_conv_b_kernel_sizestage_conv_b_num_groupsstage_conv_b_dilationstage_spatial_h_stridestage_spatial_w_stridestage_temporal_strider]   head	head_poolhead_pool_kernel_sizehead_output_sizehead_activationhead_output_with_global_averager~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   c           )      C   st  t jd |t v sJ | dt  t| }t|d tr)|ft| }t|d tr7|ft| }t|d trE|ft| }t|trQ|gt| }g }|| |||dd |D |	|
|dd |
D ||d}|	| |}|d } t
t|D ]}!| d }"||! }#||! }$||! d	d	f}%t|$d trd
d |$D ndd |$D }&d	||! ||! f}'t|#||"| ||! |$|%|&||! |'||! d d ||! d	 d	kr||! d	 n||! d	 d ||! d d	kr||! d n||! d d f||! ||! ||d}(|	|( | }| d } |!dkr|dur|	|||dd q}|dur2|||||||||d}|	| tt|dS )uU  
    Build ResNet style models for video recognition. ResNet has three parts:
    Stem, Stages and Head. Stem is the first Convolution layer (Conv1) with an
    optional pooling layer. Stages are grouped residual blocks. There are usually
    multiple stages and each stage may include multiple residual blocks. Head
    may include pooling, dropout, a fully-connected layer and global spatial
    temporal averaging. The three parts are assembled in the following order:

    ::

                                         Input
                                           ↓
                                         Stem
                                           ↓
                                         Stage 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Stage N
                                           ↓
                                         Head

    Args:

        input_channel (int): number of channels for the input video clip.

        model_depth (int): the depth of the resnet. Options include: 50, 101, 152.
        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.


        norm (callable): a callable that constructs normalization layer.

        activation (callable): a callable that constructs activation layer.

        stem_dim_out (int): output channel size to stem.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.
        stem_pool (callable): a callable that constructs resnet head pooling layer.
        stem_pool_kernel_size (tuple): pooling kernel size(s).
        stem_pool_stride (tuple): pooling stride size(s).
        stem (callable): a callable that constructs stem layer.
            Examples include: create_res_video_stem.

        stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
            for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
        stage_conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        stage_spatial_h_stride (tuple): the spatial height stride for each stage.
        stage_spatial_w_stride (tuple): the spatial width stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        bottleneck (callable): a callable that constructs bottleneck block layer.
            Examples include: create_bottleneck_block.

        head (callable): a callable that constructs the resnet-style head.
            Ex: create_res_basic_head
        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_pool_kernel_size (tuple): the pooling kernel size.
        head_output_size (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.

    Returns:
        (nn.Module): basic resnet.
    z PYTORCHVIDEO.model.create_resnetz is not in r   c                 S      g | ]}|d  qS r   r=   .0sizer=   r=   r>   
<listcomp>      z!create_resnet.<locals>.<listcomp>c                 S   r   r   r=   r   r=   r=   r>   r     r   )r(   r)   conv_kernel_sizeconv_strideconv_paddingpoolpool_kernel_sizepool_stridepool_paddingr   r"   rt   r   c                 S   r   r   r=   r   r=   r=   r>   r     r   c                 S   s   g | ]	}d d |D qS )c                 S   r   r   r=   r   r=   r=   r>   r     r   z,create_resnet.<locals>.<listcomp>.<listcomp>r=   )r   sizesr=   r=   r>   r     s    r   )rj   r#   r$   r%   r]   r   r   r   r   r   r   r   r   r   r"   Nr@   )r*   r+   r,   )in_featuresout_featuresr   output_sizer   r   r"   output_with_global_average)blocks)torch_C_log_api_usage_once_MODEL_STAGE_DEPTHkeysrl   rm   lenr   ro   rn   rs   r
   rB   rC   ))r~   r   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r   r   r   r   r   r   stage_depthsr   stage_dim_instage_dim_outidxstage_dim_innerrj   stage_conv_a_kernelstage_conv_a_stridestage_conv_a_paddingstage_conv_b_stridestager=   r=   r>   create_resnetW  s   |

	



r   P   )r   r}   r}   )r   r   r   r   )r   r   r   r   )rt   r   r   )r}   r}   g      ?) r~   r   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r   r   r   r   r   r   head_spatial_resolutionhead_spatial_scalehead_sampling_ratior   r   r   c            !      C   s   t di d| d|d|d|d|d|d|d|d	|d
|	d|
d|d|d|d|d|d|d|d|d|d|dd} ||dtt| d   ||||||||||d}t| |S )u0  
    Build ResNet style models for video detection. ResNet has three parts:
    Stem, Stages and Head. Stem is the first Convolution layer (Conv1) with an
    optional pooling layer. Stages are grouped residual blocks. There are usually
    multiple stages and each stage may include multiple residual blocks. Head
    may include pooling, dropout, a fully-connected layer and global spatial
    temporal averaging. The three parts are assembled in the following order:

    ::

                            Input Clip    Input Bounding Boxes
                              ↓                       ↓
                            Stem                      ↓
                              ↓                       ↓
                            Stage 1                   ↓
                              ↓                       ↓
                              .                       ↓
                              .                       ↓
                              .                       ↓
                              ↓                       ↓
                            Stage N                   ↓
                              ↓--------> Head <-------↓

    Args:

        input_channel (int): number of channels for the input video clip.

        model_depth (int): the depth of the resnet. Options include: 50, 101, 152.
        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.


        norm (callable): a callable that constructs normalization layer.

        activation (callable): a callable that constructs activation layer.

        stem_dim_out (int): output channel size to stem.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.
        stem_pool (callable): a callable that constructs resnet head pooling layer.
        stem_pool_kernel_size (tuple): pooling kernel size(s).
        stem_pool_stride (tuple): pooling stride size(s).
        stem (callable): a callable that constructs stem layer.
            Examples include: create_res_video_stem.

        stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
            for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
        stage_conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        stage_spatial_h_stride (tuple): the spatial height stride for each stage.
        stage_spatial_w_stride (tuple): the spatial width stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        bottleneck (callable): a callable that constructs bottleneck block layer.
            Examples include: create_bottleneck_block.

        head (callable): a callable that constructs the detection head which can
            take in the additional input of bounding boxes.
            Ex: create_res_roi_pooling_head
        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_pool_kernel_size (tuple): the pooling kernel size.
        head_output_size (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.
        head_spatial_resolution (tuple): h, w sizes of the RoI interpolation.
        head_spatial_scale (float): scale the input boxes by this number.
        head_sampling_ratio (int): number of inputs samples to take for each output
                sample interpolation. 0 to take samples densely.

    Returns:
        (nn.Module): basic resnet.
    r~   r   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r   Nr   r   )r   r   r   r   r   r   r"   r   
resolutionspatial_scalesampling_ratior=   )r   r   r   r	   )!r~   r   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r   r   r   r   r   r   r   r   r   modelr=   r=   r>   create_resnet_with_roi_headF  sx    

r   )	   r   r   )r   r   r   )r   r   r   )r   r   r   )rt   r   r   r~   r   r   r   r   r"   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r]   r   r   r   r   r   c                 C   s   t di t S )u  
    Build ResNet style models for acoustic recognition. ResNet has three parts:
    Stem, Stages and Head. Stem is the first Convolution layer (Conv1) with an
    optional pooling layer. Stages are grouped residual blocks. There are usually
    multiple stages and each stage may include multiple residual blocks. Head
    may include pooling, dropout, a fully-connected layer and global spatial
    temporal averaging. The three parts are assembled in the following order:

    ::

                                         Input
                                           ↓
                                         Stem
                                           ↓
                                         Stage 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Stage N
                                           ↓
                                         Head

    Args:

        input_channel (int): number of channels for the input video clip.

        model_depth (int): the depth of the resnet. Options include: 50, 101, 152.
        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.


        norm (callable): a callable that constructs normalization layer.

        activation (callable): a callable that constructs activation layer.

        stem_dim_out (int): output channel size to stem.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.
        stem_pool (callable): a callable that constructs resnet head pooling layer.
        stem_pool_kernel_size (tuple): pooling kernel size(s).
        stem_pool_stride (tuple): pooling stride size(s).
        stem (callable): a callable that constructs stem layer.
            Examples include: create_res_video_stem.

        stage_conv_a_kernel_size (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
            for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
        stage_conv_b_dilation (tuple): dilation for 3D convolution for conv_b.
        stage_spatial_h_stride (tuple): the spatial height stride for each stage.
        stage_spatial_w_stride (tuple): the spatial width stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        bottleneck (callable): a callable that constructs bottleneck block layer.
            Examples include: create_bottleneck_block.

        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_pool_kernel_size (tuple): the pooling kernel size.
        head_output_size (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.

    Returns:
        (nn.Module): audio resnet, that takes spectragram image input with
            shape: (B, C, T, 1, F), where T is the time dimension and F is the
            frequency dimension.
    Nr=   )r   localsr   r=   r=   r>   create_acoustic_resnet  s   pr   c                       s`   e Zd ZdZ					ddejdejdejdejdedejf fd	d
Zdej	fddZ
  ZS )rf   u  
    Residual block. Performs a summation between an identity shortcut in branch1 and a
    main block in branch2. When the input and output dimensions are different, a
    convolution followed by a normalization will be performed.

    ::


                                         Input
                                           |-------+
                                           ↓       |
                                         Block     |
                                           ↓       |
                                       Summation ←-+
                                           ↓
                                       Activation

    The builder can be found in `create_res_block`.
    Nr^   r_   r`   r"   rY   r&   c                    s(   t    t| t  | jdusJ dS )a  
        Args:
            branch1_conv (torch.nn.modules): convolutional module in branch1.
            branch1_norm (torch.nn.modules): normalization module in branch1.
            branch2 (torch.nn.modules): bottleneck block module in branch2.
            activation (torch.nn.modules): activation module.
            branch_fusion: (Callable): A callable or layer that combines branch1
                and branch2.
        N)super__init__r   r   r`   )selfr^   r_   r`   r"   rY   	__class__r=   r>   r     s   
zResBlock.__init__c                 C   sf   | j d u r| || |}n|  |}| jd ur| |}| || |}| jd ur1| |}|S N)r^   rY   r`   r_   r"   )r   rU   shortcutr=   r=   r>   forward  s   





zResBlock.forward)NNNNN)__name__
__module____qualname____doc__rB   Moduler   r   r   Tensorr   __classcell__r=   r=   r   r>   rf   k  s*    rf   c                       sv   e Zd ZdZdddejdejdejdejdejd	ejd
ejdejdeddf fddZde	j
de	j
fddZ  ZS )rA   u  
    Separable Bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
    and Activations repeated in the following order. Requires a tuple of models to be
    provided to conv_b, norm_b, act_b to perform Convolution, Normalization, and
    Activations in parallel Separably.

    ::


                                    Conv3d (conv_a)
                                           ↓
                                 Normalization (norm_a)
                                           ↓
                                   Activation (act_a)
                                           ↓
                                 Conv3d(s) (conv_b), ...
                                         ↓ (↓)
                              Normalization(s) (norm_b), ...
                                         ↓ (↓)
                                 Activation(s) (act_b), ...
                                         ↓ (↓)
                                  Reduce (sum or cat)
                                           ↓
                                    Conv3d (conv_c)
                                           ↓
                                 Normalization (norm_c)
    sum)reduce_methodr   r7   r8   r   r9   r:   r   r;   r   r&   Nc       	   
         sv   t    t| t  tdd | j| jfD s(J | j d| j d| j d|	dv s.J | jdur9d| j_	dS dS )a  
        Args:
            conv_a (torch.nn.modules): convolutional module.
            norm_a (torch.nn.modules): normalization module.
            act_a (torch.nn.modules): activation module.
            conv_b (torch.nn.modules_list): convolutional module(s).
            norm_b (torch.nn.modules_list): normalization module(s).
            act_b (torch.nn.modules_list): activation module(s).
            conv_c (torch.nn.modules): convolutional module.
            norm_c (torch.nn.modules): normalization module.
            reduce_method (str): if multiple conv_b is used, reduce the output with
                `sum`, or `cat`.
        c                 s       | ]}|d uV  qd S r   r=   r   opr=   r=   r>   	<genexpr>  s    
z4SeparableBottleneckBlock.__init__.<locals>.<genexpr>z, z	 has None)r   catNT)
r   r   r   r   allr   r   r   r;   block_final_bn)
r   r   r7   r8   r   r9   r:   r   r;   r   r   r=   r>   r     s   



z!SeparableBottleneckBlock.__init__rU   c                 C   s  | j d ur
|  |}| jd ur| |}| jd ur| |}g }tt| jD ]*}| j| |}| j| d ur>| j| |}| j| d urL| j| |}|| q'| j	dkrdt
j|ddjddd}n| j	dkrpt
j|dd}| |}| jd ur| |}|S )Nr   r   )dimF)r   keepdimr   r   )r   r7   r8   rn   r   r   r9   r:   ro   r   r   stackr   r   r   r;   )r   rU   outputrq   x_r=   r=   r>   r     s,   










z SeparableBottleneckBlock.forward)r   r   r   r   rB   r   rC   strr   r   r   r   r   r=   r=   r   r>   rA     s2    '	
$rA   c                       s   e Zd ZdZddddddddddejdejdejdejdejd	ejd
ejdejddf fddZdejdejfddZ	  Z
S )r<   u  
    Bottleneck block: a sequence of spatiotemporal Convolution, Normalization,
    and Activations repeated in the following order:

    ::


                                    Conv3d (conv_a)
                                           ↓
                                 Normalization (norm_a)
                                           ↓
                                   Activation (act_a)
                                           ↓
                                    Conv3d (conv_b)
                                           ↓
                                 Normalization (norm_b)
                                           ↓
                                   Activation (act_b)
                                           ↓
                                    Conv3d (conv_c)
                                           ↓
                                 Normalization (norm_c)

    The builder can be found in `create_bottleneck_block`.
    Nr6   r   r7   r8   r   r9   r:   r   r;   r&   c          	         sR   t    t| t  tdd | j| j| jfD sJ | jdur'd| j_	dS dS )a  
        Args:
            conv_a (torch.nn.modules): convolutional module.
            norm_a (torch.nn.modules): normalization module.
            act_a (torch.nn.modules): activation module.
            conv_b (torch.nn.modules): convolutional module.
            norm_b (torch.nn.modules): normalization module.
            act_b (torch.nn.modules): activation module.
            conv_c (torch.nn.modules): convolutional module.
            norm_c (torch.nn.modules): normalization module.
        c                 s   r   r   r=   r   r=   r=   r>   r   6  s    z+BottleneckBlock.__init__.<locals>.<genexpr>NT)
r   r   r   r   r   r   r   r   r;   r   )	r   r   r7   r8   r   r9   r:   r   r;   r   r=   r>   r     s   
"
zBottleneckBlock.__init__rU   c                 C   s   |  |}| jd ur| |}| jd ur| |}| |}| jd ur(| |}| jd ur2| |}| |}| jd urA| |}|S r   r6   )r   rU   r=   r=   r>   r   ;  s   












zBottleneckBlock.forward)r   r   r   r   rB   r   r   r   r   r   r   r=   r=   r   r>   r<     s<    	
r<   c                       sD   e Zd ZdZdejdejf fddZdej	dej	fddZ
  ZS )	rp   u  
    ResStage composes sequential blocks that make up a ResNet. These blocks could be,
    for example, Residual blocks, Non-Local layers, or Squeeze-Excitation layers.

    ::


                                        Input
                                           ↓
                                       ResBlock
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                       ResBlock

    The builder can be found in `create_res_stage`.
    rk   r&   c                    s   t    || _dS )zZ
        Args:
            res_blocks (torch.nn.module_list): ResBlock module(s).
        N)r   r   rk   )r   rk   r   r=   r>   r   g  s   

zResStage.__init__rU   c                 C   s    t | jD ]\}}||}q|S r   )	enumeraterk   )r   rU   _	res_blockr=   r=   r>   r   o  s   
zResStage.forward)r   r   r   r   rB   rC   r   r   r   r   r   r   r=   r=   r   r>   rp   R  s    rp   ),typingr   r   r   r   numpyrc   r   torch.nnrB   pytorchvideo.layers.utilsr   pytorchvideo.models.headr   r   pytorchvideo.models.netr	   r
   pytorchvideo.models.stemr   r   Conv3dBatchNorm3dReLUrm   floatr   r?   rT   rW   boolri   rs   r   	MaxPool3d	AvgPool3dr   Sigmoidr   r   rf   rA   r<   rp   r=   r=   r=   r>   <module>   s,  	

 	
 )

 

 	#$*+,-/012345
 s	#$*+,-/012345678
 6	%&'()*
s7`P