o
    ϯiMn                  7   @   sj  d dl Z d dlmZmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZmZ d d
lmZ dddejddejddededee dee dee dede de dedej!fddZ"ddejdddejedd ed!ed"edee dee dede de d#e ded$edej!fd%d&Z#e#d'ddejdddejed(
d ed!ed"ed)ed*e$dee dee dede de d#e ded$edej!fd+d,Z%e#ddejdddejed-	d.ed ed!ed"ed)edee dee dede de d#e ded$edej!fd/d0Z&ejd1ejddd2d3ej'd'd4	d ed!ed"ed5ed6ed7ee dede de d8e ded9e$dej!fd:d;Z(d<d=d>d?d3d@dAejddejdBdddCdDdEe#dFdedGejd2ej'd'dHdIedJedKedLed8e dMe dNe dede de dedOedPee dQee dReee  dSee dTee d)edUe d#e d$edVedWedXe$dYedZe$dej!f6d[d\Z)G d]d^ d^ej!Z*dS )_    N)CallableTuple)SqueezeExcitation)Conv2plus1d)Swish)round_repeatsround_widthset_attributes)ResNetBasicHead)Net)BottleneckBlockResBlockResStage)ResNetBasicStem)      r   )      r   )r   r   r   gh㈵>g?)conv_kernel_sizeconv_strideconv_paddingnormnorm_epsnorm_momentum
activationin_channelsout_channelsr   r   r   r   r   r   r   returnc        	      	   C   s   t j| |d|d |d fd|d |d fd|d |d fdd}	t j|||d ddf|d ddf|d ddfd|d}
t|	dd|
d}|du rIdn||||d	}|du rVdn| }t|||dd
S )u  
    Creates the stem layer for X3D. It performs spatial Conv, temporal Conv, BN, and Relu.

    ::

                                        Conv_xy
                                           ↓
                                        Conv_t
                                           ↓
                                     Normalization
                                           ↓
                                       Activation

    Args:
        in_channels (int): input channel size of the convolution.
        out_channels (int): output channel size of the convolution.
        conv_kernel_size (tuple): convolutional kernel size(s).
        conv_stride (tuple): convolutional stride size(s).
        conv_padding (tuple): convolutional padding size(s).

        norm (callable): a callable that constructs normalization layer, options
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer, options
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).

    Returns:
        (nn.Module): X3D stem layer.
    r   r   r   F)r   r   kernel_sizestridepaddingbias)r   r   r   r   r    r!   groupsN)conv_tr   r   conv_xynum_featuresepsmomentum)convr   r   pool)nnConv3dr   r   )r   r   r   r   r   r   r   r   r   conv_xy_moduleconv_t_modulestacked_conv_modulenorm_moduleactivation_module r2   K/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/x3d.pycreate_x3d_stem   sD   /		r4   r   r   r   g      ?)r   r   r   r   r   se_ratior   	inner_actdim_in	dim_innerdim_outr6   r7   c              
   C   s  t j| |ddd}|du rdn||||d}|	du rdn|	 }t j||||dd |D d|dd}|d	kr?t|t||d
dnt  }t |du rMt  n||||d|}|
du r\dn|
 }t j||ddd}|du rndn||||d}t||||||||dS )u;  
    Bottleneck block for X3D: a sequence of Conv, Normalization with optional SE block,
    and Activations repeated in the following order:

    ::

                                    Conv3d (conv_a)
                                           ↓
                                 Normalization (norm_a)
                                           ↓
                                   Activation (act_a)
                                           ↓
                                    Conv3d (conv_b)
                                           ↓
                                 Normalization (norm_b)
                                           ↓
                                 Squeeze-and-Excitation
                                           ↓
                                   Activation (act_b)
                                           ↓
                                    Conv3d (conv_c)
                                           ↓
                                 Normalization (norm_c)

    Args:
        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_stride (tuple): convolutional stride size(s) for conv_b.

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.
        se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
            channel dimensionality being se_ratio times the 3x3x3 conv dim.

        activation (callable): a callable that constructs activation layer, examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).
        inner_act (callable): whether use Swish activation for act_b or not.

    Returns:
        (nn.Module): X3D bottleneck block.
    r   r   r   Fr   r   r   r!   Nr%   c                 S      g | ]}|d  qS r   r2   .0sizer2   r2   r3   
<listcomp>       z/create_x3d_bottleneck_block.<locals>.<listcomp>)r   r   r   r   r    r!   r"   dilation        T)num_channelsnum_channels_reducedis_3d)conv_anorm_aact_aconv_bnorm_bact_bconv_cnorm_c)r+   r,   r   r   Identity
Sequentialr   )r8   r9   r:   r   r   r   r   r   r6   r   r7   rI   rJ   rK   rL   serM   rN   rO   rP   r2   r2   r3   create_x3d_bottleneck_blocki   sd   @	rT   T)

bottleneckuse_shortcutr   r   r   r   r   r6   r   r7   rU   rV   c                 C   s   d}|dur| |kr||d}t | |kst|dkr'|r'tj| |d|ddnd| |kr0|r0|nd|| |||||||	|
||d|du rEdn| dd	 d
S )u#  
    Residual block for X3D. Performs a summation between an identity shortcut in branch1 and a
    main block in branch2. When the input and output dimensions are different, a
    convolution followed by a normalization will be performed.

    ::

                                         Input
                                           |-------+
                                           ↓       |
                                         Block     |
                                           ↓       |
                                       Summation ←-+
                                           ↓
                                       Activation

    Args:
        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        bottleneck (callable): a callable for create_x3d_bottleneck_block.

        conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_stride (tuple): convolutional stride size(s) for conv_b.

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.
        se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
            channel dimensionality being se_ratio times the 3x3x3 conv dim.

        activation (callable): a callable that constructs activation layer, examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).
        inner_act (callable): whether use Swish activation for act_b or not.

    Returns:
        (nn.Module): X3D block layer.
    N)r&   r   r;   F)r   r   r!   )r8   r9   r:   r   r   r   r   r   r6   r   r7   c                 S   s   | | S Nr2   )xyr2   r2   r3   <lambda>A  s    z&create_x3d_res_block.<locals>.<lambda>)branch1_convbranch1_normbranch2r   branch_fusion)r   npprodr+   r,   )r8   r9   r:   rU   rV   r   r   r   r   r   r6   r   r7   
norm_modelr2   r2   r3   create_x3d_res_block   s<   =
rb   )	rU   r   r   r   r   r   r6   r   r7   depthc                 C   st   g }t | D ]+}t|dkr|n||||||dkr|nd|||	|d d r&|
nd||d}|| qtt|dS )u  
    Create Residual Stage, which composes sequential blocks that make up X3D.

    ::

                                        Input
                                           ↓
                                       ResBlock
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                       ResBlock

    Args:

        depth (init): number of blocks to create.

        dim_in (int): input channel size to the bottleneck block.
        dim_inner (int): intermediate channel size of the bottleneck.
        dim_out (int): output channel size of the bottleneck.
        bottleneck (callable): a callable for create_x3d_bottleneck_block.

        conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        conv_stride (tuple): convolutional stride size(s) for conv_b.

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.
        se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
            channel dimensionality being se_ratio times the 3x3x3 conv dim.

        activation (callable): a callable that constructs activation layer, examples
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).
        inner_act (callable): whether use Swish activation for act_b or not.

    Returns:
        (nn.Module): X3D stage layer.
    r   r;   r   r   rE   )r8   r9   r:   rU   r   r   r   r   r   r6   r   r7   )
res_blocks)rangerb   appendr   r+   
ModuleList)rc   r8   r9   r:   rU   r   r   r   r   r   r6   r   r7   rd   idxblockr2   r2   r3   create_x3d_res_stageE  s$   ?rj   )   r   r   Fg      ?)	pool_actpool_kernel_sizer   r   r   
bn_lin5_ondropout_rater   output_with_global_averagenum_classesrl   rm   ro   rp   c              	   C   s4  t j| |ddd}||||d}|du rdn| }|du r#t d}nt j|dd}t j||ddd}|	r=||||d}nd}|du rEdn| }t|||||||d}|du rZd}n|t jkre|dd	}n|t jkrn| }ntd
||r}t d}nd}t	t j
||dd|||
dkrt |
|dS d|dS )u  
    Creates X3D head. This layer performs an projected pooling operation followed
    by an dropout, a fully-connected projection, an activation layer and a global
    spatiotemporal averaging.

    ::

                                     ProjectedPool
                                           ↓
                                        Dropout
                                           ↓
                                       Projection
                                           ↓
                                       Activation
                                           ↓
                                       Averaging

    Args:
        dim_in (int): input channel size of the X3D head.
        dim_inner (int): intermediate channel size of the X3D head.
        dim_out (int): output channel size of the X3D head.
        num_classes (int): the number of classes for the video dataset.

        pool_act (callable): a callable that constructs resnet pool activation
            layer such as nn.ReLU.
        pool_kernel_size (tuple): pooling kernel size(s) when not using adaptive
            pooling.

        norm (callable): a callable that constructs normalization layer, examples
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.
        bn_lin5_on (bool): if True, perform normalization on the features
            before the classifier.

        dropout_rate (float): dropout rate.

        activation (callable): a callable that constructs resnet head activation
            layer, examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not
            applying activation).

        output_with_global_average (bool): if True, perform global averaging on temporal
            and spatial dimensions and reshape output to batch_size x out_features.

    Returns:
        (nn.Module): X3D head layer.
    r;   Fr<   r%   Nr   )r   pre_convpre_normpre_actr*   	post_conv	post_normpost_act)dimz-{} is not supported as an activationfunction.T)r!   r   )projr   r*   dropoutoutput_pool)r+   r,   AdaptiveAvgPool3d	AvgPool3dProjectedPoolSoftmaxSigmoidNotImplementedErrorformatr
   LinearDropout)r8   r9   r:   rq   rl   rm   r   r   r   rn   ro   r   rp   pre_conv_modulepre_norm_modulepre_act_modulepool_modulepost_conv_modulepost_norm_modulepost_act_moduleprojected_pool_moduler1   r|   r2   r2   r3   create_x3d_head  s`   E


r   r   rk      i         @g@   )r5   r5   r5   r5   )r   r   r   r   )r   r   r   r   g      @i   )input_channelinput_clip_lengthinput_crop_sizemodel_num_classro   width_factordepth_factorr   r   r   r   stem_dim_instem_conv_kernel_sizestem_conv_stridestage_conv_kernel_sizestage_spatial_stridestage_temporal_striderU   bottleneck_factorr6   r7   head_dim_outhead_pool_acthead_bn_lin5_onhead_activationhead_output_with_global_averager   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   c           /      C   s  t jd g }t||}t| |||dd |D |||	|
d	}|| g d}d}|}t||dd} t| |dd}!t|!|dd}"|| |!|"g}#|}$tt|D ];}%t|#|% |}&t||& }'t	||% |}(||% ||% ||% f})t
|(|$|'|&|||% |)|||	||
|d	}*||* |&}$qO|d
 t| }+|d t| },||,ksJ d||+ksJ d||, tt||+ tt||+ f}-t|&|'||||-|||	||||d}.||. tt|dS )u4  
    X3D model builder. It builds a X3D network backbone, which is a ResNet.

    Christoph Feichtenhofer.
    "X3D: Expanding Architectures for Efficient Video Recognition."
    https://arxiv.org/abs/2004.04730

    ::

                                         Input
                                           ↓
                                         Stem
                                           ↓
                                         Stage 1
                                           ↓
                                           .
                                           .
                                           .
                                           ↓
                                         Stage N
                                           ↓
                                         Head

    Args:
        input_channel (int): number of channels for the input video clip.
        input_clip_length (int): length of the input video clip. Value for
            different models: X3D-XS: 4; X3D-S: 13; X3D-M: 16; X3D-L: 16.
        input_crop_size (int): spatial resolution of the input video clip.
            Value for different models: X3D-XS: 160; X3D-S: 160; X3D-M: 224;
            X3D-L: 312.

        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.
        width_factor (float): width expansion factor.
        depth_factor (float): depth expansion factor. Value for different
            models: X3D-XS: 2.2; X3D-S: 2.2; X3D-M: 2.2; X3D-L: 5.0.

        norm (callable): a callable that constructs normalization layer.
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer.

        stem_dim_in (int): input channel size for stem before expansion.
        stem_conv_kernel_size (tuple): convolutional kernel size(s) of stem.
        stem_conv_stride (tuple): convolutional stride size(s) of stem.

        stage_conv_kernel_size (tuple): convolutional kernel size(s) for conv_b.
        stage_spatial_stride (tuple): the spatial stride for each stage.
        stage_temporal_stride (tuple): the temporal stride for each stage.
        bottleneck_factor (float): bottleneck expansion factor for the 3x3x3 conv.
        se_ratio (float): if > 0, apply SE to the 3x3x3 conv, with the SE
            channel dimensionality being se_ratio times the 3x3x3 conv dim.
        inner_act (callable): whether use Swish activation for act_b or not.

        head_dim_out (int): output channel size of the X3D head.
        head_pool_act (callable): a callable that constructs resnet pool activation
            layer such as nn.ReLU.
        head_bn_lin5_on (bool): if True, perform normalization on the features
            before the classifier.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.

    Returns:
        (nn.Module): the X3D network.
    zPYTORCHVIDEO.model.create_x3dc                 S   r=   r>   r2   r?   r2   r2   r3   rB     rC   zcreate_x3d.<locals>.<listcomp>)	r   r   r   r   r   r   r   r   r   )r   r   r   r   r      )divisor)rc   r8   r9   r:   rU   r   r   r   r   r   r6   r   r7   r   r   z*Clip length doesn't match temporal stride!z'Crop size doesn't match spatial stride!)r8   r9   r:   rq   rl   rm   r   r   r   rn   ro   r   rp   )blocks)torch_C_log_api_usage_oncer   r4   rf   re   lenintr   rj   r_   r`   mathceilr   r   r+   rg   )/r   r   r   r   ro   r   r   r   r   r   r   r   r   r   r   r   r   rU   r   r6   r7   r   r   r   r   r   r   stem_dim_outstemstage_depths	exp_stage
stage_dim1
stage_dim2
stage_dim3
stage_dim4
stage_dimsr8   rh   r:   r9   rc   stage_conv_stridestagetotal_spatial_stridetotal_temporal_stridehead_pool_kernel_sizeheadr2   r2   r3   
create_x3d  s   m





r   c                       sx   e Zd ZdZdddddddddejdejdejdejdejd	ejd
ejddf fddZdejdejfddZ	  Z
S )r   u.  
    A pooling module augmented with Conv, Normalization and Activation both
    before and after pooling for the head layer of X3D.

    ::

                                    Conv3d (pre_conv)
                                           ↓
                                 Normalization (pre_norm)
                                           ↓
                                   Activation (pre_act)
                                           ↓
                                        Pool3d
                                           ↓
                                    Conv3d (post_conv)
                                           ↓
                                 Normalization (post_norm)
                                           ↓
                                   Activation (post_act)
    Nrr   rs   rt   ru   r*   rv   rw   rx   r   c                   sD   t    t| t  | jdusJ | jdusJ | jdus J dS )a  
        Args:
            pre_conv (torch.nn.modules): convolutional module.
            pre_norm (torch.nn.modules): normalization module.
            pre_act (torch.nn.modules): activation module.
            pool (torch.nn.modules): pooling module.
            post_conv (torch.nn.modules): convolutional module.
            post_norm (torch.nn.modules): normalization module.
            post_act (torch.nn.modules): activation module.
        N)super__init__r	   localsrs   r*   rv   )selfrs   rt   ru   r*   rv   rw   rx   	__class__r2   r3   r     s
   
zProjectedPool.__init__rX   c                 C   sr   |  |}| jd ur| |}| jd ur| |}| |}| |}| jd ur-| |}| jd ur7| |}|S rW   rr   )r   rX   r2   r2   r3   forward  s   










zProjectedPool.forward)__name__
__module____qualname____doc__r+   Moduler   r   Tensorr   __classcell__r2   r2   r   r3   r     s6    	
r   )+r   typingr   r   numpyr_   r   torch.nnr+   fvcore.nn.squeeze_excitationr    pytorchvideo.layers.convolutionsr   pytorchvideo.layers.swishr   pytorchvideo.layers.utilsr   r   r	   pytorchvideo.models.headr
   pytorchvideo.models.netr   pytorchvideo.models.resnetr   r   r   pytorchvideo.models.stemr   BatchNorm3dReLUr   floatr   r4   rT   boolrb   rj   r   r   r   r   r2   r2   r2   r3   <module>   s   	

\	

 	

f

\	
 	

 !#$%&'(
 L