o
    ϯi,                  !   @   s  d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 ddddej
ejddd	ejd
dejddededee dee dee dedededee dee dee dededededejf ddZddddejddd	ejd
dejd dededee dee dee dededee dee dee dededededejfd!d"ZG d#d$ d$ejZG d%d& d&ejZd'd(d)d*ej
d+dededee dee dee dededejfd,d-ZdS ).    )CallableTupleN)ConvReduce3D)set_attributes)      r   )      r	   )r   r   r   F)r   r   r   gh㈵>g?)conv_kernel_sizeconv_strideconv_padding	conv_biasconvpoolpool_kernel_sizepool_stridepool_paddingnormnorm_epsnorm_momentum
activationin_channelsout_channelsr
   r   r   r   r   r   r   r   r   r   r   r   r   returnc                 C   sj   || |||||d}|du rdn||||d}|du rdn| }|du r&dn|||	|
d}t ||||dS )u
  
    Creates the basic resnet stem layer. It performs spatiotemporal Convolution, BN, and
    Relu following by a spatiotemporal pooling.

    ::

                                        Conv3d
                                           ↓
                                     Normalization
                                           ↓
                                       Activation
                                           ↓
                                        Pool3d

    Normalization options include: BatchNorm3d and None (no normalization).
    Activation options include: ReLU, Softmax, Sigmoid, and None (no activation).
    Pool3d options include: AvgPool3d, MaxPool3d, and None (no pooling).

    Args:

        in_channels (int): input channel size of the convolution.
        out_channels (int): output channel size of the convolution.
        conv_kernel_size (tuple): convolutional kernel size(s).
        conv_stride (tuple): convolutional stride size(s).
        conv_padding (tuple): convolutional padding size(s).
        conv_bias (bool): convolutional bias. If true, adds a learnable bias to the
            output.
        conv (callable): Callable used to build the convolution layer.

        pool (callable): a callable that constructs pooling layer, options include:
            nn.AvgPool3d, nn.MaxPool3d, and None (not performing pooling).
        pool_kernel_size (tuple): pooling kernel size(s).
        pool_stride (tuple): pooling stride size(s).
        pool_padding (tuple): pooling padding size(s).

        norm (callable): a callable that constructs normalization layer, options
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer, options
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).

    Returns:
        (nn.Module): resnet basic stem layer.
    r   r   kernel_sizestridepaddingbiasNnum_featuresepsmomentumr   r   r   r   r   r   r   )ResNetBasicStem)r   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   conv_modulenorm_moduleactivation_modulepool_module r*   L/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/stem.pycreate_res_basic_stem   s2   E
r,   )r   r   r   )r
   r   r   r   r   r   r   r   r   r   r   r   c              
   C   s   t | ||d ddfd|d |d ff||f|d ddfd|d |d ff||fdd}|
du r1dn|
|||d}|du r>dn| }|du rGdn||||	d}t||||d	S )
u  
    Creates the acoustic resnet stem layer. It performs a spatial and a temporal
    Convolution in parallel, then performs, BN, and Relu following by a spatiotemporal
    pooling.

    ::

                                    Conv3d   Conv3d
                                           ↓
                                     Normalization
                                           ↓
                                       Activation
                                           ↓
                                        Pool3d

    Normalization options include: BatchNorm3d and None (no normalization).
    Activation options include: ReLU, Softmax, Sigmoid, and None (no activation).
    Pool3d options include: AvgPool3d, MaxPool3d, and None (no pooling).

    Args:
        in_channels (int): input channel size of the convolution.
        out_channels (int): output channel size of the convolution.
        conv_kernel_size (tuple): convolutional kernel size(s).
        conv_stride (tuple): convolutional stride size(s), it will be performed as
            temporal and spatial convolution in parallel.
        conv_padding (tuple): convolutional padding size(s), it  will be performed
            as temporal and spatial convolution in parallel.
        conv_bias (bool): convolutional bias. If true, adds a learnable bias to the
            output.

        pool (callable): a callable that constructs pooling layer, options include:
            nn.AvgPool3d, nn.MaxPool3d, and None (not performing pooling).
        pool_kernel_size (tuple): pooling kernel size(s).
        pool_stride (tuple): pooling stride size(s).
        pool_padding (tuple): pooling padding size(s).

        norm (callable): a callable that constructs normalization layer, options
            include nn.BatchNorm3d, None (not performing normalization).
        norm_eps (float): normalization epsilon.
        norm_momentum (float): normalization momentum.

        activation (callable): a callable that constructs activation layer, options
            include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
            activation).

    Returns:
        (nn.Module): resnet basic stem layer.
    r   r   r	   sum)r   r   r   r   r   r   reduction_methodNr   r#   r$   )r   r%   )r   r   r
   r   r   r   r   r   r   r   r   r   r   r   r&   r'   r(   r)   r*   r*   r+   create_acoustic_res_basic_stemn   s8   Er/   c                       s`   e Zd ZdZddddddejdejdejdejddf
 fd	d
ZdejdejfddZ	  Z
S )r%   u  
    ResNet basic 3D stem module. Performs spatiotemporal Convolution, BN, and activation
    following by a spatiotemporal pooling.

    ::

                                        Conv3d
                                           ↓
                                     Normalization
                                           ↓
                                       Activation
                                           ↓
                                        Pool3d

    The builder can be found in `create_res_basic_stem`.
    Nr$   r   r   r   r   r   c                   s(   t    t| t  | jdusJ dS )a   
        Args:
            conv (torch.nn.modules): convolutional module.
            norm (torch.nn.modules): normalization module.
            activation (torch.nn.modules): activation module.
            pool (torch.nn.modules): pooling module.
        N)super__init__r   localsr   )selfr   r   r   r   	__class__r*   r+   r1      s   
zResNetBasicStem.__init__xc                 C   sJ   |  |}| jd ur| |}| jd ur| |}| jd ur#| |}|S Nr$   r3   r6   r*   r*   r+   forward   s   






zResNetBasicStem.forward__name__
__module____qualname____doc__nnModuler1   torchTensorr9   __classcell__r*   r*   r4   r+   r%      s$    r%   c                       sB   e Zd ZdZdddejddf fddZdejfdd	Z	  Z
S )

PatchEmbedu  
    Transformer basic patch embedding module. Performs patchifying input, flatten and
    and transpose.

    ::

                                       PatchModel
                                           ↓
                                        flatten
                                           ↓
                                       transpose

    The builder can be found in `create_patch_embed`.

    Npatch_modelrF   r   c                   s(   t    t| t  | jd usJ d S r7   )r0   r1   r   r2   rF   )r3   rF   r4   r*   r+   r1     s   
zPatchEmbed.__init__c                 C   s   |  |}|dddS )Nr	   r   )rF   flatten	transposer8   r*   r*   r+   r9   !  s   
zPatchEmbed.forwardr:   r*   r*   r4   r+   rD     s    	rD   )r      rI   )r      rJ   )r   r   r   T)r
   r   r   r   r   c                 C   s   || |||||d}t |dS )u  
    Creates the transformer basic patch embedding. It performs Convolution, flatten and
    transpose.

    ::

                                        Conv3d
                                           ↓
                                        flatten
                                           ↓
                                       transpose

    Args:
        in_channels (int): input channel size of the convolution.
        out_channels (int): output channel size of the convolution.
        conv_kernel_size (tuple): convolutional kernel size(s).
        conv_stride (tuple): convolutional stride size(s).
        conv_padding (tuple): convolutional padding size(s).
        conv_bias (bool): convolutional bias. If true, adds a learnable bias to the
            output.
        conv (callable): Callable used to build the convolution layer.

    Returns:
        (nn.Module): transformer patch embedding layer.
    r   rE   )rD   )r   r   r
   r   r   r   r   r&   r*   r*   r+   create_conv_patch_embed'  s   #
rK   )typingr   r   rA   torch.nnr?    pytorchvideo.layers.convolutionsr   pytorchvideo.layers.utilsr   Conv3d	MaxPool3dBatchNorm3dReLUintboolfloatr@   r,   r/   r%   rD   rK   r*   r*   r*   r+   <module>   s   	
h

i0$	