o
    ϯiD                  9   @   s  d dl mZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZ d dlmZmZ dddd	d
ddejejdddejejdfddddddddeeeefeeeefe
e
eeffejddddddee dedeeegejf dee ded ed!ed"ed#ed$ee d%eee  d&eee  d'ee d(eee  d)eee  d*eeee   d+eeee   d,eee  d-eeee   d.eee  d/eee  d0eee  d1ed2eee  d3ee d4ed5ed6ejf8d7d8ZG d9d: d:ZG d;d< d<ejZdS )=    )CallableTupleUnionN)set_attributes) create_acoustic_bottleneck_blockcreate_bottleneck_block)create_slowfast)create_acoustic_res_basic_stemcreate_res_basic_stem)      r   )   r      2   i  g      ?)@   r       ))r      r   )   r   r   )	   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   ))r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   )r   r   r   r   r   r   )r   r   r   ))r   r   r   )r   r   r   )   r   
   r   T slowfast_channel_reduction_ratio"slowfast_conv_channel_fusion_ratiofusion_builderinput_channelsmodel_depthmodel_num_classdropout_ratenorm
activationstem_dim_outsstem_conv_kernel_sizesstem_conv_strides	stem_poolstem_pool_kernel_sizesstem_pool_stridesstage_conv_a_kernel_sizesstage_conv_b_kernel_sizesstage_conv_b_num_groupsstage_conv_b_dilationsstage_spatial_stridesstage_temporal_strides
bottleneck	head_poolhead_pool_kernel_sizeshead_output_sizehead_activationhead_output_with_global_averager!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   returnc                 C   s   t jd |du rt| d | d |dddd||d		j}td&i d
| d|d|d|d|d|d|d|d|dtttfd|	d|
d|d|d|d|d|d|d|d|d|d|d |d!|d"|d#|d$|d%|S )'u  
    Model builder for AVSlowFast network.
    Fanyi Xiao, Yong Jae Lee, Kristen Grauman, Jitendra Malik, Christoph Feichtenhofer.
    "Audiovisual SlowFast Networks for Video Recognition."
    https://arxiv.org/abs/2001.08740

                             Slow Input  Fast Input  Audio Input
                                  ↓           ↓            ↓
                                 Stem       Stem         Stem
                                  ↓ ⭠ Fusion- ↓ ⭠ Fusion- ↓
                               Stage 1     Stage 1      Stage 1
                                  ↓ ⭠ Fusion- ↓ ⭠ Fusion- ↓
                                  .            .           .
                                  ↓            ↓           ↓
                               Stage N      Stage N     Stage N
                                  ↓ ⭠ Fusion- ↓ ⭠ Fusion- ↓
                                         ↓
                                       Head

    Args:
        SlowFast configs:
            slowfast_channel_reduction_ratio (int): Corresponds to the inverse of the channel
                reduction ratio, $eta$F between the Slow and Fast pathways.
            slowfast_audio_reduction_ratio (int): Corresponds to the inverse of the channel
                reduction ratio, $eta$A between the Slow and Audio pathways.
            slowfast_conv_channel_fusion_ratio (int): Ratio of channel dimensions
                between the Slow and Fast pathways.
            fusion_builder (Callable[[int, int], nn.Module]): Builder function for generating
                the fusion modules based on stage dimension and index

        Input clip configs:
            input_channels (tuple): number of channels for the input video clip.

        Model configs:
            model_depth (int): the depth of the resnet.
            model_num_class (int): the number of classes for the video dataset.
            dropout_rate (float): dropout rate.

        Normalization configs:
            norm (callable): a callable that constructs normalization layer.

        Activation configs:
            activation (callable): a callable that constructs activation layer.

        Stem configs:
            stem_function (Tuple[Callable]): a callable that constructs stem layer.
                Examples include create_res_basic_stem. Indexed by pathway
            stem_dim_outs (tuple): output channel size to stem.
            stem_conv_kernel_sizes (tuple): convolutional kernel size(s) of stem.
            stem_conv_strides (tuple): convolutional stride size(s) of stem.
            stem_pool (Tuple[Callable]): a callable that constructs resnet head pooling layer.
                Indexed by pathway
            stem_pool_kernel_sizes (tuple): pooling kernel size(s).
            stem_pool_strides (tuple): pooling stride size(s).

        Stage configs:
            stage_conv_a_kernel_sizes (tuple): convolutional kernel size(s) for conv_a.
            stage_conv_b_kernel_sizes (tuple): convolutional kernel size(s) for conv_b.
            stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
                for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
            stage_conv_b_dilations (tuple): dilation for 3D convolution for conv_b.
            stage_spatial_strides (tuple): the spatial stride for each stage.
            stage_temporal_strides (tuple): the temporal stride for each stage.
            bottleneck (Tuple[Tuple[Callable]]): a callable that constructs bottleneck
                block layer. Examples include: create_bottleneck_block.
                Indexed by pathway and stage index

        Head configs:
            head_pool (callable): a callable that constructs resnet head pooling layer.
            head_output_sizes (tuple): the size of output tensor for head.
            head_activation (callable): a callable that constructs activation layer.
            head_output_with_global_average (bool): if True, perform global averaging on
                the head output.
    Returns:
        (nn.Module): SlowFast model.
    z/PYTORCHVIDEO.model.create_audio_visual_slowfastNr   r   )r   r   r   )r   r   r      r   r   )r   r   r   r?   )r   r   r   r=   )r   r   r   )	r!   slowfast_audio_reduction_ratioconv_fusion_channel_ratioconv_kernel_sizeconv_kernel_size_aconv_strideconv_stride_ar(   r)   r!   r"   r#   r$   r%   r&   r'   r(   r)   stem_functionr*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;    )torch_C_log_api_usage_onceAudioToSlowFastFusionBuildercreate_moduler   r
   r	   r    rG   rG   ]/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/audio_visual_slowfast.pycreate_audio_visual_slowfast   s    !
	 !#$%&'rN   c                   @   s   e Zd Zddejddejdfdededed	ee d
ee de	ee eee  f de	ee eee  f de	eef dede
dedede
deddfddZdededejfddZdS )rK   g      ?r   gh㈵>g?r   r!   r@   rA   rB   rC   rD   rE   conv_fusion_channel_interm_dim
conv_num_ar(   norm_epsnorm_momentumr)   max_stage_idxr<   Nc                 C   s   t | t  dS )a  
        Given a list of two tensors from Slow pathway and Fast pathway, fusion information
        from the Fast pathway to the Slow on through a convolution followed by a
        concatenation, then return the fused list of tensors from Slow and Fast pathway in
        order.
        Args:
            slowfast_channel_reduction_ratio (int): Reduction ratio from the stage dimension.
                Used to compute conv_dim_in = fusion_dim_in // slowfast_channel_reduction_ratio
            slowfast_audio_reduction_ratio (int): Audio Reduction ratio from the stage dimension.
                Used to compute conv_dim_in_a = fusion_dim_in // slowfast_audio_reduction_ratio
            conv_fusion_channel_ratio (int): channel ratio for the convolution used to fuse
                from Fast pathway to Slow pathway.
            conv_kernel_size (int): kernel size of the convolution used to fuse from Fast
                pathway to Slow pathway.
            conv_kernel_size_a (int): kernel size of the convolution used to fuse from Audio
                pathway to FastSlow pathway.
            conv_stride (int): stride size of the convolution used to fuse from Fast pathway
                to Slow pathway. Optionally indexed by stage.
            conv_stride_a (int): stride size of the convolution used to fuse from Audio pathway
                to FastSlow pathway. Optionally indexed by stage.
            conv_fusion_channel_interm_dim (Union[int, float]): When conv_num_a > 1 this value
                controls the dimensions of the intermediate conv
            conv_num_a (int): Number of intermediate conv for audio channel
            norm (callable): a callable that constructs normalization layer, examples
                include nn.BatchNorm3d, None (not performing normalization).
            norm_eps (float): normalization epsilon.
            norm_momentum (float): normalization momentum.
            activation (callable): a callable that constructs activation layer, examples
                include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
                activation).
            max_stage_idx (int): Returns identity module if we exceed this
        N)r   locals)selfr!   r@   rA   rB   rC   rD   rE   rO   rP   r(   rQ   rR   r)   rS   rG   rG   rM   __init__   s   1z%AudioToSlowFastFusionBuilder.__init__fusion_dim_in	stage_idxc                 C   s  || j kr	t S t| jd tr| j| n| j}t| jd tr&| j| n| j}|| j }|| j }g }|	tj
|t|| j | j|dd | jD dd | jdurd|	| j|| j | j| jd | jdurp|	|   t| jtrz| j}nt|| j }g }	|}
t| jD ]N}|| jd kr|}t|| j | }nd	}|}|		tj
|
|| j|d
d | jD dd | jdur|		| j|| j| jd | jdur|		|   |}
qttj| tj|	 dS )z
        Creates the module for the given stage
        Args:
            fusion_dim_in (int): input stage dimension
            stage_idx (int): which stage this is
        r   c                 S      g | ]}|d  qS r   rG   .0k_sizerG   rG   rM   
<listcomp>B      z>AudioToSlowFastFusionBuilder.create_module.<locals>.<listcomp>F)kernel_sizestridepaddingbiasN)num_featuresepsmomentumr   r   c                 S   rY   rZ   rG   r[   rG   rG   rM   r^   j  r_   )block_fast_to_slowblock_audio_to_fastslow)rS   nnIdentity
isinstancerD   r   rE   r!   r@   appendConv3dintrA   rB   r(   rQ   rR   r)   rO   rangerP   rC   FuseAudioToFastSlow
Sequential)rU   rW   rX   rD   rE   conv_dim_inconv_dim_in_afastslow_moduleafs_fusion_interm_dimrh   
cur_dim_inidx
cur_stridecur_dim_outrG   rG   rM   rL   $  s   








z*AudioToSlowFastFusionBuilder.create_module)__name__
__module____qualname__ri   BatchNorm3dReLUrn   floatr   r   r   rV   ModulerL   rG   rG   rG   rM   rK      sP    

	

3rK   c                       s:   e Zd ZdZdejdejddf fddZdd	 Z  ZS )
rp   a  
    Given a list of two tensors from Slow pathway and Fast pathway, fusion information
    from the Fast pathway to the Slow on through a convolution followed by a
    concatenation, then return the fused list of tensors from Slow and Fast pathway in
    order.
    rg   rh   r<   Nc                    s   t    t| t  dS )z
        Args:
            conv_fast_to_slow (nn.module): convolution to perform fusion.
            norm (nn.module): normalization module.
            activation (torch.nn.modules): activation module.
        N)superrV   r   rT   )rU   rg   rh   	__class__rG   rM   rV     s   
zFuseAudioToFastSlow.__init__c           	      C   sf   |d }|d }|d }|  |}tj|ddd}| |}t||gd}t|  || ||gS )Nr   r   r   T)dimkeepdim)rg   rH   meanrh   catprintsize)	rU   xx_sx_fx_afuse	average_afuse_ax_s_fuserG   rG   rM   forward  s   

zFuseAudioToFastSlow.forward)	rz   r{   r|   __doc__ri   r   rV   r   __classcell__rG   rG   r   rM   rp     s    rp   )typingr   r   r   rH   torch.nnri   pytorchvideo.layers.utilsr   pytorchvideo.models.resnetr   r   pytorchvideo.models.slowfastr   pytorchvideo.models.stemr	   r
   r}   r~   	MaxPool3d	AvgPool3drn   r   r   boolrN   rK   rp   rG   rG   rG   rM   <module>   s   
	




$)
.
3
8M
NOPQR
 ] 