o
    ϯiw                  G   @   s$  d dl mZmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZ d dlmZmZmZ d dlmZmZ d dlmZ dd	d
ddZdddddddddejejeefdddejejfddddddddeeeefeeeeffeejdddd d!d"eee ef d#ed$ee d%ee d&eeegejf d'ee d(ed)ed*ed+ed,ed-ee d.ee d/eee  d0eee  d1eeee f d2eee  d3eee  d4eeee   d5eeee   d6eee  d7eeee   d8eee  d9eee  d:eeeee  f d;ed<ed=eee  d>ee d?ed@edAejf@dBdCZ ddddddddDdejejeefdddejejfddddddEdFdeeeefeeeeffeejdGdej!dHdIdJd dK"d"eee ef d#ed$ee d%ee d&eeegejf d'ee d(ed)ed*ed+ed,ed-ee d.ee d/eee  d0eee  d1eeee f d2eee  d3eee  d4eeee   d5eeee   d6eee  d7eeee   d8eee  d9eee  d:eeeee  f d;ed<ed=eee  d>ee d?ed@edLee dMedNedAejfFdOdPZ"G dQdR dRejZ#G dSdT dTZ$G dUdV dVejZ%dS )W    )CallableListOptionalTupleUnionN)set_attributes)create_res_basic_headcreate_res_roi_pooling_head)DetectionBBoxNetworkMultiPathWayWithFuseNet)create_bottleneck_blockcreate_res_stage)create_res_basic_stem   r   r   r   )         r   )r   r      r   )r      $   r   )   2   e      )r      )   r   r   )r   r   r   )r   r   r   i  g      ?)@   r   ))r   r   r   )   r   r   )r   r   r   r    )r   r   r   r!   ))r   r   r   r"   r   r   r   r#   )r#   r#   r#   r#   )r!   r!   r!   r!   r$   )r   r   )r"   r"   r"   r"   r%   )r   r   r   r   r&   ))r   r   r   )    r   r   r"   T) slowfast_channel_reduction_ratio"slowfast_conv_channel_fusion_ratio slowfast_fusion_conv_kernel_sizeslowfast_fusion_conv_stridefusion_builderinput_channelsmodel_depthmodel_num_classdropout_ratenorm
activationstem_functionstem_dim_outsstem_conv_kernel_sizesstem_conv_strides	stem_poolstem_pool_kernel_sizesstem_pool_stridesstage_conv_a_kernel_sizesstage_conv_b_kernel_sizesstage_conv_b_num_groupsstage_conv_b_dilationsstage_spatial_stridesstage_temporal_strides
bottleneckhead	head_poolhead_pool_kernel_sizeshead_output_sizehead_activationhead_output_with_global_averager(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   returnc           3         s&  t jd t|}|t v sJ | dt  t| } t| tr'| f} t|tr1|f| }t|trB|ft|  }|f| }|du rXt	| d ||||	|
t| d dj
}g }!t|D ]2}"|!||" ||" ||" ||" ||" dd ||" D ||" ||" ||" d	d ||" D |	|
d
 q^g }#|#tt|!||d ddd |d }$|$d }%tt| D ]}&|$|$| | d   g}'|%d g}(|%g})| D ]}*|'|$|* g }'|(|%d |* g }(|)|%|* g })qg }+t|D ]}"| |& },||" |& }-||" |& ddf}.t|-d trdd |-D ndd |-D }/d||" |& ||" |& f}0|+t|,|'|" |(|" |)|" ||" |& |-|.|/||" |& |0||" |& d d ||" |& d dkrY||" |& d n	||" |& d d ||" |& d dkru||" |& d n	||" |& d d f||" |& ||" |& |	|
d q|#tt|+||%|&d dd |%}$|%d }%qdu rd}1n-tjkrć fddt|D }1ntjkrׇfddt|D }1ntd|1 |#tdt|1d |$}2| D ]	}*|2|$|*  }2q|dur|#||2|d |||d tt|#dS )u_  
    Build SlowFast model for video recognition, SlowFast model involves a Slow pathway,
    operating at low frame rate, to capture spatial semantics, and a Fast pathway,
    operating at high frame rate, to capture motion at fine temporal resolution. The
    Fast pathway can be made very lightweight by reducing its channel capacity, yet can
    learn useful temporal information for video recognition. Details can be found from
    the paper:

    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
    "SlowFast networks for video recognition."
    https://arxiv.org/pdf/1812.03982.pdf

    ::

                             Slow Input  Fast Input
                                  ↓           ↓
                                 Stem       Stem
                                  ↓ ⭠ Fusion- ↓
                               Stage 1     Stage 1
                                  ↓ ⭠ Fusion- ↓
                                  .           .
                                  ↓           ↓
                               Stage N     Stage N
                                  ↓ ⭠ Fusion- ↓
                                         ↓
                                       Head

    Args:
        slowfast_channel_reduction_ratio (int): Corresponds to the inverse of the channel
            reduction ratio, $eta$ between the Slow and Fast pathways.
        slowfast_conv_channel_fusion_ratio (int): Ratio of channel dimensions
            between the Slow and Fast pathways.
        DEPRECATED slowfast_fusion_conv_kernel_size (tuple): the convolutional kernel
            size used for fusion.
        DEPRECATED slowfast_fusion_conv_stride (tuple): the convolutional stride size
            used for fusion.
        fusion_builder (Callable[[int, int], nn.Module]): Builder function for generating
            the fusion modules based on stage dimension and index

        input_channels (tuple): number of channels for the input video clip.

        model_depth (int): the depth of the resnet.
        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.

        norm (callable): a callable that constructs normalization layer.

        activation (callable): a callable that constructs activation layer.

        stem_function (Tuple[Callable]): a callable that constructs stem layer.
            Examples include create_res_basic_stem. Indexed by pathway
        stem_dim_outs (tuple): output channel size to stem.
        stem_conv_kernel_sizes (tuple): convolutional kernel size(s) of stem.
        stem_conv_strides (tuple): convolutional stride size(s) of stem.
        stem_pool (Tuple[Callable]): a callable that constructs resnet head pooling layer.
            Indexed by pathway
        stem_pool_kernel_sizes (tuple): pooling kernel size(s).
        stem_pool_strides (tuple): pooling stride size(s).

        stage_conv_a_kernel_sizes (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_sizes (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
            for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
        stage_conv_b_dilations (tuple): dilation for 3D convolution for conv_b.
        stage_spatial_strides (tuple): the spatial stride for each stage.
        stage_temporal_strides (tuple): the temporal stride for each stage.
        bottleneck (Tuple[Tuple[Callable]]): a callable that constructs bottleneck
            block layer. Examples include: create_bottleneck_block.
            Indexed by pathway and stage index

        head (callable): a callable that constructs the resnet-style head.
            Ex: create_res_basic_head
        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_output_sizes (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.
    Returns:
        (nn.Module): SlowFast model.
    z"PYTORCHVIDEO.model.create_slowfastz is not in Nr   r   )r(   conv_fusion_channel_ratioconv_kernel_sizeconv_strider1   r2   max_stage_idxc                 S      g | ]}|d  qS r    .0sizerN   rN   P/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/slowfast.py
<listcomp>       z#create_slowfast.<locals>.<listcomp>c                 S   rL   rM   rN   rO   rN   rN   rR   rS      rT   )in_channelsout_channelsrI   rJ   conv_paddingpoolpool_kernel_sizepool_stridepool_paddingr1   r2   )fusion_dim_in	stage_idx)multipathway_blocksmultipathway_fusionr   c                 S   rL   rM   rN   rO   rN   rN   rR   rS         c                 S   s   g | ]	}d d |D qS )c                 S   rL   rM   rN   rO   rN   rN   rR   rS     r`   z.create_slowfast.<locals>.<listcomp>.<listcomp>rN   )rP   sizesrN   rN   rR   rS     s    r   )depthdim_in	dim_innerdim_outr@   conv_a_kernel_sizeconv_a_strideconv_a_paddingconv_b_kernel_sizeconv_b_strideconv_b_paddingconv_b_num_groupsconv_b_dilationr1   r2   c                    s   g | ]} | qS rN   rN   rP   idx)rD   rB   rN   rR   rS   H  s    c                    s   g | ]} | d ddqS )r"   )r   r   r   )kernel_sizestridepaddingrN   rn   )rB   rC   rN   rR   rS   J  s    zUnsupported pool_model type F)retain_listrX   )in_featuresout_featuresrX   output_sizer0   r2   output_with_global_average)blocks)torch_C_log_api_usage_oncelen_MODEL_STAGE_DEPTHkeys
isinstanceintr   FastToSlowFusionBuildercreate_modulerangeappendr   nn
ModuleListr   AdaptiveAvgPool3d	AvgPool3dNotImplementedErrorPoolConcatPathwayr   )3r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   _num_pathwaystage_depthsstemspathway_idxstagesstage_dim_instage_dim_outro   pathway_stage_dim_inpathway_stage_dim_innerpathway_stage_dim_outreduction_ratiostagerb   stage_conv_a_kernelstage_conv_a_stridestage_conv_a_paddingstage_conv_b_stride
pool_modelhead_in_featuresrN   )rD   rB   rC   rR   create_slowfast   s<   











	
	

	
r   P   )r"   r"   r"   r    r   )r   r   r   r   r   ))r   r   r   )r'   r   r   F)r   r   g      ?)"r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   head_spatial_resolutionhead_spatial_scalehead_sampling_ratior   r   r   c        "   &      C   s  t d i d| d|d|d|d|d|d|d|d	|	d
|
d|d|d|d|d|d|d|d|d|d|d|d|dtddd|d|}"|d dtt| d   }#|d |d  }$|#|#|$  }%t|%|d|||||| |!d
}t|"|S )!u  
    Build SlowFast model for video detection, SlowFast model involves a Slow pathway,
    operating at low frame rate, to capture spatial semantics, and a Fast pathway,
    operating at high frame rate, to capture motion at fine temporal resolution. The
    Fast pathway can be made very lightweight by reducing its channel capacity, yet can
    learn useful temporal information for video recognition. Details can be found from
    the paper:

    Christoph Feichtenhofer, Haoqi Fan, Jitendra Malik, and Kaiming He.
    "SlowFast networks for video recognition."
    https://arxiv.org/pdf/1812.03982.pdf

    ::

                        Slow Input  Fast Input         Bounding Box Input
                            ↓           ↓                      ↓
                           Stem       Stem                     ↓
                            ↓ ⭠ Fusion- ↓                     ↓
                          Stage 1     Stage 1                  ↓
                            ↓ ⭠ Fusion- ↓                     ↓
                            .           .                      ↓
                            ↓           ↓                      ↓
                          Stage N     Stage N                  ↓
                            ↓ ⭠ Fusion- ↓                     ↓
                                    ↓                          ↓
                                    ↓----------> Head <--------↓

    Args:
        slowfast_channel_reduction_ratio (int): Corresponds to the inverse of the channel
            reduction ratio, $eta$ between the Slow and Fast pathways.
        slowfast_conv_channel_fusion_ratio (int): Ratio of channel dimensions
            between the Slow and Fast pathways.
        DEPRECATED slowfast_fusion_conv_kernel_size (tuple): the convolutional kernel
            size used for fusion.
        DEPRECATED slowfast_fusion_conv_stride (tuple): the convolutional stride size
            used for fusion.
        fusion_builder (Callable[[int, int], nn.Module]): Builder function for generating
            the fusion modules based on stage dimension and index

        input_channels (tuple): number of channels for the input video clip.

        model_depth (int): the depth of the resnet.
        model_num_class (int): the number of classes for the video dataset.
        dropout_rate (float): dropout rate.

        norm (callable): a callable that constructs normalization layer.

        activation (callable): a callable that constructs activation layer.

        stem_function (Tuple[Callable]): a callable that constructs stem layer.
            Examples include create_res_basic_stem. Indexed by pathway
        stem_dim_outs (tuple): output channel size to stem.
        stem_conv_kernel_sizes (tuple): convolutional kernel size(s) of stem.
        stem_conv_strides (tuple): convolutional stride size(s) of stem.
        stem_pool (Tuple[Callable]): a callable that constructs resnet head pooling layer.
            Indexed by pathway
        stem_pool_kernel_sizes (tuple): pooling kernel size(s).
        stem_pool_strides (tuple): pooling stride size(s).

        stage_conv_a_kernel_sizes (tuple): convolutional kernel size(s) for conv_a.
        stage_conv_b_kernel_sizes (tuple): convolutional kernel size(s) for conv_b.
        stage_conv_b_num_groups (tuple): number of groups for groupwise convolution
            for conv_b. 1 for ResNet, and larger than 1 for ResNeXt.
        stage_conv_b_dilations (tuple): dilation for 3D convolution for conv_b.
        stage_spatial_strides (tuple): the spatial stride for each stage.
        stage_temporal_strides (tuple): the temporal stride for each stage.
        bottleneck (Tuple[Tuple[Callable]]): a callable that constructs bottleneck
            block layer. Examples include: create_bottleneck_block.
            Indexed by pathway and stage index

        head (callable): a a callable that constructs the detection head which can
            take in the additional input of bounding boxes.
            Ex: create_res_roi_pooling_head
        head_pool (callable): a callable that constructs resnet head pooling layer.
        head_output_sizes (tuple): the size of output tensor for head.
        head_activation (callable): a callable that constructs activation layer.
        head_output_with_global_average (bool): if True, perform global averaging on
            the head output.
        head_spatial_resolution (tuple): h, w sizes of the RoI interpolation.
        head_spatial_scale (float): scale the input boxes by this number.
        head_sampling_ratio (int): number of inputs samples to take for each output
                sample interpolation. 0 to take samples densely.
    Returns:
        (nn.Module): SlowFast model.
    r(   r)   r*   r+   r-   r.   r/   r0   r1   r2   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   NrB   rC   r   r   r   )
rt   ru   rX   rv   r0   r2   rw   
resolutionspatial_scalesampling_ratiorN   )r   r   r|   r}   r	   r
   )&r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   r   r   r   modelr   slow_fast_betar   rN   rN   rR   create_slowfast_with_roi_headh  s    '	
 !"%
r   c                	       sZ   e Zd ZdZ			ddedeej deddf fd	d
Z	de
ej dejfddZ  ZS )r   z
    Given a list of tensors, perform optional spatio-temporal pool and concatenate the
        tensors along the channel dimension.
    FNr   rs   rX   dimrG   c                       t    t| t  dS )a7  
        Args:
            retain_list (bool): if True, return the concatenated tensor in a list.
            pool (nn.module_list): if not None, list of pooling models for different
                pathway before performing concatenation.
            dim (int): dimension to performance concatenation.
        Nsuper__init__r   locals)selfrs   rX   r   	__class__rN   rR   r   L  s   
zPoolConcatPathway.__init__xc                 C   s   | j d urt|t| j ksJ g }tt|D ]&}|| d ur>| j d ur7| j | d ur7| j | || ||< |||  q| jrIt|dgS t|dS )Nr   )rX   r|   r   r   rs   ry   cat)r   r   outputindrN   rN   rR   forward\  s   
zPoolConcatPathway.forward)FNr   )__name__
__module____qualname____doc__boolr   r   r   r   r   r   ry   Tensorr   __classcell__rN   rN   r   rR   r   F  s    "r   c                   @   sn   e Zd Zejddejdfdededee dee de	d	ed
ede	deddfddZ
dededejfddZdS )r   gh㈵>g?r   r(   rH   rI   rJ   r1   norm_epsnorm_momentumr2   rK   rG   Nc
           
      C   s   t | t  dS )a  
        Given a list of two tensors from Slow pathway and Fast pathway, fusion information
        from the Fast pathway to the Slow on through a convolution followed by a
        concatenation, then return the fused list of tensors from Slow and Fast pathway in
        order.
        Args:
            slowfast_channel_reduction_ratio (int): Reduction ratio from the stage dimension.
                Used to compute conv_dim_in = fusion_dim_in // slowfast_channel_reduction_ratio
            conv_fusion_channel_ratio (int): channel ratio for the convolution used to fuse
                from Fast pathway to Slow pathway.
            conv_kernel_size (int): kernel size of the convolution used to fuse from Fast
                pathway to Slow pathway.
            conv_stride (int): stride size of the convolution used to fuse from Fast pathway
                to Slow pathway.
            norm (callable): a callable that constructs normalization layer, examples
                include nn.BatchNorm3d, None (not performing normalization).
            norm_eps (float): normalization epsilon.
            norm_momentum (float): normalization momentum.
            activation (callable): a callable that constructs activation layer, examples
                include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not performing
                activation).
            max_stage_idx (int): Returns identity module if we exceed this
        N)r   r   )
r   r(   rH   rI   rJ   r1   r   r   r2   rK   rN   rN   rR   r   l  s   #z FastToSlowFusionBuilder.__init__r\   r]   c                 C   s   || j kr	t S || j }tj|t|| j | j| jdd | jD dd}| j	du r-dn| j	|| j | j
| jd}| jdu rAdn|  }t|||dS )z
        Creates the module for the given stage
        Args:
            fusion_dim_in (int): input stage dimension
            stage_idx (int): which stage this is
        c                 S   rL   rM   rN   )rP   k_sizerN   rN   rR   rS     r`   z9FastToSlowFusionBuilder.create_module.<locals>.<listcomp>F)rp   rq   rr   biasN)num_featuresepsmomentum)conv_fast_to_slowr1   r2   )rK   r   Identityr(   Conv3dr   rH   rI   rJ   r1   r   r   r2   FuseFastToSlow)r   r\   r]   conv_dim_inr   norm_moduleactivation_modulerN   rN   rR   r     s2   



	z%FastToSlowFusionBuilder.create_module)r   r   r   r   BatchNorm3dReLUr   floatr   r   r   Moduler   rN   rN   rN   rR   r   k  s8    	

%r   c                	       sN   e Zd ZdZ		ddejdeej deej ddf fddZd	d
 Z  Z	S )r   a  
    Given a list of two tensors from Slow pathway and Fast pathway, fusion information
    from the Fast pathway to the Slow on through a convolution followed by a
    concatenation, then return the fused list of tensors from Slow and Fast pathway in
    order.
    Nr   r1   r2   rG   c                    r   )z
        Args:
            conv_fast_to_slow (nn.module): convolution to perform fusion.
            norm (nn.module): normalization module.
            activation (torch.nn.modules): activation module.
        Nr   )r   r   r1   r2   r   rN   rR   r     s   
zFuseFastToSlow.__init__c                 C   sZ   |d }|d }|  |}| jd ur| |}| jd ur!| |}t||gd}||gS )Nr   r   )r   r1   r2   ry   r   )r   r   x_sx_ffusex_s_fuserN   rN   rR   r     s   




zFuseFastToSlow.forward)NN)
r   r   r   r   r   r   r   r   r   r   rN   rN   r   rR   r     s    
r   )&typingr   r   r   r   r   ry   torch.nnr   pytorchvideo.layers.utilsr   pytorchvideo.models.headr   r	   pytorchvideo.models.netr
   r   r   pytorchvideo.models.resnetr   r   pytorchvideo.models.stemr   r}   r   r   	MaxPool3dr   r   r   r   r   r   Sigmoidr   r   r   r   rN   rN   rN   rR   <module>   s  

!
"
#$
%
&(,
01
5
67FG
HIJKL
  W

!
"
#$
%
&(,
01
5
67FG
HIJKLMNO
 _%J