o
    ϯiI                  !   @   s  d dl mZmZ d dlZd dlmZ d dlmZ d dlm	Z	 G dd dej
Zejdddd	d
ddddedededee dee dee dee dedededej
fddZdd
dddedededededej
fddZd e	ejdddd	ejd
ddddeded ed!ed"ed#ededee dee dee dee d$ededededej
f d%d&ZG d'd( d(ej
ZG d)d* d*ej
ZG d+d, d,ej
ZdS )-    )CallableTupleN)set_attributes)RoIAlignc                       s@   e Zd ZdZdeddf fddZdejdejfdd	Z  Z	S )
SequencePoolz
    Sequence pool produces a single embedding from a sequence of embeddings. Currently
    it supports "mean" and "cls".

    modereturnNc                    s$   t    |dv sJ d|| _dS )a  
        Args:
            mode (str): Optionals include "cls" and "mean". If set to "cls", it assumes
                the first element in the input is the cls token and returns it. If set
                to "mean", it returns the mean of the entire sequence.
        clsmeanz"Unsupported mode for SequencePool.N)super__init__r   )selfr   	__class__ L/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/head.pyr      s   

zSequencePool.__init__xc                 C   s:   | j dkr|d d df }|S | j dkr|d}|S t)Nr
   r   r      )r   r   NotImplementedErrorr   r   r   r   r   forward   s   


zSequencePool.forward)
__name__
__module____qualname____doc__strr   torchTensorr   __classcell__r   r   r   r   r      s    r   )r   r   r   )r      r    )r   r   r   g      ?T)pooloutput_sizepool_kernel_sizepool_stridepool_paddingdropout_rate
activationoutput_with_global_averagein_featuresout_featuresr!   r"   r#   r$   r%   r&   r'   r(   r   c        
         C   s   |du rd}
n|t jkr|dd}
n| }
|du rd}n|t jkr&||}n||||d}|	r5t d}nd}tt | ||
||dkrKt ||dS d|dS )u  
    Creates ResNet basic head. This layer performs an optional pooling operation
    followed by an optional dropout, a fully-connected projection, an activation layer
    and a global spatiotemporal averaging.

    ::


                                        Pooling
                                           ↓
                                        Dropout
                                           ↓
                                       Projection
                                           ↓
                                       Activation
                                           ↓
                                       Averaging

    Activation examples include: ReLU, Softmax, Sigmoid, and None.
    Pool3d examples include: AvgPool3d, MaxPool3d, AdaptiveAvgPool3d, and None.

    Args:

        in_features: input channel size of the resnet head.
        out_features: output channel size of the resnet head.

        pool (callable): a callable that constructs resnet head pooling layer,
            examples include: nn.AvgPool3d, nn.MaxPool3d, nn.AdaptiveAvgPool3d, and
            None (not applying pooling).
        pool_kernel_size (tuple): pooling kernel size(s) when not using adaptive
            pooling.
        pool_stride (tuple): pooling stride size(s) when not using adaptive pooling.
        pool_padding (tuple): pooling padding size(s) when not using adaptive
            pooling.
        output_size (tuple): spatial temporal output size when using adaptive
            pooling.

        activation (callable): a callable that constructs resnet head activation
            layer, examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not
            applying activation).

        dropout_rate (float): dropout rate.

        output_with_global_average (bool): if True, perform global averaging on temporal
            and spatial dimensions and reshape output to batch_size x out_features.
    Nr   dimkernel_sizestridepaddingr   )projr'   r!   dropoutoutput_pool)nnSoftmaxAdaptiveAvgPool3dResNetBasicHeadLinearDropout)r)   r*   r!   r"   r#   r$   r%   r&   r'   r(   activation_model
pool_modelr3   r   r   r   create_res_basic_head'   s2   A



r<   r
   )seq_pool_typer&   r'   r=   c                 C   s   |dv sJ |dv rt |}n	|dkrd}nt|du rd}n|tjkr*|dd}n| }t||dkr8t|ndt| ||dS )	u  
    Creates vision transformer basic head.

    ::


                                        Pooling
                                           ↓
                                        Dropout
                                           ↓
                                       Projection
                                           ↓
                                       Activation


    Activation examples include: ReLU, Softmax, Sigmoid, and None.
    Pool type examples include: cls, mean and none.

    Args:

        in_features: input channel size of the resnet head.
        out_features: output channel size of the resnet head.

        pool_type (str): Pooling type. It supports "cls", "mean " and "none". If set to
            "cls", it assumes the first element in the input is the cls token and
            returns it. If set to "mean", it returns the mean of the entire sequence.

        activation (callable): a callable that constructs vision transformer head
            activation layer, examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and
            None (not applying activation).

        dropout_rate (float): dropout rate.
    )r
   r   noner	   r>   Nr   r+   g        sequence_poolr2   r1   r'   )r   r   r4   r5   VisionTransformerBasicHeadr9   r8   )r)   r*   r=   r&   r'   seq_pool_modelr:   r   r   r   create_vit_basic_head   s"   -


rC   )sampling_ratioroir!   r"   r#   r$   r%   pool_spatialr&   r'   r(   
resolutionspatial_scalerD   rE   rF   c              
   C   s   |du rd}n|t jkr|dd}n| }|du rd}n|t jkr&||}n|||	|
d}|r5t d}nd}tt | ||||rG||ddnd||||d|dkrZt ||dS d|dS )	u0  
    Creates ResNet RoI head. This layer performs an optional pooling operation
    followed by an RoI projection, an optional 2D spatial pool, an optional dropout,
    a fully-connected projection, an activation layer
    and a global spatiotemporal averaging.

                                        Pool3d
                                           ↓
                                       RoI Align
                                           ↓
                                        Pool2d
                                           ↓
                                        Dropout
                                           ↓
                                       Projection
                                           ↓
                                       Activation
                                           ↓
                                       Averaging

    Activation examples include: ReLU, Softmax, Sigmoid, and None.
    Pool3d examples include: AvgPool3d, MaxPool3d, AdaptiveAvgPool3d, and None.
    RoI examples include: detectron2.layers.ROIAlign, detectron2.layers.ROIAlignRotated,
        tochvision.ops.RoIAlign and None
    Pool2d examples include: MaxPool2e, AvgPool2d, and None.

    Args:
        Projection related configs:
            in_features: input channel size of the resnet head.
            out_features: output channel size of the resnet head.

        RoI layer related configs:
            resolution (tuple): h, w sizes of the RoI interpolation.
            spatial_scale (float): scale the input boxes by this number
            sampling_ratio (int): number of inputs samples to take for each output
                sample interpolation. 0 to take samples densely.
            roi (callable): a callable that constructs the roi interpolation layer,
                examples include detectron2.layers.ROIAlign,
                detectron2.layers.ROIAlignRotated, and None.

        Pooling related configs:
            pool (callable): a callable that constructs resnet head pooling layer,
                examples include: nn.AvgPool3d, nn.MaxPool3d, nn.AdaptiveAvgPool3d, and
                None (not applying pooling).
            pool_kernel_size (tuple): pooling kernel size(s) when not using adaptive
                pooling.
            pool_stride (tuple): pooling stride size(s) when not using adaptive pooling.
            pool_padding (tuple): pooling padding size(s) when not using adaptive
                pooling.
            output_size (tuple): spatial temporal output size when using adaptive
                pooling.
            pool_spatial (callable): a callable that constructs the 2d pooling layer which
                follows the RoI layer, examples include: nn.AvgPool2d, nn.MaxPool2d, and
                None (not applying spatial pooling).

        Activation related configs:
            activation (callable): a callable that constructs resnet head activation
                layer, examples include: nn.ReLU, nn.Softmax, nn.Sigmoid, and None (not
                applying activation).

        Dropout related configs:
            dropout_rate (float): dropout rate.

        Output related configs:
            output_with_global_average (bool): if True, perform global averaging on temporal
                and spatial dimensions and reshape output to batch_size x out_features.
    Nr   r+   r-   )r/   )r"   rH   rD   r   )r1   r'   r!   rF   	roi_layerr2   r3   )r4   r5   r6   ResNetRoIHeadr8   r9   )r)   r*   rG   rH   rD   rE   r!   r"   r#   r$   r%   rF   r&   r'   r(   r:   r;   r3   r   r   r   create_res_roi_pooling_head   s>   [




rK   c                       sf   e Zd ZdZ					ddejdejdejdejdejddf fd	d
ZdejdejfddZ	  Z
S )r7   u  
    ResNet basic head. This layer performs an optional pooling operation followed by an
    optional dropout, a fully-connected projection, an optional activation layer and a
    global spatiotemporal averaging.

    ::

                                        Pool3d
                                           ↓
                                        Dropout
                                           ↓
                                       Projection
                                           ↓
                                       Activation
                                           ↓
                                       Averaging

    The builder can be found in `create_res_basic_head`.
    Nr!   r2   r1   r'   r3   r   c                    (   t    t| t  | jdusJ dS )a<  
        Args:
            pool (torch.nn.modules): pooling module.
            dropout(torch.nn.modules): dropout module.
            proj (torch.nn.modules): project module.
            activation (torch.nn.modules): activation module.
            output_pool (torch.nn.Module): pooling module for output.
        Nr   r   r   localsr1   )r   r!   r2   r1   r'   r3   r   r   r   r   _  s   
zResNetBasicHead.__init__r   c                 C   s   | j d ur
|  |}| jd ur| |}| jd ur(|d}| |}|d}| jd ur2| |}| jd urE| |}||jd d}|S )Nr            r   r   rR   r   rP   rQ   r   )r!   r2   r1   permuter'   r3   viewshaper   r   r   r   r   s  s   











zResNetBasicHead.forward)NNNNNr   r   r   r   r4   Moduler   r   r   r   r   r   r   r   r   r7   J  s*    r7   c                       s|   e Zd ZdZ							ddejdejdejdejdejdejd	ejd
df fddZdejdejd
ejfddZ	  Z
S )rJ   u  
    ResNet RoI head. This layer performs an optional pooling operation
    followed by an RoI projection, an optional 2D spatial pool, an optional dropout,
    a fully-connected projection, an activation layer
    and a global spatiotemporal averaging.
                                        Pool3d
                                           ↓
                                       RoI Align
                                           ↓
                                        Pool2d
                                           ↓
                                        Dropout
                                           ↓
                                       Projection
                                           ↓
                                       Activation
                                           ↓
                                       Averaging

    The builder can be found in `create_res_roi_pooling_head`.
    Nr!   rF   rI   r2   r1   r'   r3   r   c                    rL   )a  
        Args:
            pool (torch.nn.modules): pooling module.
            pool_spatial (torch.nn.modules): pooling module.
            roi_spatial (torch.nn.modules): RoI (Ex: Align, pool) module.
            dropout(torch.nn.modules): dropout module.
            proj (torch.nn.modules): project module.
            activation (torch.nn.modules): activation module.
            output_pool (torch.nn.Module): pooling module for output.
        NrM   )r   r!   rF   rI   r2   r1   r'   r3   r   r   r   r     s   
zResNetRoIHead.__init__r   bboxesc                 C   s   | j dur
|  |}| jdur7|jd }|dkrtdt|d}| ||}| jdur2| |}|d}| jdurA| |}| j	durU|
d}| 	|}|
d}| jdur_| |}| jdurr| |}||jd d}|S )	aC  
        Args:
            x (torch.tensor): input tensor
            bboxes (torch.tensor): Accociated bounding boxes.
                The format is N*5 (Index, X_1,Y_1,X_2,Y_2) if using RoIAlign
                and N*6 (Index, x_ctr, y_ctr, width, height, angle_degrees) if
                using RoIAlignRotated.
        Nr   zBTemporal dimension should be 1. Consider modifying the pool layer.rO   rS   r   rT   )r!   rI   rW   	Exceptionr   squeezerF   	unsqueezer2   r1   rU   r'   r3   rV   )r   r   rZ   temporal_dimr   r   r   r     s2   

















zResNetRoIHead.forward)NNNNNNNrX   r   r   r   r   rJ     s6    	$rJ   c                       s^   e Zd ZdZ				ddejdejdejdejddf
 fdd	Zd
ejdejfddZ	  Z
S )rA   u  
    Vision transformer basic head.

    ::

                                      SequencePool
                                           ↓
                                        Dropout
                                           ↓
                                       Projection
                                           ↓
                                       Activation


    The builder can be found in `create_vit_basic_head`.
    Nr@   r2   r1   r'   r   c                    rL   )z
        Args:
            sequence_pool (torch.nn.modules): pooling module.
            dropout(torch.nn.modules): dropout module.
            proj (torch.nn.modules): project module.
            activation (torch.nn.modules): activation module.
        NrM   )r   r@   r2   r1   r'   r   r   r   r     s   
z#VisionTransformerBasicHead.__init__r   c                 C   sJ   | j d ur
|  |}| jd ur| |}| |}| jd ur#| |}|S )Nr?   r   r   r   r   r   	  s   






z"VisionTransformerBasicHead.forward)NNNNrX   r   r   r   r   rA     s$    rA   )typingr   r   r   torch.nnr4   pytorchvideo.layers.utilsr   torchvision.opsr   rY   r   	AvgPool3dintfloatboolr<   r   rC   	MaxPool2drK   r7   rJ   rA   r   r   r   r   <module>   s   "	

e

M	
@[