o
    ϯi9                     @   s  d dl mZ d dlmZ d dlmZ ddlmZmZ d	ed	ed	ed	Z
d
dddgddgddggddgddgddggg dg dg dgg dg ddZd
dddgddgddggddgddgddggg dg dg dgg dg ddZd
ddddgddgddgdddgddgddggddgddgddggg dg dg dgg dg ddZ		d'dededed ejfd!d"Z		d'dededed ejfd#d$Z		d'dededed ejfd%d&ZdS )(    )AnyN)%create_multiscale_vision_transformers   )MODEL_ZOO_ROOT_DIRhub_model_builderz{}/kinetics/MVIT_B_16x4.pythz'{}/kinetics/MVIT_B_32x3_f294077834.pythz%{}/imagenet/MVIT_B_16_f292487636.pyth)mvit_base_16x4mvit_base_32x3mvit_base_16      g       @      )r   r      r   )r   r   r   r   )r   r   r   r   )r      r   )r   r   r   )spatial_sizetemporal_sizeembed_dim_mulatten_head_mulpool_q_stride_sizepool_kv_stride_adaptivepool_kvq_kernel          T)r   r   r   )r   r   r   )r   r   depthconv_patch_embed_kernelconv_patch_embed_strideconv_patch_embed_paddinguse_2d_patchr   r   r   r   r   F
pretrainedprogresskwargsreturnc                 K      t dt| |td td|S )a  
    Multiscale Vision Transformers model architecture [1] trained with an 16x4
    setting on the Kinetics400 dataset. Model with pretrained weights has top1
    accuracy of 78.9%.

    [1] Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra
    Malik, Christoph Feichtenhofer, "Multiscale Vision Transformers"
    https://arxiv.org/pdf/2104.11227.pdf

    Args:
        pretrained (bool): If True, returns a model pre-trained on Kinetics400 dataset.
        progress (bool): If True, displays a progress bar of the download to stderr.
        kwargs: Use these to modify any of the other model settings. All the
            options are defined in create_multiscale_vision_transformers.

    NOTE: to use the pretrained model, do not modify the model configuration
    via the kwargs. Only modify settings via kwargs to initialize a new model
    without pretrained weights.
    r   model_builder_funcr   r    checkpoint_pathdefault_configN )r   r   checkpoint_pathsmvit_video_base_configr   r    r!   r(   r(   _/home/ubuntu/.local/lib/python3.10/site-packages/pytorchvideo/models/hub/vision_transformers.pyr   9      r   c                 K   r#   )a  
    Multiscale Vision Transformers model architecture [1] trained with an 32x3
    setting on the Kinetics400 dataset. Model with pretrained weights has top1
    accuracy of 80.3%.

    [1] Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra
    Malik, Christoph Feichtenhofer, "Multiscale Vision Transformers"
    https://arxiv.org/pdf/2104.11227.pdf

    Args:
        pretrained (bool): If True, returns a model pre-trained on Kinetics400 dataset.
        progress (bool): If True, displays a progress bar of the download to stderr.
        kwargs: Use these to modify any of the other model settings. All the
            options are defined in create_multiscale_vision_transformers.

    NOTE: to use the pretrained model, do not modify the model configuration
    via the kwargs. Only modify settings via kwargs to initialize a new model
    without pretrained weights.
    r   r$   Nr(   )r   r   r)   mvit_video_base_32x3_configr+   r(   r(   r,   r   \   r-   r   c                 K   r#   )az  
    Multiscale Vision Transformers model architecture [1] with a depth 16 trained on
    ImageNet-1k dataset. Model with pretrained weights has top1 accuracy of 83%.

    [1] Haoqi Fan, Bo Xiong, Karttikeya Mangalam, Yanghao Li, Zhicheng Yan, Jitendra
    Malik, Christoph Feichtenhofer, "Multiscale Vision Transformers"
    https://arxiv.org/pdf/2104.11227.pdf

    Args:
        pretrained (bool): If True, returns a model pre-trained on Kinetics400 dataset.
        progress (bool): If True, displays a progress bar of the download to stderr.
        kwargs: Use these to modify any of the other model settings. All the
            options are defined in create_multiscale_vision_transformers.

    NOTE: to use the pretrained model, do not modify the model configuration
    via the kwargs. Only modify settings via kwargs to initialize a new model
    without pretrained weights.
    r	   r$   Nr(   )r   r   r)   mvit_image_base_16_configr+   r(   r(   r,   r	      s   r	   )FT)typingr   torch.nnnn'pytorchvideo.models.vision_transformersr   utilsr   r   formatr)   r*   r.   r/   boolModuler   r   r	   r(   r(   r(   r,   <module>   s   	
$
$