o
    پi                     @   sb  d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl
mZ d dlmZmZ d dlmZmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ dgZ dd Z!G dd dej"Z#G dd dej"Z$ej%dfde&de&deej" de'dej(f
ddZ)G dd dej"Z*G dd dej"Z+G dd dej"Z,G d d! d!ej"Z-G d"d# d#ej"Z.G d$d% d%ej"Z/G d&d' d'ej"Z0G d(d) d)ej"Z1G d*d+ d+ej"Z2G d,d dej"Z3dgd.d/Z4ei d0e4d1d2d3e4d1d2d4e4d1d2d5e4d1d2d6e4d1d2d7e4d1d2d8e4d1d9d:d;e4d1d2d<e4d1d2d=e4d1d2d>e4d1d2d?e4d1d2d@e4d1d2dAe4d1d9d:dBe4dCdDd9dEdFdGdHdIe4dJdKd9dEdFdGdHdLe4dMdNd9dEdFdGdHZ5dOdP Z6dhdQdRZ7edhdSdTZ8edhdUdVZ9edhdWdXZ:edhdYdZZ;edhd[d\Z<edhd]d^Z=edhd_d`Z>edhdadbZ?edhdcddZ@edhdedfZAdS )i    N)partial)ListOptionalTupleTypeUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPathtrunc_normal_create_conv2dConvNormActSqueezeExciteuse_fused_attnClassifierHead   )build_model_with_cfg)feature_take_indices)checkpoint_seq)register_modelgenerate_default_cfgsFastVitc                 C   s    | sdS ||  dksJ ||  S )Nr   r    )
group_sizechannelsr   r   G/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/fastvit.py
num_groups   s   r   c                       s   e Zd ZdZddddddddejf	dededed	ed
edededededededeej	 ddf fddZ
dejdejfddZdd Zdeejejf fddZdeejejf deejejf fddZ  ZS ) MobileOneBlocka#  MobileOne building block.

    This block has a multi-branched architecture at train-time
    and plain-CNN style architecture at inference time
    For more details, please refer to our paper:
    `An Improved One millisecond Mobile Backbone` -
    https://arxiv.org/pdf/2206.04040.pdf
    r   r   FTin_chsout_chskernel_sizestridedilationr   inference_modeuse_seuse_actuse_scale_branchnum_conv_branches	act_layerreturnNc              	      s&  t t  |_t||_|_|_ _|_	|_
|_|r*t|ddnt _|r?t|| ||jdd_nEd_||krP|dkrPtj|dnd_|dkrit fdd	tjD _nd_d_ dkr|
rtj	j
djjd
d_|	r| _dS t _dS )a  Construct a MobileOneBlock module.

        Args:
            in_chs: Number of channels in the input.
            out_chs: Number of channels produced by the block.
            kernel_size: Size of the convolution kernel.
            stride: Stride size.
            dilation: Kernel dilation factor.
            group_size: Convolution group size.
            inference_mode: If True, instantiates model in inference mode.
            use_se: Whether to use SE-ReLU activations.
            use_act: Whether to use activation. Default: ``True``
            use_scale_branch: Whether to use scale branch. Default: ``True``
            num_conv_branches: Number of linear conv branches.
        r   )
rd_divisorTr!   r"   r#   groupsbiasN)num_featuresr   c              
      s(   g | ]}t jj jjd dqS )Fr!   r"   r-   	apply_act)r   r   r    r"   r-   ).0_r!   selfr   r   
<listcomp>m   s    z+MobileOneBlock.__init__.<locals>.<listcomp>Fr0   )superr   __init__r$   r   r-   r"   r#   r!   r   r    r(   r   nnIdentityser   reparam_convBatchNorm2didentity
ModuleListrangeconv_kxk
conv_scaler   act)r5   r   r    r!   r"   r#   r   r$   r%   r&   r'   r(   r)   	__class__r4   r   r8   ,   sR   
	zMobileOneBlock.__init__xc                 C   s   | j dur| | |  |S d}| jdur| |}d}| jdur(| |}|| }| jdur=| jD ]}|||7 }q4| | |S )zApply forward pass.Nr   )r<   rC   r;   r>   rB   rA   )r5   rF   identity_out	scale_outoutrcr   r   r   forward   s   






zMobileOneBlock.forwardc              	   C   s   | j durdS |  \}}t| j| j| j| j| j| jdd| _ || j j	_
|| j j_
|  D ]\}}d|v r6q-|  q-| d | d t| drO| d d| _dS )a  Following works like `RepVGG: Making VGG-style ConvNets Great Again` -
        https://arxiv.org/pdf/2101.03697.pdf. We re-parameterize multi-branched
        architecture used at training time to obtain a plain CNN-like structure
        for inference.
        NT)in_channelsout_channelsr!   r"   r#   r-   r.   r<   rA   rB   r>   )r<   _get_kernel_biasr   r   r    r!   r"   r#   r-   weightdatar.   named_parametersdetach___delattr__hasattrr$   )r5   kernelr.   nameparar   r   r   reparameterize   s.   

	






zMobileOneBlock.reparameterizec                 C   s   d}d}| j dur"| | j \}}| jd }tjj|||||g}d}d}| jdur3| | j\}}d}d}| jdurVt	| j
D ]}| | j| \}	}
||	7 }||
7 }qA|| | }|| | }||fS )zMethod to obtain re-parameterized kernel and bias.
        Reference: https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py#L83

        Returns:
            Tuple of (kernel, bias) after fusing branches.
        r   N   )rB   _fuse_bn_tensorr!   torchr9   
functionalpadr>   rA   r@   r(   )r5   kernel_scale
bias_scaler]   kernel_identitybias_identitykernel_conv	bias_convix_kernel_biaskernel_final
bias_finalr   r   r   rN      s(   




zMobileOneBlock._get_kernel_biasbranchc                 C   s  t |tr|jj}|jj}|jj}|jj}|jj}|jj}nSt |t	j
s&J t| ds_| j| j }tj| j|| j| jf|jj|jjd}	t| jD ]}
d|	|
|
| | jd | jd f< qI|	| _| j}|j}|j}|j}|j}|j}||  }|| dddd}|| ||| |  fS )a  Method to fuse batchnorm layer with preceding conv layer.
        Reference: https://github.com/DingXiaoH/RepVGG/blob/main/repvgg.py#L95

        Args:
            branch: Sequence of ops to be fused.

        Returns:
            Tuple of (kernel, bias) after fusing batchnorm.
        	id_tensordtypedevicer   rY   )
isinstancer   convrO   bnrunning_meanrunning_varr.   epsr9   r=   rT   r   r-   r[   zerosr!   rl   rm   r@   rj   sqrtreshape)r5   ri   rU   rr   rs   gammabetart   	input_dimkernel_valueistdtr   r   r   rZ      s<   


zMobileOneBlock._fuse_bn_tensor)__name__
__module____qualname____doc__r9   GELUintboolr   Moduler8   r[   TensorrK   rX   r   rN   r   
Sequentialr=   rZ   __classcell__r   r   rD   r   r   "   s\    	
\##r   c                       s   e Zd ZdZ				ddededededed	ee d
edeej deddf fddZ	de
jde
jfddZdee
je
jf fddZdddZedejdejdee
je
jf fddZ  ZS )ReparamLargeKernelConvzBuilding Block of RepLKNet

    This class defines overparameterized large kernel conv block
    introduced in `RepLKNet <https://arxiv.org/abs/2203.06717>`_

    Reference: https://github.com/DingXiaoH/RepLKNet-pytorch
    NFr   r    r!   r"   r   small_kernelr%   r)   r$   r*   c
           
   	      s   t t|   || _t||| _|| _|| _|| _|| _	|	r,t
||||d| jdd| _n)d| _t|||| j| jdd| _|durU||ksHJ dt|||| j| jdd| _|r]t|dd	nt | _|durl| | _dS t | _dS )
a!  Construct a ReparamLargeKernelConv module.

        Args:
            in_chs: Number of input channels.
            out_chs: Number of output channels.
            kernel_size: Kernel size of the large kernel conv branch.
            stride: Stride size. Default: 1
            group_size: Group size. Default: 1
            small_kernel: Kernel size of small kernel conv branch.
            act_layer: Activation module. Default: ``nn.GELU``
            inference_mode: If True, instantiates model in inference mode. Default: ``False``
        r   Tr,   NFr0   zDThe kernel size for re-param cannot be larger than the large kernel!g      ?)rd_ratio)r7   r   r8   r"   r   r-   r   r    r!   r   r   r<   r   
large_conv
small_convr   r9   r:   r;   rC   )
r5   r   r    r!   r"   r   r   r%   r)   r$   rD   r   r   r8     sP   


"zReparamLargeKernelConv.__init__rF   c                 C   sP   | j d ur|  |}n| |}| jd ur|| | }| |}| |}|S N)r<   r   r   r;   rC   )r5   rF   rI   r   r   r   rK   `  s   




zReparamLargeKernelConv.forwardc                 C   sn   |  | jj| jj\}}t| dr3|  | jj| jj\}}||7 }|tj|| j	| j
 d gd 7 }||fS )zMethod to obtain re-parameterized kernel and bias.
        Reference: https://github.com/DingXiaoH/RepLKNet-pytorch

        Returns:
            Tuple of (kernel, bias) after fusing branches.
        r   rY      )_fuse_bnr   rp   rq   rT   r   r9   r\   r]   r!   r   )r5   eq_keq_bsmall_ksmall_br   r   r   get_kernel_biask  s   
z&ReparamLargeKernelConv.get_kernel_biasc                 C   sf   |   \}}t| j| j| j| j| jdd| _|| jj_	|| jj
_	| d t| dr1| d dS dS )a  
        Following works like `RepVGG: Making VGG-style ConvNets Great Again` -
        https://arxiv.org/pdf/2101.03697.pdf. We re-parameterize multi-branched
        architecture used at training time to obtain a plain CNN-like structure
        for inference.
        Tr!   r"   r-   r.   r   r   N)r   r   r   r    r!   r"   r-   r<   rO   rP   r.   rS   rT   )r5   r   r   r   r   r   rX   {  s   
	


z%ReparamLargeKernelConv.reparameterizerp   rq   c           
      C   s\   | j }|j}|j}|j }|j}|j}||  }|| dddd}	||	 ||| |  fS )zMethod to fuse batchnorm layer with conv layer.

        Args:
            conv: Convolutional kernel weights.
            bn: Batchnorm 2d layer.

        Returns:
            Tuple of (kernel, bias) after fusing batchnorm.
        rn   r   )rO   rr   rs   r.   rt   rv   rw   )
rp   rq   rU   rr   rs   rx   ry   rt   r}   r~   r   r   r   r     s   zReparamLargeKernelConv._fuse_bn)NFNFr*   N)r   r   r   r   r   r   r   r9   r   r8   r[   r   rK   r   r   rX   staticmethodConv2dr=   r   r   r   r   rD   r   r     sL    	
D
r   Fr   r    r)   r$   r*   c                 C   s@   t t| |dd||dt||ddd||dt||dd||dS )a,  Build convolutional stem with MobileOne blocks.

    Args:
        in_chs: Number of input channels.
        out_chs: Number of output channels.
        inference_mode: Flag to instantiate model in inference mode. Default: ``False``

    Returns:
        nn.Sequential object with stem elements.
       rY   )r   r    r!   r"   r)   r$   r   )r   r    r!   r"   r   r)   r$   )r9   r   r   )r   r    r)   r$   r   r   r   convolutional_stem  s6   	r   c                       sl   e Zd ZU dZejje ed< 				dde	de	ded	e
d
e
ddf fddZdejdejfddZ  ZS )	AttentionzMulti-headed Self Attention module.

    Source modified from:
    https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py
    
fused_attn    F        dimhead_dimqkv_bias	attn_drop	proj_dropr*   Nc                    s   t    || dksJ d|| _|| | _|d | _t | _tj||d |d| _	t
|| _t||| _t
|| _dS )a}  Build MHSA module that can handle 3D or 4D input tensors.

        Args:
            dim: Number of embedding dimensions.
            head_dim: Number of hidden dimensions per head. Default: ``32``
            qkv_bias: Use bias or not. Default: ``False``
            attn_drop: Dropout rate for attention tensor.
            proj_drop: Dropout rate for projection tensor.
        r   z#dim should be divisible by head_dimg      r   )r.   N)r7   r8   r   	num_headsscaler   r   r9   LinearqkvDropoutr   projr   )r5   r   r   r   r   r   rD   r   r   r8     s   


zAttention.__init__rF   c                 C   s  |j \}}}}|| }|ddd}| |||d| j| jddddd}|d\}}	}
| j	rGt
jjj||	|
| jrB| jjndd	}n|| j }||	dd }|jdd
}| |}||
 }|dd|||}| |}| |}|dd||||}|S )NrY   rn   r   r   r   r   r   )	dropout_pr   )shapeflatten	transposer   rw   r   r   permuteunbindr   r[   r9   r\   scaled_dot_product_attentiontrainingr   pr   softmaxr   r   )r5   rF   BCHWNr   qkvattnr   r   r   rK     s.   



zAttention.forward)r   Fr   r   )r   r   r   r   r[   jitFinalr   __annotations__r   floatr8   r   rK   r   r   r   rD   r   r     s*   
 r   c                       sn   e Zd ZdZejdddfdededededeej de	d	e	d
e	ddf fddZ
dejdejfddZ  ZS )
PatchEmbedz$Convolutional patch embedding layer.F
patch_sizer"   r   	embed_dimr)   lkc_use_actr%   r$   r*   Nc	           	         sL   t    tt||||dd||r|nd|d	t||ddd||d| _dS )a{  Build patch embedding layer.

        Args:
            patch_size: Patch size for embedding computation.
            stride: Stride for convolutional embedding layer.
            in_chs: Number of channels of input tensor.
            embed_dim: Number of embedding dimensions.
            inference_mode: Flag to instantiate model in inference mode. Default: ``False``
        r   r   N)	r   r    r!   r"   r   r   r%   r)   r$   F)r   r    r!   r"   r%   r)   r$   )r7   r8   r9   r   r   r   r   )	r5   r   r"   r   r   r)   r   r%   r$   rD   r   r   r8     s.   


zPatchEmbed.__init__rF   c                 C   s   |  |}|S r   )r   r5   rF   r   r   r   rK   J  s   
zPatchEmbed.forward)r   r   r   r   r9   r   r   r   r   r   r8   r[   r   rK   r   r   r   rD   r   r     s4    	
,r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )LayerScale2dh㈵>Fc                    s.   t    || _t|t|dd | _d S )Nr   )r7   r8   inplacer9   	Parameterr[   onesrx   )r5   r   init_valuesr   rD   r   r   r8   P  s   
zLayerScale2d.__init__c                 C   s   | j r	|| jS || j S r   )r   mul_rx   r   r   r   r   rK   U     zLayerScale2d.forward)r   F)r   r   r   r8   rK   r   r   r   rD   r   r   O  s    r   c                       sN   e Zd ZdZ			ddef fddZdejd	ejfd
dZdddZ	  Z
S )RepMixerzReparameterizable token mixer.

    For more details, please refer to our paper:
    `FastViT: A Fast Hybrid Vision Transformer using Structural Reparameterization <https://arxiv.org/pdf/2303.14189.pdf>`_
    r   r   Fr$   c              	      s   t    || _|| _|| _|r&tj| j| j| jd| jd | jdd| _dS d| _t|||ddddd| _	t|||ddd	| _
|durKt||| _dS t | _dS )
a  Build RepMixer Module.

        Args:
            dim: Input feature map dimension. :math:`C_{in}` from an expected input of size :math:`(B, C_{in}, H, W)`.
            kernel_size: Kernel size for spatial mixing. Default: 3
            layer_scale_init_value: Initial value for layer scale. Default: 1e-5
            inference_mode: If True, instantiates model in inference mode. Default: ``False``
        r   rY   Tr!   r"   paddingr-   r.   NFr   )r   r&   r'   r(   )r   r&   )r7   r8   r   r!   r$   r9   r   r<   r   normmixerr   layer_scaler:   )r5   r   r!   layer_scale_init_valuer$   rD   r   r   r8   `  sD   

	zRepMixer.__init__rF   r*   c                 C   s:   | j d ur|  |}|S || | || |  }|S r   )r<   r   r   r   r   r   r   r   rK     s
   

zRepMixer.forwardNc                 C   s   | j rdS | j  | j  t| jtr<| jj| jj	d| jj
j| jj
j   }t| jj| jj
j| jj
j  }n| jj| jj
j | jj
j }| jj
j| jj
j }t| j| j| jd| jdd| _
|| j
j_|| j
j_|  D ]\}}d|v rzqq|  qq| d | d | d	 dS )
ziReparameterize mixer and norm into a single
        convolutional layer for efficient inference.
        Nrn   r   Tr   r<   r   r   r   )r$   r   rX   r   ro   r   r   rj   rx   	unsqueezer<   rO   r[   squeezer.   r   r   r!   rP   rQ   rR   rS   )r5   wbrV   rW   r   r   r   rX     sF   






zRepMixer.reparameterize)r   r   Fr   )r   r   r   r   r   r8   r[   r   rK   rX   r   r   r   rD   r   r   Y  s    	5r   c                       s~   e Zd ZdZddejdfdedee dee deej	 de
d	df fd
dZdej	d	dfddZdejd	ejfddZ  ZS )ConvMlpzConvolutional FFN Module.Nr   r   hidden_channelsr    r)   dropr*   c                    sv   t    |p|}|p|}t||d|dd| _tj||dd| _| | _tj||dd| _t	|| _
| | j dS )a_  Build convolutional FFN module.

        Args:
            in_chs: Number of input channels.
            hidden_channels: Number of channels after expansion. Default: None
            out_chs: Number of output channels. Default: None
            act_layer: Activation layer. Default: ``GELU``
            drop: Dropout rate. Default: ``0.0``.
           F)r!   r-   r1   r   )r!   N)r7   r8   r   rp   r9   r   fc1rC   fc2r   r   apply_init_weights)r5   r   r   r    r)   r   rD   r   r   r8     s   
zConvMlp.__init__mc                 C   s@   t |tjrt|jdd |jd urtj|jd d S d S d S )N{Gz?r}   r   )ro   r9   r   r   rO   r.   init	constant_r5   r   r   r   r   r     s   
zConvMlp._init_weightsrF   c                 C   s@   |  |}| |}| |}| |}| |}| |}|S r   )rp   r   rC   r   r   r   r   r   r   rK     s   





zConvMlp.forward)r   r   r   r   r9   r   r   r   r   r   r   r8   r   r[   r   rK   r   r   r   rD   r   r     s*    !r   c                       sn   e Zd ZdZ			ddedee deeeeef f ddf fd	d
Zde	j
de	j
fddZdddZ  ZS )RepConditionalPosEnca"  Implementation of conditional positional encoding.

    For more details refer to paper:
    `Conditional Positional Encodings for Vision Transformers <https://arxiv.org/pdf/2102.10882.pdf>`_

    In our implementation, we can reparameterize this module to eliminate a skip connection.
    Nr   r   Fr   dim_outspatial_shaper*   c              	      s   t t|   t|trt|gd }t|ts"J dt| dt|dks2J dt| d|| _	|| _
|p;|| _|| _|rYtj| j
| j| j	d|d d | jdd| _d	S d	| _tj| j
| j|dt|d d | jdd
| _d	S )at  Build reparameterizable conditional positional encoding

        Args:
            dim: Number of input channels.
            dim_out: Number of embedding dimensions. Default: 768
            spatial_shape: Spatial shape of kernel for positional encoding. Default: (7, 7)
            inference_mode: Flag to instantiate block in inference mode. Default: ``False``
        rY   z/"spatial_shape" must by a sequence or int, get z	 instead.z+Length of "spatial_shape" should be 2, got r   r   Tr   N)r-   r.   )r7   r   r8   ro   r   tupler   typelenr   r   r   r-   r9   r   r<   pos_enc)r5   r   r   r   r$   rD   r   r   r8     sJ   



zRepConditionalPosEnc.__init__rF   c                 C   s*   | j d ur|  |}|S | || }|S r   )r<   r   r   r   r   r   rK   <  s
   

zRepConditionalPosEnc.forwardc           	   	   C   s  | j | j }tj| j || jd | jd f| jjj| jjjd}t	| j D ]}d|||| | jd d | jd d f< q$|}|| jj }| jj
}tj| j | j| jdt| jd d | jdd| _|| jj_|| jj
_|  D ]\}}d|v rvqm|  qm| d d S )	Nr   r   rk   rY   Tr   r<   r   )r   r-   r[   ru   r   r   rO   rl   rm   r@   r.   r9   r   r   r   r<   rP   rQ   rR   rS   )	r5   rz   r{   r|   rj   w_finalb_finalrV   rW   r   r   r   rX   C  sL   

	

z#RepConditionalPosEnc.reparameterize)Nr   Fr   )r   r   r   r   r   r   r   r   r8   r[   r   rK   rX   r   r   r   rD   r   r     s     6r   c                       sb   e Zd ZdZddejddddfdeded	ed
eej	 dededede
f fddZdd Z  ZS )RepMixerBlockzImplementation of Metaformer block with RepMixer as token mixer.

    For more details on Metaformer structure, please refer to:
    `MetaFormer Is Actually What You Need for Vision <https://arxiv.org/pdf/2111.11418.pdf>`_
    r         @r   r   Fr   r!   	mlp_ratior)   r   	drop_pathr   r$   c	           	         sz   t    t||||d| _t|t|| ||d| _|dur&t||| _nt	
 | _|dkr6t|| _dS t	
 | _dS )a,  Build RepMixer Block.

        Args:
            dim: Number of embedding dimensions.
            kernel_size: Kernel size for repmixer. Default: 3
            mlp_ratio: MLP expansion ratio. Default: 4.0
            act_layer: Activation layer. Default: ``nn.GELU``
            proj_drop: Dropout rate. Default: 0.0
            drop_path: Drop path rate. Default: 0.0
            layer_scale_init_value: Layer scale value at initialization. Default: 1e-5
            inference_mode: Flag to instantiate block in inference mode. Default: ``False``
        )r!   r   r$   r   r   r)   r   Nr   )r7   r8   r   token_mixerr   r   mlpr   r   r9   r:   r   r   )	r5   r   r!   r   r)   r   r   r   r$   rD   r   r   r8   x  s"   


$zRepMixerBlock.__init__c                 C   s(   |  |}|| | | | }|S r   )r   r   r   r   r   r   r   r   rK     s   
zRepMixerBlock.forward)r   r   r   r   r9   r   r   r   r   r   r   r8   rK   r   r   r   rD   r   r   q  s6    		-r   c                       sd   e Zd ZdZdejejdddfdedede	ej
 de	ej
 d	ed
edef fddZdd Z  ZS )AttentionBlockzImplementation of metaformer block with MHSA as token mixer.

    For more details on Metaformer structure, please refer to:
    `MetaFormer Is Actually What You Need for Vision <https://arxiv.org/pdf/2111.11418.pdf>`_
    r   r   r   r   r   r)   
norm_layerr   r   r   c                    s   t    ||| _t|d| _|durt||| _nt | _|dkr(t	|nt | _
t|t|| ||d| _|durEt||| _nt | _|dkrUt	|| _dS t | _dS )a  Build Attention Block.

        Args:
            dim: Number of embedding dimensions.
            mlp_ratio: MLP expansion ratio. Default: 4.0
            act_layer: Activation layer. Default: ``nn.GELU``
            norm_layer: Normalization layer. Default: ``nn.BatchNorm2d``
            proj_drop: Dropout rate. Default: 0.0
            drop_path: Drop path rate. Default: 0.0
            layer_scale_init_value: Layer scale value at initialization. Default: 1e-5
        r   Nr   r   )r7   r8   r   r   r   r   layer_scale_1r9   r:   r   
drop_path1r   r   r   layer_scale_2
drop_path2)r5   r   r   r)   r   r   r   r   rD   r   r   r8     s"   




$zAttentionBlock.__init__c              
   C   s>   ||  | | | | }|| | | | }|S r   )r   r   r   r   r  r   r   r   r   r   r   rK     s    zAttentionBlock.forward)r   r   r   r   r9   r   r=   r   r   r   r   r8   rK   r   r   r   rD   r   r     s0    	,r   c                "       s   e Zd Zdddddddejejddd	ddfd
edededededededede	ej
 dededeej
 deej
 dedede	e f  fddZdd Z  ZS )FastVitStageTFr   rY   Nr   r   r   r   r   r   depthtoken_mixer_type
downsamplese_downsampledown_patch_sizedown_stridepos_emb_layerr!   r   r)   r   proj_drop_ratedrop_path_rater   c                    s   t    d| _|rt||||||||d| _n||ksJ t | _|	dur/|	||d| _nt | _g }t|D ]3}|dkrR|	t
||
||||| ||d q:|dkrg|	t||||||| |d q:td	|tj| | _dS )
aQ  FastViT stage.

        Args:
            dim: Number of embedding dimensions.
            depth: Number of blocks in stage
            token_mixer_type: Token mixer type.
            kernel_size: Kernel size for repmixer.
            mlp_ratio: MLP expansion ratio.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
            proj_drop_rate: Dropout rate.
            drop_path_rate: Drop path rate.
            layer_scale_init_value: Layer scale value at initialization.
            inference_mode: Flag to instantiate block in inference mode.
        F)r   r"   r   r   r%   r)   r   r$   N)r$   repmixer)r!   r   r)   r   r   r   r$   	attention)r   r)   r   r   r   r   z"Token mixer type: {} not supported)r7   r8   grad_checkpointingr   r  r9   r:   pos_embr@   appendr   r   
ValueErrorformatr   blocks)r5   r   r   r  r  r  r  r  r  r	  r!   r   r)   r   r
  r  r   r   r$   r  	block_idxrD   r   r   r8     sZ   
$






zFastVitStage.__init__c                 C   sB   |  |}| |}| jrtj st| j|}|S | |}|S r   )r  r  r  r[   r   is_scriptingr   r  r   r   r   r   rK   =  s   


zFastVitStage.forward)r   r   r   r9   r   r=   r   strr   r   r   r   r   r8   rK   r   r   r   rD   r   r    sb    	
Xr  c                1       sV  e Zd ZU ejje ed< 	 ddddddddd	d
ddddddddddej	ej
dfdedeedf deedf deedf deedf deedf deedf dededeeej df dededed ed!ed"ed#eded$ed%ed&eej d'eej d(ed)d*f0 fd+d,Zd-ejd)d*fd.d/Zejjd0d1 ZejjdPd2d3ZejjdQd5d6Zejjd)ejfd7d8ZdRded%ee fd9d:Z	*			;	dSd<ejd=eeeee f  d>ed?ed@edAed)eeej eejeej f f fdBdCZ	D		4dTd=eeee f dEedFefdGdHZd<ejd)ejfdIdJZdPd<ejdKefdLdMZ d<ejd)ejfdNdOZ!  Z"S )Ur   	fork_featr   rY   rY      rY   r  r  r  r  @            r   r   r   r   )FTTT)FFFF  )NNNNr   rY   r   r   Fg       @avgin_chanslayers.token_mixers
embed_dims
mlp_ratiosdownsamplesse_downsamplesrepmixer_kernel_sizenum_classespos_embsr  r  	drop_rater
  r  r   r   	cls_ratioglobal_poolr   r)   r$   r*   Nc           $         sV  t    |r	dn|	| _|| _|| _g | _t||d ||| _|d }d}dd t	d|t
||D }g }tt|D ]v}|| pI||| k}td#i d|d|| d|| d|d	|| d
|d|d|
| d|| d|d|| d|d|d|d|| d|d|d|}|| || }|r|d9 }|  jt|d| d| dg7  _q>tj| | _t| j| _| | _| _| jrg d| _t| jD ]&\}} |dkrtjdd r	 t }!n||| }!d|  }"| |"|! qn't|d |  | _ | _}#t|d |#ddd|d |dd!	| _ t!|#|	||d"| _"| #| j$ d S )$Nr   r   c                 S   s   g | ]}|  qS r   )tolist)r2   rF   r   r   r   r6   y  s    z$FastVit.__init__.<locals>.<listcomp>r   r   r  r  r  r  r  r	  r  r!   r   r)   r   r
  r  r   r   r$   rY   r   stages.)num_chs	reductionmoduler   r   rY   r   
FORK_LAST3r   rn   r   T)	r   r    r!   r"   r   r$   r%   r)   r(   )	pool_typer-  r   )%r7   r8   r+  r  r/  feature_infor   stemr[   linspacesumsplitr@   r   r  r  dictr9   r   stages
num_stagesr/   head_hidden_sizeout_indices	enumerateosenvirongetr:   
add_moduler   r   
final_convr   headr   r   )$r5   r#  r$  r%  r&  r'  r(  r)  r*  r+  r,  r  r  r-  r
  r  r   r   r  r.  r/  r   r)   r$   prev_dimr   dprr>  r|   r  stagei_embi_layerlayer
layer_namefinal_featuresrD   r   r   r8   N  s   
"	

&


zFastVit.__init__r   c                 C   sP   t |tjr"t|jdd t |tjr$|jdur&tj|jd dS dS dS dS )zInit. for classificationr   r   Nr   )ro   r9   r   r   rO   r.   r   r   r   r   r   r   r     s   zFastVit._init_weightsc                 C   s   t  S r   )setr5   r   r   r   no_weight_decay  s   zFastVit.no_weight_decayc                 C   s   t d|rddS g ddS )Nz^stemz^stages\.(\d+)))z^stages\.(\d+).downsampler   )z^stages\.(\d+).pos_embrT  )z^stages\.(\d+)\.\w+\.(\d+)N)r9  r  )r=  )r5   coarser   r   r   group_matcher  s   zFastVit.group_matcherTc                 C   s   | j D ]}||_qd S r   )r>  r  )r5   enablesr   r   r   set_grad_checkpointing  s   
zFastVit.set_grad_checkpointingc                 C   s   | j jS r   )rH  fcrR  r   r   r   get_classifier  s   zFastVit.get_classifierc                 C   s   || _ | j|| d S r   )r+  rH  reset)r5   r+  r/  r   r   r   reset_classifier  s   zFastVit.reset_classifierNCHWrF   indicesr   
stop_early
output_fmtintermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}| jd }
tj s%|s)| j}n	| jd|	d  }d}t|D ]\}}||}||v rI|	| q8|rN|S ||
krW| 
|}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r^  zOutput shape must be NCHW.r   Nr   )r   r   r>  r9  r?  r[   r   r  rB  r  rG  )r5   rF   r_  r   r`  ra  rb  intermediatestake_indices	max_indexlast_idxr>  feat_idxrK  r   r   r   forward_intermediates  s&   



zFastVit.forward_intermediatesr   
prune_norm
prune_headc                 C   s<   t t| j|\}}| jd|d  | _|r| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r    )r   r   r>  r]  )r5   r_  ri  rj  rd  re  r   r   r   prune_intermediate_layers  s
   z!FastVit.prune_intermediate_layersc                 C   st   |  |}g }t| jD ]!\}}||}| jr-|| jv r-t| d| }||}|| q| jr3|S | |}|S )Nr   )r9  rB  r>  r  rA  getattrr  rG  )r5   rF   outsidxblockr   x_outr   r   r   forward_features   s   



zFastVit.forward_features
pre_logitsc                 C   s   |r	| j |ddS |  |S )NT)rs  )rH  )r5   rF   rs  r   r   r   forward_head1  r   zFastVit.forward_headc                 C   s"   |  |}| jr
|S | |}|S r   )rr  r  rt  r   r   r   r   rK   4  s
   

zFastVit.forwardF)Tr   )NFFr^  F)r   FT)#r   r   r   r[   r   r   r   r   r9   r=   r   r   r   r  r   r   r   r   r8   r   ignorerS  rV  rY  r[  r]  r   r   r   rh  rl  rr  rt  rK   r   r   r   rD   r   r   G  s   
 





	
s

 
0
rk  c                 K   s   | dddddt tddd
|S )	Nr!  )r   r  r  )   rw  g?bicubic)zstem.0.conv_kxk.0.convzstem.0.conv_scale.convzhead.fc)
urlr+  
input_size	pool_sizecrop_pctinterpolationmeanr}   
first_conv
classifierr   )ry  kwargsr   r   r   _cfg<  s   r  zfastvit_t8.apple_in1kztimm/)	hf_hub_idzfastvit_t12.apple_in1kzfastvit_s12.apple_in1kzfastvit_sa12.apple_in1kzfastvit_sa24.apple_in1kzfastvit_sa36.apple_in1kzfastvit_ma36.apple_in1kgffffff?)r  r|  zfastvit_t8.apple_dist_in1kzfastvit_t12.apple_dist_in1kzfastvit_s12.apple_dist_in1kzfastvit_sa12.apple_dist_in1kzfastvit_sa24.apple_dist_in1kzfastvit_sa36.apple_dist_in1kzfastvit_ma36.apple_dist_in1kzfastvit_mci0.apple_mclipzapple/mobileclip_s0_timmzXhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s0.ptr  )r   r   r   )      ?r  r  )r  ry  r|  r+  r~  r}   zfastvit_mci1.apple_mclipzapple/mobileclip_s1_timmzXhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s1.ptzfastvit_mci2.apple_mclipzapple/mobileclip_s2_timmzXhttps://docs-assets.developer.apple.com/ml-research/datasets/mobileclip/mobileclip_s2.ptc                 C   sf  d| v r| S |  d| } d| v rd}nd}ddl}ddl}g }|  D ]\}}|d|}|r9|t|d	 q#tt	t
|}i }	|  D ]\}}|rY||vrSqH||d}|d
d}|dd}|dd}|dd}|dd}|dd}|dd}|dd}|dd}|dd|}|dr|dd}|d d!}|d"r|d#krt|jd$rt|jjtjr|d#d%}|j}t|jd |	d&< n|d"d'}|d(|}d)\}
}|rt|d*}|||}
|
dur,d+| }d,|
 }|d- |v r||d- |d. }n|d/ |v r$||d/ |d0 }n|||d1 }||	|< qH|	S )2z$ Remap original checkpoints -> timm zstem.0.conv_kxk.0.conv.weight
state_dictz8image_encoder.model.patch_embed.0.rbr_conv.0.conv.weightzimage_encoder.model.rk  r   Nz^(.*?)network\.(\d+)\.proj.*rY   patch_embedr9  rbr_convrA   	rbr_scalerB   rbr_skipr>   conv_exprG  
lkb_originr   convffnr   z	se.reducezse.fc1z	se.expandzse.fc2zlayer_scale_([0-9])zlayer_scale_\1.gammar   zlayer_scale.gamma	dist_head	head_distzhead.z	head.projrZ  zhead.fc.weightzhead.fc.biaszhead.fc.z^network\.(\d+))NNr   znetwork.r1  z.projz.downsample.projz.pez.pos_emb.pos_encz.blocks)rE  rebisectitemsmatchr  r   grouplistsortedrQ  replacesubendswith
startswithrT   rH  ro   rZ  r9   r   Tr[   ru   r   bisect_right)r  modelprefixr  r  
stage_endsr   r   r  out_dict	stage_idxnet_idx
net_prefixstage_prefixr   r   r   checkpoint_filter_fn  sn   

$



r  c                 K   s2   | dd}tt| |fttd|dd|}|S )NrA  r5  T)flatten_sequentialrA  )pretrained_filter_fnfeature_cfg)popr   r   r  r=  )variant
pretrainedr  rA  r  r   r   r   _create_fastvit  s   
r  c                 K   .   t ddddd}td	d| it |fi |S )
z%Instantiate FastViT-T8 model variant.)rY   rY   r   rY   )0   `      i  r   r   r   r   r  r$  r&  r'  r%  
fastvit_t8r  N)r  r=  r  r  r  
model_argsr   r   r   r       r  c                 K   r  )
z&Instantiate FastViT-T12 model variant.r  r  r  r  r  fastvit_t12r  N)r  r  r  r   r   r   r    r  r  c                 K   r  )
z&Instantiate FastViT-S12 model variant.r  r  r   r  r  fastvit_s12r  N)r  r  r  r   r   r   r    r  r  c                 K   @   t ddddddttddfdd}tdd
| it |fi |S )z'Instantiate FastViT-SA12 model variant.r  r  r   Nr   r   r  r  r  r  r$  r&  r'  r,  r%  fastvit_sa12r  )r  r=  r   r   r  r  r   r   r   r       r  c                 K   r  )z'Instantiate FastViT-SA24 model variant.)r   r      r   r  r   Nr   r  r  r  fastvit_sa24r  )r  r  r  r   r   r   r    r  r  c                 K   r  )z'Instantiate FastViT-SA36 model variant.r  r     r  r  r   Nr   r  r  r  fastvit_sa36r  )r  r  r  r   r   r   r    r  r  c                 K   r  )z'Instantiate FastViT-MA36 model variant.r  )L      i0  i`  r   Nr   r  r  r  fastvit_ma36r  )r  r  r  r   r   r   r  +  r  r  c                 K   D   t dddddddttddfdd	d
}tdd| it |fi |S )zInstantiate MCi0 model variant.)rY   r  
   rY   r  r  FFTTNr   r  r  Tr$  r&  r'  r)  r,  r%  r   fastvit_mci0r  )r  r  r  r   r   r   r  8     	r  c                 K   r  )zInstantiate MCi1 model variant.)r   r     r   r  r  r  Nr   r  r  Tr  fastvit_mci1r  )r  r  r  r   r   r   r  G  r  r  c                 K   r  )zInstantiate MCi2 model variant.)r   r     r   )P      i@  i  r  r  Nr   r  r  Tr  fastvit_mci2r  )r  r  r  r   r   r   r  V  r  r  )rk  ru  )BrC  	functoolsr   typingr   r   r   r   r   r[   torch.nnr9   	timm.datar	   r
   timm.layersr   r   r   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r   __all__r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r  default_cfgsr  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r   <module>   s&  $	 r 
-D4
p4t:9c 
v
!&-4>
I