o
    پiO                    @   s 1  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZmZmZmZ zddl	mZ W n eyK   ddlmZ Y nw ddlZddlmZ ddlm  mZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ dd	l%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8 d
dl9m:Z: d
dl;m<Z< d
dl=m>Z>m?Z?m@Z@mAZA d
dlBmCZCmDZDmEZE dgZFeGeHZIG dd dejJZKG dd dejJZLG dd dejJZMG dd dejJZNG dd dejJZO		
	dRdejPdeQdeRdeSfd d!ZTG d"d dejJZUdSd$ejJd%eQd&dfd'd(ZVdTd$ejJd%eQd*eWd&dfd+d,ZXdSd$ejJd%eQd&dfd-d.ZYdUd0eQd*eWd&efd1d2ZZ	
	3	4	dVd5ejPd6ejPdeRd7eeReRf d8eQd9eSd&ejPfd:d;Z[e\ dWd<eUd=eQd>eQd?eSd&df
d@dAZ]	BdXdCeeQejPf d<eUd>eQd&eeQejPf fdDdEZ^dCeeQejPf d<eUd&eeQejPf fdFdGZ_dCeeQejPf d<eUd&eeQejPf fdHdIZ`dCeafdJdKZb		4	LdYdCeeQejPf d<eUdMeSd8eQd9eSd&eeQejPf fdNdOZcdSdPeQd&eeQe
f fdQdRZdi dSeddTdUdVed dWeddTdUdXeddYdTdLdZd[edd\dTdLd]d^d_d`eddadTdLdZdbeddcdTdLd]d^d_ddeddedTdLdZdfeddgdTdLd]d^d_dheddidTdLdZdjeddkdTdLd]d^d_dleddmdTdLdZdneddodTdLd]d^d_dpeddqdTdLdZdreddsdTdLdZdteddudTdLd]d^d_dveddwdTdxi dyeddzdTd]d^d{d|edd}dTd]d^d{d~edddTdLdZdedddTdLd]d^d_dedddTdLdZdedddTdLd]d^d_dedddTdLdZdedddTdLd]d^d_dedd#ddedd#ddedd#ddedd#ddeddTdddeddTdddeddTdddeddTdddeddTddi dedddTdLdddedddTdLdddedddTdLdddedddTdLdddedddTdLdddedddTdLdddedddTdLdddedddLdTddedddLdTddedddTee dddedddTee dddedddTee dddedddTee dddedddTdee ddd^ddedddTdee ddd^ddedddTdee ddd^ddedddTdee ddd^di dedddTdee ddd^ddedddTdee ddd^ddedddTdee ddd^ddedddTdee ddd^ddedddTdddddd͍dedddTdddddЍdedddTdxdeddTdddd׍deddTdddڍdeddTd]dddݍded deddTe#e$ddeddTe#e$d^d]ddeddTe#e$d^dddeddTe#e$dddeddTe#e$d^d]dddeddTe!e"d^di deddTe!e"d^ddddeddTe#e$d^ddeddTe#e$d^ddddede#e$ddeddTe#e$dd]dddeddTe#e$dddeddTe#e$dd]dddeddTe#e$d^ddeddTe#e$d^ddddeddTe#e$ddeddTe#e$d^ddeddTe#e$d^d]dddeddTe!e"d^ddeddTe!e"d^ddddeddTe#e$d^ddedd#e#e$d^ddddeddTe#e$di deddTe#e$ddeddTe#e$d^d]dddeddTe#e$d^dd eddTe#e$d֐ddeddTe!e"d^d֐ddeddTe#e$d^d֐ddeddTe#e$d֐ddeddTe#e$d^d֐ddeddTe#e$ddd	eddTe#e$d^ddd
eddTe!e"d^dddeddTe#e$d^dddeddTe#e$d^dddeddTe#e$d^dddeddTde#e$dddeddTe#e$d^dddeddTe#e$dd^ddi deddTe#e$d^dddeddTe#e$d^dddeddTe#e$d^dِdddeddTe#e$d^dddeddTe#e$d^dddeddTde#e$d^dd d!eddTde#e$d^dd d"eddTdde#e$d^dd#d$eddTdde#e$d^dd#d%eddTe#e$ddd^d&dd'd(eddTd)de#e$d^dd#d*eddTd)de#e$d^dd#d+eddTd)de#e$d^dd#d,eddTd)de#e$d^dd#d-eddTd)e#e$d^dd d.eddTd)de#e$d^dd#d/eddTd)de#e$d^dd#i d0eddTd)de#e$d^dd#d1eddTd)de#e$d^dd#d2eddTde#e$ddd3eddTde#e$ddd4eddTde#e$d^dd5d6eddTde#e$d^ddd7d8edd#ddՐd9d:edd#ddՐd9d;edd#dd<edd#dd=edd#dd>eddTd?e#e$d@d^dAdBeddTd?e#e$dd^dܐdCdDeddTd?e#e$d@d^dAdEeddTd?e#e$dd^dܐdCdFeddGdLdTddՐdHdIeddJdLdTddՐdHi dKeddLdLdTddՐdHdMeddNdLdTddՐdHdOeddPdLdTddՐdHdQeddRdLdTddՐdHdSeddTdLdTddddUdVeddWdLdTddddUdXeddYdLdTddՐdHdZedd[dLdTddՐdHd\edd]dLdTddՐdHd^edd_dLdTddddUd`eddadLdTddddUdbedd#ddcedd#dddedd#ddeeddfdTd)ee ddgdheddidTd)ee ddgdjeddkdTd)ee ddgi dleddmd)ee ddndoeddpd)ee ddndqeddrd)dd^ee ddsdteddud)ee ddndveddTdddwdxeddTdddyeddTdddzeddTdddwd{eddTdddwd|eddTdddwd}eddTd]ddwd~eddTd]ddwdeddTdddwdeddTdddwdeddTdddwdeddTdddwdeddTd]ddwi deddTd]ddwdeddTdddwdeddTdddeddTdddeddTd&ddwdeddTd&ddwdeddTd]ddwdeddTdddwdeddTdddwdeddTd]ddwdeddTdddwdeddTdddwdeddTd]ddwdeddTdddwdeddTdddeddTdddeddTdddwi deddTdddwdeddTdddwdeddTd]ddwdeddTd]ddwdeddTdddwdeddTdddwdeddTdddwdeddTdddwdeddTd]ddwdeddTd]ddwdeddTdddwdeddTdddeddTdddeddTdddeddTdddeddTdddeddTddi deddTd&ddwdeddTd&d^dd׍deddTd]d^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdd^dd׍deddTdddwi deddTdddwdeddTd]ddwdeddTdddwdeddTdddwdeddTd]ddwdeddTd&d^ddݍdeddTd&d^ddݍdeddTd?e#e$dddeddTd?e#e$dddeddTd?e#e$dddeddTd?e#e$dddeddTdddڍdeddTdddڍdeddTdddڍdeddTdddՐdɍdeddTdddڍdeddTdddڍi deddTdddڍdeddTdddڍdeddTdddՐdɍdeddTdddڍdeddTdddڍdeddTdddՐdɍdeddTdddՐdɍdeddTd]d^dڍdeddTdddڍdeddTdddڍdeddTdddڍdeddTdddڍdeddTdddՐdɍdeddTdddՐdɍdeddTd]d^dڍdeddِd܍deddTdddڍi deddTdddՐdɍdeddTd]d^dڍdeddِd܍deddTdd^dڍdeddTddd^dɍdeddTd]d^dڍdeddTdd^ddݍdeddTee dd^dddeddTee dd^dddeddTe#e$dd^dddeddTe#e$dd^dddeddTe#e$dd^dddeddTe#e$dd^dddeddTe#e$dd^dddeddTe#e$ddd^dddeddTe#e$ddd^dddeddTe#e$ddd^ddi deddTe#e$ddd^dddeddTe#e$ddd^dddeddTe#e$ddd^dddeddTe#e$ddd^dddeddTe#e$ddd^dddeddTe#e$ddd^dddeddTdddڍdeddTdddڍdeddTdddڍdedddՐddeddTee d^ddeddTee d^ddeddTee d^dd eddTee d^ddedd#ee d^ddedd#dee d^ddeddTee d^ddeddTee d^ddeddTee d^ddeddTee d^dddZedd eef D ZgegD ]#Zheieeeh Zjejd	 dTkrdTeh ejd	< ejeeehkd
d< qeCeeZeejlmddn dkZo		dZdeQdeSdeeS d&eeUdf fddZpeDd[deSd&eUfddZqeDd[deSd&eUfddZreDd[deSd&eUfddZseDd[deSd&eUfddZteDd[deSd&eUfddZueDd[deSd&eUfdd ZveDd[deSd&eUfd!d"ZweDd[deSd&eUfd#d$ZxeDd[deSd&eUfd%d&ZyeDd[deSd&eUfd'd(ZzeDd[deSd&eUfd)d*Z{eDd[deSd&eUfd+d,Z|eDd[deSd&eUfd-d.Z}eDd[deSd&eUfd/d0Z~eDd[deSd&eUfd1d2ZeDd[deSd&eUfd3d4ZeDd[deSd&eUfd5d6ZeDd[deSd&eUfd7d8ZeDd[deSd&eUfd9d:ZeDd[deSd&eUfd;d<ZeDd[deSd&eUfd=d>ZeDd[deSd&eUfd?d@ZeDd[deSd&eUfdAdBZeDd[deSd&eUfdCdDZeDd[deSd&eUfdEdFZeDd[deSd&eUfdGdބZeDd[deSd&eUfdHdIZeDd[deSd&eUfdJdKZeDd[deSd&eUfdLdMZeDd[deSd&eUfdNdOZeDd[deSd&eUfdPdQZeDd[deSd&eUfdRdSZeDd[deSd&eUfdTdUZeDd[deSd&eUfdVdWZeDd[deSd&eUfdXdYZeDd[deSd&eUfdZd[ZeDd[deSd&eUfd\d]ZeDd[deSd&eUfd^d_ZeDd[deSd&eUfd`daZeDd[deSd&eUfdbdcZeDd[deSd&eUfdddeZeDd[deSd&eUfdfdgZeDd[deSd&eUfdhdiZeDd[deSd&eUfdjdkZeDd[deSd&eUfdldmZeDd[deSd&eUfdndoZeDd[deSd&eUfdpdqZeDd[deSd&eUfdrdsZeDd[deSd&eUfdtduZeDd[deSd&eUfdvdwZeDd[deSd&eUfdxdyZeDd[deSd&eUfdzd{ZeDd[deSd&eUfd|d}ZeDd[deSd&eUfd~dZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddÄZeDd[deSd&eUfdĐdńZeDd[deSd&eUfdƐdǄZeDd[deSd&eUfdȐdɄZeDd[deSd&eUfdʐd˄ZeDd[deSd&eUfd̐d̈́ZeDd[deSd&eUfdΐdτZeDd[deSd&eUfdАdфZeDd[deSd&eUfdҐdӄZeDd[deSd&eUfdԐdՄZeDd[deSd&eUfd֐dׄZeDd[deSd&eUfdؐdلZeDd[deSd&eUfdڐdۄZeDd[deSd&eUfdܐd݄ZeDd[deSd&eUfdސd߄ZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfd dZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfdd	ZeDd[deSd&eUfd
dZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfddZeDd[deSd&eUfd d!ZeDd[deSd&eUfd"d#ZeDd[deSd&eUfd$d%ZeDd[deSd&eUfd&d'ZeDd[deSd&eUfd(d)ZeDd[deSd&eUfd*d+ZeDd[deSd&eUfd,d-ZeDd[deSd&eUfd.d/ZeDd[deSd&eUfd0d1ZeDd[deSd&eUfd2d3Z eDd[deSd&eUfd4d5ZeDd[deSd&eUfd6d7ZeDd[deSd&eUfd8d9ZeDd[deSd&eUfd:d;ZeDd[deSd&eUfd<d=ZeEeHi d>dd?dd@ddAddBddCddDddEddFddGdHdIdJdKddLddMddNddOdƓdPdd
dddQ dS (\  a   Vision Transformer (ViT) in PyTorch

A PyTorch implement of Vision Transformers as described in:

'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale'
    - https://arxiv.org/abs/2010.11929

`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers`
    - https://arxiv.org/abs/2106.10270

`FlexiViT: One Model for All Patch Sizes`
    - https://arxiv.org/abs/2212.08013

The official jax code is released and available at
  * https://github.com/google-research/vision_transformer
  * https://github.com/google-research/big_vision

Acknowledgments:
  * The paper authors for releasing code and weights, thanks!
  * I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch
  * Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT
  * Bert reference code checks against Huggingface Transformers and Tensorflow Bert

Hacked together by / Copyright 2020, Ross Wightman
    N)OrderedDict)partial)	AnyCallableDictOptionalSetTupleTypeUnionList)Literal)Final)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STDOPENAI_CLIP_MEANOPENAI_CLIP_STD)	AttentionAttentionPoolLatent
PatchEmbedMlpSwiGLUPackedSwiGLU	LayerNormRmsNormDropPathPatchDropouttrunc_normal_lecun_normal_resample_patch_embedresample_abs_pos_embeduse_fused_attnget_act_layerget_norm_layermaybe_add_mask	LayerType   )build_model_with_cfg)feature_take_indices)named_apply
checkpointcheckpoint_seqadapt_input_conv)generate_default_cfgsregister_modelregister_model_deprecationsVisionTransformerc                	       sN   e Zd ZdZ		ddedededdf fd	d
Zdej	dej	fddZ
  ZS )
LayerScalezRLayer scale module.

    References:
      - https://arxiv.org/abs/2103.17239
    h㈵>Fdiminit_valuesinplacereturnNc                    s*   t    || _t|t| | _dS )zInitialize LayerScale module.

        Args:
            dim: Dimension.
            init_values: Initial value for scaling.
            inplace: If True, perform inplace operations.
        N)super__init__r7   nn	Parametertorchonesgamma)selfr5   r6   r7   	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/vision_transformer.pyr:   W   s   
zLayerScale.__init__xc                 C   s   | j r	|| jS || j S )zApply layer scaling.)r7   mul_r?   )r@   rE   rC   rC   rD   forwardh   s   zLayerScale.forward)r4   F)__name__
__module____qualname____doc__intfloatboolr:   r=   TensorrG   __classcell__rC   rC   rA   rD   r3   P   s    	r3   c                !       s   e Zd ZdZddddddddddejeefdeded	e	d
e
de
de
de
de
de	de	dee	 de	deej deej deej ddf  fddZddejdeej dejfddZ  ZS )Blockz)Transformer block with pre-normalization.      @FT        Nr5   	num_heads	mlp_ratioqkv_biasqk_normscale_attn_normscale_mlp_norm	proj_bias	proj_drop	attn_dropr6   	drop_path	act_layer
norm_layer	mlp_layerr8   c                    s   t    ||| _t|||||||
|	|d	| _|r t||dnt | _|dkr-t	|nt | _
||| _||t|| ||rC|nd||	d| _|rRt||dnt | _|dkrbt	|| _dS t | _dS )a  Initialize Block.

        Args:
            dim: Number of input channels.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            qk_norm: If True, apply normalization to query and key.
            proj_bias: If True, add bias to output projection.
            proj_drop: Projection dropout rate.
            attn_drop: Attention dropout rate.
            init_values: Initial values for layer scale.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
            mlp_layer: MLP layer.
        rT   rV   rW   
scale_normrZ   r\   r[   r_   r6   rS   Nin_featureshidden_featuresr^   r_   biasdrop)r9   r:   norm1r   attnr3   r;   Identityls1r   
drop_path1norm2rL   mlpls2
drop_path2r@   r5   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r6   r]   r^   r_   r`   rA   rC   rD   r:   p   s4   
#



$zBlock.__init__rE   	attn_maskc              
   C   sH   ||  | | j| ||d }|| | | | | }|S Nrs   )rm   rl   rj   ri   rq   rp   ro   rn   r@   rE   rs   rC   rC   rD   rG      s   $ zBlock.forwardNrH   rI   rJ   rK   r;   GELUr   r   rL   rM   rN   r   r
   Moduler:   r=   rO   rG   rP   rC   rC   rA   rD   rQ   m   sb    	
*?rQ   c                !       s   e Zd Zddddddddddejeefdededed	e	d
e	de	de	de	dedede
e dedeej deej deej ddf  fddZdddZddejde
ej dejfddZ  ZS ) ResPostBlockrR   FTrS   Nr5   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r6   r]   r^   r_   r`   r8   c                    s   t    || _t|||||||
|	|d	| _||| _|dkr#t|nt | _	||t
|| ||r4|nd ||	d| _||| _|dkrHt|nt | _|   d S )Nra   rS   rd   )r9   r:   r6   r   rj   ri   r   r;   rk   rm   rL   ro   rn   rq   init_weightsrr   rA   rC   rD   r:      s4   




zResPostBlock.__init__c                 C   s:   | j d urtj| jj| j  tj| jj| j  d S d S rw   )r6   r;   init	constant_ri   weightrn   r@   rC   rC   rD   r|      s   
zResPostBlock.init_weightsrE   rs   c              	   C   s<   ||  | | j||d }|| | | | }|S rt   )rm   ri   rj   rq   rn   ro   rv   rC   rC   rD   rG      s   zResPostBlock.forwardr8   Nrw   )rH   rI   rJ   r;   ry   r   r   rL   rM   rN   r   r
   rz   r:   r|   r=   rO   rG   rP   rC   rC   rA   rD   r{      sb    	

0*r{   c                !       s   e Zd ZU dZee ed< ddddddddddeje	dfde
d	e
d
ededededededededee dedeej deej deeej  ddf  fddZddejdeej dejfddZ  ZS )ParallelScalingBlockz Parallel ViT block (MLP & Attention in parallel)
    Based on:
      'Scaling Vision Transformers to 22 Billion Parameters` - https://arxiv.org/abs/2302.05442
    
fused_attnrR   FTrS   Nr5   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r6   r]   r^   r_   r`   r8   c                    s  t    || dksJ d|s|rJ d|| _|| | _| jd | _t | _t|| }|d|  }||| _t	j
|||d| _|g|gd  | _|r[| dd  | dd  n| jdtd| d	d
 t	t|| _|rx|| jnt	 | _|r|| jnt	 | _t	|
| _t	j
|||d| _t	|	| _| | _t	j
|||d| _|d urt||dnt	 | _|dkrt|| _d S t	 | _d S )Nr   z$dim should be divisible by num_headszScale norms not supportedg         )rg   rV   mlp_biasF)
persistentrc   rS   ) r9   r:   rT   head_dimscaler#   r   rL   in_normr;   Linearin_projin_splitregister_bufferregister_parameterr=   zerosr<   r   rk   q_normk_normDropoutr\   attn_out_projmlp_dropmlp_actmlp_out_projr3   lsr   r]   )r@   r5   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r6   r]   r^   r_   r`   mlp_hidden_dimin_proj_out_dimrA   rC   rD   r:      s4   


$zParallelScalingBlock.__init__rE   rs   c                 C   s  |j \}}}| |}| jd ur!t|| jjt| j	| jf}n| |}tj
|| jdd\}}}	}
| |||| j| jdd}| |	||| j| jdd}	|
||| j| jdd}
| jrxtj||	|
|| jrs| jjndd}n!|| j }||	dd }t||}|jdd}| |}||
 }|dd|||}| |}| |}| |}| |}| |  || }|| }|S )Nr5   r(      rS   )rs   	dropout_p)!shaper   r   Flinearr   r   r=   catrV   splitr   r   viewrT   r   	transposer   r   scaled_dot_product_attentiontrainingr\   pr   r&   softmaxreshaper   r   r   r   r]   r   )r@   rE   rs   BNCyx_mlpqkvx_attnrj   rC   rC   rD   rG   +  s:   

"
""






zParallelScalingBlock.forwardrw   )rH   rI   rJ   rK   r   rN   __annotations__r;   ry   r   rL   rM   r   r
   rz   r:   r=   rO   rG   rP   rC   rC   rA   rD   r      sd   
 	
*2r   c                #       s   e Zd ZdZdddddddddddejeefded	ed
ede	de
de
de
de
de
dee	 de	de	de	deej deej deej ddf" fddZddejdeej dejfddZ  ZS ) ParallelThingsBlockz Parallel ViT block (N parallel attention followed by N parallel MLP)
    Based on:
      `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
    r   rR   FTNrS   r5   rT   num_parallelrU   rV   rW   rX   rY   rZ   r6   r[   r\   r]   r^   r_   r`   r8   c                    s   t    || _t | _t | _t|D ]w}| jt	t
d||fdt||||||	|||d	fd|
r:t||
dnt fd|dkrGt|nt fg | jt	t
d||fd||t|| ||rh|nd |	|d	fd|
rwt||
dnt fd|dkrt|nt fg qd S )
Nnormrj   ra   r   rc   r]   rS   ro   )rf   r^   r_   rg   rh   )r9   r:   r   r;   
ModuleListattnsffnsrangeappend
Sequentialr   r   r3   rk   r   rL   )r@   r5   rT   r   rU   rV   rW   rX   rY   rZ   r6   r[   r\   r]   r^   r_   r`   _rA   rC   rD   r:   ]  sF   







zParallelThingsBlock.__init__rE   rs   c                    s   |d ur3g }| j D ]}| }|j||d}||}||}|| q	 t|jdd  n t fdd| j D jdd   t fdd| j	D jdd   S )Nru   r   r   c                       g | ]}| qS rC   rC   ).0rj   rE   rC   rD   
<listcomp>      z/ParallelThingsBlock.forward.<locals>.<listcomp>c                    r   rC   rC   )r   ffnr   rC   rD   r     r   )
r   r   rj   r   r]   r   r=   stacksumr   )r@   rE   rs   attn_outrj   r   rC   r   rD   rG     s   



&&zParallelThingsBlock.forwardrw   rx   rC   rC   rA   rD   r   X  sh    	
*6r   tokenFrE   	pool_typenum_prefix_tokensreduce_include_prefixc                 C   s   |s| S |dkr| d d df } | S |r| n	| d d |d f } |dkr,| j dd} | S |dkr@d| jdd| j dd  } | S |dkrL| jdd} | S |rUJ d	| | S )
Nr   r   avgr(   r   avgmaxg      ?maxzUnknown pool type )meanamax)rE   r   r   r   rC   rC   rD   global_pool_nlc  s"   r   c                Q       s$  e Zd ZU dZee ed< ddddddd	d	d
ddddddddddddddddddddddddedddee	f'de
eeeef f de
eeeef f dededed dedededededededed ed!ee d"ed#ed$ed%ed&ed'ed(ee d)eded*ed+ed,ed-ed.ed/ed0ed1ed2 d3ed4ed5ee d6ee d7ee d8eej d9eej d:dfP fd;d<Zdxd=d>Zdyd?ed:dfd@dAZdBejd:dfdCdDZej dydEedFed:dfdGdHZejjd:ee fdIdJZejjdzdKed:e ee
ee!f f fdLdMZ"ejjd{dNed:dfdOdPZ#ejjd:ejfdQdRZ$d|dedee d:dfdSdTZ%		d}deeeef  deeeef  d:dfdUdVZ&dWej'd:ej'fdXdYZ(					Z			d~dWej'd[ee
ee!e f  d\ed]ed^ed_ed`edaedbeej' d:e
e!ej' eej'e!ej' f e ee)f f fdcddZ*	e		dd[e
ee!e f dfedged:e!e fdhdiZ+	e				ddWej'dje
ee!e ee f dked\ed]edbeej' d:e!ej' fdldmZ,d|dWej'dbeej' d:ej'fdndoZ-d|dWej'dpee d:ej'fdqdrZ.dzdWej'dsed:ej'fdtduZ/d|dWej'dbeej' d:ej'fdvdwZ0  Z1S )r2   z Vision Transformer

    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
        - https://arxiv.org/abs/2010.11929
    dynamic_img_size      r     r         rR   TFNlearnr   rS    img_size
patch_sizein_chansnum_classesglobal_poolr   r   r   r   r   map	embed_dimdepthrT   rU   rV   rW   rX   rY   rZ   r6   class_token	pos_embedno_embed_class
reg_tokenspre_norm
final_normfc_normpool_include_prefixdynamic_img_pad	drop_ratepos_drop_ratepatch_drop_rateproj_drop_rateattn_drop_ratedrop_path_rateweight_init)skipjaxjax_nlhbmocor   fix_initembed_layerembed_norm_layerr_   r^   block_fnr`   r8   c(           ,         s   t    |dv sJ |s|dksJ |dv sJ |du r!|dv n|}(tp(tt|#}#t p3tj || _|| _ | _	 | _
| _|rGdnd| _|  j|7  _|| _|| _|| _|| _|| _d| _i })|rp|)tdd	d
 |#durx|#|)d< |"d|||| |d|)| _| jj}*t| jdr| j n||rttddnd| _|rttd|nd| _|r|*n|*| j }+|r|dkrd| _nttd|+d | _tj |d| _!|dkrt"|| jd| _#nt$ | _#|rnt$ | _%dd t&d||D tj' 	
fddt(|D  | _)fddt(|D | _*|r<|(s<nt$ | _+|dkrRt,| j	 d| _-nd| _-|r_|(r_nt$ | _.t || _/|dkrvt0| j|nt$ | _1| dkr| 2|  |!r| 3  dS dS )a  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of image input channels.
            num_classes: Number of classes for classification head.
            global_pool: Type of global pooling for final sequence (default: 'token').
            embed_dim: Transformer embedding dimension.
            depth: Depth of transformer.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: Enable bias for qkv projections if True.
            init_values: Layer-scale init values (layer-scale enabled if not None).
            class_token: Use class token.
            no_embed_class: Don't include position embeddings for class (or reg) tokens.
            reg_tokens: Number of register tokens.
            pre_norm: Enable norm after embeddings, before transformer blocks (standard in CLIP ViT).
            final_norm: Enable norm after transformer blocks, before head (standard in most ViT).
            fc_norm: Move final norm after pool (instead of before), if None, enabled when global_pool == 'avg'.
            drop_rate: Head dropout rate.
            pos_drop_rate: Position embedding dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth rate.
            weight_init: Weight initialization scheme.
            fix_init: Apply weight initialization fix (scaling w/ layer index).
            embed_layer: Patch embedding layer.
            embed_norm_layer: Normalization layer to use / override in patch embed module.
            norm_layer: Normalization layer.
            act_layer: MLP activation layer.
            block_fn: Transformer block layer.
        r   r   )r   noner   N)r   r   r   r(   r   FNHWC)strict_img_size
output_fmtr_   )r   r   r   r   rg   r   
feat_ratior   {Gz?)r   r   c                 S   s   g | ]}|  qS rC   )item)r   rE   rC   rC   rD   r   E  r   z.VisionTransformer.__init__.<locals>.<listcomp>c                    s6   g | ]}	
|  d qS ))r5   rT   rU   rV   rW   rX   rY   rZ   r6   r[   r\   r]   r_   r^   r`   rC   r   i)r^   r   r   dprr   r6   r`   rU   r_   rT   rZ   r   rW   rV   rX   rY   rC   rD   r   F  s(    c                    s    g | ]}t d |  dqS )blocks.)modulenum_chs	reductiondictr  )r   r	  rC   rD   r   Y  s    r   )rT   rU   r_   r^   r   rC   )4r9   r:   r%   r   r$   r;   ry   r   r   num_featureshead_hidden_sizer   r   num_reg_tokenshas_class_tokenr   r   r   grad_checkpointingupdater  patch_embednum_patcheshasattrr   r<   r=   r   	cls_token	reg_tokenr   randnr   pos_dropr   
patch_droprk   norm_prelinspacer   r   blocksfeature_infor   r   	attn_poolr   	head_dropr   headr|   fix_init_weight),r@   r   r   r   r   r   r   r   rT   rU   rV   rW   rX   rY   rZ   r6   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r_   r^   r   r`   use_fc_norm
embed_argsr  	embed_lenrA   )r^   r   r   r  r   r6   r`   rU   r_   rT   rZ   r   rW   rV   r	  rX   rY   rD   r:     s   
I	

,


"

zVisionTransformer.__init__c                 C   sL   dd }t | jD ]\}}||jjjj|d  ||jjjj|d  q	dS )z9Apply weight initialization fix (scaling w/ layer index).c                 S   s   |  td|  d S )Ng       @)div_mathsqrt)param	_layer_idrC   rC   rD   rescales  s   z2VisionTransformer.fix_init_weight.<locals>.rescaler(   N)	enumerater  rj   projr   dataro   fc2)r@   r*  layer_idlayerrC   rC   rD   r!  q  s
   z!VisionTransformer.fix_init_weightmodec                 C   s   |dv sJ d|v rt | j nd}| jdurt| jdd | jdur-tjj| jdd | j	dur;tjj| j	dd t
t|||  dS )zInitialize model weights.

        Args:
            mode: Weight initialization mode ('jax', 'jax_nlhb', 'moco', or '').
        )r   r   r   r   nlhbrS   Nr   stdư>)r&  logr   r   r   r  r;   r}   normal_r  r+   get_init_weights_vit)r@   r1  	head_biasrC   rC   rD   r|   z  s   


zVisionTransformer.init_weightsmc                 C   s   t | dS )z>Initialize weights for a single module (compatibility method).N)init_weights_vit_timm)r@   r:  rC   rC   rD   _init_weights  s   zVisionTransformer._init_weightscheckpoint_pathprefixc                 C   s   t | || dS )zLoad pretrained weights.

        Args:
            checkpoint_path: Path to checkpoint.
            prefix: Prefix for state dict keys.
        N)_load_weights)r@   r=  r>  rC   rC   rD   load_pretrained  s   z!VisionTransformer.load_pretrainedc                 C   s   h dS )z3Set of parameters that should not use weight decay.>   r  r   
dist_tokenrC   r   rC   rC   rD   no_weight_decay  s   z!VisionTransformer.no_weight_decaycoarsec                 C   s   t dddgdS )zCreate regex patterns for parameter grouping.

        Args:
            coarse: Use coarse grouping.

        Returns:
            Dictionary mapping group names to regex patterns.
        z ^cls_token|pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr  r
  )r@   rC  rC   rC   rD   group_matcher  s   
zVisionTransformer.group_matcherenablec                 C   s&   || _ t| jdr| j| dS dS )zEnable or disable gradient checkpointing.

        Args:
            enable: Whether to enable gradient checkpointing.
        set_grad_checkpointingN)r  r  r  rG  )r@   rF  rC   rC   rD   rG    s   z(VisionTransformer.set_grad_checkpointingc                 C   s   | j S )zGet the classifier head.)r   r   rC   rC   rD   get_classifier  s   z VisionTransformer.get_classifierc                 C   s|   || _ |dur)|dv sJ |dkr| jdu rJ d|dkr&| jdur&d| _|| _|dkr7t| j|| _dS t | _dS )zReset the classifier head.

        Args:
            num_classes: Number of classes for new classifier.
            global_pool: Global pooling type.
        Nr   r   Fz=Cannot currently add attention pooling in reset_classifier().r   )r   r  r   r;   r   r   rk   r   )r@   r   r   rC   rC   rD   reset_classifier  s   *z"VisionTransformer.reset_classifierc              	   C   sz   | j j}| j j||d | jdur9| jrdn| j}| j j| }|| jjd kr;t	t
| j| j j||dd| _dS dS dS )zUpdate the input image resolution and patch size.

        Args:
            img_size: New input resolution, if None current resolution is used.
            patch_size: New patch size, if None existing patch size is used.
        )r   r   Nr   r(   T)new_sizeold_sizer   verbose)r  	grid_sizeset_input_sizer   r   r   r  r   r;   r<   r"   )r@   r   r   prev_grid_sizer   num_new_tokensrC   rC   rD   rN    s   
z VisionTransformer.set_input_sizerE   c           	      C   s  | j du r||jd d|jd S | jr9|j\}}}}| jj}t| j ||f|| jr,dn| jd}||d|}n| j }g }| j	durQ|
| j	|jd dd | jdurd|
| j|jd dd | jrx|| }|rwtj||g dd}n|rtj||g dd}|| }| |S )z$Apply positional embedding to input.Nr   r   )rJ  rK  r   r(   r   )r   r   r   r   r  rM  r"   r   r   r  r   expandr  r=   r   r  )	r@   rE   r   HWr   rO  r   to_catrC   rC   rD   
_pos_embed  s6   



zVisionTransformer._pos_embedNCHWindicesreturn_prefix_tokensr   
stop_earlyr   intermediates_onlyoutput_dictrs   c
                    s  |dv sJ d|dk}
g }t tj|\}}|j\ }}}|}|}|}|}tj	
 s:|s>j}n	jd|d  }t|D ]1\}}|	durZ|||	d}njrhtj	
 sht||}n||}||v r|||ry|n| qKjrfdd|D }fd	d|D }nd}|
rj||f\ fd
d|D }|ri }||d< |dur|r||d< |sɈ|}||d< |S tj	
 s|r|durtt||}|r|S |}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            return_prefix_tokens: Return both prefix and spatial intermediate tokens
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
            output_dict: Return outputs as a dictionary with 'image_features' and 'image_intermediates' keys
            attn_mask: Optional attention mask for masked attention (e.g., for NaFlex)
        Returns:
            A tuple with (final_features, intermediates), a list of intermediate features, or a dictionary containing
            'image_features' and 'image_intermediates' (and optionally 'image_intermediates_prefix')
        )rV  NLCz)Output format must be one of NCHW or NLC.rV  Nr(   ru   c                    s"   g | ]}|d d d j f qS )Nr   r  r   r   r   rC   rD   r   H     " z;VisionTransformer.forward_intermediates.<locals>.<listcomp>c                    s"   g | ]}|d d  j d f qS rw   r  r]  r   rC   rD   r   I  r^  c                    s,   g | ]}|  d dddd qS )r   r   r   r(   r   )r   permute
contiguousr]  )r   rR  rS  rC   rD   r   P     , image_intermediatesimage_intermediates_prefiximage_features)r*   lenr  r   r  rU  r  r  r=   jitis_scriptingr+  r  r,   r   r   r   dynamic_feat_sizelistzip)r@   rE   rW  rX  r   rY  r   rZ  r[  rs   r   intermediatestake_indices	max_indexr   heightwidthr  r  blkprefix_tokensresult_dictx_finalrC   )r   rR  rS  r@   rD   forward_intermediates  sV   





z'VisionTransformer.forward_intermediatesr(   
prune_norm
prune_headc                 C   sT   t t| j|\}}| jd|d  | _|rt | _|r(t | _| dd |S )aE  Prune layers not required for specified intermediates.

        Args:
            indices: Indices of intermediate layers to keep.
            prune_norm: Whether to prune normalization layer.
            prune_head: Whether to prune the classifier head.

        Returns:
            List of indices that were kept.
        Nr(   r   r   )r*   re  r  r;   rk   r   r   rI  )r@   rW  ru  rv  rl  rm  rC   rC   rD   prune_intermediate_layersm  s   

z+VisionTransformer.prune_intermediate_layersnr   c              	   C   s    | j |||||r
dndd|dS )a  Get intermediate layer outputs (DINO interface compatibility).

        NOTE: This API is for backwards compat, favour using forward_intermediates() directly.

        Args:
            x: Input tensor.
            n: Number or indices of layers.
            reshape: Reshape to NCHW format.
            return_prefix_tokens: Return prefix tokens.
            norm: Apply normalization.

        Returns:
            List of intermediate features.
        rV  r\  T)rX  r   r   rZ  rs   )rt  )r@   rE   rx  r   rX  r   rs   rC   rC   rD   get_intermediate_layers  s   
z)VisionTransformer.get_intermediate_layersc                 C   s   |  |}| |}| |}| |}|dur%| jD ]}|||d}qn| jr4tj s4t	| j|}n| |}| 
|}|S )z\Forward pass through feature layers (embeddings, transformer blocks, post-transformer norm).Nru   )r  rU  r  r  r  r  r=   rf  rg  r-   r   )r@   rE   rs   rp  rC   rC   rD   forward_features  s   






z"VisionTransformer.forward_featuresr   c                 C   s^   | j dur| js|dd| jdf }|  |}|S |du r!| jn|}t||| j| jd}|S )zApply pooling to feature tokens.

        Args:
            x: Feature tensor.
            pool_type: Pooling type override.

        Returns:
            Pooled features.
        N)r   r   r   )r  r   r   r   r   )r@   rE   r   rC   rC   rD   pool  s   


zVisionTransformer.pool
pre_logitsc                 C   s0   |  |}| |}| |}|r|S | |S )zForward pass through classifier head.

        Args:
            x: Feature tensor.
            pre_logits: Return features before final classifier.

        Returns:
            Output tensor.
        )r{  r   r  r   )r@   rE   r|  rC   rC   rD   forward_head  s   



zVisionTransformer.forward_headc                 C   s   | j ||d}| |}|S rt   )rz  r}  rv   rC   rC   rD   rG     s   
zVisionTransformer.forwardr   r   F)Trw   )NN)NFFFrV  FFN)r(   FT)r(   FFFN)2rH   rI   rJ   rK   r   rN   r   r   rQ   r   r   rL   r	   r   rM   r   strr   r'   r
   r;   rz   r:   r!  r|   r<  r=   rf  ignorer@  r   rB  r   r   rE  rG  rH  rI  rN  rO   rU  r   rt  rw  ry  rz  r{  r}  rG   rP   rC   rC   rA   rD   r2     s  
 	
 !"#$%&'() 
.		&

*	
*
a

"  *r   r  namer8   c                 C   sT   t | tjrt| jdd | jdurtj| j dS dS t| dr(| 	  dS dS )zViT weight initialization, original timm impl (for reproducibility).

    Args:
        module: Module to initialize.
        name: Module name for context.
    r   r3  Nr|   )

isinstancer;   r   r   r   rg   r}   zeros_r  r|   )r  r  rC   rC   rD   r;    s   

r;  rS   r9  c                 C   s   t | tjr@|drtj| j tj| j| dS tj	| j | jdur>d|v r5tjj
| jddntj| j dS dS t | tjr[t| j | jdurYtj| j dS dS t| drf|   dS dS )zViT weight initialization, matching JAX (Flax) impl.

    Args:
        module: Module to initialize.
        name: Module name for context.
        head_bias: Bias value for head layer.
    r   Nro   r5  r3  r|   )r  r;   r   
startswithr}   r  r   r~   rg   xavier_uniform_r7  Conv2dr    r  r|   )r  r  r9  rC   rC   rD   init_weights_vit_jax  s$   

& 


r  c                 C   s   t | tjrAd|v r*tdt| jjd d | jjd   }tj	| j| | ntj
| j | jdur?tj| j dS dS t| drL|   dS dS )zViT weight initialization, matching moco-v3 impl minus fixed PatchEmbed.

    Args:
        module: Module to initialize.
        name: Module name for context.
    qkvg      @r   r   r(   Nr|   )r  r;   r   r&  r'  rM   r   r   r}   uniform_r  rg   r  r  r|   )r  r  valrC   rC   rD   init_weights_vit_moco  s   *

r  r   r1  c                 C   s$   d| v r
t t|dS d| v rtS tS )Nr   )r9  r   )r   r  r  r;  )r1  r9  rC   rC   rD   r8  !  s
   r8  rC   bicubicposemb
posemb_newgs_newinterpolation	antialiasc           	   	   C   sb   |j d | }| j d | }tt|gd }t|s&tt|gd }t| |||||ddS )z Rescale the grid of position embeddings when loading from state_dict.
    *DEPRECATED* This function is being deprecated in favour of using resample_abs_pos_embed
    r(   r   T)r   r  r  rL  )r   rL   r&  r'  re  r"   )	r  r  r   r  r  r  ntok_newntok_oldgs_oldrC   rC   rD   resize_pos_embed*  s   r  modelr=  r>  load_bfloat16c                    s  ddl rddlm  ddldNfdd	 r"|n|d}d}d}|sGdv r6d	}nd
v r?d}d}ndv rGd}d}t| jdrI| jj}t|d }|r\|n|j}	|	jj	
t|	jj	jd  | d  |	jj	
 | d  |	jj
 | d  |s?t|jD ]\}
}t|jD ]\}}| d|
d  d|d  d}tdD ]P}t|d|d  j	
 | d|d  d  t|d|d  j	
 | d|d  d  t|d|d  j
 | d|d  d  q|jdur=|jjj	
 | d  |jjj	
 | d  |jjj
 | d   qq | d! }nt| jjj	jd  | d! }|jd"d | jjj	jd"d kr~t|| jjj	jd"d ||dd#}| jjj	
| | jjj
 | d$  | jdur| j
 | d% dd& |r | d' dd&}n | d( dd&}|j| jjkrt| d)drdnt| d*d}t|| jj|||dd+}| j
| | jj	
 | d,  | jj
 | d-  t| jtjrG| d.v rG| jjjd | d. jd/ krG| jj	
 | d0  | jj
 | d.  | jdur2| d1}|d2 | jj 
 | d3 dd& | jj!j	
t"# fd4d5d6D  | jj!j
t"# fd7d5d6D  | jj$j	
  d8 dd&%dj& | jj$j
  d9 dd&'d/ | jjj	
  d: %d | jjj
  d;  | jjj	
 | d<  | jjj
 | d=  td>D ]7}t| jj(d?|d  j	
 | d@| d  t| jj(d?|d  j
 | d@| d  q|r7dAndB\}}}t| j) D ] \}
}| dCv rX| dD}|
n
| dE|
 d}d|dF| d |j*j	
 | d< dG |j*j
 | d= dG |j+j,j	
t"# fdHd5dID  |j+j,j
t"# fdJd5dID  |j+jj	
  d: dG%d |j+jj
  d; dG |j-j	
 | dK| d dG |j-j
 | dK| d dG td>D ]?}t|j(d?|d  j	
 | dL| dM| d dG t|j(d?|d  j
 | dL| dM| d dG qqCdS )OzV Load weights from .npz checkpoints for official Google Brain Flax implementation
    r   NTc                    s   |d ur| | } r|  j j} | } | jdkr<| jd | jd   kr6| jd   kr6dkr<n n|  } |rd| jdkrK| g d} n| jdkrX| g d} n| jdkrd| ddg} t	
| } | S )N   r   r(   r   )r   r   r   r(   r   )r   r   r(   )r   bfloat16astypefloat32arrayndimr   flattenr   r=   
from_numpy)_wtidx)jnpr  	ml_dtypesnprC   rD   _n2pL  s    
>



z_load_weights.<locals>._n2pbilinearFzopt/target/embedding/kernelzopt/target/zparams/embedding/kernelzparams/zparams/img/embedding/kernelzparams/img/backbonerD  r(   zconv_root/kernelzgn_root/scalezgn_root/biasblockz/unit/r   conv/kernelr   gnz/scale/biaszconv_proj/kernelzgn_proj/scalezgn_proj/biaszembedding/kernelr   r  r  rL  zembedding/biasclsr  pos_embeddingz(Transformer/posembed_input/pos_embeddingr   r   rJ  r   r  r  rL  zTransformer/encoder_norm/scalezTransformer/encoder_norm/biasz	head/biasr   zhead/kernelz
MAPHead_0/zMultiHeadDotProductAttention_0/probec                    s.   g | ]}  | d  dd djqS )r  Fr  r(   r  Tr   rx  r  
mha_prefixwrC   rD   r         "z!_load_weights.<locals>.<listcomp>)keyvaluec                    s,   g | ]}  | d  dd dqS )r  Fr  r   r   r  r  rC   rD   r     s     zquery/kernelz
query/biasz
out/kernelzout/biaszLayerNorm_0/scalezLayerNorm_0/biasr   fczMlpBlock_0/Dense_)r   r   r(   )r(   r   r   z*Transformer/encoderblock/LayerNorm_0/scalezTransformer/encoderblock/zTransformer/encoderblock_MultiHeadDotProductAttention_)r  c                    s0   g | ]}  | d  dd djqS )r  Fr  r  r(   r  r  r  r  r  r  rC   rD   r     s    $)queryr  r  c                    s.   g | ]}  | d  dd dqS )r  Fr  r   r  r  r  rC   rD   r     r  
LayerNorm_	MlpBlock_z/Dense_)TN).numpy	jax.numpyr  loadr  r  r  rD  r  r   copy_r.   r   r   rg   r+  stagesr  r   getattr
downsampler,  r!   r  r   r"   rM  r  r   r;   r   r  latentkvr=   r   r   r  r  r   ro   childrenri   rj   r  rn   )r  r=  r>  r  r  r  
big_visionr  	stem_onlyrD  r  stagejr  bprembed_conv_wpos_embed_wr   block_prefixmha_subb_subln1_subrC   )r  r  r  r  r  r  r  r  rD   r?  C  s   
,446
$$
 

*($48
  

("&&  r?  visual.
state_dictc                 C   s   i }g d}|   D ]R\}}||sq
||d}|D ]}||d |d }q|dkr@d}|dd}t|jd |d< n|dkrOd	}|dd}n	|d
krX|d}|||< q
|S )N)
)conv1patch_embed.proj)positional_embeddingr   )ztransformer.resblocks.r  )ln_prer  )ln_postr   )ln_r   )in_proj_zqkv.out_projr,  )zmlp.c_fcmlp.fc1)z
mlp.c_projzmlp.fc2r   r   r(   r,  head.weight	head.biasclass_embeddingr  r   )itemsr  replacer   r=   r   r   	unsqueeze)r  r  r>  out_dictswapsr   r   sprC   rC   rD   _convert_openai_clip  s&   


r  c                 C   s   dd l }i }| dd  d| v r7| d|d< | d| d d d df  |d< | dd d dd f |d< |  D ]&\}}|d|rN|||d	d
< q;|d|r]|||dd< q;|||< q;|S )Nr   
mask_tokenregister_tokensr  r  r   r(   z(blocks\.(\d+)\.mlp\.w12\.(?:weight|bias)w12fc1z'blocks\.(\d+)\.mlp\.w3\.(?:weight|bias)w3r.  )repopr  matchr  )r  r  r  r  r   r   rC   rC   rD   _convert_dinov2  s    "
r  c                 C   s   i }|   D ]8\}}|dd}|dd}|dd}|dd}|d	d
}|dd}|dd}|dd}|||< q|S )Nnorm_1ri   norm_2rn   zpreprocessor.patchifier.patch_embed.zpreprocessor.pos_embedr   ztrunk.r   zpost_trunk_norm.norm.r  z	mlp.fc1_gzmlp.fc3z	mlp.fc1_x)r  r  )r  r  r  r   r   rC   rC   rD   _convert_aimv2  s   
r   c                 C   s:  ddl }| d| } dD ]}| |d qg d}i }|  D ]+\}}d|v r(q|D ]\}}||||}q*|dkrF|dd d|d	< q|||< qi i }	}
|d
}| D ]A\}}||}|si||	|< qY| \}}}|
	||fi }|||< t
|dkrtj|d |d |d gdd|	d| d| < qY|	S )zP
    Turn a BEiT-3 checkpoint into a standard VisionTransformer state-dict.
    r   Nr  )zbeit3.text_embed.weightzbeit3.vision_embed.mask_token))zbeit3\.r   )zvision_embed\.cls_tokenr  )zvision_embed\.r  )zembed_positions\.z
pos_embed.)z	encoder\.r   )zlayers\.r  )zffn_layernorm\.r  )zffn\.zmlp.)zself_attn_layer_norm\.znorm1.)zself_attn\.zattn.)zfinal_layer_norm\.znorm2.)inner_attn_lnr   r  )z\.A\..z.B.zpos_embed.weightr   r   z1blocks\.(\d+)\.attn\.(q|k|v)_proj\.(weight|bias)$r   r   r   r   r   r  z
.attn.qkv.)r  getr  r  subr  compile	fullmatchgroups
setdefaultre  r=   r   )r  r  r  r   rulestmpr   oldnewoutbufpatr:  rp  whichkindstashrC   rC   rD   _convert_beit3-  s<   



r  Tadapt_layer_scalec              	      s@  ddl }i }| d| } | d| } d d| v rt| |} ncd| v r*t| |dd	} nWd
| v r4t| |} nMtdd |  D rEt| |} n<d| v rP| d } d n1d| v sXd| v rxd d| v rwt|jt	j
rw| d |d< t| d jd |d< n	d| v rt| |}  r fdd|  D } |  D ]\}}d|v r|jjjj\}	}
}}t|jdk r|jjjj\}	}
}}||	d||}|jd |ks|jd |krt|||f||dd}nB|dkr|jd |jjd krt|d d!rdnt|d"d}t||jj|||dd#}n|rd$|v r|d%d&|}nd'|v rq|||< q|S )(zJ convert patch embedding weight from manual patchify + linear proj to convr   Nr  r  r   zvisual.class_embeddingzmodule.visual.class_embeddingzmodule.visual.r>  r  c                 s   s    | ]}d |v V  qdS )zbeit3.NrC   )r   r   rC   rC   rD   	<genexpr>}  s    z'checkpoint_filter_fn.<locals>.<genexpr>encoderzmodule.zvisual.trunk.pos_embedz"visual.trunk.blocks.0.norm1.weightzvisual.trunk.zvisual.head.proj.weightr  r  z#preprocessor.patchifier.proj.weightc                    s,   i | ]\}}|  r|t d  |qS rw   )r  re  )r   r   r   r  rC   rD   
<dictcomp>  ra  z(checkpoint_filter_fn.<locals>.<dictcomp>zpatch_embed.proj.weightr  r   r   Tr  r   r(   r   Fr   r  gamma_zgamma_([0-9])z
ls\1.gammar|  )r  r  r  r  anykeysr  r  r   r;   r   r=   r   r   r   r  r  r,  r   re  r   r!   r   r  r"   rM  r  )r  r  r  r  r  r  r  r   r   OIrR  rS  r   rC   r  rD   checkpoint_filter_fni  sr   
"

r  urlc                 K   s    | ddd dddt tddd|S )	Nr   )r   r   r   g?r  Tr  r   )r  r   
input_size	pool_sizecrop_pctr  fixed_input_sizer   r4  
first_conv
classifier)r   r   )r  kwargsrC   rC   rD   _cfg  s   r'  z*vit_base_patch16_224.augreg2_in21k_ft_in1kztimm/)	hf_hub_idz*vit_base_patch16_384.augreg2_in21k_ft_in1kz)vit_base_patch8_224.augreg2_in21k_ft_in1kz)vit_tiny_patch16_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz)r  r(  custom_loadz)vit_tiny_patch16_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)r     r*        ?)r  r(  r)  r   r"  z*vit_small_patch32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npzz*vit_small_patch32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npzz*vit_small_patch16_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npzz*vit_small_patch16_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npzz)vit_base_patch32_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npzz)vit_base_patch32_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npzz)vit_base_patch16_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npzz)vit_base_patch16_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npzz(vit_base_patch8_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npzz*vit_large_patch16_224.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npzz*vit_large_patch16_384.augreg_in21k_ft_in1kzhttps://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npzz'vit_base_patch16_224.orig_in21k_ft_in1kzohttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_224-80ecf9dd.pth)r  r(  z'vit_base_patch16_384.orig_in21k_ft_in1kzohttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_p16_384-83fb41ba.pth)r  r(  r   r"  z(vit_large_patch32_384.orig_in21k_ft_in1kzphttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pthz!vit_small_patch16_224.augreg_in1kzhttps://storage.googleapis.com/vit_models/augreg/S_16-i1k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npzz!vit_small_patch16_384.augreg_in1kzhttps://storage.googleapis.com/vit_models/augreg/S_16-i1k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npzz vit_base_patch32_224.augreg_in1kzhttps://storage.googleapis.com/vit_models/augreg/B_32-i1k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npzz vit_base_patch32_384.augreg_in1kzhttps://storage.googleapis.com/vit_models/augreg/B_32-i1k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npzz vit_base_patch16_224.augreg_in1kzhttps://storage.googleapis.com/vit_models/augreg/B_16-i1k-300ep-lr_0.001-aug_strong2-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npzz vit_base_patch16_384.augreg_in1kzhttps://storage.googleapis.com/vit_models/augreg/B_16-i1k-300ep-lr_0.001-aug_strong2-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npzzvit_large_patch14_224.untrained)r  zvit_huge_patch14_224.untrainedzvit_giant_patch14_224.untrainedz"vit_gigantic_patch14_224.untrainedzvit_base_patch32_224.orig_in21k)r(  r   zvit_base_patch16_224.orig_in21kz vit_large_patch32_224.orig_in21kz vit_large_patch16_224.orig_in21kzvit_huge_patch14_224.orig_in21kz!vit_tiny_patch16_224.augreg_in21kzmhttps://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npziSU  )r  r(  r)  r   z"vit_small_patch32_224.augreg_in21kznhttps://storage.googleapis.com/vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npzz"vit_small_patch16_224.augreg_in21kznhttps://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npzz!vit_base_patch32_224.augreg_in21kzohttps://storage.googleapis.com/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0.npzz!vit_base_patch16_224.augreg_in21kznhttps://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npzz vit_base_patch8_224.augreg_in21kzmhttps://storage.googleapis.com/vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npzz"vit_large_patch16_224.augreg_in21kznhttps://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1.npzzvit_base_patch32_224.sam_in1kz:https://storage.googleapis.com/vit_models/sam/ViT-B_32.npz)r  r)  r(  zvit_base_patch16_224.sam_in1kz:https://storage.googleapis.com/vit_models/sam/ViT-B_16.npzzvit_small_patch16_224.dinoz[https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth)r  r(  r   r4  r   zvit_small_patch8_224.dinozYhttps://dl.fbaipublicfiles.com/dino/dino_deitsmall8_pretrain/dino_deitsmall8_pretrain.pthzvit_base_patch16_224.dinozWhttps://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pthzvit_base_patch8_224.dinozUhttps://dl.fbaipublicfiles.com/dino/dino_vitbase8_pretrain/dino_vitbase8_pretrain.pthz vit_small_patch14_dinov2.lvd142mzNhttps://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pthz
apache-2.0)r     r,  )r  r(  licenser   r4  r   r   r"  zvit_base_patch14_dinov2.lvd142mzNhttps://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pthz vit_large_patch14_dinov2.lvd142mzNhttps://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_pretrain.pthz vit_giant_patch14_dinov2.lvd142mzNhttps://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_pretrain.pthz%vit_small_patch14_reg4_dinov2.lvd142mzShttps://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_reg4_pretrain.pthz$vit_base_patch14_reg4_dinov2.lvd142mzShttps://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_reg4_pretrain.pthz%vit_large_patch14_reg4_dinov2.lvd142mzShttps://dl.fbaipublicfiles.com/dinov2/dinov2_vitl14/dinov2_vitl14_reg4_pretrain.pthz%vit_giant_patch14_reg4_dinov2.lvd142mzShttps://dl.fbaipublicfiles.com/dinov2/dinov2_vitg14/dinov2_vitg14_reg4_pretrain.pthzvit_base_patch16_224_miil.in21kz}https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_in21k_miil-887286df.pth)rS   rS   rS   )r+  r+  r+  g      ?r  i+  )r  r(  r   r4  r"  r  r   z'vit_base_patch16_224_miil.in21k_ft_in1kzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_1k_miil_84_4-2deb18e3.pth)r  r(  r   r4  r"  r  z vit_base_patch16_rpn_224.sw_in1kz}https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_base_patch16_rpn_224-sw-3b07e89d.pthz#vit_medium_patch16_gap_240.sw_in12k)r      r.  gffffff?i-.  )r(  r   r"  r   z+vit_medium_patch16_gap_256.sw_in12k_ft_in1k)r      r/  )r(  r   r"  z+vit_medium_patch16_gap_384.sw_in12k_ft_in1ksquash)r(  r   r"  	crop_modevit_base_patch16_gap_224z/vit_base_patch32_clip_224.laion2b_ft_in12k_in1k)r(  r   r4  z/vit_base_patch32_clip_384.laion2b_ft_in12k_in1k)r(  r   r4  r"  r   z/vit_base_patch32_clip_448.laion2b_ft_in12k_in1k)r     r3  z/vit_base_patch16_clip_224.laion2b_ft_in12k_in1k)r(  r   r4  r"  z/vit_base_patch16_clip_384.laion2b_ft_in12k_in1k)r(  r   r4  r"  r   r1  z0vit_large_patch14_clip_224.laion2b_ft_in12k_in1kz0vit_large_patch14_clip_336.laion2b_ft_in12k_in1k)r   P  r4  z/vit_huge_patch14_clip_224.laion2b_ft_in12k_in1kz/vit_huge_patch14_clip_336.laion2b_ft_in12k_in1kz.vit_base_patch32_clip_224.openai_ft_in12k_in1k)r   r4  z.vit_base_patch32_clip_384.openai_ft_in12k_in1kz.vit_base_patch16_clip_224.openai_ft_in12k_in1kz.vit_base_patch16_clip_384.openai_ft_in12k_in1kz/vit_large_patch14_clip_224.openai_ft_in12k_in1kz/vit_large_patch14_clip_336.openai_ft_in12k_in1kz)vit_base_patch32_clip_224.laion2b_ft_in1kz)vit_base_patch16_clip_224.laion2b_ft_in1kz)vit_base_patch16_clip_384.laion2b_ft_in1kz*vit_large_patch14_clip_224.laion2b_ft_in1kz*vit_large_patch14_clip_336.laion2b_ft_in1kz)vit_huge_patch14_clip_224.laion2b_ft_in1kz)vit_huge_patch14_clip_336.laion2b_ft_in1kz(vit_base_patch32_clip_224.openai_ft_in1kz(vit_base_patch16_clip_224.openai_ft_in1kz(vit_base_patch16_clip_384.openai_ft_in1kz)vit_large_patch14_clip_224.openai_ft_in1kz*vit_base_patch16_clip_224.laion2b_ft_in12k)r(  r   r4  r   z+vit_large_patch14_clip_224.laion2b_ft_in12k)r(  r   r4  r"  r   z*vit_huge_patch14_clip_224.laion2b_ft_in12kz)vit_base_patch16_clip_224.openai_ft_in12kz*vit_large_patch14_clip_224.openai_ft_in12kz!vit_base_patch32_clip_224.laion2b   z!vit_base_patch16_clip_224.laion2bz"vit_large_patch14_clip_224.laion2br   z!vit_huge_patch14_clip_224.laion2b   z"vit_giant_patch14_clip_224.laion2bz%vit_gigantic_patch14_clip_224.laion2b   z'vit_base_patch32_clip_224.laion400m_e32)zDnatively QuickGELU, use quickgelu model variant for original results)r(  notesr   r4  r   z'vit_base_patch16_clip_224.laion400m_e32z,vit_base_patch16_plus_clip_240.laion400m_e32  )r(  r   r4  r   r"  r   z(vit_large_patch14_clip_224.laion400m_e32z$vit_base_patch32_clip_224.datacompxlz$vit_base_patch32_clip_256.datacompxl)r(  r   r4  r"  r   r   z$vit_base_patch16_clip_224.datacompxlz%vit_large_patch14_clip_224.datacompxlzvit_base_patch16_clip_224.dfn2bz
apple-ascl)r(  r-  r   r4  r"  r   z%vit_large_patch14_clip_224.dfn2b_s39bz vit_large_patch14_clip_224.dfn2b)r(  r-  r8  r   r4  r"  r   zvit_huge_patch14_clip_224.dfn5bzvit_huge_patch14_clip_378.dfn5b)r   z  r:  )r(  r   r4  r-  r8  r"  r   r   z(vit_base_patch32_clip_224.metaclip_2pt5bzcc-by-nc-4.0z(vit_base_patch16_clip_224.metaclip_2pt5bz)vit_large_patch14_clip_224.metaclip_2pt5bz(vit_huge_patch14_clip_224.metaclip_2pt5bz-vit_huge_patch14_clip_224.metaclip_altogetherz,vit_gigantic_patch14_clip_224.metaclip_2pt5bz'vit_base_patch32_clip_224.metaclip_400mz'vit_base_patch16_clip_224.metaclip_400mz(vit_large_patch14_clip_224.metaclip_400mz vit_base_patch32_clip_224.openaiz vit_base_patch16_clip_224.openaiz!vit_large_patch14_clip_224.openai)r(  r8  r   r4  r"  r   z!vit_large_patch14_clip_336.openai)r(  r8  r   r4  r"  r   r   z#vit_base_patch32_plus_256.untrained)r  r   r"  z#vit_base_patch16_plus_240.untrainedz$vit_small_patch16_36x1_224.untrainedz$vit_small_patch16_18x2_224.untrainedz#vit_base_patch16_18x2_224.untrainedz)eva_large_patch14_196.in22k_ft_in22k_in1kmit)r      r<  )r(  r-  r   r4  r   r"  z)eva_large_patch14_336.in22k_ft_in22k_in1k)r(  r-  r   r4  r   r"  r1  z#eva_large_patch14_196.in22k_ft_in1kz#eva_large_patch14_336.in22k_ft_in1kzflexivit_small.1200ep_in1kzEhttps://storage.googleapis.com/big_vision/flexivit/flexivit_s_i1k.npz)r  r)  r(  r   r"  zflexivit_small.600ep_in1kzKhttps://storage.googleapis.com/big_vision/flexivit/flexivit_s_i1k_600ep.npzzflexivit_small.300ep_in1kzKhttps://storage.googleapis.com/big_vision/flexivit/flexivit_s_i1k_300ep.npzzflexivit_base.1200ep_in1kzEhttps://storage.googleapis.com/big_vision/flexivit/flexivit_b_i1k.npzzflexivit_base.600ep_in1kzKhttps://storage.googleapis.com/big_vision/flexivit/flexivit_b_i1k_600ep.npzzflexivit_base.300ep_in1kzKhttps://storage.googleapis.com/big_vision/flexivit/flexivit_b_i1k_300ep.npzzflexivit_base.1000ep_in21kzMhttps://storage.googleapis.com/big_vision/flexivit/flexivit_b_i21k_1000ep.npz)r  r)  r(  r   r"  r   zflexivit_base.300ep_in21kzLhttps://storage.googleapis.com/big_vision/flexivit/flexivit_b_i21k_300ep.npzzflexivit_large.1200ep_in1kzEhttps://storage.googleapis.com/big_vision/flexivit/flexivit_l_i1k.npzzflexivit_large.600ep_in1kzKhttps://storage.googleapis.com/big_vision/flexivit/flexivit_l_i1k_600ep.npzzflexivit_large.300ep_in1kzKhttps://storage.googleapis.com/big_vision/flexivit/flexivit_l_i1k_300ep.npzzflexivit_base.patch16_in21kzIhttps://storage.googleapis.com/big_vision/flexivit/vit_b16_i21k_300ep.npzzflexivit_base.patch30_in21kzIhttps://storage.googleapis.com/big_vision/flexivit/vit_b30_i21k_300ep.npzz!vit_base_patch16_xp_224.untrainedz"vit_large_patch14_xp_224.untrainedz!vit_huge_patch14_xp_224.untrainedzvit_base_patch16_224.maezEhttps://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_base.pth)r  r(  r-  r   r4  r   zvit_large_patch16_224.maezFhttps://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_large.pthzvit_huge_patch14_224.maezEhttps://dl.fbaipublicfiles.com/mae/pretrain/mae_pretrain_vit_huge.pthz#vit_huge_patch14_gap_224.in1k_ijepaz?https://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.14-300e.pth.tar)r  r-  r   r4  r   z$vit_huge_patch14_gap_224.in22k_ijepaz@https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.h.14-900e.pth.tarz#vit_huge_patch16_gap_448.in1k_ijepazEhttps://dl.fbaipublicfiles.com/ijepa/IN1K-vit.h.16-448px-300e.pth.tar)r  r-  r   r"  r   r4  r   z%vit_giant_patch16_gap_224.in22k_ijepaz@https://dl.fbaipublicfiles.com/ijepa/IN22K-vit.g.16-600e.pth.tarz$vit_base_patch32_siglip_256.v2_webli)r(  r   r   z$vit_base_patch16_siglip_224.v2_webliz!vit_base_patch16_siglip_224.webliz$vit_base_patch16_siglip_256.v2_webliz!vit_base_patch16_siglip_256.webliz&vit_base_patch16_siglip_256.webli_i18nz$vit_base_patch16_siglip_384.v2_webliz!vit_base_patch16_siglip_384.webliz$vit_base_patch16_siglip_512.v2_webli)r   r5  r5  z!vit_base_patch16_siglip_512.webliz%vit_large_patch16_siglip_256.v2_webliz"vit_large_patch16_siglip_256.webliz%vit_large_patch16_siglip_384.v2_webliz"vit_large_patch16_siglip_384.webliz%vit_large_patch16_siglip_512.v2_webliz&vit_so400m_patch14_siglip_224.v2_webliz#vit_so400m_patch14_siglip_224.webliz&vit_so400m_patch14_siglip_378.v2_webliz#vit_so400m_patch14_siglip_378.webliz#vit_so400m_patch14_siglip_384.webliz&vit_so400m_patch16_siglip_256.v2_webliz(vit_so400m_patch16_siglip_256.webli_i18nz&vit_so400m_patch16_siglip_384.v2_webliz&vit_so400m_patch16_siglip_512.v2_webliz(vit_giantopt_patch16_siglip_256.v2_webliz(vit_giantopt_patch16_siglip_384.v2_webliz(vit_base_patch32_siglip_gap_256.v2_webliz(vit_base_patch16_siglip_gap_224.v2_webliz%vit_base_patch16_siglip_gap_224.webliz(vit_base_patch16_siglip_gap_256.v2_webliz%vit_base_patch16_siglip_gap_256.webliz*vit_base_patch16_siglip_gap_256.webli_i18nz(vit_base_patch16_siglip_gap_384.v2_webliz%vit_base_patch16_siglip_gap_384.webliz(vit_base_patch16_siglip_gap_512.v2_webliz%vit_base_patch16_siglip_gap_512.webliz)vit_large_patch16_siglip_gap_256.v2_webliz&vit_large_patch16_siglip_gap_256.webliz)vit_large_patch16_siglip_gap_384.v2_webliz&vit_large_patch16_siglip_gap_384.webliz)vit_large_patch16_siglip_gap_512.v2_webliz*vit_so400m_patch14_siglip_gap_224.v2_webliz'vit_so400m_patch14_siglip_gap_224.webliz*vit_so400m_patch14_siglip_gap_224.pali_mixz)vit_so400m_patch14_siglip_gap_224.pali_ptz-vit_so400m_patch14_siglip_gap_224.pali2_3b_ptz.vit_so400m_patch14_siglip_gap_224.pali2_10b_ptz*vit_so400m_patch14_siglip_gap_378.v2_webliz'vit_so400m_patch14_siglip_gap_378.webliz'vit_so400m_patch14_siglip_gap_384.webliz*vit_so400m_patch14_siglip_gap_448.pali_mixz)vit_so400m_patch14_siglip_gap_448.pali_ptz2vit_so400m_patch14_siglip_gap_448.pali_refcoco_segz-vit_so400m_patch14_siglip_gap_448.pali_ocrvqaz-vit_so400m_patch14_siglip_gap_448.pali2_3b_ptz.vit_so400m_patch14_siglip_gap_448.pali2_10b_ptz0vit_so400m_patch14_siglip_gap_448.pali2_3b_docciz1vit_so400m_patch14_siglip_gap_448.pali2_10b_docciz)vit_so400m_patch14_siglip_gap_896.pali_pt)r     r=  z2vit_so400m_patch14_siglip_gap_896.pali_refcoco_segz-vit_so400m_patch14_siglip_gap_896.pali_ocrvqaz-vit_so400m_patch14_siglip_gap_896.pali2_3b_ptz.vit_so400m_patch14_siglip_gap_896.pali2_10b_ptz*vit_so400m_patch16_siglip_gap_256.v2_webliz,vit_so400m_patch16_siglip_gap_256.webli_i18nz*vit_so400m_patch16_siglip_gap_384.v2_webliz*vit_so400m_patch16_siglip_gap_512.v2_webliz,vit_giantopt_patch16_siglip_gap_256.v2_webliz,vit_giantopt_patch16_siglip_gap_384.v2_webliz+vit_so400m_patch14_siglip_378.webli_ft_in1kz/vit_so400m_patch14_siglip_gap_378.webli_ft_in1kz,vit_xsmall_patch16_clip_224.tinyclip_yfcc15m)r(  r-  r   r4  r   z.vit_medium_patch32_clip_224.tinyclip_laion400mz,vit_medium_patch16_clip_224.tinyclip_yfcc15mz/vit_betwixt_patch32_clip_224.tinyclip_laion400mz%vit_wee_patch16_reg1_gap_256.sbb_in1kz&vit_pwee_patch16_reg1_gap_256.sbb_in1kz1vit_little_patch16_reg1_gap_256.sbb_in12k_ft_in1kz)vit_little_patch16_reg1_gap_256.sbb_in12k)r(  r   r   r"  z(vit_little_patch16_reg4_gap_256.sbb_in1kz(vit_medium_patch16_reg1_gap_256.sbb_in1kz1vit_medium_patch16_reg4_gap_256.sbb_in12k_ft_in1kz(vit_medium_patch16_reg4_gap_256.sbb_in1kz)vit_medium_patch16_reg4_gap_256.sbb_in12kz8vit_mediumd_patch16_reg4_gap_256.sbb2_e200_in12k_ft_in1kz2vit_mediumd_patch16_reg4_gap_256.sbb_in12k_ft_in1kz0vit_mediumd_patch16_reg4_gap_256.sbb2_e200_in12kz*vit_mediumd_patch16_reg4_gap_256.sbb_in12kz8vit_mediumd_patch16_reg4_gap_384.sbb2_e200_in12k_ft_in1kz)vit_betwixt_patch16_reg1_gap_256.sbb_in1kz8vit_betwixt_patch16_reg4_gap_256.sbb2_e200_in12k_ft_in1kz2vit_betwixt_patch16_reg4_gap_256.sbb_in12k_ft_in1kz)vit_betwixt_patch16_reg4_gap_256.sbb_in1kz0vit_betwixt_patch16_reg4_gap_256.sbb2_e200_in12kz*vit_betwixt_patch16_reg4_gap_256.sbb_in12kz8vit_betwixt_patch16_reg4_gap_384.sbb2_e200_in12k_ft_in1kz'vit_base_patch16_reg4_gap_256.untrained)r   z6vit_so150m_patch16_reg4_gap_256.sbb_e250_in12k_ft_in1kz.vit_so150m_patch16_reg4_gap_256.sbb_e250_in12kz6vit_so150m_patch16_reg4_gap_384.sbb_e250_in12k_ft_in1kz)vit_so150m_patch16_reg4_map_256.untrainedz7vit_so150m2_patch16_reg1_gap_256.sbb_e200_in12k_ft_in1kz/vit_so150m2_patch16_reg1_gap_256.sbb_e200_in12kz7vit_so150m2_patch16_reg1_gap_384.sbb_e200_in12k_ft_in1kz7vit_so150m2_patch16_reg1_gap_448.sbb_e200_in12k_ft_in1kz$vit_intern300m_patch14_448.ogvl_distz$vit_intern300m_patch14_448.ogvl_2pt5z aimv2_large_patch14_224.apple_pt)r(  r   r4  r-  r"  r   z%aimv2_large_patch14_224.apple_pt_distzaimv2_huge_patch14_224.apple_ptzaimv2_1b_patch14_224.apple_ptzaimv2_3b_patch14_224.apple_ptz aimv2_large_patch14_336.apple_pt)r(  r   r4  r-  r   r"  r   z%aimv2_large_patch14_336.apple_pt_distzaimv2_huge_patch14_336.apple_ptzaimv2_1b_patch14_336.apple_ptzaimv2_3b_patch14_336.apple_ptz aimv2_large_patch14_448.apple_ptzaimv2_huge_patch14_448.apple_ptzaimv2_1b_patch14_448.apple_ptzaimv2_3b_patch14_448.apple_ptztest_vit.r160_in1k)r      r>  ztest_vit2.r160_in1kztest_vit3.r160_in1kztest_vit4.r160_in1k)r   r"  z$beit3_base_patch16_224.in22k_ft_in1kz-beit3_base_patch16_224.indomain_in22k_ft_in1kz%beit3_large_patch16_224.in22k_ft_in1kz.beit3_large_patch16_224.indomain_in22k_ft_in1kz!beit3_giant_patch14_224.untrained)r  r   r4  r"  z!beit3_giant_patch14_336.untrained)r  r   r   r4  r"  zbeit3_base_patch16_224.pt)z"beit3_base_patch16_224.indomain_ptzbeit3_large_patch16_224.ptz#beit3_large_patch16_224.indomain_ptc                 C   s0   g | ]\}}| d drd|d  d v r|qS )r8  rC   	quickgelur   )r  )r   rx  crC   rC   rD   r   O
  s   0 r   r(  _clip__clip_quickgelu_TIMM_USE_NAFLEXVITfalsetruevariant
pretrained
use_naflex	NaFlexVitc                 K   s   |d u rt }|rddlm} || |fi |S |dd}d| v r)ttddd}nt}|d	d
}d| v r?|dd dkr?d}tt| |f||t	|ddd|S )Nr(   )_create_naflexvit_from_classicout_indicesr   flexir  F)r  r  pretrained_strictTsiglipr   r   getter)rK  feature_cls)pretrained_filter_fnrM  feature_cfg)
_USE_NAFLEX_DEFAULT	naflexvitrJ  r  r   r  r  r)   r2   r  )rF  rG  rH  r&  rJ  rK  
_filter_fnstrictrC   rC   rD   _create_vision_transformer\
  s.   
rW  c                 K   2   t ddddd}td	d| it |fi |}|S )
z ViT-Tiny (Vit-Ti/16)
    r      r   r   r   r   r   rT   vit_tiny_patch16_224rG  N)r[  r  rW  rG  r&  
model_argsr  rC   rC   rD   r[  
     r[  c                 K   rX  )
z% ViT-Tiny (Vit-Ti/16) @ 384x384.
    r   rY  r   r   rZ  vit_tiny_patch16_384rG  N)r`  r\  r]  rC   rC   rD   r`  
  r_  r`  c                 K   rX  )
z ViT-Small (ViT-S/32)
        r*  r      rZ  vit_small_patch32_224rG  N)rc  r\  r]  rC   rC   rD   rc  
  r_  rc  c                 K   rX  )
z& ViT-Small (ViT-S/32) at 384x384.
    ra  r*  r   rb  rZ  vit_small_patch32_384rG  N)rd  r\  r]  rC   rC   rD   rd  
  r_  rd  c                 K   rX  )
 ViT-Small (ViT-S/16)
    r   r*  r   rb  rZ  vit_small_patch16_224rG  N)rf  r\  r]  rC   rC   rD   rf  
  r_  rf  c                 K   rX  )
re  r   r*  r   rb  rZ  vit_small_patch16_384rG  N)rg  r\  r]  rC   rC   rD   rg  
  r_  rg  c                 K   rX  )
z ViT-Small (ViT-S/8)
       r*  r   rb  rZ  vit_small_patch8_224rG  N)ri  r\  r]  rC   rC   rD   ri  
  r_  ri  c                 K   2   t ddddd}tdd| it |fi |}|S )	z ViT-Base (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k, source https://github.com/google-research/vision_transformer.
    ra  r   r   rZ  vit_base_patch32_224rG  N)rk  r\  r]  rC   rC   rD   rk  
     rk  c                 K   rj  )	z ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
    ra  r   r   rZ  vit_base_patch32_384rG  N)rm  r\  r]  rC   rC   rD   rm  
  rl  rm  c                 K   rj  )	z ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
    r   r   r   rZ  vit_base_patch16_224rG  N)rn  r\  r]  rC   rC   rD   rn  
  rl  rn  c                 K   rj  )	z ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
    r   r   r   rZ  vit_base_patch16_384rG  N)ro  r\  r]  rC   rC   rD   ro  
  rl  ro  c                 K   rj  )	z ViT-Base (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
    rh  r   r   rZ  vit_base_patch8_224rG  N)rp  r\  r]  rC   rC   rD   rp  
  rl  rp  c                 K   rX  )
zo ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights.
    ra  r6     r   rZ  vit_large_patch32_224rG  N)rr  r\  r]  rC   rC   rD   rr  
  r_  rr  c                 K   rX  )
z ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
    ra  r6  rq  r   rZ  vit_large_patch32_384rG  N)rs  r\  r]  rC   rC   rD   rs  
  rl  rs  c                 K   2   t ddddd}tdd| it |fi |}|S )	z ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer.
    r   r6  rq  rZ  vit_large_patch16_224rG  N)ru  r\  r]  rC   rC   rD   ru    rl  ru  c                 K   rt  )	z ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929).
    ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer.
    r   r6  rq  rZ  vit_large_patch16_384rG  N)rv  r\  r]  rC   rC   rD   rv    rl  rv  c                 K   rX  )
z  ViT-Large model (ViT-L/14)
       r6  rq  r   rZ  vit_large_patch14_224rG  N)rx  r\  r]  rC   rC   rD   rx    r_  rx  c                 K   rX  )
zW ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
    rw  r7  ra  r   rZ  vit_huge_patch14_224rG  N)ry  r\  r]  rC   rC   rD   ry  #  r_  ry  c                 K   4   t dddddd}td
d| it |fi |}|S )zq ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
    rw    tE]t@(   r   r   r   rU   r   rT   vit_giant_patch14_224rG  N)r  r\  r]  rC   rC   rD   r  ,     r  c                 K   6   t dddddd}t	d
d| it |fi |}|S )zq ViT-Gigantic (big-G) model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
    rw    ;;@0   r   r~  vit_gigantic_patch14_224rG  N)r  r\  r]  rC   rC   rD   r  5     r  c                 K   6   t dddddd}t	d	d| it |fi |}|S )
z ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929).
    Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K
    r   r   r   F)r   r   r   rT   rV   vit_base_patch16_224_miilrG  N)r  r\  r]  rC   rC   rD   r  ?  s   r  c                 K   >   t dddddddddd	}t		dd
| it |fi |}|S )zB ViT-Medium (ViT-M/16) w/o class token, w/ avg-pool @ 240x240
    r   r5  r   rh  Fr   r5  	r   r   r   rT   r   r   rV   r6   r   vit_medium_patch16_gap_240rG  N)r  r\  r]  rC   rC   rD   r  J     
r  c                 K   r  )zB ViT-Medium (ViT-M/16) w/o class token, w/ avg-pool @ 256x256
    r   r5  r   rh  Fr   r5  r  vit_medium_patch16_gap_256rG  Nr  r\  r]  rC   rC   rD   r  V  r  r  c                 K   r  )zB ViT-Medium (ViT-M/16) w/o class token, w/ avg-pool @ 384x384
    r   r5  r   rh  Fr   r5  r  vit_medium_patch16_gap_384rG  N)r  r\  r]  rC   rC   rD   r  b  r  r  c                 K   r  )zC ViT-Betwixt (ViT-b/16) w/o class token, w/ avg-pool @ 256x256
    r   r9  r   
   Fr   r5  r  r  rG  Nr  r\  r]  rC   rC   rD   vit_betwixt_patch16_gap_256n  r  r  c              	   K   :   t dddddddd}t	d
d| it |fi |}|S )z@ ViT-Base (ViT-B/16) w/o class token, w/ avg-pool @ 224x224
    r   r   r   Fr   r   r   r   rT   r   r   r   r2  rG  N)r2  r\  r]  rC   rC   rD   r2  z     c              	   K   s:   t dddddddd}t	dd	| it |fi |}|S )z; ViT-Huge model (ViT-H/14) w/ no class token, avg pool
    rw  r7  ra  r   Fr   r  vit_huge_patch14_gap_224rG  N)r  r\  r]  rC   rC   rD   r    r  r  c              	   K   r  )zE ViT-Huge model (ViT-H/16) w/ no class token, avg pool @ 448x448
    r   r7  ra  Fr   r  vit_huge_patch16_gap_448rG  N)r  r\  r]  rC   rC   rD   r    r  r  c              
   K   s<   t ddddddddd}t	dd	| it |fi |}|S )zH ViT-Giant (little-gg) model (ViT-g/16) w/ no class token, avg pool
    r   r{  r}  r|  Fr   r   r   r   rT   rU   r   r   r   vit_giant_patch16_gap_224rG  N)r  r\  r]  rC   rC   rD   r    s   
r  c              	   K   >   t ddddttddd}t	d
d	| it |fi |}|S )Nr/  r  r  Tr4   epsr   r   rT   r   r_   vit_xsmall_patch16_clip_224rG  )r  r  r   r   rW  r]  rC   rC   rD   r       r  c              
   K   @   t dddddttddd}t		dd
| it |fi |}|S )Nra  r5  r   rh  Tr4   r  r   r   r   rT   r   r_   vit_medium_patch32_clip_224rG  )r  r  r]  rC   rC   rD   r       r  c              	   K   r  )Nr5  r   rh  Tr4   r  r  vit_medium_patch16_clip_224rG  )r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )Nra  r9  r   r  Tr4   r  r  vit_betwixt_patch32_clip_224rG  )r  r  r]  rC   rC   rD   r    r  r  c              
   K   @   t dddddttddd}t	dd	| it |fi |}|S )) ViT-B/32 CLIP image tower @ 224x224
    ra  r   r   Tr4   r  r  vit_base_patch32_clip_224rG  N)r  r  r]  rC   rC   rD   r       r  c              
   K   r  )z) ViT-B/32 CLIP image tower @ 256x256
    ra  r   r   Tr4   r  r  vit_base_patch32_clip_256rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )z) ViT-B/32 CLIP image tower @ 384x384
    ra  r   r   Tr4   r  r  vit_base_patch32_clip_384rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )z) ViT-B/32 CLIP image tower @ 448x448
    ra  r   r   Tr4   r  r  vit_base_patch32_clip_448rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )z ViT-B/16 CLIP image tower
    r   r   r   Tr4   r  r  vit_base_patch16_clip_224rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )z) ViT-B/16 CLIP image tower @ 384x384
    r   r   r   Tr4   r  r  vit_base_patch16_clip_384rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   @   t dddddttddd}t		dd
| it |fi |}|S )z5 ViT-Base (ViT-B/16+) CLIP image tower @ 240x240
    r   r=  r   rw  Tr4   r  r  vit_base_patch16_plus_clip_240rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )z1 ViT-Large model (ViT-L/14) CLIP image tower
    rw  r6  rq  r   Tr4   r  r  vit_large_patch14_clip_224rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )z; ViT-Large model (ViT-L/14) CLIP image tower @ 336x336
    rw  r6  rq  r   Tr4   r  r  vit_large_patch14_clip_336rG  N)r  r  r]  rC   rC   rD   r  %  r  r  c              
   K   r  )z1 ViT-Huge model (ViT-H/14) CLIP image tower.
    rw  r7  ra  r   Tr4   r  r  vit_huge_patch14_clip_224rG  N)r  r  r]  rC   rC   rD   r  0  r  r  c              
   K   r  )z: ViT-Huge model (ViT-H/14) CLIP image tower @ 336x336
    rw  r7  ra  r   Tr4   r  r  vit_huge_patch14_clip_336rG  N)r  r  r]  rC   rC   rD   r  ;  r  r  c              
   K   r  )z: ViT-Huge model (ViT-H/14) CLIP image tower @ 378x378
    rw  r7  ra  r   Tr4   r  r  vit_huge_patch14_clip_378rG  N)r  r  r]  rC   rC   rD   r  F  r  r  c                 K   B   t ddddddttddd	}t	
dd| it |fi |}|S )z ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
    Pretrained weights from CLIP image tower.
    rw  r{  r|  r}  r   Tr4   r  r   r   rU   r   rT   r   r_   vit_giant_patch14_clip_224rG  N)r  r  r]  rC   rC   rD   r  Q     
r  c                 K   r  )z ViT-bigG model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560
    Pretrained weights from CLIP image tower.
    rw  r  r  r  r   Tr4   r  r  vit_gigantic_patch14_clip_224rG  N)r  r  r]  rC   rC   rD   r  _  r  r  c              
   K   B   t dddddttdddd}t		dd
| it |fi |}|S )r  ra  r   r   Tr4   r  
quick_gelur   r   r   rT   r   r_   r^   #vit_base_patch32_clip_quickgelu_224rG  N)r  r  r]  rC   rC   rD   r  m     
r  c              
   K   r  )z0 ViT-B/16 CLIP image tower w/ QuickGELU act
    r   r   r   Tr4   r  r  r  #vit_base_patch16_clip_quickgelu_224rG  N)r  r  r]  rC   rC   rD   r  z  r  r  c              
   K   B   t dddddttdddd	}t	
dd| it |fi |}|S )zB ViT-Large model (ViT-L/14) CLIP image tower w/ QuickGELU act
    rw  r6  rq  r   Tr4   r  r  r  $vit_large_patch14_clip_quickgelu_224rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )zL ViT-Large model (ViT-L/14) CLIP image tower @ 336x336 w/ QuickGELU act
    rw  r6  rq  r   Tr4   r  r  r  $vit_large_patch14_clip_quickgelu_336rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )zB ViT-Huge model (ViT-H/14) CLIP image tower w/ QuickGELU act.
    rw  r7  ra  r   Tr4   r  r  r  #vit_huge_patch14_clip_quickgelu_224rG  N)r  r  r]  rC   rC   rD   r    r  r  c              
   K   r  )zK ViT-Huge model (ViT-H/14) CLIP image tower @ 378x378 w/ QuickGELU act
    rw  r7  ra  r   Tr4   r  r  r  #vit_huge_patch14_clip_quickgelu_378rG  N)r  r  r]  rC   rC   rD   r    r  r  c                 K   sD   t ddddddttddd	d
}t	dd| it |fi |}|S )z0 ViT-bigG model (ViT-G/14) w/ QuickGELU act
    rw  r  r  r  r   Tr4   r  r  )r   r   rU   r   rT   r   r_   r^   'vit_gigantic_patch14_clip_quickgelu_224rG  N)r  r  r]  rC   rC   rD   r    s   r  c                 K   r  )z ViT-Base (ViT-B/32+)
    ra  r=  r   rw  r4   r   r   r   rT   r6   vit_base_patch32_plus_256rG  N)r  r\  r]  rC   rC   rD   r    r  r  c                 K   r  )z ViT-Base (ViT-B/16+)
    r   r=  r   rw  r4   r  vit_base_patch16_plus_240rG  N)r  r\  r]  rC   rC   rD   r    r  r  c                 K   s>   t dddddddtdd	}t	dd	| it |fi |}|S )z/ ViT-Base (ViT-B/16) w/ residual post-norm
    r   r   r   Fr4   r   )	r   r   r   rT   rV   r6   r   r   r   vit_base_patch16_rpn_224rG  N)r  )r  r{   rW  r]  rC   rC   rD   r    s   r  c                 K   r  )a   ViT-Base w/ LayerScale + 36 x 1 (36 block serial) config. Experimental, may remove.
    Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
    Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow.
    r   r*  $   rb  r4   r  vit_small_patch16_36x1_224rG  N)r  r\  r]  rC   rC   rD   r    s   r  c                 K   8   t dddddtd}t	d
d| it |fi |}|S )a   ViT-Small w/ LayerScale + 18 x 2 (36 block parallel) config. Experimental, may remove.
    Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
    Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow.
    r   r*     rb  r4   r   r   r   rT   r6   r   vit_small_patch16_18x2_224rG  N)r  r  r   rW  r]  rC   rC   rD   r    s   r  c                 K   r  )z ViT-Base w/ LayerScale + 18 x 2 (36 block parallel) config. Experimental, may remove.
    Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795
    r   r   r  r   r4   r  vit_base_patch16_18x2_224rG  N)r  r  r]  rC   rC   rD   r    s   r  c                 K   r  )zG EVA-large model https://arxiv.org/abs/2211.07636 /via MAE MIM pretrainrw  r6  rq  r   r   r   r   r   rT   r   eva_large_patch14_196rG  N)r  r\  r]  rC   rC   rD   r    s   r  c                 K   rz  )zF EVA-large model https://arxiv.org/abs/2211.07636 via MAE MIM pretrainrw  r6  rq  r   r   r  eva_large_patch14_336rG  N)r  r\  r]  rC   rC   rD   r    s   r  c                 K   rz  )z FlexiViT-Small
    r   r*  r   rb  Tr   r   r   rT   r   flexivit_smallrG  N)r  r\  r]  rC   rC   rD   r     r  r  c                 K   s4   t dddddd}td	d| it |fi |}|S )
z FlexiViT-Base
    r   r   r   Tr  flexivit_baserG  N)r  r\  r]  rC   rC   rD   r  )  r  r  c                 K   s4   t dddddd}td	d| it |fi |}|S )
z FlexiViT-Large
    r   r6  rq  Tr  flexivit_largerG  N)r  r\  r]  rC   rC   rD   r  2  r  r  c                 K   s@   t ddddddttddd
}t	d
d| it |fi |}|S )H ViT-Large model (ViT-L/14) w/ parallel blocks and qk norm enabled.
    r   r   r   TF
r   r   r   rT   r   r   r_   r   rV   rW   vit_base_patch16_xp_224rG  N)r  r  r   r   rW  r]  rC   rC   rD   r  ;     r  c                 K   @   t ddddddttddd
}t	dd	| it |fi |}|S )r  rw  r6  rq  r   TFr  vit_large_patch14_xp_224rG  N)r  r  r]  rC   rC   rD   r  H  r  r  c                 K   r  )zG ViT-Huge model (ViT-H/14) w/ parallel blocks and qk norm enabled.
    rw  r7  ra  r   TFr  vit_huge_patch14_xp_224rG  N)r  r  r]  rC   rC   rD   r  U  r  r  c                 K   r  )z ViT-S/14 for DINOv2
    rw  r*  r   rb  r4   r  vit_small_patch14_dinov2rG  N)r  r\  r]  rC   rC   rD   r  b  r  r  c                 K   r  )
z ViT-B/14 for DINOv2
    rw  r   r   r4   r  vit_base_patch14_dinov2rG  N)r  r\  r]  rC   rC   rD   r  l  r  r  c                 K   r  )z ViT-L/14 for DINOv2
    rw  r6  rq  r   r4   r  vit_large_patch14_dinov2rG  N)r  r\  r]  rC   rC   rD   r  v  r  r  c              
   K   s>   t ddddddttjd}t	dd	| it |fi |}|S ) ViT-G/14 for DINOv2
    rw     r}  rq  r4   h˹WU@)r   r   r   rT   r6   rU   r`   r^   vit_giant_patch14_dinov2rG  N)r  r  r   r;   SiLUrW  r]  rC   rC   rD   r    s   
r  c              	   K   :   t dddddddd}t		dd
| it |fi |}|S )z( ViT-S/14 for DINOv2 w/ 4 registers
    rw  r*  r   rb  r4   r  Tr   r   r   rT   r6   r   r   vit_small_patch14_reg4_dinov2rG  N)r  r\  r]  rC   rC   rD   r       
r  c              	   K   s:   t dddddddd}t	dd	| it |fi |}|S )z( ViT-B/14 for DINOv2 w/ 4 registers
    rw  r   r   r4   r  Tr  vit_base_patch14_reg4_dinov2rG  N)r  r\  r]  rC   rC   rD   r    r  r  c              	   K   r  )z( ViT-L/14 for DINOv2 w/ 4 registers
    rw  r6  rq  r   r4   r  Tr  vit_large_patch14_reg4_dinov2rG  N)r  r\  r]  rC   rC   rD   r    r  r  c                 K   sB   t ddddddttjddd	
}t	
dd| it |fi |}|S )r  rw  r  r}  rq  r4   r  r  T)
r   r   r   rT   r6   rU   r`   r^   r   r   vit_giant_patch14_reg4_dinov2rG  N)r  r  r]  rC   rC   rD   r    s   
r  c              	   K   s:   t dddddddd}t	d
d	| it |fi |}|S )Nra  r   r   Fr   	gelu_tanhr   r   r   rT   r   r   r^   vit_base_patch32_siglip_256rG  )r  r\  r]  rC   rC   rD   r       r  c                 K   8   t ddddddd}t	d	d| it |fi |}|S )
Nr   r   r   Fr   r   r   r   rT   r   r   vit_base_patch16_siglip_224rG  )r  r\  r]  rC   rC   rD   r       r  c                 K   r  )
Nr   r   r   Fr   r  vit_base_patch16_siglip_256rG  )r  r\  r]  rC   rC   rD   r    r  r  c                 K   r  )
Nr   r   r   Fr   r  vit_base_patch16_siglip_384rG  )r  r\  r]  rC   rC   rD   r    r  r  c                 K   r  )
Nr   r   r   Fr   r  vit_base_patch16_siglip_512rG  )r  r\  r]  rC   rC   rD   r    r  r  c                 K   8   t ddddddd}t	d	d| it |fi |}|S )
Nr   r6  rq  Fr   r  vit_large_patch16_siglip_256rG  )r  r\  r]  rC   rC   rD   r    r  r  c                 K   r  )
Nr   r6  rq  Fr   r  vit_large_patch16_siglip_384rG  )r  r\  r]  rC   rC   rD   r    r  r  c              	   K   :   t dddddddd}t	d
d	| it |fi |}|S )Nr   r6  rq  Fr   r  r  vit_large_patch16_siglip_512rG  )r  r\  r]  rC   rC   rD   r    r  r  c              	   K   :   t dddddddd}t		dd
| it |fi |}|S )Nrw       r   爅ZӼ@Fr   r   r   r   rT   rU   r   r   vit_so400m_patch14_siglip_224rG  )r  r\  r]  rC   rC   rD   r       r  c              	   K   r  )Nrw  r  r  r   r  Fr   r  vit_so400m_patch14_siglip_378rG  )r	  r\  r]  rC   rC   rD   r	  %     r	  c              	   K   r  )Nrw  r  r  r   r  Fr   r  vit_so400m_patch14_siglip_384rG  )r  r\  r]  rC   rC   rD   r  0  r  r  c              
   K   <   t ddddddddd}t		dd
| it |fi |}|S )Nr   r  r  r  Fr   r  r   r   r   rT   rU   r   r   r^   vit_so400m_patch16_siglip_256rG  )r  r\  r]  rC   rC   rD   r  :     r  c              
   K   r  )Nr   r  r  r  Fr   r  r  vit_so400m_patch16_siglip_384rG  )r  r\  r]  rC   rC   rD   r  E  r  r  c              
   K   r  )Nr   r  r  r  Fr   r  r  vit_so400m_patch16_siglip_512rG  )r  r\  r]  rC   rC   rD   r  P  r  r  c              	   K   r   )Nr   r  r}  Fr   r  r  vit_giantopt_patch16_siglip_256rG  )r  r\  r]  rC   rC   rD   r  [  r  r  c              	   K   r   )Nr   r  r}  Fr   r  r  vit_giantopt_patch16_siglip_384rG  )r  r\  r]  rC   rC   rD   r  f  r  r  c              
   K   s<   t ddddddddd}t	d
d	| it |fi |}|S )Nra  r   r   Fr   r  r   r   r   rT   r   r   r   r^   vit_base_patch32_siglip_gap_256rG  )r  r\  r]  rC   rC   rD   r  q  r  r  c              	   K   :   t dddddddd}t	d
d| it |fi |}|S )^ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP).r   r   r   Fr   r  vit_base_patch16_siglip_gap_224rG  N)r  r\  r]  rC   rC   rD   r  |  r
  r  c              	   K   r  )r  r   r   r   Fr   r  vit_base_patch16_siglip_gap_256rG  N)r  r\  r]  rC   rC   rD   r    r
  r  c              	   K   r  )r  r   r   r   Fr   r  vit_base_patch16_siglip_gap_384rG  N)r  r\  r]  rC   rC   rD   r    r
  r  c              	   K   r  )r  r   r   r   Fr   r  vit_base_patch16_siglip_gap_512rG  N)r  r\  r]  rC   rC   rD   r    r
  r  c              	   K   r  )r  r   r6  rq  Fr   r   vit_large_patch16_siglip_gap_256rG  N)r  r\  r]  rC   rC   rD   r    r
  r  c              	   K   r  )r  r   r6  rq  Fr   r   vit_large_patch16_siglip_gap_384rG  N)r  r\  r]  rC   rC   rD   r    r
  r  c              
   K   <   t ddddddddd}t	d
d	| it |fi |}|S )Nr   r6  rq  Fr   r  r   vit_large_patch16_siglip_gap_512rG  )r  r\  r]  rC   rC   rD   r       
r  c              
   K   <   t ddddddddd}t		dd
| it |fi |}|S )r  rw  r  r  r   r  Fr   r  !vit_so400m_patch14_siglip_gap_224rG  N)r"  r\  r]  rC   rC   rD   r"       
r"  c              
   K   r!  )r  rw  r  r  r   r  Fr   r  !vit_so400m_patch14_siglip_gap_378rG  N)r$  r\  r]  rC   rC   rD   r$    r#  r$  c              
   K   r!  )r  rw  r  r  r   r  Fr   r  !vit_so400m_patch14_siglip_gap_384rG  N)r%  r\  r]  rC   rC   rD   r%    r#  r%  c              
   K   r!  )r  rw  r  r  r   r  Fr   r  !vit_so400m_patch14_siglip_gap_448rG  N)r&  r\  r]  rC   rC   rD   r&    r#  r&  c              
   K   r!  )r  rw  r  r  r   r  Fr   r  !vit_so400m_patch14_siglip_gap_896rG  N)r'  r\  r]  rC   rC   rD   r'    r#  r'  c                 K   s>   t dddddddddd	}t		dd
| it |fi |}|S )r  r   r  r  r  Fr   r  	r   r   r   rT   rU   r   r   r   r^   !vit_so400m_patch16_siglip_gap_256rG  N)r)  r\  r]  rC   rC   rD   r)       
r)  c                 K   >   t dddddddddd	}t		dd
| it |fi |}|S )Nr   r  r  r  Fr   r  r(  !vit_so400m_patch16_siglip_gap_384rG  )r,  r\  r]  rC   rC   rD   r,       r,  c                 K   r+  )Nr   r  r  r  Fr   r  r(  !vit_so400m_patch16_siglip_gap_512rG  )r.  r\  r]  rC   rC   rD   r.    r-  r.  c              
   K   r  )Nr   r  r}  Fr   r  r  #vit_giantopt_patch16_siglip_gap_256rG  )r/  r\  r]  rC   rC   rD   r/  '  r   r/  c              
   K   r  )Nr   r  r}  Fr   r  r  #vit_giantopt_patch16_siglip_gap_384rG  )r0  r\  r]  rC   rC   rD   r0  2  r   r0  c                 K   @   t ddddddddd	d
d
}t	dd| it |fi |}|S )Nr   r/  rw  r  r4      FTr(   r   
r   r   r   rT   r6   rU   r   r   r   r   vit_wee_patch16_reg1_gap_256rG  )r4  r\  r]  rC   rC   rD   r4  =     r4  c                 K   sB   t dddddddddd	td
}t	dd| it |fi |}|S )Nr   r/  r  r4   r2  FTr(   r   )r   r   r   rT   r6   rU   r   r   r   r   r   vit_pwee_patch16_reg1_gap_256rG  )r6  )r  r   rW  r]  rC   rC   rD   r6  H  s   
r6  c                 K   r1  )Nr   @  rw  r2  r4   ffffff@FTr(   r   r3  vit_little_patch16_reg1_gap_256rG  )r9  r\  r]  rC   rC   rD   r9  S  r5  r9  c                 K   r1  )Nr   r7  rw  r2  r4   r8  FTr  r   r3  vit_little_patch16_reg4_gap_256rG  )r:  r\  r]  rC   rC   rD   r:  ^  r5  r:  c                 K   >   t ddddddddd	d
	}t	dd| it |fi |}|S )Nr   r5  r   rh  r4   FTr(   r   	r   r   r   rT   r6   r   r   r   r   vit_medium_patch16_reg1_gap_256rG  )r=  r\  r]  rC   rC   rD   r=  i     
r=  c                 K   r;  )Nr   r5  r   rh  r4   FTr  r   r<  vit_medium_patch16_reg4_gap_256rG  )r?  r\  r]  rC   rC   rD   r?  t  r>  r?  c                 K   r;  )Nr   r5     rh  r4   FTr  r   r<   vit_mediumd_patch16_reg4_gap_256rG  )rA  r\  r]  rC   rC   rD   rA    r>  rA  c                 K   r;  )Nr   r5  r@  rh  r4   FTr  r   r<   vit_mediumd_patch16_reg4_gap_384rG  )rB  r\  r]  rC   rC   rD   rB    r>  rB  c                 K   r;  )Nr   r9  r   r  r4   FTr(   r   r<   vit_betwixt_patch16_reg1_gap_256rG  )rC  r\  r]  rC   rC   rD   rC    r>  rC  c                 K   r;  )Nr   r9  r   r  r4   FTr  r   r<   vit_betwixt_patch16_reg4_gap_256rG  )rD  r\  r]  rC   rC   rD   rD    r>  rD  c                 K   r;  )Nr   r9  r   r  r4   FTr  r   r<   vit_betwixt_patch16_reg4_gap_384rG  )rE  r\  r]  rC   rC   rD   rE    r>  rE  c              
   K   s<   t ddddddddd}t		dd
| it |fi |}|S )Nr   r   r   FTr   r  )r   r   r   rT   r   r   r   r   vit_base_patch16_reg4_gap_256rG  )rF  r\  r]  rC   rC   rD   rF    r   rF  c              
   K   s<   t ddddddddd	}t	
dd| it |fi |}|S )F SO150M (shape optimized, but diff than paper def, optimized for GPU) r   r=  r  rw  ~jt@Fr  r   )r   r   r   rT   rU   r   r   r   vit_so150m_patch16_reg4_map_256rG  N)rI  r\  r]  rC   rC   rD   rI    r#  rI  c                 K   >   t dddddddddd		}t	
dd| it |fi |}|S )rG  r   r=  r  rw  rH  Fr  r   	r   r   r   rT   rU   r   r   r   r   vit_so150m_patch16_reg4_gap_256rG  N)rL  r\  r]  rC   rC   rD   rL    r*  rL  c                 K   rJ  )rG  r   r=  r  rw  rH  Fr  r   rK  vit_so150m_patch16_reg4_gap_384rG  N)rM  r\  r]  rC   rC   rD   rM    r*  rM  c                 K   @   t dddddddddd	d

}t	dd| it |fi |}|S )I SO150M v2 (shape optimized, but diff than paper def, optimized for GPU) r   @        NN@r4   Fr(   r   
r   r   r   rT   rU   r6   rV   r   r   r    vit_so150m2_patch16_reg1_gap_256rG  N)rU  r\  r]  rC   rC   rD   rU       rU  c                 K   rN  )rO  r   rP  rQ  rR  rS  r4   Fr(   r   rT   vit_so150m2_patch16_reg1_gap_384rG  N)rW  r\  r]  rC   rC   rD   rW    rV  rW  c                 K   rN  )rO  r   rP  rQ  rR  rS  r4   Fr(   r   rT   vit_so150m2_patch16_reg1_gap_448rG  N)rX  r\  r]  rC   rC   rD   rX    rV  rX  c              	   K   r  )Nrw  r6  rq  r   g?FT)r   r   r   rT   r6   r   r   vit_intern300m_patch14_448rG  )rY  r\  r]  rC   rC   rD   rY  	  s   rY  c                 K   X   t dddddddddddttd	d
ttd	d
td}t	dd| it |fi |}|S ) ViT Large AIM-v2 model
    rw  r6  rq  rh  F      @r   silur4   r  r   r   r   rT   r   r   rU   r   rV   rZ   r^   r_   r   r`   aimv2_large_patch14_224rG  N)r_  r  r   r   r   rW  r]  rC   rC   rD   r_       
r_  c                 K   rZ  ) ViT Huge AIM-v2 model
    rw  r  rq  r   FAfU@r   r]  r4   r  r^  aimv2_huge_patch14_224rG  N)rd  r`  r]  rC   rC   rD   rd  "  s   
rd  c                 K   rZ  ) ViT 1B AIM-v2 model
    rw     rq  r   Fr\  r   r]  r4   r  r^  aimv2_1b_patch14_224rG  N)rg  r`  r]  rC   rC   rD   rg  1  ra  rg  c                 K   X   t dddddddddddttdd	ttdd	td
}t	dd| it |fi |}|S ) ViT 3B AIM-v2 model
    rw     rq  Frc  r   r]  r4   r  r^  aimv2_3b_patch14_224rG  N)rk  r`  r]  rC   rC   rD   rk  ?  ra  rk  c                 K   rZ  )r[  rw  r6  rq  rh  Fr\  r   r]  r4   r  r^  aimv2_large_patch14_336rG  N)rl  r`  r]  rC   rC   rD   rl  M  ra  rl  c                 K   rZ  )rb  rw  r  rq  r   Frc  r   r]  r4   r  r^  aimv2_huge_patch14_336rG  N)rm  r`  r]  rC   rC   rD   rm  [  ra  rm  c                 K   rZ  )re  rw  rf  rq  r   Fr\  r   r]  r4   r  r^  aimv2_1b_patch14_336rG  N)rn  r`  r]  rC   rC   rD   rn  i  ra  rn  c                 K   rh  )ri  rw  rj  rq  Frc  r   r]  r4   r  r^  aimv2_3b_patch14_336rG  N)ro  r`  r]  rC   rC   rD   ro  w  ra  ro  c                 K   rZ  )r[  rw  r6  rq  rh  Fr\  r   r]  r4   r  r^  aimv2_large_patch14_448rG  N)rp  r`  r]  rC   rC   rD   rp    ra  rp  c                 K   rZ  )rb  rw  r  rq  r   Frc  r   r]  r4   r  r^  aimv2_huge_patch14_448rG  N)rq  r`  r]  rC   rC   rD   rq    ra  rq  c                 K   rZ  )re  rw  rf  rq  r   Fr\  r   r]  r4   r  r^  aimv2_1b_patch14_448rG  N)rr  r`  r]  rC   rC   rD   rr    ra  rr  c                 K   rh  )ri  rw  rj  rq  Frc  r   r]  r4   r  r^  aimv2_3b_patch14_448rG  N)rs  r`  r]  rC   rC   rD   rs    ra  rs  c                 K   s6   t ddddddd}tdd	| it |fi |}|S ) ViT Test
    r   @   rb  r   r   T)r   r   r   rT   rU   r   test_vitrG  N)rv  r\  r]  rC   rC   rD   rv    s   rv  c                 K   >   t ddddddddd	d
d
}tdd| it |fi |}|S )rt  r   ru  rh  r   r   Fr(   r   r4   T)
r   r   r   rT   rU   r   r   r   r6   r   	test_vit2rG  N)rx  r\  r]  rC   rC   rD   rx       

rx  c                 K   rw  )rt  r   `   	   r   r   Fr(   r   Tr4   )
r   r   r   rT   rU   r   r   r   r   r6   	test_vit3rG  N)r|  r\  r]  rC   rC   rD   r|    ry  r|  c                 K   s@   t dddddddddd	d
d}tdd| it |fi |}|S )rt  r   rz  r{  r   Fr(   r   r4   Trmsnorm)r   r   r   rT   rU   r   r   r   r6   r   r_   	test_vit4rG  N)r~  r\  r]  rC   rC   rD   r~    s   

r~  c                 K   sF   t dddddddddttddd	
}tdd| it |fi |}|S )zu BEiT3 Base model (ViT-Base size) with patch size 16x16.
    Remapped to VisionTransformer with scale_norm=True.
    r   r   r   r  Tr   r4   r  
r   r   r   rT   rU   rX   rY   r   r   r_   beit3_base_patch16_224rG  N)r  r  r]  rC   rC   rD   r       

r  c                 K   sF   t dddddddddttddd	
}tdd| it |fi |}|S )zw BEiT3 Large model (ViT-Large size) with patch size 16x16.
    Remapped to VisionTransformer with scale_norm=True.
    r   r6  rq  r  Tr   r4   r  r  beit3_large_patch16_224rG  N)r  r  r]  rC   rC   rD   r    r  r  c                 K   sF   t dddddddddttdd	d

}tdd| it |fi |}|S )zf BEiT3 Giant model with patch size 14x14.
    Remapped to VisionTransformer with scale_norm=True.
    rw  r{  r}  r   8mt@Tr   r4   r  r  beit3_giant_patch14_224rG  N)r  r  r]  rC   rC   rD   r    r  r  c                 K   sH   t ddddddddddttd	d
d}tdd| it |fi |}|S )z} BEiT3 Giant model with patch size 14x14 and image size 336x336.
    Remapped to VisionTransformer with scale_norm=True.
    r4  rw  r{  r}  r   r  Tr   r4   r  )r   r   r   r   rT   rU   rX   rY   r   r   r_   beit3_giant_patch14_336rG  N)r  r  r]  rC   rC   rD   r    s   
r  vit_tiny_patch16_224_in21kvit_small_patch32_224_in21kvit_small_patch16_224_in21kvit_base_patch32_224_in21kvit_base_patch16_224_in21kvit_base_patch8_224_in21kvit_large_patch32_224_in21kvit_large_patch16_224_in21kvit_huge_patch14_224_in21kvit_base_patch32_224_samzvit_base_patch32_224.samvit_base_patch16_224_samzvit_base_patch16_224.samvit_small_patch16_224_dinovit_small_patch8_224_dinovit_base_patch16_224_dinovit_base_patch8_224_dinovit_base_patch16_224_miil_in21k!vit_base_patch32_224_clip_laion2b)"vit_large_patch14_224_clip_laion2b!vit_huge_patch14_224_clip_laion2b"vit_giant_patch14_224_clip_laion2b)r   r(   Fr~  )r   rS   )r   rS   )r(   rC   r  F)r   F)r  )Fr  T)FNr  (  rK   copyloggingr&  oscollectionsr   	functoolsr   typingr   r   r   r   r   r	   r
   r   r   r   ImportErrortyping_extensionsr=   torch.nnr;   torch.nn.functional
functionalr   	torch.jitr   	timm.datar   r   r   r   r   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   _builderr)   	_featuresr*   _manipulater+   r,   r-   r.   	_registryr/   r0   r1   __all__	getLoggerrH   _loggerrz   r3   rQ   r{   r   r   rO   r  rL   rN   r   r2   r;  rM   r  r  r8  r  no_gradr?  r  r  r   r  r  r  r'  default_cfgsr  _quick_gelu_cfgsrx  deepcopyr@  r  environr  lowerrS  rW  r[  r`  rc  rd  rf  rg  ri  rk  rm  rn  ro  rp  rr  rs  ru  rv  rx  ry  r  r  r  r  r  r  r  r2  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r"  r$  r%  r&  r'  r)  r,  r.  r/  r0  r4  r6  r9  r:  r=  r?  rA  rB  rC  rD  rE  rF  rI  rL  rM  rU  rW  rX  rY  r_  rd  rg  rk  rl  rm  rn  ro  rp  rq  rr  rs  rv  rx  r|  r~  r  r  r  r  rC   rC   rC   rD   <module>   s"   , T
H=fM
    . 

$ 
&

?
M
"&*.26:@CGMQUY]a
f
g
h
ilptx|     
             #  (  ,  0  4  :  @  F  L  T  Z  `  f  n
  r  x  {  ~                                           "    '    *    .    1    5    8    =    @    C    G    J    N    Q    V    Y    \    `    d    g    j    n    q    u    x
    {
    ~
     
      
      	      
            
      
            
      "
      &
      *
      .
      3
      8
      ?
      D
      I
      N
      S
      W
      \
      a
      f
      l      p      t
      x                                                                                     !        %        *        .        2        6        :        ?        C        G        L        P        U        V        W        Y        ^        c        i        n        s        y                                     
                                                            "          &          *          .          2          6          :          =          @          D          H          L          P          T          X          \          `          e          i          l          o          s          w          {                                                                                                                      #            &            )            ,            /            7            ;            ?            C            G            K            O            S            W            a            e            i            m            q            u            y                                                                                                               !              &              *              .              2              7              :              =              @              D              G              J              M              P              T              W              Z              ^              b              e              h              k              n              q              u              y              |                                                             
                                                                                                                %                )                -                1                5                9                =                A                E                I                M                Q                U                Y                ^                a                d                g                k                n                q                t                w                y
                {                          
         &									



		











					
						
	
	






























 
     	
