o
    پiu                    @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
mZmZ ddlZddlmZ ddlm  mZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, dd	l-m.Z. dd
l/m0Z0 ddl1m2Z2m3Z3 dgZ4G dd dej5Z6G dd dej5Z7G dd dej5Z8G dd dej5Z9	ddee:ej;f dej5de:dee:ej;f fddZ<		ddee:ej;f dej5de:de=dee:ej;f f
dd Z>dd"e:d#e=de9fd$d%Z?dd'e:dee:ef fd(d)Z@dd'e:dee:ef fd*d+ZAe2i d,e@d-d.d/e@d-d0d1d2d3d4e@d-eed0d1d2d5d6e@d-eed7d1d2d5d8e@d-d9d1d2d3d:e@d-d9d1d2d3d;e@d-d9d1d2d3d<e@d-d0d1d=d>e@d-d0d1d=d?e@d-d9d1d=d@e@d-d9d1d=dAe@d-d9d1d=dBe@d-d9d1d2dCdDdEe@d-d9d1d2dCdDdFe@d-d9d1d2dCdDdGe@d-ddHdIe@d-ddHi dJe@d-ddHdKe@d-ddHdLe@d-ddHdMe@d-dNdHdOe@d-dNdHdPe@d-dQdHdRe@d-dSdHdTe@d-d0d1dSdUdVe@d-dNdHdWe@d-dNdHdXe@ddYdZe@d-d[d\d]d]d^d_e@d-d[d\d]d]d^d`e@d-d[d\d=dae@d-d[d\d]d]d^dbeAd-dcdNdddeeAd-d0dNddeAd-d9dfddeAd-d9dddeAd-d9dddeAd-d9ddddgZBe3dd#e=de9fdhdiZCe3dd#e=de9fdjdkZDe3dd#e=de9fdldmZEe3dd#e=de9fdndoZFe3dd#e=de9fdpdqZGe3dd#e=de9fdrdsZHe3dd#e=de9fdtduZIe3dd#e=de9fdvdwZJe3dd#e=de9fdxdyZKe3dd#e=de9fdzd{ZLe3dd#e=de9fd|d}ZMe3dd#e=de9fd~dZNe3dd#e=de9fddZOe3dd#e=de9fddZPe3dd#e=de9fddZQe3dd#e=de9fddZRe3dd#e=de9fddZSe3dd#e=de9fddZTe3dd#e=de9fddZUe3dd#e=de9fddZVe3dd#e=de9fddZWe3dd#e=de9fddZXe3dd#e=de9fddZYe3dd#e=de9fddZZe3dd#e=de9fddZ[e3dd#e=de9fddZ\dS )a   EVA

EVA from https://github.com/baaivision/EVA , paper: https://arxiv.org/abs/2211.07636

@article{EVA,
  title={EVA: Exploring the Limits of Masked Visual Representation Learning at Scale},
  author={Fang, Yuxin and Wang, Wen and Xie, Binhui and Sun, Quan and Wu, Ledell and Wang, Xinggang and Huang,
  Tiejun and Wang, Xinlong and Cao, Yue},
  journal={arXiv preprint arXiv:2211.07636},
  year={2022}
}

EVA-02: A Visual Representation for Neon Genesis - https://arxiv.org/abs/2303.11331
@article{EVA02,
  title={EVA-02: A Visual Representation for Neon Genesis},
  author={Fang, Yuxin and Sun, Quan and Wang, Xinggang and Huang, Tiejun and Wang, Xinlong and Cao, Yue},
  journal={arXiv preprint arXiv:2303.11331},
  year={2023}
}

This file contains EVA & EVA02 model implementations evolved from BEiT, additional models in vision_transformer.py.

Modifications by / Copyright 2023 Ross Wightman, original copyrights below
    N)partial)AnyCallableDictListOptionalSetTupleUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDOPENAI_CLIP_MEANOPENAI_CLIP_STD)
PatchEmbedMlpGluMlpSwiGLU	LayerNormDropPathPatchDropoutRotaryEmbeddingCatapply_rot_embed_catapply_keep_indices_nlctrunc_normal_resample_patch_embedresample_abs_pos_embedglobal_pool_nlc	to_2tupleuse_fused_attnAttentionRopeAttentionPoolLatent   )build_model_with_cfg)feature_take_indices)
checkpoint)generate_default_cfgsregister_modelEvac                       s   e Zd ZU dZejje ed< 											dd	e	d
e	dededede	de
de
dee	 dee dedef fddZ		ddeej deej fddZ  ZS )EvaAttentionzG EVA Attention with ROPE, no k-bias, and fused/unfused qkv options
    
fused_attn   TFr!           Ndim	num_headsqkv_bias	qkv_fusedqkv_bias_separatenum_prefix_tokens	attn_drop	proj_dropattn_head_dim
norm_layerqk_norm
scale_normc                    s  t    |s	|r|
dusJ d|| _|| }|	dur|	}|| j }|d | _|| _t | _|| _|rrtj	||d dd| _
d | _ | _| _|rhtt|| _| jdt|dd tt|| _n1d | _ | _| _n'tj	|||d| _tj	||dd| _tj	|||d| _d| _
d | _ | _| _|r|
| jnt | _|r|
| jnt | _t|| _|r|
|nt | _t	||| _t|| _dS )	a,  
        Args:
            dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to add a bias term to the query, key, and value projections
            qkv_fused: Whether qkv projections are fused into one projection or separate
            qkv_bias_separate: Whether to apply bias to qkv as a separate addition or part of F.linear() call
            num_prefix_tokens: Number of reg/cls tokens at the beginning of the sequence that
                should not have position embeddings applied
            attn_drop: Dropout rate for attention weights
            proj_drop: Dropout rate for the output projection
            attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
            norm_layer: Normalization layer constructor to use for QK and scale normalization
            qk_norm: Enable normalization of query (Q) and key (K) vectors with norm_layer
            scale_norm: Enable normalization (scaling) of attention output with norm_layer
        Nz<norm_layer must be provided if qk_norm or scale_norm is Trueg         F)biask_bias)
persistent)super__init__r-   scaler1   r   r)   r0   nnLinearqkvq_projk_projv_proj	Parametertorchzerosq_biasregister_bufferv_biasr:   head_dimIdentityq_normk_normDropoutr2   normprojr3   )selfr,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   rK   attn_dim	__class__ C/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/eva.pyr=   5   s>   


zEvaAttention.__init__rope	attn_maskc              	   C   s  |j \}}}| jdurR| jdu r| |}n"t| j| j| jf}| jr.| |}||7 }n
tj	|| jj
|d}|||d| jdddddd}|d\}	}
}n0| |||| jddd}	| |||| jddd}
| |||| jddd}| |	| |
}	}
|dur| j}tj|	ddddd|ddf t|	dddd|dddf |gdd	|}	tj|
ddddd|ddf t|
dddd|dddf |gdd	|}
| jrtj|	|
||| jr| jjnd
d}n:|	| j }	|	|
dd }|dur*|tj}| |ddddddf  t!d}|j"dd	}| |}|| }|dd|||}| #|}| $|}| %|}|S )a  Forward pass for the attention module.

        Args:
            x: Input tensor of shape (batch_size, sequence_length, embedding_dim)
            rope: Rotary position embeddings tensor for position-aware attention
            attn_mask: Optional attention mask to apply during attention computation

        Returns:
            Tensor of shape (batch_size, sequence_length, embedding_dim)
        N)weightr9   r8      r   r!      r,   r+   )rY   	dropout_pz-inf)&shaperA   rH   rF   catr:   rJ   r0   FlinearrZ   reshaper-   permuteunbindrB   	transposerC   rD   rM   rN   r1   r   type_asr)   scaled_dot_product_attentiontrainingr2   pr>   toboolmasked_fillfloatsoftmaxrP   rQ   r3   )rR   xrX   rY   BNCrA   r.   qkvnptattnrV   rV   rW   forwardw   sL   



"   VV

&



zEvaAttention.forward)r*   TTFr!   r+   r+   NNFTNN)__name__
__module____qualname____doc__rF   jitFinalrn   __annotations__intrp   r   r   r=   Tensorr{   __classcell__rV   rV   rT   rW   r(   0   s\   
 	
Er(   c                $       s   e Zd Zddddddddddddejedfded	ed
ededededededede	dededede
e dedede
e f" fddZdde
ej de
ej fddZ  ZS ) EvaBlockT      @Fr!   evar+   Nr,   r-   r.   r/   	mlp_ratio
swiglu_mlp	scale_mlpscale_attn_innerr1   	attn_typer3   r2   	drop_pathinit_values	act_layerr5   r4   c                    s@  t    ||| _|
dkrtnt}||||||	|||||d
| _|dur/t|t	| nd| _
|dkr:t|nt | _||| _t|| }|ro|r\t|||rU|nd|d| _n!t||d |re|ndtjd|d| _nt||||rw|nd|d	| _|durt|t	| nd| _|dkrt|| _dS t | _dS )
a   Initialize the EVA transformer block.

        Args:
          dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to use bias terms in query, key, value projections
            qkv_fused: Whether to use a single projection for query, key, value
            mlp_ratio: Ratio of MLP hidden dimension to input dimension
            swiglu_mlp: Whether to use SwiGLU activation in the MLP
            scale_mlp: Whether to use normalization in the MLP
            scale_attn_inner: Whether to use normalization within the attention mechanism
            num_prefix_tokens: Number of tokens at the beginning of the sequence (class tokens, etc.)
            attn_type: Type of attention module to use ('eva' or 'rope')
            proj_drop: Dropout rate for projection layers
            attn_drop: Dropout rate for attention matrix
            drop_path: Stochastic depth rate
            init_values: Initial value for LayerScale, None = no LayerScale
            act_layer: Activation layer constructor
            norm_layer: Normalization layer constructor
            attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
        rX   	r-   r.   r/   r1   r2   r3   r4   r5   r7   Nr+   in_featureshidden_featuresr5   dropr\   Fr   r   r5   r   	gate_lastr   r   r   r   r5   r   )r<   r=   norm1r   r(   rz   r?   rE   rF   onesgamma_1r   rL   
drop_path1norm2r   r   mlpr   SiLUr   gamma_2
drop_path2)rR   r,   r-   r.   r/   r   r   r   r   r1   r   r3   r2   r   r   r   r5   r4   attn_clsr   rT   rV   rW   r=      sX   
)
"




	
"$zEvaBlock.__init__rX   rY   c              	   C   s   | j d u r$|| | j| |||d }|| | | | }|S || | j | j| |||d  }|| | j| | |  }|S N)rX   rY   )r   r   rz   r   r   r   r   r   rR   rr   rX   rY   rV   rV   rW   r{     s   
 & zEvaBlock.forwardr|   )r}   r~   r   r?   GELUr   r   rn   rp   strr   r   r=   rF   r   r{   r   rV   rV   rT   rW   r      sh    	
([r   c                $       s   e Zd ZdZddddddddddddejejdfd	ed
ededede	de
dedededede	de	de	dee	 dededee f" fddZd deej deej fddZ  ZS )!EvaBlockPostNormzF EVA block w/ post-norm and support for swiglu, MLP norm scale, ROPE. Tr   r   Fr!   r+   Nr,   r-   r.   r/   r   r   r   r   r   r1   r3   r2   r   r   r   r5   r4   c                    s   t    |dkrtnt}||||||
|||||	d
| _||| _|dkr)t|nt | _	t
|| }|rY|rFt|||r?|nd|d| _n!t||d |rO|ndtjd|d| _nt||||ra|nd|d	| _||| _|dkrwt|| _dS t | _dS )
a   Initialize the post-norm EVA transformer block.

        Args:
          dim: Input dimension of the token embeddings
            num_heads: Number of attention heads
            qkv_bias: Whether to use bias terms in query, key, value projections
            qkv_fused: Whether to use a single projection for query, key, value
            mlp_ratio: Ratio of MLP hidden dimension to input dimension
            swiglu_mlp: Whether to use SwiGLU activation in the MLP
            scale_mlp: Whether to use normalization in the MLP
            scale_attn_inner: Whether to use normalization within the attention mechanism
            num_prefix_tokens: Number of tokens at the beginning of the sequence (class tokens, etc.)
            attn_type: Type of attention module to use ('eva' or 'rope')
            proj_drop: Dropout rate for projection layers
            attn_drop: Dropout rate for attention matrix
            drop_path: Stochastic depth rate
            init_values: Initial value for LayerScale, None = no LayerScale (NOTE: ignored for post-norm block)
            act_layer: Activation layer constructor
            norm_layer: Normalization layer constructor
            attn_head_dim: Dimension of each attention head (if None, computed as dim // num_heads)
        rX   r   r+   Nr   r\   Fr   r   )r<   r=   r   r(   rz   r   r   r?   rL   r   r   r   r   r   r   r   r   r   )rR   r,   r-   r.   r/   r   r   r   r   r   r1   r3   r2   r   r   r   r5   r4   r   r   rT   rV   rW   r=   #  sT   
)




	

$zEvaBlockPostNorm.__init__rX   rY   c              
   C   s>   ||  | | j|||d }|| | | | }|S r   )r   r   rz   r   r   r   r   rV   rV   rW   r{   |  s    zEvaBlockPostNorm.forwardr|   )r}   r~   r   r   r?   r   r   r   rn   rp   r   r   r   r=   rF   r   r{   r   rV   rV   rT   rW   r   !  sj    	
(Yr   c                Q       s  e Zd ZdZddddddddd	d	d
ddddddddddedd	dd	dddddddddddddf'deeeeef f deeeeef f dedededededede	de	de
de	de	de	d ed!e
d"e
d#e
d$e
d%e
d&e
d'ed(ee
 d)e	d*ed+e	d,e	d-e
d.ed/e	d0e	d1ee	 d2ee	 d3ee d4ee
 d5e	d6e	d7eeeeef ef  d8e
fN fd9d:Zdhd<d=Zd>ejd;dfd?d@Zejjd;ee fdAdBZejjdidCe	d;dfdDdEZejjdjdFe	d;eeef fdGdHZejjd;ejfdIdJZdkdedee d;dfdKdLZd;eejeej f fdMdNZ					O	dldPejdQeeeee f  dRe	dSe	dTe	dUedVe	d;eeej eejeej f f fdWdXZ 	Y			dmdQeeee f dZe	d[e	fd\d]Z!dkdPejd^ee d;ejfd_d`Z"dPejd;ejfdadbZ#djdPejdce	d;ejfdddeZ$dPejd;ejfdfdgZ%  Z&S )nr'   a!   Eva Vision Transformer w/ Abs & Rotary Pos Embed

    This class implements the EVA and EVA02 models that were based on the BEiT ViT variant
      * EVA - abs pos embed, global avg pool
      * EVA02 - abs + rope pos embed, global avg pool, SwiGLU, scale Norm in MLP (ala normformer)
          r8     avg      Tr   Fr   r+   Nr   ijgMbP?img_size
patch_sizein_chansnum_classesglobal_pool	embed_dimdepthr-   r.   r/   r   r   r   r   r   	drop_ratepos_drop_ratepatch_drop_rateproj_drop_rateattn_drop_ratedrop_path_rater5   r   class_tokennum_reg_tokensuse_abs_pos_embuse_rot_pos_embrope_grid_offsetrope_grid_indexinguse_post_normuse_pre_transformer_normuse_post_transformer_normuse_fc_normattn_pool_num_headsattn_pool_mlp_ratiodynamic_img_sizedynamic_img_padref_feat_shapehead_init_scalec(           -         s  t    |dv sJ |_|_ _ __|rdnd| _|$_d_	|}(|!dur2|!})n|dk})| dur=| }*n|) }*i }+|$rM|+
tddd td||||%| d	|+_jj},tjd
rlj n||rzttddnd_|rttd|nd_|ojdu _|rttd|,j nd_tj|d_|dkrt|jdd_nd_|r|&durt|&nd}&t d|$rdnjj|&||d_ nd_ |(rnt! _"dd t#d||D |rt$nt%t& 	
fddt'|D _(fddt'|D _)|*r0nt! _*|dkrQ|"p>}"|#pC}#t+j|"|#tj,d_-nd_-|)r[nt! _.t|_/|dkrqt0|nt! _12j3 jdurt4jdd jdurt4jdd jdurt4jdd 5  t6j1tj0rt4j1j7dd j1j7j89|' j1j:j89|' dS dS )a  Initialize the EVA Vision Transformer model.

        Args:
            img_size: Input image size (single int for square, or tuple for rectangular)
            patch_size: Patch size to divide image into tokens (single int for square, or tuple)
            in_chans: Number of input image channels
            num_classes: Number of classes (output dim) for classification head (final projection), 0 for pass-through
            global_pool: Type of global pooling for final sequence ('avg', 'token', 'map', etc.)
            embed_dim: Embedding dimension for tokens
            depth: Number of transformer blocks
            num_heads: Number of attention heads
            qkv_bias: Enable bias for query, key, value projections
            qkv_fused: Use a single projection for query, key, value
            mlp_ratio: Ratio of mlp hidden dim to embedding dim
            swiglu_mlp: Use SwiGLU activation in MLP
            scale_mlp: Apply scaling normalization in MLP (normformer style)
            scale_attn_inner: Apply scaling normalization inside attention
            attn_type: Type of attention module to use
            drop_rate: Dropout rate after final projection and pooling
            pos_drop_rate: Dropout rate for positional embeddings
            patch_drop_rate: Rate of dropping patches during training
            proj_drop_rate: Dropout rate for projections
            attn_drop_rate: Dropout rate for attention
            drop_path_rate: Stochastic depth rate
            norm_layer: Normalization layer constructor
            init_values: Initial layer-scale values
            class_token: Use class token
            num_reg_tokens: Number of additional learnable 'register' tokens to add to the sequence
            use_abs_pos_emb: Use absolute (learned) positional embeddings
            use_rot_pos_emb: Use rotary position embeddings
            rope_grid_offset: Offset for rotary position embedding grid
            rope_grid_indexing: Indexing mode for rotary position embeddings ('ij' or 'xy')
            use_post_norm: Use post-norm transformer block type
            use_pre_transformer_norm: Use normalization layer before transformer blocks
            use_post_transformer_norm: Use normalization layer after transformer blocks
            use_fc_norm: Use normalization layer after pooling, before final classifier
            attn_pool_num_heads: Number of heads in attention pooling
            attn_pool_mlp_ratio: MLP ratio in attention pooling
            dynamic_img_size: Support dynamic image sizes in forward pass
            dynamic_img_pad: Apply dynamic padding for irregular image sizes
            ref_feat_shape: Reference feature shape for rotary position embedding scale
            head_init_scale: Initialization scale for classification head weights
        ) r   avgmaxmaxtokenmapr!   r   FNr   NHWC)strict_img_size
output_fmt)r   r   r   r   r   r9   
feat_ratio)rl   T)r1   return_indices)	in_pixels
feat_shaper   grid_offsetgrid_indexingc                 S   s   g | ]}|  qS rV   )item).0rr   rV   rV   rW   
<listcomp>"  s    z Eva.__init__.<locals>.<listcomp>c                    s8   g | ]}
j 	 | d qS ))r,   r-   r.   r/   r   r   r   r   r   r1   r3   r2   r   r5   r   r1   r   i)r   r   block_fndprr   r   r   r5   r-   r   r.   r/   r   r   rR   r   rV   rW   r   $  s(    c                    s    g | ]}t d |  dqS )blocks.)modulenum_chs	reductiondictr   )r   rrV   rW   r   7  s    r   )r-   r   r5   r   {Gz?stdrV   );r<   r=   r   r   num_featureshead_hidden_sizer   r1   r   grad_checkpointingupdater   r   patch_embednum_patcheshasattrr   r?   rE   rF   rG   	cls_token	reg_token	cls_embed	pos_embedrO   pos_dropr   
patch_dropr   r   	grid_sizerX   rL   norm_prelinspacer   r   
ModuleListrangeblocksfeature_inforP   r    r   	attn_poolfc_norm	head_dropr@   headapply_init_weightsr   fix_init_weight
isinstancerZ   datamul_r9   )-rR   r   r   r   r   r   r   r   r-   r.   r/   r   r   r   r   r   r   r   r   r   r   r   r5   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   activate_pre_normactivate_fc_normactivate_post_norm
embed_argsr   rT   )r   r   r   r   r   r   r   r5   r-   r   r.   r/   r   r   r   rR   r   rW   r=     s   
U	

	,




 zEva.__init__returnc                 C   sL   dd }t | jD ]\}}||jjjj|d  ||jjjj|d  q	dS )z=Fix initialization weights by rescaling based on layer depth.c                 S   s   |  td|  d S )Ng       @)div_mathsqrt)paramlayer_idrV   rV   rW   rescale\  s   z$Eva.fix_init_weight.<locals>.rescaler!   N)	enumerater   rz   rQ   rZ   r	  r   fc2)rR   r  r  layerrV   rV   rW   r  Z  s
   zEva.fix_init_weightmc                 C   s>   t |tjrt|jdd |jdurtj|j dS dS dS )zbInitialize weights for Linear layers.

        Args:
            m: Module to initialize.
        r   r   N)r  r?   r@   r   rZ   r9   initzeros_)rR   r  rV   rV   rW   r  c  s   
zEva._init_weightsc                 C   s   ddh}|S )z(Parameters to exclude from weight decay.r   r   rV   )rR   nwdrV   rV   rW   no_weight_decayn  s   zEva.no_weight_decayenablec                 C   s
   || _ dS )z)Enable or disable gradient checkpointing.N)r   )rR   r  rV   rV   rW   set_grad_checkpointingt  s   
zEva.set_grad_checkpointingcoarsec                 C   s   t dddgd}|S )z(Create layer groupings for optimization.z ^cls_token|pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr   r   )rR   r   matcherrV   rV   rW   group_matchery  s
   zEva.group_matcherc                 C   s   | j S N)r  rR   rV   rV   rW   get_classifier  s   zEva.get_classifierc                 C   s>   || _ |dur
|| _|dkrt| j|| _dS t | _dS )zReset the classifier head.

        Args:
            num_classes: Number of output classes.
            global_pool: Global pooling type.
        Nr   )r   r   r?   r@   r   rL   r  )rR   r   r   rV   rV   rW   reset_classifier  s   *zEva.reset_classifierc                 C   sz  | j r:|j\}}}}| jd ur | jj}t| j||f|| jd}nd }||d|}| jd ur7| jj	||fdnd }n| j}| jd urG| j	 nd }| j
d ur`tj| j
|jd dd|fdd}|d urh|| }| jd urg }	| j
d ur|	| j
|jd dd |	| j|jd dd tj|	|g dd}| |}| jd ur| |\}}
|d ur|
d urt|||
}||fS )N)new_sizeold_sizer1   r[   )ra   r   r!   r^   )r   ra   r   r   r   r   r1   viewrX   	get_embedr   rF   rb   expandr   appendr   r   r   )rR   rr   rs   HWru   prev_grid_sizer   rot_pos_embedto_catkeep_indicesrV   rV   rW   
_pos_embed  s>   
"
$



zEva._pos_embedNCHWrr   indicesreturn_prefix_tokensrP   
stop_earlyr   intermediates_onlyc                    st  |dv sJ d|dk}g }	t tj|\}
}|j\ }}}|}|\}}|}tj	 s7|s;j}n	jd|d  }t
|D ]*\}}jr\tj	 s\t|||d}n|||d}||
v rr|	|ro|n| qHjrfdd|	D }fd	d|	D }	|rj||f\ fd
d|	D }	tj	 s|rtt|	|}	|r|	S |}||	fS )a)   Forward features that returns intermediates.
        Args:
            x: Input image tensor
            indices: Take last n blocks if an int, if is a sequence, select by matching indices
            return_prefix_tokens: Return both prefix and spatial intermediate tokens
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        )r5  NLCz>Output format for EVA-ViT features must be one of NCHW or NLC.r5  Nr!   rX   c                    s"   g | ]}|d d d j f qS )Nr   r   r   yr%  rV   rW   r        " z-Eva.forward_intermediates.<locals>.<listcomp>c                    s"   g | ]}|d d  j d f qS r$  r   r<  r%  rV   rW   r     r>  c                    s,   g | ]}|  d dddd qS )r[   r   r8   r!   r\   )re   rf   
contiguousr<  )rs   r.  r/  rV   rW   r     s   , )r#   lenr   ra   r   r4  r   rF   r   is_scriptingr  r   r$   r-  rP   r1   dynamic_feat_sizelistzip)rR   rr   r6  r7  rP   r8  r   r9  re   intermediatestake_indices	max_index_heightwidthr1  r   r   blkprefix_tokensrV   )rs   r.  r/  rR   rW   forward_intermediates  s<   


zEva.forward_intermediatesr!   
prune_norm
prune_headc                 C   sZ   t t| j|\}}| jd|d  | _|rt | _|r+d| _t | _| dd |S )z@ Prune layers not required for specified intermediates.
        Nr!   r   r   )	r#   r@  r   r?   rL   rP   r  r  r'  )rR   r6  rN  rO  rF  rG  rV   rV   rW   prune_intermediate_layers  s   

zEva.prune_intermediate_layers	pool_typec                 C   s>   | j d ur|  |}|S |d u r| jn|}t||| jd}|S )N)rQ  r1   )r  r   r   r1   )rR   rr   rQ  rV   rV   rW   pool  s   

zEva.poolc                 C   sh   |  |}| |\}}| |}| jD ]}| jr&tj s&t|||d}q|||d}q| 	|}|S )zForward pass through feature extraction layers.

        Args:
            x: Input tensor.

        Returns:
            Feature tensor.
        r;  )
r   r4  r   r   r   rF   r   rA  r$   rP   )rR   rr   r1  rK  rV   rV   rW   forward_features  s   
	


zEva.forward_features
pre_logitsc                 C   s0   |  |}| |}| |}|r|S | |S )zForward pass through classifier head.

        Args:
            x: Feature tensor.
            pre_logits: Return pre-logits if True.

        Returns:
            Output tensor.
        )rR  r  r  r  )rR   rr   rT  rV   rV   rW   forward_head'  s   



zEva.forward_headc                 C   s   |  |}| |}|S )zoForward pass.

        Args:
            x: Input tensor.

        Returns:
            Output tensor.
        )rS  rU  )rR   rr   rV   rV   rW   r{   6  s   
	
zEva.forward)r  N)TFr$  )NFFFr5  F)r!   FT)'r}   r~   r   r   r   r
   r   r	   r   rn   rp   r   r   r=   r  r?   Moduler  rF   r   ignorer   r  r  r   r   r#  r&  r'  r   r4  r   rM  rP  rR  rS  rU  r{   r   rV   rV   rT   rW   r'     sT   		
 !"#$%&'( 
Q	, 	
@
 visual.
state_dictmodelprefixr  c           
      C   s  |  d| } dd |  D } i }g d}t|}|  D ]\}}|r/||s)q||d }|D ]}||d |d }q1|dr|d	d}|d
d}|dd}|dr|jd d }	|drw|d|	 |d< ||	d |d< n|dr|d|	 |d< ||	d |d< qn/|dkrd}|dd}t	|jd |d< n|dkrd}|
d
d}n	|dkr|
d}|||< q|S )zConvert Perception Encoder weights.

    Args:
        state_dict: State dictionary to convert.
        model: Target model instance.
        prefix: Prefix to strip from keys.

    Returns:
        Converted state dictionary.
    r[  c                 S   s   i | ]\}}| d d|qS )zmodule.r   )replace)r   rw   rx   rV   rV   rW   
<dictcomp>T  s    z_convert_pe.<locals>.<dictcomp>))conv1patch_embed.proj)positional_embeddingr   )ztransformer.resblocks.r   )ln_prer   )ln_postrP   )ln_rP   )z
ls_1.gammar   )z
ls_2.gammar   )in_proj_zqkv.)out_projrQ   )zmlp.c_fcmlp.fc1)z
mlp.c_projmlp.fc2Nr   r!   r  zattn_pool.attnzattn_pool.layernormzattn_pool.normzattn_pool.probezattn_pool.latentzattn_pool.qkvr8   rZ   zattn_pool.q.weightzattn_pool.kv.weightr9   zattn_pool.q.biaszattn_pool.kv.biasrQ   zhead.weightz	head.biasclass_embeddingr   r   )getitemsr@  
startswithr]  ra   endswithrh   rF   rG   	unsqueeze)
rZ  r[  r\  out_dictswaps
len_prefixrw   rx   spr,   rV   rV   rW   _convert_peD  sJ   




	

rs  bicubicTinterpolation	antialiasc              	   C   s  i }|  d| } |  d| } |  d| } |  d| } d| v r#t| |S d| v r.t| |ddS d	| v r5d
}n	d| v r<d}nd}|d | v }|d | v }t|}|  D ]\}	}
|rd|	|s^qR|	|d }	d|	v riqRd|	v r|jjjj\}}}}|
jd |ks|
jd |krt	|
||f||dd}
n)|	dkr|
jd |j
jd krt|ddrdnt|dd}t|
|jj|||dd}
|	dd}	|	dd }	|	d!d"}	|	d#d$}	|	d%d&}	|	d'd(}	|r|	d)d*}	|	d+d,}	|r|	d-v r|	d.ks|	d/kr|	d0d1}	nqR|
||	< qR|S )2aZ  Convert patch embedding weight from manual patchify + linear proj to conv.

    Args:
        state_dict: Checkpoint state dictionary.
        model: Target model instance.
        interpolation: Interpolation method for resizing.
        antialias: Whether to use antialiasing when resizing.

    Returns:
        Filtered state dictionary.
    	model_emar[  r   rZ  zvisual.conv1.weightzconv1.weightr   )r\  zvisual.trunk.pos_embedzvisual.trunk.zvisual.pos_embedrY  
mask_tokenzblocks.0.attn.q_proj.weightNrX   zpatch_embed.proj.weightr[   r`   T)ru  rv  verboser   r!   no_embed_classFr   r1   )r(  r1   ru  rv  ry  z
mlp.ffn_lnzmlp.normzattn.inner_attn_lnz	attn.normzmlp.w12rg  zmlp.w1z	mlp.fc1_gzmlp.w2z	mlp.fc1_xzmlp.w3rh  rH   zq_proj.biasrJ   zv_proj.bias)rx  zlm_head.weightzlm_head.biasnorm.weight	norm.biasr{  r|  rP   r  )rj  rs  r@  rk  rl  r   rQ   rZ   ra   r   r   getattrr   r   r]  )rZ  r[  ru  rv  ro  r\  mim_weightsno_qkvrq  rw   rx   rH  r.  r/  r1   rV   rV   rW   checkpoint_filter_fn  sx   

	
r  Fvariant
pretrainedc                 K   s2   | dd}tt| |ftt|ddd|}|S )zCreate an EVA model.

    Args:
        variant: Model variant name.
        pretrained: Load pretrained weights.
        **kwargs: Additional model arguments.

    Returns:
        Instantiated Eva model.
    out_indicesr8   getter)r  feature_cls)pretrained_filter_fnfeature_cfg)popr"   r'   r  r   )r  r  kwargsr  r[  rV   rV   rW   _create_eva  s   
r  r   urlc                 K   s"   | ddddddt tddd	d
|S )zGenerate default configuration for EVA models.

    Args:
        url: Model weights URL.
        **kwargs: Additional configuration parameters.

    Returns:
        Model configuration dictionary.
    r   r8   r   r   Ng?rt  Tr`  r  mitr  r   
input_size	pool_sizecrop_pctru  fixed_input_sizemeanr   
first_conv
classifierlicense)r   r   r  r  rV   rV   rW   _cfg     r  c                 K   s"   | dddddddddd	d
d|S )zGenerate default configuration for Perception Encoder models.

    Args:
        url: Model weights URL.
        **kwargs: Additional configuration parameters.

    Returns:
        Model configuration dictionary.
    r   r  N      ?rt  T      ?r  r  r`  r  customr  rV   r  rV   rV   rW   _pe_cfg  r  r  z"eva_giant_patch14_224.clip_ft_in1kztimm/)	hf_hub_idz"eva_giant_patch14_336.clip_ft_in1k)r8   P  r  r  squash)r  r  r  	crop_modez(eva_giant_patch14_336.m30m_ft_in22k_in1k)r  r  r   r  r  r  z(eva_giant_patch14_560.m30m_ft_in22k_in1k)r8   0  r  z.eva02_base_patch14_448.mim_in22k_ft_in22k_in1k)r8     r  z/eva02_large_patch14_448.mim_in22k_ft_in22k_in1kz.eva02_large_patch14_448.mim_m38m_ft_in22k_in1kz(eva02_tiny_patch14_336.mim_in22k_ft_in1k)r  r  r  z)eva02_small_patch14_336.mim_in22k_ft_in1kz(eva02_base_patch14_448.mim_in22k_ft_in1kz)eva02_large_patch14_448.mim_in22k_ft_in1kz(eva02_large_patch14_448.mim_m38m_ft_in1kz)eva02_base_patch14_448.mim_in22k_ft_in22kiQU  )r  r  r  r  r   z*eva02_large_patch14_448.mim_in22k_ft_in22kz)eva02_large_patch14_448.mim_m38m_ft_in22kz eva02_tiny_patch14_224.mim_in22k)r  r   z!eva02_small_patch14_224.mim_in22kz eva02_base_patch14_224.mim_in22kz!eva02_large_patch14_224.mim_in22kz eva02_large_patch14_224.mim_m38mz$eva_giant_patch14_clip_224.laion400m   z#eva_giant_patch14_clip_224.merged2bz$eva02_base_patch16_clip_224.merged2b   z%eva02_large_patch14_clip_224.merged2br   z%eva02_large_patch14_clip_336.merged2b)r  r  r  r   z'eva02_enormous_patch14_clip_224.laion2bz,eva02_enormous_patch14_clip_224.laion2b_plusz(eva02_enormous_patch14_clip_224.pretrain)r   z-vit_medium_patch16_rope_reg1_gap_256.sbb_in1k)r8      r  gffffff?r  )r  r  r  r  r   z.vit_mediumd_patch16_rope_reg1_gap_256.sbb_in1kz.vit_betwixt_patch16_rope_reg4_gap_256.sbb_in1kz+vit_base_patch16_rope_reg1_gap_256.sbb_in1kzvit_pe_core_base_patch16_224.fbr  )r  r  r   z vit_pe_core_large_patch14_336.fbi   )z#vit_pe_core_gigantic_patch14_448.fbz vit_pe_lang_large_patch14_448.fbz#vit_pe_lang_gigantic_patch14_448.fbz&vit_pe_spatial_gigantic_patch14_448.fbc                 K   4   t dddddd}td
d| it |fi |}|S ). EVA-g model https://arxiv.org/abs/2211.07636      (   r   tE]t@r   r   r   r-   r   eva_giant_patch14_224r  N)r  r   r  r  r  
model_argsr[  rV   rV   rW   r       r  c                 K   r  )r  r  r  r  r   r  r  eva_giant_patch14_336r  N)r  r  r  rV   rV   rW   r    r  r  c                 K   r  )r  r  r  r  r   r  r  eva_giant_patch14_560r  N)r  r  r  rV   rV   rW   r     r  r  c                 K   <   t dddddddddd		}tdd| it |fi |}|S )Nr   r     r   r8   UUUUUU@Tr   r   	r   r   r   r   r-   r   r   r   r   eva02_tiny_patch14_224r  )r  r  r  rV   rV   rW   r  (     r  c                 K   r  )Nr   r    r      r  Tr  r  eva02_small_patch14_224r  )r  r  r  rV   rV   rW   r  9  r  r  c                 K   @   t dddddddddddd	}tdd| it |fi |}|S )Nr   r  r   r   Fr  Tr  r   r   r   r   r-   r/   r   r   r   r   r   eva02_base_patch14_224r  )r  r  r  rV   rV   rW   r  J     r  c                 K   @   t ddddddddddd	d
}tdd| it |fi |}|S )Nr   r  r     r   r  FTr  r   r   r   r   r-   r   r/   r   r   r   r   eva02_large_patch14_224r  )r  r  r  rV   rV   rW   r  ]  r  r  c                 K   r  )Nr  r  r  r   r8   r  Tr  r  eva02_tiny_patch14_336r  )r  r  r  rV   rV   rW   r  p  r  r  c                 K   r  )Nr  r  r  r   r  r  Tr  r  eva02_small_patch14_336r  )r  r  r  rV   rV   rW   r    r  r  c                 K   r  )Nr  r  r   r   Fr  Tr  r  eva02_base_patch14_448r  )r  r  r  rV   rV   rW   r    r  r  c                 K   r  )Nr  r  r  r  r   r  FTr  r  eva02_large_patch14_448r  )r  r  r  rV   rV   rW   r    r  r  c              
   K   s>   t ddddd|ddd}tdd
| it |fi |}|S )zB EVA-g CLIP model (only difference from non-CLIP is the pooling)  r  r  r  r   r  r   r   )r   r   r   r-   r   r   eva_giant_patch14_clip_224r  N)r  r   r  r  r  rV   rV   rW   r    s   

r  c                 K   sL   t dddddddddddd|d	d
d}tdd| it |fi |}|S )zU A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_base r   r   r   r   Fr  Tr  r   r   )r   r   r   r   r-   r/   r   r   r   r   r   r   r   eva02_base_patch16_clip_224r  N)r  r  r  rV   rV   rW   r    "   
r  c                 K   L   t dddddddddddd	|d
dd}tdd| it |fi |}|S )V A EVA-CLIP specific variant that adds additional attn scale layernorm to eva02_large r   r  r  r  r   r  FTr  r   r   r   r   r   r   r-   r   r/   r   r   r   r   r   r   eva02_large_patch14_clip_224r  N)r  r  r  rV   rV   rW   r    r  r  c                 K   r  )r  r  r  r  r  r   r  FTr  r   r   r  eva02_large_patch14_clip_336r  N)r  r  r  rV   rV   rW   r    r  r  c                 K   sB   t ddddddd|dd	d
}tdd| it |fi |}|S )zD A EVA-CLIP specific variant that uses residual post-norm in blocks r   r  i   @   r   gI$I$!@Tr   r   )r   r   r   r   r-   r   r   r   eva02_enormous_patch14_clip_224r  N)r  r  r  rV   rV   rW   r    s   

r  c                 K   D   t dddddddddd	ddd
d}tdd| it |fi |}|S )Nr  r   r  r   r*   Th㈵>Fr!   r  r   r   r   r   r-   r/   r.   r   r   r   r   r   r   $vit_medium_patch16_rope_reg1_gap_256r  )r  r  r  rV   rV   rW   r    "   r  c                 K   sD   t dddddddddd	ddd
d}tdd| it |fi |}|S )Nr  r   r     r*   TFr  r!   r  r  %vit_mediumd_patch16_rope_reg1_gap_256r  )r  r  r  rV   rV   rW   r  *  r  r  c                 K   r  )Nr  r   i  r   
   Tr  Fr]   r  r  %vit_betwixt_patch16_rope_reg4_gap_256r  )r  r  r  rV   rV   rW   r  ?  r  r  c                 K   sD   t ddddddddddddd	d
}tdd| it |fi |}|S )Nr  r   r   r   Tr  Fr!   r  r  "vit_base_patch16_rope_reg1_gap_256r  )r  r  r  rV   rV   rW   r  T  r  r  c                 K   sL   t ddddddddddd	d
ddttddd}tdd| it |fi |S )Nr   r   r   r   r   rX   T)r  r  r  xyr*   r  epsr   r   r   r-   r   r   r   r   r   r   r   r   r   r   r5   vit_pe_core_base_patch16_224r  )r  r   r   r   r  r  r  r  rV   rV   rW   r  i  $   
r  c                 K   sL   t dddddddddd	d
dddttddd}tdd| it |fi |S )Nr  r  r  r   r   r   rX   T)r  r  r  r  r*   r  r  r  vit_pe_core_large_patch14_336r  )r  r  r  rV   rV   rW   r    r  r  c                 K   sL   t ddddddddd	d	d
dddttddd}tdd| it |fi |S )Nr     2   r   UUUUUU@r   rX   FT    r  r  r*   r   r  r  )r   r   r   r-   r   r   r   r   r   r   r   r   r   r   r5    vit_pe_core_gigantic_patch14_448r  )r  r  r  rV   rV   rW   r    r  r  c                 K   s   t d!i ddddddddd	d
dddddddddddddddddddddttdd}td"d | it |fi |S )#Nr   r  r   r  r      r-   r   r   r   r   rX   r   Tr   r   r  r   r  r   r  r   r   Fr   r   皙?r5   r  r  vit_pe_lang_large_patch14_448r  rV   )r  r  r  rV   rV   rW   r    sD   	
r  c                 K   L   t ddddddddd	d
ddddttddd}tdd| it |fi |S )Nr  r  /   r   r  rX   FTr  r  r  r  r  r   r   r   r-   r   r   r   r   r   r   r   r   r   r   r5    vit_pe_lang_gigantic_patch14_448r  )r  r  r  rV   rV   rW   r    r  r  c                 K   r  )Nr  r  r  r   r  rX   FTr  r  r  r  r  r  #vit_pe_spatial_gigantic_patch14_448r  )r  r  r  rV   rV   rW   r    r  r  )rY  )rt  TrV  )r   )]r   r  	functoolsr   typingr   r   r   r   r   r   r	   r
   rF   torch.nnr?   torch.nn.functional
functionalrc   	timm.datar   r   r   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    _builderr"   	_featuresr#   _manipulater$   	_registryr%   r&   __all__rW  r(   r   r   r'   r   r   rs  rn   r  r  r  r  default_cfgsr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rV   rV   rV   rW   <module>   s   (P ga   H
I
_#*/49>EJOV[`ejqx         $  )  .  3  7  >  E  L 
 k	