o
    پik                     @   sR  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlmZmZ ddlmZmZmZmZmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z# dgZ$dddddZ%dddddZ&dddddZ'G dd dej(Z)G dd de
jj(Z*G dd  d e
jj(Z+G d!d" d"e
jj(Z,G d#d$ d$ej(Z-G d%d& d&ej(Z.G d'd( d(ej(Z/G d)d* d*ej(Z0G d+d, d,ej1Z2G d-d. d.ej(Z3G d/d dej(Z4dBd1d2Z5e"e5d3d4e5d3d4e5d3d4e5d3d4d5Z6dCd7d8Z7e#dCd9e4fd:d;Z8e#dCd9e4fd<d=Z9e#dCd9e4fd>d?Z:e#dCd9e4fd@dAZ;dS )DaJ   EfficientFormer-V2

@article{
    li2022rethinking,
    title={Rethinking Vision Transformers for MobileNet Size and Speed},
    author={Li, Yanyu and Hu, Ju and Wen, Yang and Evangelidis, Georgios and Salahi, Kamyar and Wang, Yanzhi and Tulyakov, Sergey and Ren, Jian},
    journal={arXiv preprint arXiv:2212.08059},
    year={2022}
}

Significantly refactored and cleaned up for timm from original at: https://github.com/snap-research/EfficientFormer

Original code licensed Apache 2.0, Copyright (c) 2022 Snap Inc.

Modifications and timm support by / Copyright 2023, Ross Wightman
    N)partial)DictListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)create_conv2dcreate_norm_layerget_act_layerget_norm_layerConvNormAct)DropPathtrunc_normal_	to_2tuple	to_ntuplendgrid   )build_model_with_cfg)feature_take_indices)checkpoint_seq)generate_default_cfgsregister_modelEfficientFormerV2)(   P        )    @      i   )r    0   x      )r    r#   `      )LS2S1S0)   r,      
   )   r/         )   r2   	      )   r5   r4   r/   )r/   r/   )r/   r/   r/   r/   r2   r2   r2   r2   r2   r2   r2   r/   r/   r/   r/   )
r/   r/   r/   r2   r2   r2   r2   r/   r/   r/   )r/   r/   )r/   r/   r2   r2   r2   r2   r2   r2   r/   r/   r/   r/   )r/   r/   r2   r2   r2   r2   r/   r/   )r/   r/   )	r/   r/   r2   r2   r2   r2   r/   r/   r/   )r/   r/   r2   r2   r/   r/   )r/   r/   )r/   r2   r2   r2   r/   r/   )r/   r2   r2   r/   c                       s6   e Zd Z								d
 fdd	Zdd	 Z  ZS )ConvNormr    Tbatchnorm2dNc              
      sH   |
pi }
t t|   t||||||||d| _t|	|fi |
| _d S )N)stridepaddingdilationgroupsbias)superr6   __init__r   convr   bn)selfin_channelsout_channelskernel_sizer9   r:   r;   r<   r=   
norm_layernorm_kwargs	__class__ R/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/efficientformer_v2.pyr?   :   s   
zConvNorm.__init__c                 C      |  |}| |}|S N)r@   rA   rB   xrJ   rJ   rK   forwardU      

zConvNorm.forward)r   r   r7   r   r   Tr8   N__name__
__module____qualname__r?   rP   __classcell__rJ   rJ   rH   rK   r6   9   s    r6   c                       sv   e Zd ZU eeejf ed< dddddej	df fdd		Z
e d fdd	ZdejdejfddZdd Z  ZS )Attention2dattention_bias_cacher   r    r1   r/      Nc                    s  t    || _|d | _|| _t|} d ur6t fdd|D }t||d |d| _t	j
 dd| _nd | _d | _|| _| jd | jd	  | _t|| | _t|| | | _|| _| j| j }t||| _t||| _t|| j| _t| j| jd| jd
| _t	j| j| jd	d| _t	j| j| jd	d| _| | _t| j|d	| _ttt| jd t| jd	 d	}	|	dd d d f |	dd d d f    }
|
d | jd	  |
d	  }
tj	!t"|| j| _#| j$dt%|
dd i | _&d S )N      c                       g | ]	}t |  qS rJ   mathceil.0rr9   rJ   rK   
<listcomp>o       z(Attention2d.__init__.<locals>.<listcomp>r2   rE   r9   r<   bilinear)scale_factormoder   r   )rE   r<   )rE   .attention_bias_idxsF
persistent)'r>   r?   	num_headsscalekey_dimr   tupler6   stride_convnnUpsampleupsample
resolutionNintddh
attn_ratioqkvv_localConv2dtalking_head1talking_head2actprojtorchstackr   arangeflattenabs	Parameterzerosattention_biasesregister_buffer
LongTensorrX   )rB   dimrn   rl   ry   rt   	act_layerr9   khposrel_posrH   rb   rK   r?   ^   s>   


.(
zAttention2d.__init__Tc                    (   t  | |r| jri | _d S d S d S rM   r>   trainrX   rB   rh   rH   rJ   rK   r         

zAttention2d.traindevicereturnc                 C   Z   t j s| jr| jd d | jf S t|}|| jvr(| jd d | jf | j|< | j| S rM   r   jit
is_tracingtrainingr   ri   strrX   rB   r   
device_keyrJ   rJ   rK   get_attention_biases      

z Attention2d.get_attention_biasesc                 C   s>  |j \}}}}| jd ur| |}| ||| jd| jdddd}| ||| jd| jdddd}| |}| 	|}	||| jd| jdddd}|| | j
 }
|
| |j }
| |
}
|
jdd}
| |
}
|
| dd}||| j| jd | jd |	 }| jd ur| |}| |}| |}|S Nr   r   r2   r5   r   )shaperp   rz   reshaperl   ru   permuter{   r|   r}   rm   r   r   r   softmaxr   	transposerx   rt   rs   r   r   rB   rO   BCHWrz   r{   r|   r}   attnrJ   rJ   rK   rP      s(   

&&

 

"



zAttention2d.forwardTrS   rT   rU   r   r   r   Tensor__annotations__rq   GELUr?   no_gradr   r   r   rP   rV   rJ   rJ   rH   rK   rW   [   s   
 0	rW   c                       s$   e Zd Z fddZdd Z  ZS )LocalGlobalQueryc                    sD   t    tddd| _tj||ddd|d| _t||d| _d S )Nr   r5   r   r2   )rE   r9   r:   r<   )	r>   r?   rq   	AvgPool2dpoolr~   localr6   r   )rB   in_dimout_dimrH   rJ   rK   r?      s   
zLocalGlobalQuery.__init__c                 C   s*   |  |}| |}|| }| |}|S rM   )r   r   r   )rB   rO   local_qpool_qrz   rJ   rJ   rK   rP      s
   


zLocalGlobalQuery.forwardrR   rJ   rJ   rH   rK   r      s    r   c                       sv   e Zd ZU eeejf ed< ddddddej	f fdd		Z
e d fdd	ZdejdejfddZdd Z  ZS )Attention2dDownsamplerX   r      r1   r/   rY   Nc              	      s  t    || _|d | _|| _t|| _tdd | jD | _| jd | jd  | _	| jd | jd  | _
t|| | _t|| | | _|| _|pL|| _| j| j }t||| _t||d| _t|| jd| _t| j| jdd| jd| _| | _t| j| jd| _tt|| j	| _ttt| jd t| jd  d}	tttjd| jd dd	tjd| jd dd	 d}
|
d
d d d f |	d
d d d f  ! }|d | jd  |d  }| j"d|dd i | _#d S )NrZ   c                 S      g | ]	}t |d  qS r5   r\   r_   rJ   rJ   rK   rc      rd   z2Attention2dDownsample.__init__.<locals>.<listcomp>r   r   r2   r5   re   )step.ri   Frj   )$r>   r?   rl   rm   rn   r   rt   ro   resolution2ru   N2rv   rw   rx   ry   r   r   rz   r6   r{   r|   r}   r   r   rq   r   r   r   r   r   r   r   r   r   r   rX   )rB   r   rn   rl   ry   rt   r   r   r   k_posq_posr   rH   rJ   rK   r?      s>   




.(
zAttention2dDownsample.__init__Tc                    r   rM   r   r   rH   rJ   rK   r      r   zAttention2dDownsample.trainr   r   c                 C   r   rM   r   r   rJ   rJ   rK   r      r   z*Attention2dDownsample.get_attention_biasesc                 C   s  |j \}}}}| ||| jd| jdddd}| ||| jd| jdddd}| |}| 	|}	||| jd| jdddd}|| | j
 }
|
| |j }
|
jdd}
|
| dd}||| j| jd | jd |	 }| |}| |}|S r   )r   rz   r   rl   r   r   r{   ru   r|   r}   rm   r   r   r   r   rx   r   r   r   r   rJ   rJ   rK   rP     s   &&

 "

zAttention2dDownsample.forwardr   r   rJ   rJ   rH   rK   r      s   
 -	r   c                       s8   e Zd Zdddddejejf fdd	Zdd	 Z  ZS )

Downsampler2   r5   r   rY   Fc
           
         sh   t    t|}t|}t|}|	pt }	t||||||	d| _|r/t||||d| _d S d | _d S )N)rE   r9   r:   rF   )r   r   rt   r   )	r>   r?   r   rq   Identityr6   r@   r   r   )
rB   in_chsout_chsrE   r9   r:   rt   use_attnr   rF   rH   rJ   rK   r?     s*   
	
zDownsample.__init__c                 C   s&   |  |}| jd ur| || S |S rM   )r@   r   )rB   rO   outrJ   rJ   rK   rP   @  s   

zDownsample.forward	rS   rT   rU   rq   r   BatchNorm2dr?   rP   rV   rJ   rJ   rH   rK   r     s    %r   c                       s:   e Zd ZdZddejejddf fdd	Zdd Z  Z	S )	ConvMlpWithNormz`
    Implementation of MLP with 1*1 convolutions.
    Input: tensor with shape [B, C, H, W]
    N        Fc              	      s   t    |p|}|p|}t||dd||d| _|r't||d|d||d| _nt | _t|| _t	||d|d| _
t|| _d S )Nr   T)r=   rF   r   r2   )r<   r=   rF   r   )rF   )r>   r?   r   fc1midrq   r   Dropoutdrop1r6   fc2drop2)rB   in_featureshidden_featuresout_featuresr   rF   dropmid_convrH   rJ   rK   r?   M  s    



zConvMlpWithNorm.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S rM   )r   r   r   r   r   rN   rJ   rJ   rK   rP   g  s   




zConvMlpWithNorm.forward)
rS   rT   rU   __doc__rq   r   r   r?   rP   rV   rJ   rJ   rH   rK   r   G  s    r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )LayerScale2dh㈵>Fc                    s*   t    || _t|t| | _d S rM   )r>   r?   inplacerq   r   r   onesgamma)rB   r   init_valuesr   rH   rJ   rK   r?   q  s   
zLayerScale2d.__init__c                 C   s*   | j dddd}| jr||S || S )Nr   r   )r   viewr   mul_)rB   rO   r   rJ   rJ   rK   rP   v  s   zLayerScale2d.forward)r   FrR   rJ   rJ   rH   rK   r   p  s    r   c                	       s<   e Zd Zdejejddddddf	 fdd	Zd	d
 Z  ZS )EfficientFormerV2Block      @r   r   rY   NTc                    s   t    |
r,t||||	d| _|d urt||nt | _|dkr&t|nt | _	n	d | _d | _d | _	t
|t|| |||dd| _|d urMt||nt | _|dkr]t|| _d S t | _d S )N)rt   r   r9   r   T)r   r   r   rF   r   r   )r>   r?   rW   token_mixerr   rq   r   ls1r   
drop_path1r   rv   mlpls2
drop_path2)rB   r   	mlp_ratior   rF   	proj_drop	drop_pathlayer_scale_init_valuert   r9   r   rH   rJ   rK   r?   |  sB   

	$zEfficientFormerV2Block.__init__c                 C   sB   | j d ur|| | |  | }|| | | | }|S rM   )r   r   r   r   r   r   rN   rJ   rJ   rK   rP     s   
zEfficientFormerV2Block.forwardr   rJ   rJ   rH   rK   r   {  s    *r   c                       s&   e Zd Zejejf fdd	Z  ZS )Stem4c              
      sP   t    d| _t||d dddd||d| _t|d |dddd||d| _d S )Nr/   r5   r2   r   T)rE   r9   r:   r=   rF   r   )r>   r?   r9   r   conv1conv2)rB   r   r   r   rF   rH   rJ   rK   r?     s   
zStem4.__init__)rS   rT   rU   rq   r   r   r?   rV   rJ   rJ   rH   rK   r     s    r   c                       sB   e Zd Zddddddddddejejf fd	d
	Zdd Z  ZS )EfficientFormerV2StagerY   TNFr   r   r   r   c                    s   t    d| _t||
}
t|}|r+t||||||d| _|}tdd |D }n||ks1J t	 | _g }t
|D ]#}||	 d }t||||
| |oP||k||| |||d
}||g7 }q<tj| | _d S )NF)r   rt   rF   r   c                 S   r   r   r\   r_   rJ   rJ   rK   rc     rd   z3EfficientFormerV2Stage.__init__.<locals>.<listcomp>r   )	rt   r9   r   r   r   r   r   r   rF   )r>   r?   grad_checkpointingr   r   r   
downsamplero   rq   r   ranger   
Sequentialblocks)rB   r   dim_outdepthrt   r   block_stridedownsample_use_attnblock_use_attnnum_vitr   r   r   r   r   rF   r   	block_idx
remain_idxbrH   rJ   rK   r?     sD   


zEfficientFormerV2Stage.__init__c                 C   s8   |  |}| jrtj st| j|}|S | |}|S rM   )r   r   r   r   is_scriptingr   r   rN   rJ   rJ   rK   rP     s   

zEfficientFormerV2Stage.forwardr   rJ   rJ   rH   rK   r     s    9r   c                       sh  e Zd Z												
	
	
			d6 fdd	Zdd Zejjdd Zejjd7ddZ	ejjd8ddZ
ejjdejfddZd9dedee fddZejjd8dd Z				!	d:d"ejd#eeeee f  d$ed%ed&ed'edeeej eejeej f f fd(d)Z	*		d;d#eeee f d+ed,efd-d.Zd/d0 Zd7d1efd2d3Zd4d5 Z  ZS )<r   r2   r%   avgNr/   r8   r   gelu  r   r   Tc                    s  t    |dv sJ || _|| _g | _t|}tt||	d}t|
}
t	||d |
|d| _
|d }d t|}dd td|t||D }|pUdd	t|d
   }t||}g }t|D ]U}t fdd|D }t||| || ||| |dkrdnd |dk|dk||| ||| ||
|d}|| r d9  || }|  jt| d| dg7  _|| qbtj| | _|d  | _| _||d | _t|| _|dkrt|d |nt | _ || _!| j!r|dkrt|d |nt | _"nd | _"| #| j$ d| _%d S )N)r  r7   )epsr   )r   rF   r/   c                 S   s   g | ]}|  qS rJ   )tolist)r`   rO   rJ   rJ   rK   rc   "  s    z.EfficientFormerV2.__init__.<locals>.<listcomp>Fr   r   c                    r[   rJ   r\   )r`   srb   rJ   rK   rc   '  rd   r5   r2   )r   rt   r   r   r   r   r   r   r   r   r   r   rF   zstages.)num_chs	reductionmoduler   F)&r>   r?   num_classesglobal_poolfeature_infor   r   r   r   r   stemlenr   linspacesumsplitr   r   ro   r   dictappendrq   r   stagesnum_featureshead_hidden_sizenormr   	head_dropLinearr   headdist	head_distapplyinit_weightsdistilled_training)rB   depthsin_chansimg_sizer  
embed_dimsdownsamples
mlp_ratiosrF   norm_epsr   r  	drop_rateproj_drop_ratedrop_path_rater   r   distillationprev_dim
num_stagesdprr  icurr_resolutionstagerH   rb   rK   r?      sf   
" "$
zEfficientFormerV2.__init__c                 C   s@   t |tjrt|jdd |jd urtj|jd d S d S d S )N{Gz?)stdr   )
isinstancerq   r  r   weightr=   init	constant_)rB   mrJ   rJ   rK   r#  O  s   
zEfficientFormerV2.init_weightsc                 C   s   dd |   D S )Nc                 S   s   h | ]
\}}d |v r|qS )r   rJ   )r`   r{   _rJ   rJ   rK   	<setcomp>W  s    z4EfficientFormerV2.no_weight_decay.<locals>.<setcomp>)named_parametersrB   rJ   rJ   rK   no_weight_decayU  s   z!EfficientFormerV2.no_weight_decayFc                 C   s   t dddgd}|S )Nz^stem)z^stages\.(\d+)N)z^norm)i )r  r   )r  )rB   coarsematcherrJ   rJ   rK   group_matcherY  s
   zEfficientFormerV2.group_matcherc                 C   s   | j D ]}||_qd S rM   )r  r   )rB   enabler  rJ   rJ   rK   set_grad_checkpointinga  s   
z(EfficientFormerV2.set_grad_checkpointingr   c                 C   s   | j | jfS rM   r  r!  r@  rJ   rJ   rK   get_classifierf  s   z EfficientFormerV2.get_classifierr  r  c                 C   s^   || _ |d ur
|| _|dkrt| j|nt | _|dkr(t| j|| _d S t | _d S )Nr   )r  r  rq   r  r  r   r  r!  )rB   r  r  rJ   rJ   rK   reset_classifierj  s
    *z"EfficientFormerV2.reset_classifierc                 C   s
   || _ d S rM   )r$  )rB   rE  rJ   rJ   rK   set_distilled_trainingq  s   
z(EfficientFormerV2.set_distilled_trainingNCHWrO   indicesr  
stop_early
output_fmtintermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}t| jd }
tj s'|s+| j}n	| jd|	d  }t|D ]$\}}||}||v r\||
krW|rO| |n|}|	| q8|	| q8|ra|S ||
krj| |}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )rK  zOutput shape must be NCHW.r   N)
r   r  r  r  r   r   r  	enumerater  r  )rB   rO   rL  r  rM  rN  rO  intermediatestake_indices	max_indexlast_idxr  feat_idxr5  x_interrJ   rJ   rK   forward_intermediatesu  s*   


z'EfficientFormerV2.forward_intermediatesr   
prune_norm
prune_headc                 C   sJ   t t| j|\}}| jd|d  | _|rt | _|r#| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r7   )r   r  r  rq   r   r  rI  )rB   rL  rX  rY  rR  rS  rJ   rJ   rK   prune_intermediate_layers  s   
z+EfficientFormerV2.prune_intermediate_layersc                 C   s"   |  |}| |}| |}|S rM   )r  r  r  rN   rJ   rJ   rK   forward_features  s   


z"EfficientFormerV2.forward_features
pre_logitsc                 C   sh   | j dkr|jdd}| |}|r|S | || |}}| jr.| jr.tj	 s.||fS || d S )Nr  )r5   r2   r   r5   )
r  meanr  r  r!  r$  r   r   r   r  )rB   rO   r\  x_distrJ   rJ   rK   forward_head  s   

zEfficientFormerV2.forward_headc                 C   rL   rM   )r[  r_  rN   rJ   rJ   rK   rP     rQ   zEfficientFormerV2.forward)r2   r%   r  NNr/   r8   r   r  r  r   r   r   r   r   Tr
  r   rM   )NFFrK  F)r   FT)rS   rT   rU   r?   r#  r   r   ignorerA  rD  rF  rq   ModulerH  rv   r   r   rI  rJ  r   r   r   boolr   rW  rZ  r[  r_  rP   rV   rJ   rJ   rH   rK   r     s~    O
 
5
r7   c                 K   s    | ddd dddt tddd|S )	Nr  )r2   r%   r%   Tgffffff?bicubicrG  zstem.conv1.conv)urlr  
input_size	pool_sizefixed_input_sizecrop_pctinterpolationr]  r7  
classifier
first_convr   )rd  kwargsrJ   rJ   rK   _cfg  s   rm  ztimm/)	hf_hub_id)z#efficientformerv2_s0.snap_dist_in1kz#efficientformerv2_s1.snap_dist_in1kz#efficientformerv2_s2.snap_dist_in1kz"efficientformerv2_l.snap_dist_in1kFc                 K   s0   | dd}tt| |fdtd|di|}|S )Nout_indices)r   r   r5   r2   feature_cfgT)flatten_sequentialro  )popr   r   r  )variant
pretrainedrl  ro  modelrJ   rJ   rK   _create_efficientformerv2  s   
rv  r   c                 K   <   t td td ddtd d}tdd| it |fi |S )Nr+   r5   r   r%  r(  r   r.  r*  efficientformerv2_s0rt  )ry  r  EfficientFormer_depthEfficientFormer_width EfficientFormer_expansion_ratiosrv  rt  rl  
model_argsrJ   rJ   rK   ry       ry  c                 K   rw  )Nr*   r5   r   rx  efficientformerv2_s1rt  )r  rz  r~  rJ   rJ   rK   r    r  r  c                 K   rw  )Nr)   r/   r6  rx  efficientformerv2_s2rt  )r  rz  r~  rJ   rJ   rK   r    r  r  c                 K   rw  )Nr(   r4   g?rx  efficientformerv2_lrt  )r  rz  r~  rJ   rJ   rK   r    r  r  )r7   r
  )<r   r]   	functoolsr   typingr   r   r   r   r   r   torch.nnrq   	timm.datar	   r
   timm.layersr   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r   __all__r|  r{  r}  ra  r6   rW   r   r   r   r   r   r   r   r   r   r   rm  default_cfgsrv  ry  r  r  r  rJ   rJ   rJ   rK   <module>   s    "]S-)2D 
T
	