o
    پid                  	   @   s.  d Z dgZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlmZ ddlmZmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ G dd dejj Z!G dd dejj Z"G dd dejj#Z$G dd dejj#Z%G dd dejj#Z&G dd dejj#Z'G dd dejj#Z(G dd dejj#Z)G dd dejj#Z*G dd  d ejj Z+G d!d dej#Z,d7d#d$Z-ee-d%d&e-d%d&e-d%d&e-d%d&e-d%d&e-d%d&d'Z.d8d)d*Z/ed8d+d,Z0ed8d-d.Z1ed8d/d0Z2ed8d1d2Z3ed8d3d4Z4ed8d5d6Z5dS )9z EfficientViT (by MSRA)

Paper: `EfficientViT: Memory Efficient Vision Transformer with Cascaded Group Attention`
    - https://arxiv.org/abs/2305.07027

Adapted from official impl at https://github.com/microsoft/Cream/tree/main/EfficientViT
EfficientVitMsra    N)OrderedDict)DictListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)SqueezeExciteSelectAdaptivePool2dtrunc_normal__assert   )build_model_with_cfg)feature_take_indices)
checkpointcheckpoint_seq)register_modelgenerate_default_cfgsc                       s.   e Zd Zd fdd	Ze dd Z  ZS )ConvNormr   r   c	           	   
      s^   t    tj|||||||dd| _t|| _tjj	| jj
| tjj	| jjd d S )NFbiasr   )super__init__nnConv2dconvBatchNorm2dbntorchinit	constant_weightr   )	selfin_chsout_chsksstridepaddilationgroupsbn_weight_init	__class__ Q/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/efficientvit_msra.pyr      s
   
zConvNorm.__init__c              	   C   s   | j | j}}|j|j|j d  }|j|d d d d d f  }|j|j|j |j|j d   }tjj	|
d| j j |
d|jdd  | j j| j j| j j| j jd}|jj| |jj| |S )N      ?r   r      )r)   paddingr+   r,   )r   r    r$   running_varepsr   running_meanr!   r   r   sizer,   shaper)   r4   r+   datacopy_)r%   cr    wbmr0   r0   r1   fuse!   s   $zConvNorm.fuse)r   r   r   r   r   r   __name__
__module____qualname__r   r!   no_gradr@   __classcell__r0   r0   r.   r1   r      s    r   c                       s.   e Zd Zd fdd	Ze dd Z  ZS )	
NormLinearT{Gz?        c                    sj   t    t|| _t|| _tj|||d| _t	| jj
|d | jjd ur3tj| jjd d S d S )Nr   )stdr   )r   r   r   BatchNorm1dr    DropoutdropLinearlinearr   r$   r   r"   r#   )r%   in_featuresout_featuresr   rJ   rM   r.   r0   r1   r   1   s   
zNormLinear.__init__c                 C   s   | j | j}}|j|j|j d  }|j| j j| j j |j|j d   }|j|d d d f  }|jd u r=|| jjj }n|j|d d d f  d| jj }t	j
|d|d}|jj| |jj| |S )Nr2   r   r   )r    rO   r$   r5   r6   r   r7   Tviewr!   r   rN   r8   r:   r;   )r%   r    rO   r=   r>   r?   r0   r0   r1   r@   ;   s   

$zNormLinear.fuse)TrH   rI   rA   r0   r0   r.   r1   rG   0   s    
rG   c                       $   e Zd Z fddZdd Z  ZS )PatchMergingc                    sl   t    t|d }t||ddd| _tj | _t||ddd|d| _	t
|d| _t||ddd| _d S )N   r   r      r3   r,   g      ?)r   r   intr   conv1r!   r   ReLUactconv2r   seconv3)r%   dimout_dimhid_dimr.   r0   r1   r   M   s   
zPatchMerging.__init__c                 C   s,   |  | | | | | |}|S N)r`   r_   r]   r^   r[   r%   xr0   r0   r1   forwardV   s   (zPatchMerging.forwardrB   rC   rD   r   rg   rF   r0   r0   r.   r1   rV   L   s    	rV   c                       s&   e Zd Zd fdd	Zdd Z  ZS )ResidualDroprI   c                    s   t    || _|| _d S rd   )r   r   r?   rM   )r%   r?   rM   r.   r0   r1   r   \   s   

zResidualDrop.__init__c              	   C   s`   | j r)| jdkr)|| |tj|dddd|jd| jd| j 	   S || | S )Nr   r   )device)
trainingrM   r?   r!   randr8   rj   ge_divdetachre   r0   r0   r1   rg   a   s   zResidualDrop.forward)rI   rh   r0   r0   r.   r1   ri   [   s    ri   c                       rU   )ConvMlpc                    s6   t    t||| _tj | _t||dd| _d S )Nr   r-   )	r   r   r   pw1r!   r   r\   r]   pw2)r%   edhr.   r0   r1   r   j   s   
zConvMlp.__init__c                 C   s   |  | | |}|S rd   )rs   r]   rr   re   r0   r0   r1   rg   p   s   zConvMlp.forwardrh   r0   r0   r.   r1   rp   i   s    rp   c                       sp   e Zd ZU eeejf ed< 	 				d fdd	Ze	 d fd	d
	Z
dejdejfddZdd Z  ZS )CascadedGroupAttentionattention_bias_cache   rW         r{   r{   r{   c                    s  t    || _|d | _|| _t|| | _|| _g }g }t|D ](}	|	t
|| | jd | j  |	t
| j| j||	 d||	 d | jd q"tj|| _tj|| _tjtj t
| j| |dd| _ttt|t|}
t|
}i }g }|
D ],}|
D ]'}t|d |d  t|d |d  f}||vrt|||< |	||  qqtjt|t|| _| jdt|||dd	 i | _d S )
Ng      r3   r   rY   r   rq   attention_bias_idxsF)
persistent)r   r   	num_headsscalekey_dimrZ   val_dim
attn_ratiorangeappendr   r!   r   
ModuleListqkvsdws
Sequentialr\   projlist	itertoolsproductlenabs	Parameterzerosattention_biasesregister_buffer
LongTensorrT   rw   )r%   ra   r   r~   r   
resolutionkernelsr   r   ipointsNattention_offsetsidxsp1p2offsetr.   r0   r1   r      s>   
	
 .(
zCascadedGroupAttention.__init__Tc                    s(   t  | |r| jri | _d S d S d S rd   )r   trainrw   )r%   moder.   r0   r1   r      s   

zCascadedGroupAttention.trainrj   returnc                 C   sZ   t j s| jr| jd d | jf S t|}|| jvr(| jd d | jf | j|< | j| S rd   )r!   jit
is_tracingrk   r   r|   strrw   )r%   rj   
device_keyr0   r0   r1   get_attention_biases   s   

z+CascadedGroupAttention.get_attention_biasesc                 C   s@  |j \}}}}|jt| jdd}g }|d }| |j}	tt| j| jD ]n\}
\}}|
dkr6|||
  }||}|	|d||j
| j| j| jgdd\}}}||}|d|d|d}}}|| j }|dd| }||	|
  }|jdd}||dd }|	|| j||}|| q&| t|d}|S )Nr   )ra   r   rR   r3   )r9   chunkr   r   r   rj   	enumeratezipr   rT   splitr   r   flattenr   	transposesoftmaxr   r   r!   cat)r%   rf   BCHWfeats_in	feats_outfeat	attn_biashead_idxqkvr   qkvattnr0   r0   r1   rg      s*   ,"
zCascadedGroupAttention.forward)rx   rW   ry   rz   T)rB   rC   rD   r   r   r!   Tensor__annotations__r   rE   r   rj   r   rg   rF   r0   r0   r.   r1   rv   u   s   
 *	rv   c                       s4   e Zd ZdZ					d fdd	Zd	d
 Z  ZS )LocalWindowAttentiona   Local Window Attention.

    Args:
        dim (int): Number of input channels.
        key_dim (int): The dimension for query and key.
        num_heads (int): Number of attention heads.
        attn_ratio (int): Multiplier for the query dim for value dimension.
        resolution (int): Input resolution.
        window_resolution (int): Local window resolution.
        kernels (List[int]): The kernel size of the dw conv on query.
    rx   rW   ry      rz   c                    sV   t    || _|| _|| _|dksJ d|| _t||}t||||||d| _d S )Nr   z"window_size must be greater than 0)r   r   r   )	r   r   ra   r~   r   window_resolutionminrv   r   r%   ra   r   r~   r   r   r   r   r.   r0   r1   r      s   


zLocalWindowAttention.__init__c              	   C   s  | j  }}|j\}}}}t||kd||f d||f  t||kd||f d||f  || jkr?|| jkr?| |}|S |dddd}| j|| j  | j }| j|| j  | j }	tjj	|ddd|	d|f}|| ||	 }
}|
| j }|| j }|
||| j|| j|dd}||| | | j| j|dddd}| |}|dddd
|||| j| j|}|dd||
||}|d d d |d |f  }|dddd}|S )Nz%input feature has wrong size, expect z, got r   r3   rX   r   )r   r9   r   r   r   permuter!   r   
functionalr*   rT   r   reshape
contiguous)r%   rf   r   r   r   r   H_W_pad_bpad_rpHpWnHnWr0   r0   r1   rg      s,   
""


 (
$zLocalWindowAttention.forward)rx   rW   ry   r   rz   rB   rC   rD   __doc__r   rg   rF   r0   r0   r.   r1   r      s    r   c                       s8   e Zd ZdZddddg df fdd	Zd	d
 Z  ZS )EfficientVitBlocka   A basic EfficientVit building block.

    Args:
        dim (int): Number of input channels.
        key_dim (int): Dimension for query and key in the token mixer.
        num_heads (int): Number of attention heads.
        attn_ratio (int): Multiplier for the query dim for value dimension.
        resolution (int): Input resolution.
        window_resolution (int): Local window resolution.
        kernels (List[int]): The kernel size of the dw conv on query.
    rx   rW   ry   r   rz   c              
      s   t    tt||ddd|dd| _tt|t|d | _tt|||||||d| _	tt||ddd|dd| _
tt|t|d | _d S )NrX   r   rI   )r,   r-   r3   )r   r   r   r   )r   r   ri   r   dw0rp   rZ   ffn0r   mixerdw1ffn1r   r.   r0   r1   r      s   


zEfficientVitBlock.__init__c                 C   s"   |  | | | | |S rd   )r   r   r   r   r   re   r0   r0   r1   rg   <  s   "zEfficientVitBlock.forwardr   r0   r0   r.   r1   r     s    r   c                       s8   e Zd Zdddddg ddf fdd		Zd
d Z  ZS )EfficientVitStage r   rx   rW   ry   r   rz   r   c                    s,  t    |d dkri|d |d  d | _g }|dtjtt||ddd|dtt	|t
|d f |dt||f |d	tjtt||ddd|dtt	|t
|d f tt|| _n||ksoJ t | _|| _g }t|
D ]}|t||||| j||	 q}tj| | _d S )
Nr   	subsampler   res1rX   rY   r3   
patchmergeres2)r   r   r   r   r!   r   r   ri   r   rp   rZ   rV   r   
downsampleIdentityr   r   blocks)r%   in_dimrb   r   r   r~   r   r   r   r   depthdown_blocksr   dr.   r0   r1   r   A  s6   

zEfficientVitStage.__init__c                 C      |  |}| |}|S rd   )r   r   re   r0   r0   r1   rg   l     

zEfficientVitStage.forwardrh   r0   r0   r.   r1   r   @  s    +r   c                       s   e Zd Z fddZ  ZS )PatchEmbeddingc              	      s   t    | dt||d ddd | dtj  | dt|d |d ddd | d	tj  | d
t|d |d ddd | dtj  | dt|d |ddd d| _d S )Nr[   rx   rX   r3   r   relu1r^   rW   relu2r`   relu3conv4   )r   r   
add_moduler   r!   r   r\   
patch_size)r%   in_chansra   r.   r0   r1   r   s  s   
  
zPatchEmbedding.__init__)rB   rC   rD   r   rF   r0   r0   r.   r1   r   r  s    r   c                       sF  e Zd Z											
		d4 fdd	Zejjdd Zejjd5ddZejjd6ddZ	ejjde
jfddZd7dedee fddZ					d8d ejd!eeeee f  d"ed#ed$ed%edeeej eejeej f f fd&d'Z	(		d9d!eeee f d)ed*efd+d,Zd-d. Zd5d/efd0d1Zd2d3 Z  ZS ):r      rX     @         r   r   r   r   r3   rX   rW   rW   rW   r   r   r   rz   r   r   r3   r   avgrI   c                    s  t t|   d| _|| _|| _t| d | _| jj}|| jj } fddt	t
 D }g | _g } d }tt ||||
D ]C\}\}}}}}}}t|||||||||	|d
}|}|d dkrq|dkrq||d 9 }|j}|| |  jt||d| d	g7  _qFtj| | _|d
krt|dd| _n|dksJ t | _ d  | _| _|dkrt| j|| jd| _d S tj | _d S )NFr   c                    s$   g | ]} | | |   qS r0   r0   ).0r   	embed_dimr   r~   r0   r1   
<listcomp>  s   $ z-EfficientVitMsra.__init__.<locals>.<listcomp>)
r   rb   r   r   r~   r   r   r   r   r   r   r   zstages.)num_chs	reductionmoduler   T	pool_typer   rR   rM   )r   r   r   grad_checkpointingnum_classes	drop_rater   patch_embedr   r   r   feature_infor   r   r   r   r   dictr   r   stagesr   global_poolr   num_featureshead_hidden_sizerG   r!   head)r%   img_sizer   r
  r  r   r   r~   window_sizer   down_opsr  r  r)   r   r   r  pre_edr   rt   kddpthnharwddostager.   r   r1   r     sX   
"

zEfficientVitMsra.__init__c                 C   s   dd |    D S )Nc                 S   s   h | ]}d |v r|qS )r   r0   )r   rf   r0   r0   r1   	<setcomp>  s    z3EfficientVitMsra.no_weight_decay.<locals>.<setcomp>)
state_dictkeysr%   r0   r0   r1   no_weight_decay  s   z EfficientVitMsra.no_weight_decayFc                 C   s   t d|rdnddgd}|S )Nz^patch_embedz^stages\.(\d+))z^stages\.(\d+).downsample)r   )z^stages\.(\d+)\.\w+\.(\d+)N)stemr   )r  )r%   coarsematcherr0   r0   r1   group_matcher  s   zEfficientVitMsra.group_matcherTc                 C   s
   || _ d S rd   )r	  )r%   enabler0   r0   r1   set_grad_checkpointing  s   
z'EfficientVitMsra.set_grad_checkpointingr   c                 C   s   | j jS rd   )r  rO   r"  r0   r0   r1   get_classifier  s   zEfficientVitMsra.get_classifierNr
  r  c                 C   sl   || _ |d ur|dkrt|dd| _n|dksJ t | _|dkr.t| j|| jd| _	d S tj | _	d S )Nr   Tr  r   r  )
r
  r   r  r   r   rG   r  r  r!   r  )r%   r
  r  r0   r0   r1   reset_classifier  s   

z!EfficientVitMsra.reset_classifierNCHWrf   indicesnorm
stop_early
output_fmtintermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}tj s |s$| j}
n	| jd|	d  }
t|
D ]\}}| jrCtj sCt	||}n||}||v rP|
| q1|rU|S ||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r,  zOutput shape must be NCHW.Nr   )r   r   r  r  r!   r   is_scriptingr   r	  r   r   )r%   rf   r-  r.  r/  r0  r1  intermediatestake_indices	max_indexr  feat_idxr  r0   r0   r1   forward_intermediates  s"   

z&EfficientVitMsra.forward_intermediatesr   
prune_norm
prune_headc                 C   s<   t t| j|\}}| jd|d  | _|r| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r   )r   r   r  r+  )r%   r-  r8  r9  r4  r5  r0   r0   r1   prune_intermediate_layers  s
   z*EfficientVitMsra.prune_intermediate_layersc                 C   s8   |  |}| jrtj st| j|}|S | |}|S rd   )r  r	  r!   r   r2  r   r  re   r0   r0   r1   forward_features  s   

z!EfficientVitMsra.forward_features
pre_logitsc                 C   s   |  |}|r	|S | |S rd   )r  r  )r%   rf   r<  r0   r0   r1   forward_head#  s   
zEfficientVitMsra.forward_headc                 C   r   rd   )r;  r=  re   r0   r0   r1   rg   '  r   zEfficientVitMsra.forward)r   rX   r   r   r   r   r   r   rz   r   r   rI   Fr   rd   )NFFr,  F)r   FT)rB   rC   rD   r   r!   r   ignorer#  r'  r)  r   Moduler*  rZ   r   r   r+  r   r   r   boolr   r7  r:  r;  r=  rg   rF   r0   r0   r.   r1   r     sp    =

 
0
r   c              	   K   s   | dt tddddd|S )Nr   zpatch_embed.conv1.convzhead.linearT)rW   rW   )urlr
  meanrJ   
first_conv
classifierfixed_input_size	pool_sizer	   )rB  kwargsr0   r0   r1   _cfgO  s   	rI  ztimm/)	hf_hub_id)zefficientvit_m0.r224_in1kzefficientvit_m1.r224_in1kzefficientvit_m2.r224_in1kzefficientvit_m3.r224_in1kzefficientvit_m4.r224_in1kzefficientvit_m5.r224_in1kFc                 K   s0   | dd}tt| |fdtd|di|}|S )Nout_indices)r   r   r3   feature_cfgT)flatten_sequentialrK  )popr   r   r  )variant
pretrainedrH  rK  modelr0   r0   r1   _create_efficientvit_msray  s   
rR  c                 K   F   t dg dg dg dg dg dd}td
d	| it |fi |S )Nr   r   r   r   r   rz   r  r  r   r~   r  r   efficientvit_m0rP  )rU  r  rR  rP  rH  
model_argsr0   r0   r1   rU       rU  c                 K   rS  )Nr   )r      r   r   )r3   rX   rX   r   r   r{   rX   rX   rT  efficientvit_m1rP  )r\  rV  rW  r0   r0   r1   r\    rY  r\  c                 K   rS  )Nr   )r   r   r   r   )rW   rX   r3   r   r[  rT  efficientvit_m2rP  )r]  rV  rW  r0   r0   r1   r]    rY  r]  c                 K   rS  )Nr   )r      i@  r   )rW   rX   rW   r   rz   rT  efficientvit_m3rP  )r_  rV  rW  r0   r0   r1   r_    rY  r_  c                 K   rS  )Nr   )r        r   r   r   r[  rT  efficientvit_m4rP  )rb  rV  rW  r0   r0   r1   rb    rY  rb  c                 K   rS  )Nr   )r   i   ra  )r   rX   rW   )rX   rX   rW   r   r[  rT  efficientvit_m5rP  )rc  rV  rW  r0   r0   r1   rc    rY  rc  )r   r>  )6r   __all__r   collectionsr   typingr   r   r   r   r   r!   torch.nnr   	timm.datar
   r   timm.layersr   r   r   r   _builderr   	_featuresr   _manipulater   r   	_registryr   r   r   r   rG   r@  rV   ri   rp   rv   r   r   r   r   r   rI  default_cfgsrR  rU  r\  r]  r_  rb  rc  r0   r0   r0   r1   <module>   sv    ^A,2 
Q
