o
    پi_f                     @   sp  d Z dgZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlmZ ddlm  mZ ddlmZmZ ddlmZmZmZmZmZmZ dd	lmZ dd
lmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& G dd dejj'Z(G dd dej)Z*G dd dej)Z+G dd dej)Z,G dd dej)Z-G dd dej)Z.G dd dejj)Z/G dd dej)Z0e e0 G dd dej)Z1G d d dej)Z2d!d" Z3d@d$d%Z4e&e4d&d'd(e4d&d)e4d&d)e4d&d'd(e4d&d)e4d&d)e4d&d'd(e4d&d)e4d&d)e4d&d*d+d,d-e4d&d.d/d,d0d1d2Z5dAd4d5Z6e%dAd6d7Z7e%dAd8d9Z8e%dAd:d;Z9e%dAd<d=Z:e%dAd>d?Z;dS )Bz TinyViT

Paper: `TinyViT: Fast Pretraining Distillation for Small Vision Transformers`
    - https://arxiv.org/abs/2207.10666

Adapted from official impl at https://github.com/microsoft/Cream/tree/main/TinyViT
TinyVit    N)partial)DictListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)LayerNorm2dNormMlpClassifierHeadDropPathtrunc_normal_resize_rel_pos_bias_table_levituse_fused_attn   )build_model_with_cfg)feature_take_indices)register_notrace_module)
checkpointcheckpoint_seq)register_modelgenerate_default_cfgsc                       s.   e Zd Zd fdd	Ze dd Z  ZS )ConvNormr   r   c	           	   
      s^   t    tj|||||||dd| _t|| _tjj	| jj
| tjj	| jjd d S )NF)biasr   )super__init__nnConv2dconvBatchNorm2dbntorchinit	constant_weightr   )	selfin_chsout_chsksstridepaddilationgroupsbn_weight_init	__class__ H/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/tiny_vit.pyr      s
   
zConvNorm.__init__c              	   C   s   | j | j}}|j|j|j d  }|j|d d d d d f  }|j|j|j |j|j d   }tjj	|
d| j j |
d|jdd  | j j| j j| j j| j jd}|jj| |jj| |S )Ng      ?r   r      )r+   paddingr-   r.   )r    r"   r&   running_varepsr   running_meanr#   r   r   sizer.   shaper+   r5   r-   datacopy_)r'   cr"   wbmr2   r2   r3   fuse%   s   $zConvNorm.fuse)r   r   r   r   r   r   )__name__
__module____qualname__r   r#   no_gradrA   __classcell__r2   r2   r0   r3   r      s    r   c                       $   e Zd Z fddZdd Z  ZS )
PatchEmbedc                    sH   t    d| _t||d ddd| _| | _t|d |ddd| _d S )N   r4      r   )r   r   r+   r   conv1actconv2)r'   r(   r)   	act_layerr0   r2   r3   r   5   s
   
zPatchEmbed.__init__c                 C   s"   |  |}| |}| |}|S N)rK   rL   rM   r'   xr2   r2   r3   forward<   s   


zPatchEmbed.forwardrB   rC   rD   r   rR   rF   r2   r2   r0   r3   rH   4   s    rH   c                       rG   )MBConvc                    s   t    t|| }t||dd| _| | _t||ddd|d| _| | _t||ddd| _| | _	|dkr>t
|| _d S t | _d S )Nr   )r*   rJ   r*   r+   r,   r.           )r*   r/   )r   r   intr   rK   act1rM   act2conv3act3r   r   Identity	drop_path)r'   r(   r)   expand_ratiorN   r]   mid_chsr0   r2   r3   r   D   s   
$zMBConv.__init__c                 C   sV   |}|  |}| |}| |}| |}| |}| |}||7 }| |}|S rO   )rK   rX   rM   rY   rZ   r]   r[   )r'   rQ   shortcutr2   r2   r3   rR   O   s   






zMBConv.forwardrS   r2   r2   r0   r3   rT   C   s    rT   c                       rG   )PatchMergingc                    sX   t    t||ddd| _| | _t||ddd|d| _| | _t||ddd| _d S )Nr   r   rJ   r4   )r.   )r   r   r   rK   rX   rM   rY   rZ   )r'   dimout_dimrN   r0   r2   r3   r   ]   s   
zPatchMerging.__init__c                 C   s6   |  |}| |}| |}| |}| |}|S rO   )rK   rX   rM   rY   rZ   rP   r2   r2   r3   rR   e   s   




zPatchMerging.forwardrS   r2   r2   r0   r3   ra   \   s    ra   c                       s*   e Zd Z		d fdd	Zdd Z  ZS )	ConvLayerrV         @c                    s>   t    | _|| _tj fddt|D  | _d S )Nc              
      s.   g | ]}t  ttr| nqS r2   )rT   
isinstancelist.0irN   conv_expand_ratiorb   r]   r2   r3   
<listcomp>z   s    z&ConvLayer.__init__.<locals>.<listcomp>)r   r   rb   depthr   
Sequentialrangeblocks)r'   rb   rn   rN   r]   rl   r0   rk   r3   r   o   s   
zConvLayer.__init__c                 C   s   |  |}|S rO   )rq   rP   r2   r2   r3   rR      s   
zConvLayer.forward)rV   re   rS   r2   r2   r0   r3   rd   n   s
    rd   c                       s4   e Zd Zddejejdf fdd	Zdd Z  ZS )NormMlpNrV   c                    sd   t    |p|}|p|}||| _t||| _| | _t|| _t||| _	t|| _
d S rO   )r   r   normr   Linearfc1rL   Dropoutdrop1fc2drop2)r'   in_featureshidden_featuresout_features
norm_layerrN   dropr0   r2   r3   r      s   
	
zNormMlp.__init__c                 C   s@   |  |}| |}| |}| |}| |}| |}|S rO   )rs   ru   rL   rw   rx   ry   rP   r2   r2   r3   rR      s   





zNormMlp.forward)	rB   rC   rD   r   	LayerNormGELUr   rR   rF   r2   r2   r0   r3   rr      s    rr   c                       s|   e Zd ZU ejje ed< ee	ej
f ed< 			d fdd	Ze d fd	d
	Zdejdej
fddZdd Z  ZS )	Attention
fused_attnattention_bias_cache   rI      r   c                    sp  t    t|trt|dksJ || _|d | _|| _t|| | _	| j	| | _
|| _|| _t | _t|| _t||| j	d|   | _t| j
|| _ttt|d t|d }t|}i }g }	|D ],}
|D ]'}t|
d |d  t|
d |d  f}||vrt|||< |	||  qnqjtjt|t|| _| jdt |	!||dd i | _"d S )Nr4   g      r   r   attention_bias_idxsF)
persistent)#r   r   rf   tuplelen	num_headsscalekey_dimrW   val_dimrc   
attn_ratio
resolutionr   r   r   r   rs   rt   qkvprojrg   	itertoolsproductrp   absappendr#   	Parameterzerosattention_biasesregister_buffer
LongTensorviewr   )r'   rb   r   r   r   r   pointsNattention_offsetsidxsp1p2offsetr0   r2   r3   r      s6   

 (
zAttention.__init__Tc                    s(   t  | |r| jri | _d S d S d S rO   )r   trainr   )r'   moder0   r2   r3   r      s   

zAttention.traindevicereturnc                 C   sZ   t j s| jr| jd d | jf S t|}|| jvr(| jd d | jf | j|< | j| S rO   )r#   jit
is_tracingtrainingr   r   strr   )r'   r   
device_keyr2   r2   r3   get_attention_biases   s   

zAttention.get_attention_biasesc                 C   s   |  |j}|j\}}}| |}| |}|||| jdj| j| j| j	gdd\}}}	|
dddd}|
dddd}|	
dddd}	| jrRtj|||	|d}n|| j }||dd }
|
| }
|
jdd}
|
|	 }|dd||| j}| |}|S )	NrJ   )rb   r   r4   r   )	attn_mask)r   r   r:   rs   r   r   r   splitr   r   permuter   Fscaled_dot_product_attentionr   	transposesoftmaxreshaperc   r   )r'   rQ   	attn_biasBr   _r   qkvattnr2   r2   r3   rR      s$   

.

zAttention.forward)r   rI   r   T)rB   rC   rD   r#   r   Finalbool__annotations__r   r   Tensorr   rE   r   r   r   rR   rF   r2   r2   r0   r3   r      s   
 %	r   c                       sF   e Zd ZdZdddddejf fdd	Zdd	 Zd
efddZ	  Z
S )TinyVitBlocka5   TinyViT Block.

    Args:
        dim (int): Number of input channels.
        num_heads (int): Number of attention heads.
        window_size (int): Window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        drop (float, optional): Dropout rate. Default: 0.0
        drop_path (float, optional): Stochastic depth rate. Default: 0.0
        local_conv_size (int): the kernel size of the convolution between
                               Attention and MLP. Default: 3
        act_layer: the activation function. Default: nn.GELU
       re   rV   rJ   c	                    s   t    || _|| _|dksJ d|| _|| _|| dks#J d|| }	||f}
t||	|d|
d| _|dkr=t|nt	
 | _t|t|| ||d| _|dkrWt|nt	
 | _|d }t|||d||d	| _d S )
Nr   z"window_size must be greater than 0z"dim must be divisible by num_headsr   )r   r   rV   )rz   r{   rN   r~   r4   rU   )r   r   rb   r   window_size	mlp_ratior   r   r   r   r\   
drop_path1rr   rW   mlp
drop_path2r   
local_conv)r'   rb   r   r   r   r~   r]   local_conv_sizerN   head_dimwindow_resolutionr,   r0   r2   r3   r     s(   

zTinyVitBlock.__init__c              	   C   s  |j \}}}}|| }|}|| jkr,|| jkr,||||}| |}|||||}n| j|| j  | j }| j|| j  | j }	|dkpI|	dk}
|
rXt|ddd|	d|f}|| ||	 }}|| j }|| j }|||| j|| j|dd|| | | j| j |}| |}||||| j| j|dd||||}|
r|d d d |d |f  }|| 	| }|
dddd}| |}||||dd}|| | | }|||||S Nr   r4   rJ   r   )r:   r   r   r   r   r   r,   r   
contiguousr   r   r   r   r   )r'   rQ   r   HWCLr`   pad_bpad_rr5   pHpWnHnWr2   r2   r3   rR   +  s8   


 
,
zTinyVitBlock.forwardr   c                 C   s$   d| j  d| j d| j d| j S )Ndim=z, num_heads=z, window_size=z, mlp_ratio=)rb   r   r   r   r'   r2   r2   r3   
extra_reprS  s
   zTinyVitBlock.extra_reprrB   rC   rD   __doc__r   r   r   rR   r   r   rF   r2   r2   r0   r3   r      s    %(r   c                       sF   e Zd ZdZdddddejf fdd	Zdd	 Zd
efddZ	  Z
S )TinyVitStagea   A basic TinyViT layer for one stage.

    Args:
        dim (int): Number of input channels.
        out_dim: the output dimension of the layer
        depth (int): Number of blocks.
        num_heads (int): Number of attention heads.
        window_size (int): Local window size.
        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
        drop (float, optional): Dropout rate. Default: 0.0
        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
        local_conv_size: the kernel size of the depthwise convolution between attention and MLP. Default: 3
        act_layer: the activation function. Default: nn.GELU
    re   rV   NrJ   c              	      sv   t    || _| _|	d ur|	| d| _nt | _|ks#J tj fddt|D  | _	d S )N)rb   rc   rN   c                    s6   g | ]}t ttr| n d qS ))rb   r   r   r   r~   r]   r   rN   )r   rf   rg   rh   rN   r~   r]   r   r   r   rc   r   r2   r3   rm     s    z)TinyVitStage.__init__.<locals>.<listcomp>)
r   r   rn   rc   
downsampler   r\   ro   rp   rq   )r'   rb   rc   rn   r   r   r   r~   r]   r   r   rN   r0   r   r3   r   l  s   


zTinyVitStage.__init__c                 C   s8   |  |}|dddd}| |}|dddd}|S r   )r   r   rq   rP   r2   r2   r3   rR     s
   

zTinyVitStage.forwardr   c                 C   s   d| j  d| j S )Nr   z, depth=)rc   rn   r   r2   r2   r3   r     s   zTinyVitStage.extra_reprr   r2   r2   r0   r3   r   [  s    ,r   c                       sd  e Zd Zddddddddd	d
dddejf fdd	Zdd Zejj	dd Z
ejj	dd Zejj	d6ddZejj	d7ddZejj	dejfddZd8dedee fdd Z				!	d9d"ejd#eeeee f  d$ed%ed&ed'edeeej eejeej f f fd(d)Z	*		d:d#eeee f d+ed,efd-d.Zd/d0 Zd6d1efd2d3Zd4d5 Z  ZS );r   rJ     avg)`        i   r4   r4      r4   )rJ   r         r   r   r   r   re   rV   皙?Fc                    s  t    || _|| _t|| _|| _|| _t||d |d| _	dd t
d|
t|D }t | _| j	j}|d }g | _t| jD ]f}|dkr[t||| ||d ||  |d}n8|| }|t|d | t|d |d   }
t||d  ||| || || | j|	||
t|d}|}|d9 }| j| |  jt||d	| d
g7  _qC|d  | _| _ttdd}t| j|||d| _| | j d S )Nr   )r(   r)   rN   c                 S   s   g | ]}|  qS r2   )itemri   rQ   r2   r2   r3   rm     s    z$TinyVit.__init__.<locals>.<listcomp>)rb   rn   rN   r]   rl   r   )rb   rc   rn   r   r   r   r~   r   r]   r   rN   r4   zstages.)num_chs	reductionmoduler   gh㈵>)r7   )	pool_typer}   ) r   r   num_classesdepthsr   
num_stagesr   grad_checkpointingrH   patch_embedr#   linspacesumr   ro   stagesr+   feature_inforp   rd   r   ra   r   dictnum_featureshead_hidden_sizer   r   r   headapply_init_weights)r'   in_chansr   global_pool
embed_dimsr   r   window_sizesr   	drop_ratedrop_path_rateuse_checkpointmbconv_expand_ratior   rN   dprr+   prev_dim	stage_idxstagerc   norm_layer_cfr0   r2   r3   r     sj   


(
"zTinyVit.__init__c                 C   sP   t |tjr"t|jdd t |tjr$|jd ur&tj|jd d S d S d S d S )Ng{Gz?)stdr   )rf   r   rt   r   r&   r   r$   r%   )r'   r@   r2   r2   r3   r    s   zTinyVit._init_weightsc                 C   s   dhS )Nr   r2   r   r2   r2   r3   no_weight_decay_keywords  s   z TinyVit.no_weight_decay_keywordsc                 C   s   dd |    D S )Nc                 S   s   h | ]}d |v r|qS )r   r2   r   r2   r2   r3   	<setcomp>  s    z*TinyVit.no_weight_decay.<locals>.<setcomp>)
state_dictkeysr   r2   r2   r3   no_weight_decay  s   zTinyVit.no_weight_decayc                 C   s   t d|rdnddgd}|S )Nz^patch_embedz^stages\.(\d+))z^stages\.(\d+).downsample)r   )z^stages\.(\d+)\.\w+\.(\d+)N)stemrq   )r   )r'   coarsematcherr2   r2   r3   group_matcher  s   zTinyVit.group_matcherTc                 C   s
   || _ d S rO   )r   )r'   enabler2   r2   r3   set_grad_checkpointing  s   
zTinyVit.set_grad_checkpointingr   c                 C   s   | j jS rO   )r   fcr   r2   r2   r3   get_classifier  s   zTinyVit.get_classifierNr   r  c                 C   s   || _ | jj||d d S )N)r   )r   r   reset)r'   r   r  r2   r2   r3   reset_classifier  s   zTinyVit.reset_classifierNCHWrQ   indicesrs   
stop_early
output_fmtintermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}tj s |s$| j}
n	| jd|	d  }
t|
D ]\}}| jrCtj sCt	||}n||}||v rP|
| q1|rU|S ||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r   zOutput shape must be NCHW.Nr   )r   r   r   r   r#   r   is_scripting	enumerater   r   r   )r'   rQ   r!  rs   r"  r#  r$  intermediatestake_indices	max_indexr   feat_idxr  r2   r2   r3   forward_intermediates  s"   

zTinyVit.forward_intermediatesr   
prune_norm
prune_headc                 C   s<   t t| j|\}}| jd|d  | _|r| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r    )r   r   r   r  )r'   r!  r,  r-  r(  r)  r2   r2   r3   prune_intermediate_layersI  s
   z!TinyVit.prune_intermediate_layersc                 C   s8   |  |}| jrtj st| j|}|S | |}|S rO   )r   r   r#   r   r%  r   r   rP   r2   r2   r3   forward_featuresW  s   

zTinyVit.forward_features
pre_logitsc                 C   s$   |r| j ||d}|S |  |}|S )N)r1  )r   )r'   rQ   r1  r2   r2   r3   forward_head_  s   
zTinyVit.forward_headc                 C   s   |  |}| |}|S rO   )r0  r2  rP   r2   r2   r3   rR   c  s   

zTinyVit.forwardFr   rO   )NFFr   F)r   FT)rB   rC   rD   r   r   r   r  r#   r   ignorer  r  r  r  Moduler  rW   r   r   r  r   r   r   r   r   r+  r/  r0  r2  rR   rF   r2   r2   r0   r3   r     sz    S


 
/
c                 C   sp   d|   v r
| d } | }i }|  D ]!\}}|drqd|v r1t|j|| jd d d j}|||< q|S )Nmodelr   r   r   )r  r  itemsendswithr   Tr:   )r  r6  	target_sdout_dictr   r   r2   r2   r3   checkpoint_filter_fni  s   

r<  r.  c              
   K   s   | dt tdddddd	|S )Nr   zpatch_embed.conv1.convzhead.fc)r   r   )rJ      r=  gffffff?)	urlr   meanr  
first_conv
classifier	pool_size
input_sizecrop_pctr	   )r>  kwargsr2   r2   r3   _cfgx  s   
rF  ztimm/iQU  )	hf_hub_idr   )rG  )rJ   r   r   )r   r   g      ?)rG  rC  rB  rD  )rJ      rH  )   rI  squash)rG  rC  rB  rD  	crop_mode)ztiny_vit_5m_224.dist_in22kz"tiny_vit_5m_224.dist_in22k_ft_in1kztiny_vit_5m_224.in1kztiny_vit_11m_224.dist_in22kz#tiny_vit_11m_224.dist_in22k_ft_in1kztiny_vit_11m_224.in1kztiny_vit_21m_224.dist_in22kz#tiny_vit_21m_224.dist_in22k_ft_in1kztiny_vit_21m_224.in1kz#tiny_vit_21m_384.dist_in22k_ft_in1kz#tiny_vit_21m_512.dist_in22k_ft_in1kFc                 K   s2   | dd}tt| |ftd|dtd|}|S )Nout_indices)r   r   r4   rJ   T)flatten_sequentialrL  )feature_cfgpretrained_filter_fn)popr   r   r   r<  )variant
pretrainedrE  rL  r6  r2   r2   r3   _create_tiny_vit  s   
rS  c                 K   >   t g dg dg dg ddd}|| td| fi |S )N)@         i@  r   )r4   rI      
   r   rV   r  r   r   r  r  tiny_vit_5m_224r   updaterS  rR  rE  model_kwargsr2   r2   r3   r[       
r[  c                 K   rT  )N)rU  rV     i  r   )r4   rI   r   r   r   r   rZ  tiny_vit_11m_224r\  r^  r2   r2   r3   rb    r`  rb  c                 K   rT  )Nr   r   r   i@  r   rJ   r   r      r   g?rZ  tiny_vit_21m_224r\  r^  r2   r2   r3   rf    r`  rf  c                 K   rT  )Nrc  r   rd  )r   r   r   r   r   rZ  tiny_vit_21m_384r\  r^  r2   r2   r3   rg    r`  rg  c                 K   rT  )Nrc  r   rd  )rI  rI      rI  r   rZ  tiny_vit_21m_512r\  r^  r2   r2   r3   ri    r`  ri  )r.  r3  )<r   __all__r   	functoolsr   typingr   r   r   r   r   r#   torch.nnr   torch.nn.functional
functionalr   	timm.datar
   r   timm.layersr   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr   r   ro   r   r5  rH   rT   ra   rd   rr   r   r   r   r   r<  rF  default_cfgsrS  r[  rb  rf  rg  ri  r2   r2   r2   r3   <module>   s     RaH G

5