o
    پiW                  	   @   sd  d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlm  mZ ddlmZmZ ddlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lm Z m!Z! dgZ"G dd dej#Z$G dd dej#Z%G dd dej#Z&G dd dej#Z'G dd dej#Z(G dd dej#Z)G dd dej#Z*G dd dej#Z+G dd dej#Z,dee-e
j.f dej#dee-e
j.f fd d!Z/d7d#e-d$edee-ef fd%d&Z0e e0d'd(e0d'd(e0d'd(e0d'd(d)Z1d8d+e-d,e2d$ede,fd-d.Z3e!d8d,e2d$ede,fd/d0Z4e!d8d,e2d$ede,fd1d2Z5e!d8d,e2d$ede,fd3d4Z6e!d8d,e2d$ede,fd5d6Z7dS )9aj  SwiftFormer
SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications
Code: https://github.com/Amshaker/SwiftFormer
Paper: https://arxiv.org/pdf/2303.15446

@InProceedings{Shaker_2023_ICCV,
    author    = {Shaker, Abdelrahman and Maaz, Muhammad and Rasheed, Hanoona and Khan, Salman and Yang, Ming-Hsuan and Khan, Fahad Shahbaz},
    title     = {SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications},
    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
    year      = {2023},
}
    N)AnyDictListOptionalSetTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPathLinear	LayerType	to_2tupletrunc_normal_   )build_model_with_cfg)feature_take_indices)checkpoint_seq)generate_default_cfgsregister_modelSwiftFormerc                       sB   e Zd Zddededef fddZdejd	ejfd
dZ	  Z
S )LayerScale2dh㈵>Fdiminit_valuesinplacec                    s2   t    || _tj|t|dd dd| _d S )Nr   T)requires_grad)super__init__r   nn	Parametertorchonesgamma)selfr   r   r   	__class__ K/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/swiftformer.pyr      s
   
zLayerScale2d.__init__xreturnc                 C   s   | j r	|| jS || j S N)r   mul_r$   r%   r*   r(   r(   r)   forward%   s   zLayerScale2d.forward)r   F)__name__
__module____qualname__intfloatboolr   r"   Tensorr/   __classcell__r(   r(   r&   r)   r      s    r   c                       s`   e Zd ZdZdddddejfdededed	ed
edef fddZde	j
de	j
fddZ  ZS )	Embeddingz
    Patch Embedding that is implemented by a layer of conv.
    Input: tensor in shape [B, C, H, W]
    Output: tensor in shape [B, C, H/stride, W/stride]
       i      r   in_chans	embed_dim
patch_sizestridepadding
norm_layerc                    sV   t    t|}t|}t|}t|||||| _|r$||| _d S t | _d S r,   )r   r   r   r    Conv2dprojIdentitynorm)r%   r;   r<   r=   r>   r?   r@   r&   r(   r)   r   /   s   
	 zEmbedding.__init__r*   r+   c                 C      |  |}| |}|S r,   )rB   rD   r.   r(   r(   r)   r/   ?      

zEmbedding.forward)r0   r1   r2   __doc__r    BatchNorm2dr3   r   r   r"   r6   r/   r7   r(   r(   r&   r)   r8   )   s,    r8   c                       sf   e Zd ZdZdddejejdfdededed	ed
e	de	de
f fddZdejdejfddZ  ZS )ConvEncoderz
    Implementation of ConvEncoder with 3*3 and 1*1 convolutions.
    Input: tensor with shape [B, C, H, W]
    Output: tensor with shape [B, C, H, W]
    @   r9           Tr   
hidden_dimkernel_size	drop_path	act_layerr@   use_layer_scalec                    s   t    tj||||d |d| _||| _t||d| _| | _t||d| _|dkr3t	|nt
 | _|rBt|d| _d S t
 | _d S )N   r?   groupsr   rK   r   r   r    rA   dwconvrD   pwconv1actpwconv2r   rC   rN   r   layer_scale)r%   r   rL   rM   rN   rO   r@   rP   r&   r(   r)   r   K   s   


"zConvEncoder.__init__r*   r+   c                 C   R   |}|  |}| |}| |}| |}| |}| |}|| | }|S r,   rU   rD   rV   rW   rX   rY   rN   )r%   r*   inputr(   r(   r)   r/   ^      





zConvEncoder.forwardr0   r1   r2   rG   r    GELUrH   r3   r4   r   r5   r   r"   r6   r/   r7   r(   r(   r&   r)   rI   E   s0    rI   c                       sh   e Zd ZdZddejejdfdedee dee de	de	d	e
f fd
dZdejdejfddZ  ZS )Mlpz
    Implementation of MLP layer with 1*1 convolutions.
    Input: tensor with shape [B, C, H, W]
    Output: tensor with shape [B, C, H, W]
    NrK   in_featureshidden_featuresout_featuresrO   r@   dropc                    s\   t    |p|}|p|}||| _t||d| _| | _t||d| _t|| _	d S )Nr   )
r   r   norm1r    rA   fc1rW   fc2Dropoutrd   )r%   ra   rb   rc   rO   r@   rd   r&   r(   r)   r   p   s   
	
zMlp.__init__r*   r+   c                 C   s@   |  |}| |}| |}| |}| |}| |}|S r,   )re   rf   rW   rd   rg   r.   r(   r(   r)   r/      s   





zMlp.forward)r0   r1   r2   rG   r    r_   rH   r3   r   r   r4   r   r"   r6   r/   r7   r(   r(   r&   r)   r`   j   s*    r`   c                       sF   e Zd ZdZddededef fdd	Zd
ejdejfddZ  Z	S )EfficientAdditiveAttentionz
    Efficient Additive Attention module for SwiftFormer.
    Input: tensor in shape [B, C, H, W]
    Output: tensor in shape [B, C, H, W]
          r   in_dims	token_dim	num_headsc                    s|   t    |d | _t||| | _t||| | _tt	|| d| _
t|| || | _t|| || _d S )Ng      r   )r   r   scale_factorr    r   to_queryto_keyr!   r"   randnw_grB   final)r%   rl   rm   rn   r&   r(   r)   r      s   

z#EfficientAdditiveAttention.__init__r*   r+   c           
      C   s   |j \}}}}|dddd}tj| |dd}tj| |dd}tj|| j | j dd}t	j
|| ddd}| || | }	| |	ddd|d||}	|	S )NrQ   r   r   r   T)r   keepdim)shapeflattenpermuteF	normalizerp   rq   rs   ro   r"   sumrB   rt   reshape)
r%   r*   B_HWquerykeyattnoutr(   r(   r)   r/      s    z"EfficientAdditiveAttention.forward)rj   rk   r   )
r0   r1   r2   rG   r3   r   r"   r6   r/   r7   r(   r(   r&   r)   ri      s    ri   c                       s`   e Zd ZdZdddejejfdededede	d	e
d
e
f fddZdejdejfddZ  ZS )LocalRepresentationz
    Local Representation module for SwiftFormer that is implemented by 3*3 depth-wise and point-wise convolutions.
    Input: tensor in shape [B, C, H, W]
    Output: tensor in shape [B, C, H, W]
    r9   rK   Tr   rM   rN   rP   rO   r@   c                    s   t    tj||||d |d| _||| _tj||dd| _| | _tj||dd| _|dkr5t	|nt
 | _|rDt|d| _d S t
 | _d S )NrQ   rR   r   )rM   rK   rT   )r%   r   rM   rN   rP   rO   r@   r&   r(   r)   r      s   
	
"zLocalRepresentation.__init__r*   r+   c                 C   rZ   r,   r[   )r%   r*   skipr(   r(   r)   r/      r]   zLocalRepresentation.forward)r0   r1   r2   rG   r    r_   rH   r3   r4   r5   r   r   r"   r6   r/   r7   r(   r(   r&   r)   r      s*    r   c                       sl   e Zd ZdZdddejejddfdededed	ed
e	de	de
def fddZdejdejfddZ  ZS )Blockz
    SwiftFormer Encoder Block for SwiftFormer. It consists of :
    (1) Local representation module, (2) EfficientAdditiveAttention, and (3) MLP block.
    Input: tensor in shape [B, C, H, W]
    Output: tensor in shape [B, C, H, W]
          @rK   Tr   r   	mlp_ratio	drop_raterN   rO   r@   rP   layer_scale_init_valuec	           	         s   t    t||||d| _t||d| _t|t|| |||d| _|dkr+t	|nt
 | _|r7t||nt
 | _|rFt||| _d S t
 | _d S )N)r   rP   rO   r@   )rl   rm   )ra   rb   rO   r@   rd   rK   )r   r   r   local_representationri   r   r`   r3   linearr   r    rC   rN   r   layer_scale_1layer_scale_2)	r%   r   r   r   rN   rO   r@   rP   r   r&   r(   r)   r      s0   

zBlock.__init__r*   r+   c                 C   sB   |  |}|| | | | }|| | | | }|S r,   )r   rN   r   r   r   r   r.   r(   r(   r)   r/      s   
zBlock.forwardr^   r(   r(   r&   r)   r      s6    		 r   c                       s   e Zd ZdZdejejdddddfdeded	ee d
e	de
de
de	de	dede	dee
 f fddZdejdejfddZ  ZS )Stagez
    Implementation of each SwiftFormer stages. Here, SwiftFormerEncoder used as the last block in all stages, while ConvEncoder used in the rest of the blocks.
    Input: tensor in shape [B, C, H, W]
    Output: tensor in shape [B, C, H, W]
    r   rK   Tr   Nr   indexlayersr   rO   r@   r   drop_path_raterP   r   
downsamplec                    s   t    d| _|d ur|nt | _g }t|| D ]>}||t|d |   t|d  }|| | dkrG|t	|||||||	|
d q|t
|t|| d||||	d qtj| | _d S )NFr   )r   r   rN   rO   r@   rP   r   r9   )r   rL   rM   rN   rO   r@   rP   )r   r   grad_checkpointingr    rC   r   ranger}   appendr   rI   r3   
Sequentialblocks)r%   r   r   r   r   rO   r@   r   r   rP   r   r   r   	block_idx	block_dprr&   r(   r)   r     s6   
$


	zStage.__init__r*   r+   c                 C   s8   |  |}| jrtj st| j|}|S | |}|S r,   )r   r   r"   jitis_scriptingr   r   r.   r(   r(   r)   r/   0  s   

zStage.forward)r0   r1   r2   rG   r    r_   rH   r3   r   r4   r   r5   r   r   r"   r6   r/   r7   r(   r(   r&   r)   r      sD    
	
,r   c                !       s  e Zd Zg dg ddg dejddddd	d	d
ddddfdee dee dedee dededededede	de	dede	de
dedef  fddZd d! Zejjd"efd#d$ZejjdGd&ed"ee
ef fd'd(ZejjdHd)efd*d+Zejjd"eejejf fd,d-ZdIdedee
 fd/d0ZejjdHd)efd1d2Z	.	%	%	3	%dJd4ejd5eeeee f  d6ed7ed8e
d9ed"eeej eejeej f f fd:d;Z		%	
dKd5eeee f d<ed=efd>d?Zd4ejd"ejfd@dAZ dGd4ejdBefdCdDZ!d4ejfdEdFZ"  Z#S )Lr   r9   r9         0   8   p      r   )FTTTr9   rQ   r     rK   Tr   avg    r   
embed_dims
mlp_ratiosdownsamplesrO   down_patch_sizedown_stridedown_padnum_classesr   r   rP   r   global_pooloutput_strider;   c                    s  t    |dksJ |	| _|| _g | _tt||d d dddt|d d t	 t|d d |d dddt|d t	 | _
|d }g }tt|D ]D}|| rdt||| |||dnt }t|| |||||
||||d
}|| }|| |  jt|| d|d  d| d	g7  _qStj| | _|d
  | _ | _}t|| _t|
| _|	dkrt||	nt | _|	dkrt||	nt | _d| _|   d S )Nr   r   rQ   r9   r   )r;   r<   r=   r>   r?   )
r   r   r   r   rO   r   r   rP   r   r   stages.)num_chs	reductionmoduleru   F)r   r   r   r   feature_infor    r   rA   rH   ReLUstemr   lenr8   rC   r   r   dictstagesnum_featureshead_hidden_sizerD   rh   	head_dropr   head	head_distdistilled_training_initialize_weights)r%   r   r   r   r   rO   r   r   r   r   r   r   rP   r   r   r   r;   kwargsprev_dimr   ir   stageout_chsr&   r(   r)   r   :  sb   

.zSwiftFormer.__init__c                 C   s   |   D ]9\}}t|tjr#t|jdd |jd ur"tj|jd qt|tj	r=t|jdd |jd ur=tj|jd qd S )Ng{Gz?)stdr   )
named_modules
isinstancer    r   r   weightbiasinit	constant_rA   )r%   namemr(   r(   r)   r     s   

zSwiftFormer._initialize_weightsr+   c                 C   s   t  S r,   )setr%   r(   r(   r)   no_weight_decay  s   zSwiftFormer.no_weight_decayFcoarsec                 C   s   t d|rdng dd}|S )Nz^stemz^stages\.(\d+)))z^stages\.(\d+).downsample)r   )z^stages\.(\d+)\.blocks\.(\d+)N)z^norm)i )r   r   )r   )r%   r   matcherr(   r(   r)   group_matcher  s
   zSwiftFormer.group_matcherenablec                 C   s   | j D ]}||_qd S r,   )r   r   )r%   r   sr(   r(   r)   set_grad_checkpointing  s   
z"SwiftFormer.set_grad_checkpointingc                 C   s   | j | jfS r,   r   r   r   r(   r(   r)   get_classifier  s   zSwiftFormer.get_classifierNc                 C   sZ   || _ |d ur
|| _|dkrt| j|nt | _|dkr&t| j|| _d S t | _d S )Nr   )r   r   r   r   r    rC   r   r   )r%   r   r   r(   r(   r)   reset_classifier  s
   (zSwiftFormer.reset_classifierc                 C   s
   || _ d S r,   )r   )r%   r   r(   r(   r)   set_distilled_training  s   
z"SwiftFormer.set_distilled_trainingNCHWr*   indicesrD   
stop_early
output_fmtintermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	t| jd }
| |}tj s'|s+| j}n	| jd|	d  }t|D ]\}}||}||v rW|rP||
krP| |}n|}|	| q8|r\|S ||
kre| |}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r   zOutput shape must be NCHW.r   N)
r   r   r   r   r"   r   r   	enumeraterD   r   )r%   r*   r   rD   r   r   r   intermediatestake_indices	max_indexlast_idxr   feat_idxr   x_interr(   r(   r)   forward_intermediates  s*   


z!SwiftFormer.forward_intermediates
prune_norm
prune_headc                 C   sJ   t t| j|\}}| jd|d  | _|rt | _|r#| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r    )r   r   r   r    rC   rD   r   )r%   r   r   r   r   r   r(   r(   r)   prune_intermediate_layers  s   
z%SwiftFormer.prune_intermediate_layersc                 C   s"   |  |}| |}| |}|S r,   )r   r   rD   r.   r(   r(   r)   forward_features  s   


zSwiftFormer.forward_features
pre_logitsc                 C   sh   | j dkr|jdd}| |}|r|S | || |}}| jr.| jr.tj	 s.||fS || d S )Nr   )rQ   r9   rv   rQ   )
r   meanr   r   r   r   trainingr"   r   r   )r%   r*   r   x_distr(   r(   r)   forward_head  s   

zSwiftFormer.forward_headc                 C   rE   r,   )r   r   r.   r(   r(   r)   r/     rF   zSwiftFormer.forwardF)Tr,   )NFFr   F)r   FT)$r0   r1   r2   r    r_   r   r3   r5   r   r4   strr   r   r"   r   ignorer   r   r   r   r   r   r   Moduler   r   r   r   r6   r   r   r   r   r   r/   r7   r(   r(   r&   r)   r   9  s    	
H 
4

state_dictmodelr+   c           	      C   s   |  d| } d| v r| S i }|  D ]_\}}|dd}|dd}|dd}|d	d
}|dd}tdd|}td|}|rmt|d|d}}|d }|d dkrcd| d| }n
d|d  d| }|||< q|S )Nr   zstem.0.weightzpatch_embed.zstem.z
dist_head.z
head_dist.z
attn.Proj.z
attn.proj.z.layer_scale_1z.layer_scale_1.gammaz.layer_scale_2z.layer_scale_2.gammaz\.layer_scale(?=$|\.)z.layer_scale.gammaz^network\.(\d+)\.(.*)r   rQ   r   r   z.blocks.z.downsample.)getitemsreplaceresubmatchr3   group)	r   r   out_dictkvr   n_idxrest	stage_idxr(   r(   r)   checkpoint_filter_fn  s(   
r
  r   urlr   c                 K   s&   | ddd dddt tdddd	d
d|S )Nr   )r9      r  Tgffffff?bicubiczstem.0r   zarXiv:2303.15446zdSwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applicationsz'https://github.com/Amshaker/SwiftFormer)r  r   
input_size	pool_sizefixed_input_sizecrop_pctinterpolationr   r   
first_conv
classifier	paper_ids
paper_name
origin_urlr	   )r  r   r(   r(   r)   _cfg'  s   	r  ztimm/)	hf_hub_id)zswiftformer_xs.dist_in1kzswiftformer_s.dist_in1kzswiftformer_l1.dist_in1kzswiftformer_l3.dist_in1kFvariant
pretrainedc                 K   s&   t t| |fttdddd|}|S )N)r   r   rQ   r9   T)out_indicesflatten_sequential)pretrained_filter_fnfeature_cfg)r   r   r
  r   )r  r  r   r   r(   r(   r)   _create_swiftformerE  s   
r   c                 K   2   t g dg dd}tdd| it |fi |S )Nr   r   r   r   swiftformer_xsr  )r#  r   r   r  r   
model_argsr(   r(   r)   r#  O     r#  c                 K   r!  )N)r9   r9   	   r   )r   rJ      r  r"  swiftformer_sr  )r*  r$  r%  r(   r(   r)   r*  U  r'  r*  c                 K   r!  )N)r   r9   
      )r   `      i  r"  swiftformer_l1r  )r/  r$  r%  r(   r(   r)   r/  Z  r'  r/  c                 K   r!  )N)r   r      r   )rJ      i@  rj   r"  swiftformer_l3r  )r2  r$  r%  r(   r(   r)   r2  `  r'  r2  )r   r   )8rG   r   typingr   r   r   r   r   r   r   r"   torch.nnr    torch.nn.functional
functionalr{   	timm.datar
   r   timm.layersr   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r   __all__r   r   r8   rI   r`   ri   r   r   r   r   r   r6   r
  r  default_cfgsr5   r   r#  r*  r/  r2  r(   r(   r(   r)   <module>   s`    $%" $.; ,U 
