o
    پiI                  	   @   s^  d Z ddlmZmZmZmZmZmZmZ ddl	Z	ddl
mZ ddlm  mZ ddlmZmZ ddlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZm Z  dgZ!G dd dej"Z#G dd dej$Z%G dd dej$Z&G dd dej"Z'G dd dej"Z(G dd dej"Z)G dd dej"Z*G dd dej"Z+G dd dej"Z,dee-e	j.f dej"dee-e	j.f fd d!Z/d9d#e-d$edee-ef fd%d&Z0e e0d'd(e0d'd(e0d'd(e0d'd)d*d+Z1d:d-e-d.e2d$ede,fd/d0Z3ed:d.e2d$ede,fd1d2Z4ed:d.e2d$ede,fd3d4Z5ed:d.e2d$ede,fd5d6Z6ed:d.e2d$ede,fd7d8Z7dS );a  SHViT
SHViT: Single-Head Vision Transformer with Memory Efficient Macro Design
Code: https://github.com/ysj9909/SHViT
Paper: https://arxiv.org/abs/2401.16456

@inproceedings{yun2024shvit,
  author={Yun, Seokju and Ro, Youngmin},
  title={SHViT: Single-Head Vision Transformer with Memory Efficient Macro Design},
  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  pages={5756--5767},
  year={2024}
}
    )AnyDictListOptionalSetTupleUnionNIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
GroupNorm1SqueezeExciteSelectAdaptivePool2d	LayerTypetrunc_normal_   )build_model_with_cfg)feature_take_indices)checkpoint_seq)register_modelgenerate_default_cfgsSHViTc                       sR   e Zd Zdejf fddZdejdejfddZe	 dejfdd	Z
  ZS )
Residualmc                    s   t    || _d S N)super__init__r   )selfr   	__class__ E/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/shvit.pyr      s   

zResidual.__init__xreturnc                 C   s   ||  | S r   )r   r   r"   r    r    r!   forward#   s   zResidual.forwardc                 C   sv   t | jtr9| j }|j|jksJ t|jj	d |jj	d dd}t
|g d}| j||jj7  _|S | S )Nr   r   )r   r   r   r   )
isinstancer   
Conv2dNormfusegroupsin_channelstorchonesweightshapeFpadtodevice)r   r   identityr    r    r!   r(   &   s   
 zResidual.fuse)__name__
__module____qualname__nnModuler   r+   Tensorr%   no_gradr(   __classcell__r    r    r   r!   r      s
    r   c                       sX   e Zd Z				ddedededededef fd	d
Ze dejfddZ	  Z
S )r'   r   r   r*   out_channelskernel_sizestridepaddingbn_weight_initc              	      sj   t    | dtj|||||fddi| | dt| tj| jj	| tj| jj
d d S )NcbiasFbnr   )r   r   
add_moduler7   Conv2dBatchNorm2dinit	constant_rC   r-   rB   )r   r*   r<   r=   r>   r?   r@   kwargsr   r    r!   r   4   s   



zConv2dNorm.__init__r#   c                 C   s   | j  \}}|j|j|j d  }|j|d d d d d f  }|j|j|j |j|j d   }tj|	d| j
j |	d|jdd  | j
j| j
j| j
j| j
j|jj|jjd	}|jj| |jj| |S )N      ?r   r      )	r*   r<   r=   r>   r?   dilationr)   r2   dtype)_modulesvaluesr-   running_varepsrB   running_meanr7   rE   sizerA   r)   r.   r>   r?   rL   r2   rM   datacopy_)r   rA   rC   wbr   r    r    r!   r(   E   s$   "zConv2dNorm.fuse)r   r   r   r   )r4   r5   r6   intr   r+   r:   r7   rE   r(   r;   r    r    r   r!   r'   3   s(    r'   c                	       sL   e Zd Z		ddedededef fddZe d	e	j
fd
dZ  ZS )
NormLinearT{Gz?in_featuresout_featuresrB   stdc                    sb   t    | dt| | dtj|||d t| jj|d |r/tj	
| jjd d S d S )NrC   l)rB   )r]   r   )r   r   rD   r7   BatchNorm1dLinearr   r^   r-   rG   rH   rB   )r   r[   r\   rB   r]   r   r    r!   r   \   s   
zNormLinear.__init__r#   c                 C   s   | j  \}}|j|j|j d  }|j| jj| jj |j|j d   }|j|d d d f  }|jd u r=|| jjj	 }n|j|d d d f  
d| jj }tj|d|d|jj|jjd}|jj| |jj| |S )NrJ   r   r   )r2   rM   )rN   rO   r-   rP   rQ   rB   rC   rR   r^   Tviewr7   r`   rS   r2   rM   rT   rU   )r   rC   r^   rV   rW   r   r    r    r!   r(   j   s   &
$&zNormLinear.fuse)TrZ   )r4   r5   r6   rX   boolfloatr   r+   r:   r7   r`   r(   r;   r    r    r   r!   rY   [   s    rY   c                       F   e Zd Zejfdededef fddZdej	dej	fdd	Z
  ZS )
PatchMergingdimout_dim	act_layerc                    sd   t    t|d }t||| _| | _t||ddd|d| _| | _t|d| _	t||| _
d S )N      rK   r   r)   g      ?)r   r   rX   r'   conv1act1conv2act2r   seconv3)r   rh   ri   rj   hid_dimr   r    r!   r   {   s   
zPatchMerging.__init__r"   r#   c                 C   s@   |  |}| |}| |}| |}| |}| |}|S r   )rn   ro   rp   rq   rr   rs   r$   r    r    r!   r%      s   





zPatchMerging.forwardr4   r5   r6   r7   ReLUrX   r   r   r+   r9   r%   r;   r    r    r   r!   rg   z   s     
rg   c                       rf   )
FFNrh   	embed_dimrj   c                    s2   t    t||| _| | _t||dd| _d S )Nr   r@   )r   r   r'   pw1actpw2)r   rh   rx   rj   r   r    r!   r      s   
zFFN.__init__r"   r#   c                 C   "   |  |}| |}| |}|S r   )rz   r{   r|   r$   r    r    r!   r%         


zFFN.forwardru   r    r    r   r!   rw      s     rw   c                       sT   e Zd ZdZeejfdededededef
 fddZ	d	e
jd
e
jfddZ  ZS )SHSAzSingle-Head Self-Attentionrh   qk_dimpdim
norm_layerrj   c                    sb   t    |d | _|| _|| _|| _||| _t||d | | _t	
| t||dd| _d S )Ng      rK   r   ry   )r   r   scaler   rh   r   pre_normr'   qkvr7   
Sequentialproj)r   rh   r   r   r   rj   r   r    r!   r      s   


zSHSA.__init__r"   r#   c                 C   s   |j \}}}}tj|| j| j| j gdd\}}| |}| |}tj|| j| j| jgdd\}	}
}|	d|
d|d}	}
}|		dd|
 | j
 }|jdd}||	dd || j||}| tj||gdd}|S )Nr   )rh   rK   ra   )r.   r+   splitr   rh   r   r   r   flatten	transposer   softmaxreshaper   cat)r   r"   B_HWx1x2r   qkvattnr    r    r!   r%      s   "

""zSHSA.forward)r4   r5   r6   __doc__r   r7   rv   rX   r   r   r+   r9   r%   r;   r    r    r   r!   r      s     r   c                       sT   e Zd Zeejfdedededededef fddZ	d	e
jd
e
jfddZ  ZS )
BasicBlockrh   r   r   typer   rj   c              
      sl   t    tt||ddd|dd| _|dkr#tt|||||| _nt | _tt	|t
|d | _d S )Nrl   r   r   )r)   r@   srK   )r   r   r   r'   convr   mixerr7   Identityrw   rX   ffn)r   rh   r   r   r   r   rj   r   r    r!   r      s   
	
zBasicBlock.__init__r"   r#   c                 C   r}   r   )r   r   r   r$   r    r    r!   r%      r~   zBasicBlock.forwardr4   r5   r6   r   r7   rv   rX   strr   r   r+   r9   r%   r;   r    r    r   r!   r      s"    r   c                       s\   e Zd Zeejfdedededededededef fd	d
Z	de
jde
jfddZ  ZS )
StageBlockprev_dimrh   r   r   r   depthr   rj   c	           	         s   t    d| _|krAttt||ddd|dtt|t|d  t	| ttddddtttd  nt
 | _tj fddt|D  | _d S )NFrl   r   rm   rK   c              	      s   g | ]}t  qS r    )r   ).0r   rj   rh   r   r   r   r   r    r!   
<listcomp>   s    z'StageBlock.__init__.<locals>.<listcomp>)r   r   grad_checkpointingr7   r   r   r'   rw   rX   rg   r   
downsamplerangeblocks)	r   r   rh   r   r   r   r   r   rj   r   r   r!   r      s   

zStageBlock.__init__r"   r#   c                 C   s8   |  |}| jrtj st| j|}|S | |}|S r   )r   r   r+   jitis_scriptingr   r   r$   r    r    r!   r%      s   

zStageBlock.forwardr   r    r    r   r!   r      s*    		r   c                       s  e Zd Zddddddddd	eejfd
edededeeeef deeeef deeeef deeeef deeeef de	de
de
f fddZejjdefddZejjd=dedeeef fddZejjd>dd Zejjdejfd!d"Zd?dedefd#d$Z	%			&	d@d'ejd(eeeee f  d)ed*ed+ed,edeeej eejeej f f fd-d.Z	/		dAd(eeee f d0ed1efd2d3Zd'ejdejfd4d5Zd=d'ejd6edejfd7d8Z d'ejdejfd9d:Z!e" d;d< Z#  Z$S )Br   rl     avg)      i  )    @   `   )   r   r   )r   rK   rl   )r   r   r           in_chansnum_classesglobal_poolrx   partial_dimr   r   types	drop_rater   rj   c                    sj  t    || _|	| _g | _|d }tt||d ddd| t|d |d ddd| t|d |d ddd| t|d |ddd| _g }|}t	t
|D ]1}|t||| || || || || |
|d || }| jt|d|d  d| d	 qOtj| | _|d
  | _| _t|d| _|rtdnt | _|dkrt| j|| _d S t | _d S )Nr      rl   rK   r   rk   )r   rh   r   r   r   r   r   rj   zstages.)num_chs	reductionmodulera   	pool_type)r   r   r   r   feature_infor7   r   r'   patch_embedr   lenappendr   dictstagesnum_featureshead_hidden_sizer   r   Flattenr   r   rY   head)r   r   r   r   rx   r   r   r   r   r   r   rj   stem_chsr   prev_chsir   r    r!   r      sD   

&(zSHViT.__init__r#   c                 C   s   t  S r   )setr   r    r    r!   no_weight_decay5  s   zSHViT.no_weight_decayFcoarsec                 C   s   t d|rdnddgd}|S )Nz^patch_embedz^stages\.(\d+))z^stages\.(\d+).downsample)r   )z^stages\.(\d+)\.blocks\.(\d+)N)stemr   )r   )r   r   matcherr    r    r!   group_matcher9  s   zSHViT.group_matcherTc                 C   s   | j D ]}||_qd S r   )r   r   )r   enabler   r    r    r!   set_grad_checkpointingD  s   
zSHViT.set_grad_checkpointingc                 C   s   | j jS r   )r   r^   r   r    r    r!   get_classifierI  s   zSHViT.get_classifierc                 C   sR   || _ t|d| _|rtdnt | _|dkr"t| j|| _	d S t | _	d S )Nr   r   r   )
r   r   r   r7   r   r   r   rY   r   r   )r   r   r   r    r    r!   reset_classifierM  s   (zSHViT.reset_classifierNNCHWr"   indicesnorm
stop_early
output_fmtintermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}tj s |s$| j}
n	| jd|	d  }
t|
D ]\}}||}||v rB|| q1|rG|S ||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r   zOutput shape must be NCHW.Nr   )	r   r   r   r   r+   r   r   	enumerater   )r   r"   r   r   r   r   r   intermediatestake_indices	max_indexr   feat_idxstager    r    r!   forward_intermediatesT  s   

zSHViT.forward_intermediatesr   
prune_norm
prune_headc                 C   s<   t t| j|\}}| jd|d  | _|r| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r    )r   r   r   r   )r   r   r   r   r   r   r    r    r!   prune_intermediate_layers~  s
   zSHViT.prune_intermediate_layersc                 C      |  |}| |}|S r   )r   r   r$   r    r    r!   forward_features     

zSHViT.forward_features
pre_logitsc                 C   sD   |  |}| |}| jdkrtj|| j| jd}|r|S | |S )Nr   )ptraining)r   r   r   r/   dropoutr   r   )r   r"   r   r    r    r!   forward_head  s
   


zSHViT.forward_headc                 C   r   r   )r   r   r$   r    r    r!   r%     r   zSHViT.forwardc                    s    fdd  |  d S )Nc                    sF   |   D ]\}}t|dr| }t| ||  | q | qd S )Nr(   )named_childrenhasattrr(   setattr)net
child_namechildfusedfuse_childrenr    r!   r     s   


z!SHViT.fuse.<locals>.fuse_childrenr    r   r    r   r!   r(     s   	z
SHViT.fuseF)T)r   )NFFr   F)r   FT)%r4   r5   r6   r   r7   rv   rX   r   r   re   r   r   r+   r   ignorer   r   rd   r   r   r   r   r8   r   r   r9   r   r   r   r   r   r   r   r%   r:   r(   r;   r    r    r   r!   r      s    	
7

 
,

state_dictmodelr#   c                 C   s   |  d| } | S )Nr  )get)r   r  r    r    r!   checkpoint_filter_fn  s   r  r   urlrI   c                 K   s$   | dddddt tdddd	d
d|S )Nr   )rl      r  )rk   rk   g      ?bicubiczpatch_embed.0.czhead.lzarXiv:2401.16456zHSHViT: Single-Head Vision Transformer with Memory Efficient Macro Designz https://github.com/ysj9909/SHViT)r  r   
input_size	pool_sizecrop_pctinterpolationmeanr]   
first_conv
classifier	paper_ids
paper_name
origin_urlr	   )r  rI   r    r    r!   _cfg  s   r  ztimm/)	hf_hub_id)rl   r   r   )r  r  )zshvit_s1.in1kzshvit_s2.in1kzshvit_s3.in1kzshvit_s4.in1kFvariant
pretrainedc                 K   s&   t t| |fttdddd|}|S )N)r   r   rK   T)out_indicesflatten_sequential)pretrained_filter_fnfeature_cfg)r   r   r  r   )r  r  rI   r  r    r    r!   _create_shvit  s   
r  c                 K   .   t ddddd}tdd| it |fi |S )	N)r   r  i@  rK   rk      )r   0   D   r   r   r   rx   r   r   r   shvit_s1r  )r!  r   r  r  rI   
model_argsr    r    r!   r!       r!  c                 K   r  )	N)r   i4    r  )r   B   r   r  r   shvit_s2r  )r(  r"  r#  r    r    r!   r(    r%  r(  c                 K   r  )	N)   i`  r&  )rl   r  r  )r  K   r   r  r   shvit_s3r  )r+  r"  r#  r    r    r!   r+    r%  r+  c                 K   r  )	N)r  iP  r&  )rk         )r  H   r   r  r   shvit_s4r  )r/  r"  r#  r    r    r!   r/    r%  r/  )r   r   )8r   typingr   r   r   r   r   r   r   r+   torch.nnr7   torch.nn.functional
functionalr/   	timm.datar
   r   timm.layersr   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r   __all__r8   r   r   r'   rY   rg   rw   r   r   r   r   r   r9   r  r  default_cfgsrd   r  r!  r(  r+  r/  r    r    r    r!   <module>   s`    $($# ,/ !
