o
    پiU                  	   @   s  d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
mZ ddlm  mZ ddlmZmZ ddlmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZm Z  ddl!m"Z" dgZ#ee$e$f Z%eG dd dej&Z'G dd dej&Z(G dd dej&Z)G dd dej&Z*G dd dej&Z+G dd dej&Z,d/ddZ-d0ddZ.e e.dd e.dd e.dd e.dd e.dd e.dd d!Z/ed/d"e,fd#d$Z0ed/d"e,fd%d&Z1ed/d"e,fd'd(Z2ed/d"e,fd)d*Z3ed/d"e,fd+d,Z4ed/d"e,fd-d.Z5dS )1z Twins
A PyTorch impl of : `Twins: Revisiting the Design of Spatial Attention in Vision Transformers`
    - https://arxiv.org/pdf/2104.13840.pdf

Code/weights from https://github.com/Meituan-AutoML/Twins, original copyright/license info below

    N)partial)ListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)MlpDropPath	to_2tupletrunc_normal_use_fused_attn   )build_model_with_cfg)feature_take_indices)register_notrace_module)register_modelgenerate_default_cfgs)	AttentionTwinsc                       B   e Zd ZU dZejje ed< d fdd	Z	de
fd	d
Z  ZS )LocallyGroupedAttnz( LSA: self attention within a group
    
fused_attn           r   c                    s   |dksJ t t|   || dksJ d| d| d|| _|| _|| }|d | _t | _tj	||d dd	| _
t|| _t	||| _t|| _|| _d S )
Nr   r   dim   should be divided by num_heads .         Tbias)superr   __init__dim	num_headsscaler   r   nnLinearqkvDropout	attn_dropproj	proj_dropws)selfr%   r&   r,   r.   r/   head_dim	__class__ E/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/twins.pyr$   )   s   "

zLocallyGroupedAttn.__init__sizec              	   C   s   |j \}}}|\}}|||||}d }}	| j|| j  | j }
| j|| j  | j }t|dd||
|	|f}|j \}}}}|| j || j }}|||| j|| j|dd}| |||| | j| j d| j|| j 	dddddd}|
d\}}}| jrtj|||| jr| jjndd}n|| j }||d	d
 }|jd
d}| |}|| }|dd|||| j| j|}|dd||| j || j |}|
dks|dkr|d d d |d |d d f  }||||}| |}| |}|S )Nr      r    r         r   	dropout_pr%   )shapeviewr/   Fpadreshape	transposer*   r&   permuteunbindr   scaled_dot_product_attentiontrainingr,   pr'   softmax
contiguousr-   r.   )r0   xr6   BNCHWpad_lpad_tpad_rpad_b_HpWp_h_wr*   qkvattnr4   r4   r5   forward:   sD    
 

 $$

zLocallyGroupedAttn.forwardr   r   r   r   __name__
__module____qualname____doc__torchjitFinalbool__annotations__r$   Size_r_   __classcell__r4   r4   r2   r5   r   #   s
   
 r   c                       r   )GlobalSubSampleAttnzQ GSA: using a  key to summarize the information for a group to be efficient.
    r   r   r   r   c                    s   t    || dksJ d| d| d|| _|| _|| }|d | _t | _tj||dd| _	tj||d dd| _
t|| _t||| _t|| _|| _|d	kritj||||d
| _t|| _d S d | _d | _d S )Nr   r   r   r   r   Tr!   r7   r   kernel_sizestride)r#   r$   r%   r&   r'   r   r   r(   r)   r[   kvr+   r,   r-   r.   sr_ratioConv2dsr	LayerNormnorm)r0   r%   r&   r,   r.   rr   r1   r2   r4   r5   r$      s$   
"

zGlobalSubSampleAttn.__init__r6   c                 C   sT  |j \}}}| |||| j|| j dddd}| jd urC|dddj||g|R  }| |||dddd}| |}| ||dd| j|| j ddddd}|d\}}	| j	rwt
jjj|||	| jrr| jjndd}n|| j }||d	d }
|
jdd
}
| |
}
|
|	 }|dd|||}| |}| |}|S )Nr   r7   r   r    r=   r8   r   r:   r<   r>   )r?   r[   rC   r&   rE   rt   rv   rq   rF   r   rf   r(   
functionalrG   rH   r,   rI   r'   rD   rJ   r-   r.   )r0   rL   r6   rM   rN   rO   r[   rq   r\   r]   r^   r4   r4   r5   r_      s,   *

.



zGlobalSubSampleAttn.forwardr`   ra   r4   r4   r2   r5   rm      s
   
 rm   c                       s@   e Zd Zddddejejddf fdd	Zdefdd	Z  Z	S )
Blockg      @r   r   Nc                    s   t    ||| _|
d u rt||dd ||| _n|
dkr't|||||	| _n	t|||||
| _|dkr8t|nt	 | _
||| _t|t|| ||d| _|dkrZt|| _d S t	 | _d S )NFr   r   )in_featureshidden_features	act_layerdrop)r#   r$   norm1r   r^   rm   r   r   r(   Identity
drop_path1norm2r
   intmlp
drop_path2)r0   r%   r&   	mlp_ratior.   r,   	drop_pathr{   
norm_layerrr   r/   r2   r4   r5   r$      s    



$zBlock.__init__r6   c                 C   s:   ||  | | || }|| | | | }|S N)r   r^   r}   r   r   r   )r0   rL   r6   r4   r4   r5   r_      s   zBlock.forward)
rb   rc   rd   r(   GELUru   r$   rk   r_   rl   r4   r4   r2   r5   rx      s     rx   c                       s4   e Zd Zd
 fdd	ZdefddZdd	 Z  ZS )PosConv   r   c                    s8   t t|   ttj||d|dd|d| _|| _d S )Nr    r   T)r"   groups)r#   r   r$   r(   
Sequentialrs   r-   rp   )r0   in_chans	embed_dimrp   r2   r4   r5   r$      s
   
zPosConv.__init__r6   c                 C   sZ   |j \}}}|ddj||g|R  }| |}| jdkr"||7 }|ddd}|S )Nr   r7   )r?   rD   r@   r-   rp   flatten)r0   rL   r6   rM   rN   rO   cnn_feat_tokenr4   r4   r5   r_      s   

zPosConv.forwardc                 C   s   dd t dD S )Nc                 S   s   g | ]}d | qS )zproj.%d.weightr4   .0ir4   r4   r5   
<listcomp>       z+PosConv.no_weight_decay.<locals>.<listcomp>r8   )ranger0   r4   r4   r5   no_weight_decay   s   zPosConv.no_weight_decay)r   r   )rb   rc   rd   r$   rk   r_   r   rl   r4   r4   r2   r5   r      s    	r   c                       s:   e Zd ZdZd fdd	Zdeejef fd	d
Z	  Z
S )
PatchEmbedz Image to Patch Embedding
          r    r   c                    s   t    t|}t|}|| _|| _|d |d  dkr'|d |d  dks2J d| d| d|d |d  |d |d  | _| _| j| j | _tj	||||d| _
t|| _d S )Nr   r   z	img_size z! should be divided by patch_size r   rn   )r#   r$   r   img_size
patch_sizerP   rQ   num_patchesr(   rs   r-   ru   rv   )r0   r   r   r   r   r2   r4   r5   r$     s   
*&zPatchEmbed.__init__returnc                 C   sT   |j \}}}}| |ddd}| |}|| jd  || jd  f}||fS )Nr7   r   r   )r?   r-   r   rD   rv   r   )r0   rL   rM   rO   rP   rQ   out_sizer4   r4   r5   r_     s
   
zPatchEmbed.forward)r   r   r    r   )rb   rc   rd   re   r$   r   rf   Tensorrk   r_   rl   r4   r4   r2   r5   r      s     r   c                       sh  e Zd ZdZdddddddd	d
dddddddeejddef fdd	Ze	j
jdd Ze	j
jd8ddZe	j
jd9ddZe	j
jdejfddZd:dedee fdd Zd!d" Z				#	d;d$e	jd%eeeee f  d&ed'ed(ed)edeee	j ee	jee	j f f fd*d+Z	,		d<d%eeee f d-ed.efd/d0Zd1d2 Zd8d3efd4d5Zd6d7 Z  Z S )=r   z Twins Vision Transformer (Revisiting Spatial Attention)

    Adapted from PVT (PyramidVisionTransformer) class at https://github.com/whai362/PVT.git
    r   r8   r      avg@            )r   r7   r8   r   r8   r8   r8   r8   r    r8      r    r   r8   r7   r   Nr   gư>)epsc                    s  t    || _|| _|	| _| _d  | _| _d| _t	|}|}t
 | _t
 | _tt|	D ]*}| jt|	||  | jt
j|d | }t	fdd|D }d	q2t
 | _g | _dd td	|t|	D d	tt|	D ]?t
 
fd
dt|	 D }| j| |  jtd  dd  dg7  _|	 7 q{t
dd D | _| j| _t
|| _|d	krt
| j|nt
 | _|  | j! d S )Nr=   F)rI   c                 3   s    | ]}|  V  qd S r   r4   )r   t)r   r4   r5   	<genexpr>D  s    z!Twins.__init__.<locals>.<genexpr>r7   c                 S   s   g | ]}|  qS r4   )item)r   rL   r4   r4   r5   r   I  r   z"Twins.__init__.<locals>.<listcomp>r   c                    sZ   g | ])}   	 |  
 d u s#|d dkr%dn d	qS )Nr7   r   )	r%   r&   r   r.   r,   r   r   rr   r/   r4   r   )attn_drop_rate	block_clscurdpr
embed_dimsr\   
mlp_ratiosr   r&   proj_drop_rate	sr_ratioswssr4   r5   r   L  s    	

zblock.)modulenum_chs	reductionc                 S   s   g | ]}t ||qS r4   )r   )r   r   r4   r4   r5   r   [  s    )"r#   r$   num_classesglobal_pooldepthsr   num_featureshead_hidden_sizegrad_checkpointingr   r(   
ModuleListpatch_embeds	pos_dropsr   lenappendr   r+   tupleblocksfeature_inforf   linspacesumdict	pos_blockrv   	head_dropr)   r~   headapply_init_weights)r0   r   r   r   r   r   r   r&   r   r   r   r   	drop_ratepos_drop_rater   r   drop_path_rater   r   prev_chsr   _blockr2   )r   r   r   r   r   r\   r   r   r&   r   r   r   r   r5   r$     sB   



$
	, zTwins.__init__c                 C   s   t dd | j D S )Nc                 S   s   g | ]\}}d | qS )z
pos_block.r4   )r   nrI   r4   r4   r5   r   h  s    z)Twins.no_weight_decay.<locals>.<listcomp>)setr   named_parametersr   r4   r4   r5   r   f  s   zTwins.no_weight_decayFc                 C   s    t d|rddgng dd}|S )Nz^patch_embeds.0)z)^(?:blocks|patch_embeds|pos_block)\.(\d+)Nz^norm)i ))z^blocks\.(\d+)\.(\d+)N)z"^(?:patch_embeds|pos_block)\.(\d+))r   r   )stemr   )r   )r0   coarsematcherr4   r4   r5   group_matcherj  s   zTwins.group_matcherTc                 C   s   |rJ dd S )Nz$gradient checkpointing not supportedr4   )r0   enabler4   r4   r5   set_grad_checkpointingy  s   zTwins.set_grad_checkpointingr   c                 C   s   | j S r   )r   r   r4   r4   r5   get_classifier}  s   zTwins.get_classifierr   r   c                 C   sJ   || _ |d ur|dv sJ || _|dkrt| j|| _d S t | _d S )N) r   r   )r   r   r(   r)   r   r~   r   )r0   r   r   r4   r4   r5   reset_classifier  s
   *zTwins.reset_classifierc                 C   s   t |tjr&t|jdd t |tjr"|jd ur$tj|jd d S d S d S t |tjr>tj|jd tj|jd d S t |tj	rp|j
d |j
d  |j }||j }|jjdtd|  |jd urr|jj  d S d S d S )Ng{Gz?)stdr   g      ?r   g       @)
isinstancer(   r)   r   weightr"   init	constant_ru   rs   ro   out_channelsr   datanormal_mathsqrtzero_)r0   mfan_outr4   r4   r5   r     s    

zTwins._init_weightsNCHWrL   indicesrv   
stop_early
output_fmtintermediates_onlyc              	   C   sP  |dksJ dg }t t| j|\}}	|j\}
}}}tt| j| j| j| jD ]r\}\}}}}||\}}||}t|D ]\}}|||}|dkrP|||}q>|t| j	d k rw|j
|
g|dR  dddd }||v rv|| q(||v r|r| |n|}||j
|
g|dR  dddd  q(|r|S | |}||fS )a   Forward features that returns intermediates.
        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        r   z$Output shape for Twins must be NCHW.r   r   r=   r    r7   )r   r   r   r?   	enumeratezipr   r   r   r   rC   rE   rK   r   rv   )r0   rL   r   rv   r   r   r   intermediatestake_indices	max_indexrM   rV   heightwidthr   embedr|   r   pos_blkr6   jblkx_featr4   r4   r5   forward_intermediates  s6   

&
,
zTwins.forward_intermediatesr   
prune_norm
prune_headc                 C   s6   t t| j|\}}|rt | _|r| dd |S )z@ Prune layers not required for specified intermediates.
        r   r   )r   r   r   r(   r~   rv   r   )r0   r   r  r  r   r   r4   r4   r5   prune_intermediate_layers  s   
zTwins.prune_intermediate_layersc                 C   s   |j d }tt| j| j| j| jD ]E\}\}}}}||\}}||}t|D ]\}	}
|
||}|	dkr:|||}q(|t| jd k rW|j	|g|dR  
dddd }q| |}|S )Nr   r   r=   r    r7   )r?   r   r   r   r   r   r   r   r   rC   rE   rK   rv   )r0   rL   rM   r   r   r|   r   r   r6   r   r  r4   r4   r5   forward_features  s    


&
zTwins.forward_features
pre_logitsc                 C   s2   | j dkr|jdd}| |}|r|S | |S )Nr   r   r>   )r   meanr   r   )r0   rL   r  r4   r4   r5   forward_head  s   

zTwins.forward_headc                 C   s   |  |}| |}|S r   )r  r
  )r0   rL   r4   r4   r5   r_     s   

zTwins.forwardF)Tr   )NFFr   F)r   FT)!rb   rc   rd   re   r   r(   ru   rx   r$   rf   rg   ignorer   r   r   Moduler   r   r   strr   r   r   r   r   ri   r   r  r  r  r
  r_   rl   r4   r4   r2   r5   r     s    G
 
9
Fc                 K   s0   | dd}tt| |fdt|ddi|}|S )Nout_indicesr8   feature_cfggetter)r  feature_cls)popr   r   r   )variant
pretrainedkwargsr  modelr4   r4   r5   _create_twins  s   
r  r   c                 K   s    | ddd dddt tddd|S )	Nr   )r    r   r   g?bicubicTzpatch_embeds.0.projr   )urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizer	  r   
first_conv
classifierr   )r  r  r4   r4   r5   _cfg  s   r"  ztimm/)	hf_hub_id)ztwins_pcpvt_small.in1kztwins_pcpvt_base.in1kztwins_pcpvt_large.in1kztwins_svt_small.in1kztwins_svt_base.in1kztwins_svt_large.in1kr   c                 K   F   t dg dg dg dg dg dd}td
d	| it |fi |S )Nr8   r   r   i@  r   r   r7   r9   r   r   r   r8   r8   r   r   r   r   r&   r   r   r   twins_pcpvt_smallr  )r)  r   r  r  r  
model_argsr4   r4   r5   r)    
   r)  c                 K   r$  )Nr8   r%  r&  r'  )r    r8      r    r   r(  twins_pcpvt_baser  )r/  r*  r+  r4   r4   r5   r/     r-  r/  c                 K   r$  )Nr8   r%  r&  r'  )r    r      r    r   r(  twins_pcpvt_larger  )r1  r*  r+  r4   r4   r5   r1  (  r-  r1  c              	   K   L   t dg dg dg dg dg dg dd}tdd
| it |fi |S )Nr8   r   )r7   r8   r   r   r   )r7   r7   
   r8      r5  r5  r5  r   r   r   r&   r   r   r   r   twins_svt_smallr  )r7  r*  r+  r4   r4   r5   r7  0  
   r7  c              	   K   r2  )Nr8   )`      i  r   )r    r         r   r7   r7   r.  r7   r4  r   r6  twins_svt_baser  )r>  r*  r+  r4   r4   r5   r>  8  r8  r>  c              	   K   r2  )Nr8   )r   r   r   i   )r8   r   r       r   r=  r4  r   r6  twins_svt_larger  )r@  r*  r+  r4   r4   r5   r@  @  r8  r@  r  )r   )6re   r   	functoolsr   typingr   r   r   r   rf   torch.nnr(   torch.nn.functionalrw   rA   	timm.datar   r	   timm.layersr
   r   r   r   r   _builderr   	_featuresr   _features_fxr   	_registryr   r   vision_transformerr   __all__r   rk   r  r   rm   rx   r   r   r   r  r"  default_cfgsr)  r/  r1  r7  r>  r@  r4   r4   r4   r5   <module>   s\    a;( 
`


