o
    پiL                  
   @   s  d Z ddlZddlmZmZmZmZmZ ddlZddl	m
Z
 ddlm
  mZ ddlmZmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZm Z  dgZ!G dd de
j"Z#G dd de
j"Z$G dd de
j"Z%G dd de
j"Z&G dd de
j"Z'G dd de
j"Z(dd Z)d1ddZ*d2ddZ+e e+dd e+dd e+dd e+dd e+dd e+dd e+dd d!Z,ed1d"e(fd#d$Z-ed1d"e(fd%d&Z.ed1d"e(fd'd(Z/ed1d"e(fd)d*Z0ed1d"e(fd+d,Z1ed1d"e(fd-d.Z2ed1d"e(fd/d0Z3dS )3a   Pyramid Vision Transformer v2

@misc{wang2021pvtv2,
      title={PVTv2: Improved Baselines with Pyramid Vision Transformer},
      author={Wenhai Wang and Enze Xie and Xiang Li and Deng-Ping Fan and Kaitao Song and Ding Liang and
        Tong Lu and Ping Luo and Ling Shao},
      year={2021},
      eprint={2106.13797},
      archivePrefix={arXiv},
      primaryClass={cs.CV}
}

Based on Apache 2.0 licensed code at https://github.com/whai362/PVT

Modifications and timm support by / Copyright 2022, Ross Wightman
    N)CallableListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPath	to_2tuple	to_ntupletrunc_normal_	LayerNormuse_fused_attn   )build_model_with_cfg)feature_take_indices)
checkpoint)register_modelgenerate_default_cfgsPyramidVisionTransformerV2c                       s<   e Zd Zddejddf fdd	Zdee fddZ  Z	S )	MlpWithDepthwiseConvN        Fc              	      s~   t    |p|}|p|}t||| _|rt nt | _tj||dddd|d| _	| | _
t||| _t|| _d S )N   r   T)biasgroups)super__init__nnLinearfc1ReLUIdentityreluConv2ddwconvactfc2Dropoutdrop)selfin_featureshidden_featuresout_features	act_layerr)   
extra_relu	__class__ F/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/pvt_v2.pyr   $   s   
	zMlpWithDepthwiseConv.__init__	feat_sizec                 C   s   |  |}|j\}}}|dd|||d |d }| |}| |}|ddd}| |}| |}| 	|}| |}|S )Nr      r   )
r    shape	transposeviewr#   r%   flattenr&   r)   r'   )r*   xr4   BNCr2   r2   r3   forward7   s   
 





zMlpWithDepthwiseConv.forward)
__name__
__module____qualname__r   GELUr   r   intr>   __classcell__r2   r2   r0   r3   r   #   s    r   c                       sN   e Zd ZU ejje ed< 						d fdd	Zd	e	e
 fd
dZ  ZS )	Attention
fused_attn   r   FTr   c                    s(  t    || dksJ d| d| d|| _|| _|| | _| jd | _t | _tj	|||d| _
tj	||d |d| _t|| _t	||| _t|| _|swd | _|dkrltj||||d	| _t|| _nd | _d | _d | _d S td
| _tj||ddd	| _t|| _t | _d S )Nr   zdim z  should be divided by num_heads .g      )r   r5   r   )kernel_sizestride   )r   r   dim	num_headshead_dimscaler   rF   r   r   qkvr(   	attn_dropproj	proj_droppoolr$   srr   normr&   AdaptiveAvgPool2drB   )r*   rL   rM   sr_ratiolinear_attnqkv_biasrR   rT   r0   r2   r3   r   H   s0   

"

zAttention.__init__r4   c                 C   s  |j \}}}|\}}| |||| jddddd}| jd ur`|ddd||||}| | |||dddd}| |}| |}| 	||dd| j| j
ddddd}	nQ| jd ur|ddd||||}| |||dddd}| |}| 	||dd| j| j
ddddd}	n| 	||dd| j| j
ddddd}	|	d\}
}| jrtj||
|| jr| jjndd}n|| j }||
d	d }|jdd
}| |}|| }|dd|||}| |}| |}|S )Nr   r5   r   r      r   )	dropout_prL   )r6   rP   reshaperM   permuterU   rV   rW   r&   rQ   rN   unbindrF   Fscaled_dot_product_attentiontrainingrR   prO   r7   softmaxrS   rT   )r*   r:   r4   r;   r<   r=   HWrP   rQ   kvattnr2   r2   r3   r>   p   s6   $
$

,

,*"



zAttention.forward)rG   r   FTr   r   )r?   r@   rA   torchjitFinalbool__annotations__r   r   rC   r>   rD   r2   r2   r0   r3   rE   E   s   
 (rE   c                	       sD   e Zd Zdddddddejef	 fdd	Zdee fdd	Z	  Z
S )
Block      @r   Fr   c              	      s   t    ||| _t|||||||d| _|	dkrt|	nt | _||| _	t
|t|| |
||d| _|	dkrAt|	| _d S t | _d S )N)rM   rY   rZ   r[   rR   rT   r   )r+   r,   r.   r)   r/   )r   r   norm1rE   rm   r
   r   r"   
drop_path1norm2r   rC   mlp
drop_path2)r*   rL   rM   	mlp_ratiorY   rZ   r[   rT   rR   	drop_pathr.   
norm_layerr0   r2   r3   r      s*   

	

$zBlock.__init__r4   c                 C   s<   ||  | | || }|| | | || }|S N)rv   rm   ru   ry   rx   rw   )r*   r:   r4   r2   r2   r3   r>      s   zBlock.forward)r?   r@   rA   r   rB   r   r   r   rC   r>   rD   r2   r2   r0   r3   rs      s    %rs   c                       s*   e Zd ZdZd
 fdd	Zdd	 Z  ZS )OverlapPatchEmbedz Image to Patch Embedding
    rK   r]   r      c                    sf   t    t|}t||ksJ d|| _tj|||||d d |d d fd| _t|| _	d S )Nz!Set larger patch_size than strider   r5   r   )rJ   padding)
r   r   r   max
patch_sizer   r$   rS   r   rW   )r*   r   rJ   in_chans	embed_dimr0   r2   r3   r      s   
zOverlapPatchEmbed.__init__c                 C   s(   |  |}|dddd}| |}|S )Nr   r5   r   r   )rS   rb   rW   r*   r:   r2   r2   r3   r>      s   

zOverlapPatchEmbed.forward)rK   r]   r   r   )r?   r@   rA   __doc__r   r>   rD   r2   r2   r0   r3   r~      s    
r~   c                       s|   e Zd Zdddddddddef
deded	ed
ededededededededeee ef de	f fddZ
dd Z  ZS )PyramidVisionTransformerStageTrG   r   Frt   r   rL   dim_outdepth
downsamplerM   rY   rZ   rz   r[   rT   rR   r{   r|   c                    sx   t    d| _|rtdd|d| _n	|ksJ d | _t 	f
ddt|D | _| _	d S )NFr   r5   r   rJ   r   r   c                    s:   g | ]}t 	 ttr| nd 
qS ))
rL   rM   rY   rZ   rz   r[   rT   rR   r{   r|   )rs   
isinstancelist).0i
rR   r   r{   rZ   rz   r|   rM   rT   r[   rY   r2   r3   
<listcomp>   s    
z:PyramidVisionTransformerStage.__init__.<locals>.<listcomp>)
r   r   grad_checkpointingr~   r   r   
ModuleListrangeblocksrW   )r*   rL   r   r   r   rM   rY   rZ   rz   r[   rT   rR   r{   r|   r0   r   r3   r      s   

 
z&PyramidVisionTransformerStage.__init__c                 C   s   | j d ur
|  |}|j\}}}}||f}||d|}| jD ]}| jr0tj s0t|||}q|||}q| 	|}|||d |d d
dddd }|S )Nr\   r   r   r   r5   )r   r6   ra   r   r   rn   ro   is_scriptingr   rW   rb   
contiguous)r*   r:   r;   ri   rj   r=   r4   blkr2   r2   r3   r>     s   



(z%PyramidVisionTransformerStage.forward)r?   r@   rA   r   rC   rq   floatr   r   r   r   r>   rD   r2   r2   r0   r3   r      sN    	
-r   c                       sT  e Zd Zddddddddd	d
ddddef fdd	Zdd Zdd Zejj	dd Z
ejj	d5ddZejj	d6ddZdejfddZd7dedee fddZ		
	
	 	
d8d!ejd"eeeee f  d#ed$ed%ed&edeeej eejeej f f fd'd(Z	)	
		d9d"eeee f d*ed+efd,d-Zd.d/ Zd5d0efd1d2Zd3d4 Z  ZS ):r   r     avgr   r]      r   )@            )r   r5   r]   rG   )rG   r]   r5   r   )       @r   rt   rt   TFr   c                    s  t    || _|dv sJ || _|| _t|}t||}t||}t||}t||ks2J g | _tdd||d d| _	dd t
d|t||D }d}|d }g }t|D ]B}|t||| || |dk|| || || |
|	|||| |dg7 }|| }||| 7 }|  jt|dd	|  d
| dg7  _q]tj| | _|d  | _| _t|| _|dkrt|d |nt | _| | j d S )Nr    rK   r]   r   r   c                 S   s   g | ]}|  qS r2   )tolist)r   r:   r2   r2   r3   r   <  s    z7PyramidVisionTransformerV2.__init__.<locals>.<listcomp>)rL   r   r   r   rM   rY   rz   rZ   r[   rT   rR   r{   r|   r5   stages.)num_chs	reductionmoduler\   )r   r   num_classesglobal_pooldepthslenr   feature_infor~   patch_embedrn   linspacesumsplitr   r   dictr   
Sequentialstagesnum_featureshead_hidden_sizer(   	head_dropr   r"   headapply_init_weights)r*   r   r   r   r   
embed_dimsrM   	sr_ratios
mlp_ratiosr[   linear	drop_rateproj_drop_rateattn_drop_ratedrop_path_rater|   
num_stagesdprcurprev_dimr   r   r0   r2   r3   r     sZ   
"
*"z#PyramidVisionTransformerV2.__init__c                 C   s   t |tjr&t|jdd t |tjr"|jd ur$tj|jd d S d S d S t |tjrX|j	d |j	d  |j
 }||j }|jjdtd|  |jd urZ|jj  d S d S d S )Ng{Gz?)stdr   r   g       @)r   r   r   r   weightr   init	constant_r$   rI   out_channelsr   datanormal_mathsqrtzero_)r*   mfan_outr2   r2   r3   r   \  s   

z(PyramidVisionTransformerV2._init_weightsc                 C   s   d| j _d S )NF)r   requires_gradr*   r2   r2   r3   freeze_patch_embh  s   z+PyramidVisionTransformerV2.freeze_patch_embc                 C   s   i S r}   r2   r   r2   r2   r3   no_weight_decayk  s   z*PyramidVisionTransformerV2.no_weight_decayc                 C   s   t ddd}|S )Nz^patch_embedz^stages\.(\d+))stemr   )r   )r*   coarsematcherr2   r2   r3   group_matchero  s
   z(PyramidVisionTransformerV2.group_matcherc                 C   s   | j D ]}||_qd S r}   )r   r   )r*   enablesr2   r2   r3   set_grad_checkpointingw  s   
z1PyramidVisionTransformerV2.set_grad_checkpointingreturnc                 C   s   | j S r}   )r   r   r2   r2   r3   get_classifier|  s   z)PyramidVisionTransformerV2.get_classifierNr   r   c                 C   sJ   || _ |d ur|dv sJ || _|dkrt| j|| _d S t | _d S )Nr   r   )r   r   r   r   r   r"   r   )r*   r   r   r2   r2   r3   reset_classifier  s
   *z+PyramidVisionTransformerV2.reset_classifierNCHWr:   indicesrW   
stop_early
output_fmtintermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}tj s |s$| j}
n	| jd|	d  }
t|
D ]\}}||}||v rB|| q1|rG|S ||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r   zOutput shape must be NCHW.Nr   )	r   r   r   r   rn   ro   r   	enumerateappend)r*   r:   r   rW   r   r   r   intermediatestake_indices	max_indexr   feat_idxstager2   r2   r3   forward_intermediates  s   

z0PyramidVisionTransformerV2.forward_intermediatesr   
prune_norm
prune_headc                 C   s<   t t| j|\}}| jd|d  | _|r| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r   )r   r   r   r   )r*   r   r   r   r   r   r2   r2   r3   prune_intermediate_layers  s
   z4PyramidVisionTransformerV2.prune_intermediate_layersc                 C      |  |}| |}|S r}   )r   r   r   r2   r2   r3   forward_features     

z+PyramidVisionTransformerV2.forward_features
pre_logitsc                 C   s.   | j r	|jdd}| |}|r|S | |S )N)r\   r_   r`   )r   meanr   r   )r*   r:   r   r2   r2   r3   forward_head  s   
z'PyramidVisionTransformerV2.forward_headc                 C   r   r}   )r   r   r   r2   r2   r3   r>     r   z"PyramidVisionTransformerV2.forwardF)Tr}   )NFFr   F)r   FT)r?   r@   rA   r   r   r   r   rn   ro   ignorer   r   r   r   Moduler   rC   r   strr   Tensorr   r   rq   r   r   r   r   r   r>   rD   r2   r2   r0   r3   r     sx    E

 
,
c                 C   s   d| v r| S i }ddl }|  D ]=\}}|dr1|dd}|dd}|dd	}|d
d}|dd}|ddd |}|ddd |}|||< q|S )z$ Remap original checkpoints -> timm zpatch_embed.proj.weightr   Nr   patch_embed1patch_embed2zstages.1.downsamplepatch_embed3zstages.2.downsamplepatch_embed4zstages.3.downsamplezdwconv.dwconvr%   zblock(\d+).(\d+)c                 S   s$   dt | dd  d| d S )Nr   r   z.blocks.r5   rC   groupr:   r2   r2   r3   <lambda>  s   $ z&checkpoint_filter_fn.<locals>.<lambda>z
^norm(\d+)c                 S   s   dt | dd  dS )Nr   r   z.normr  r  r2   r2   r3   r    s    )reitems
startswithreplacesub)
state_dictmodelout_dictr  rk   rl   r2   r2   r3   checkpoint_filter_fn  s   

r  Fc                 K   s>   t td}|d|}tt| |fttd|dd|}|S )Nr]   out_indicesT)flatten_sequentialr  )pretrained_filter_fnfeature_cfg)tupler   popr   r   r  r   )variant
pretrainedkwargsdefault_out_indicesr  r  r2   r2   r3   _create_pvt2  s   
r  r   c                 K   s    | dddddt tdddd	|S )
Nr   )r      r  )rK   rK   g?bicubiczpatch_embed.projr   F)urlr   
input_size	pool_sizecrop_pctinterpolationr   r   
first_conv
classifierfixed_input_sizer   )r  r  r2   r2   r3   _cfg  s   r#  ztimm/)	hf_hub_id)zpvt_v2_b0.in1kzpvt_v2_b1.in1kzpvt_v2_b2.in1kzpvt_v2_b3.in1kzpvt_v2_b4.in1kzpvt_v2_b5.in1kzpvt_v2_b2_li.in1kr   c                 K   ,   t dddd}tdd| it |fi |S )Nr5   r5   r5   r5   )    r      r   r   r5      rG   r   r   rM   	pvt_v2_b0r  )r,  r   r  r  r  
model_argsr2   r2   r3   r,       r,  c                 K   r%  )Nr&  r   r   i@  r   r)  r+  	pvt_v2_b1r  )r2  r-  r.  r2   r2   r3   r2    r0  r2  c                 K   r%  )Nr   r1  r)  r+  	pvt_v2_b2r  )r3  r-  r.  r2   r2   r3   r3    r0  r3  c                 K   r%  )N)r   r]      r   r1  r)  r+  	pvt_v2_b3r  )r5  r-  r.  r2   r2   r3   r5    r0  r5  c                 K   r%  )N)r   rG      r   r1  r)  r+  	pvt_v2_b4r  )r7  r-  r.  r2   r2   r3   r7    r0  r7  c                 K   .   t ddddd}tdd| it |fi |S )	N)r   r   (   r   r1  r)  )r]   r]   r]   r]   )r   r   rM   r   	pvt_v2_b5r  )r:  r-  r.  r2   r2   r3   r:  $     r:  c                 K   r8  )	Nr   r1  r)  T)r   r   rM   r   pvt_v2_b2_lir  )r<  r-  r.  r2   r2   r3   r<  +  r;  r<  r   )r   )4r   r   typingr   r   r   r   r   rn   torch.nnr   torch.nn.functional
functionalrd   	timm.datar   r	   timm.layersr
   r   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r   __all__r   r   rE   rs   r~   r   r   r  r  r#  default_cfgsr,  r2  r3  r5  r7  r:  r<  r2   r2   r2   r3   <module>   s\     "O.@ :


