o
    پiWJ                     @   s  d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZmZ d	gZG d
d dejZG dd dejZG dd dejZG dd	 d	ejZdddZd ddZeeddedddZed!defddZed!defddZ dS )"z Visformer

Paper: Visformer: The Vision-friendly Transformer - https://arxiv.org/abs/2104.12533

From original at https://github.com/danczs/Visformer

Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
    NIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)	to_2tupletrunc_normal_DropPath
PatchEmbedLayerNorm2dcreate_classifieruse_fused_attn   )build_model_with_cfg)checkpoint_seq)register_modelgenerate_default_cfgs	Visformerc                       s4   e Zd Zddejdddf fdd	Zdd Z  ZS )	
SpatialMlpN           Fc           	   	      s   t    |p|}|p|}t|}|| _|| _|| _| jr,|dk r(|d d }n|d }|| _|| _tj	||ddddd| _
| | _t|d | _| jr`tj	||ddd| jdd	| _| | _nd | _d | _tj	||ddddd| _t|d | _d S )
N         r   r   Fstridepaddingbias   )r   r   groupsr   )super__init__r   in_featuresout_featuresspatial_convhidden_featuresgroupnnConv2dconv1act1Dropoutdrop1conv2act2conv3drop3)	selfr    r#   r!   	act_layerdropr$   r"   
drop_probs	__class__ I/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/visformer.pyr      s2   


zSpatialMlp.__init__c                 C   sT   |  |}| |}| |}| jd ur| |}| |}| |}| |}|S N)r'   r(   r*   r+   r,   r-   r.   r/   xr5   r5   r6   forward=   s   







zSpatialMlp.forward)__name__
__module____qualname__r%   GELUr   r:   __classcell__r5   r5   r3   r6   r      s    &r   c                       s8   e Zd ZU ejje ed< d	 fdd	Zdd Z	  Z
S )
	Attention
fused_attnr         ?r   c                    s   t    || _|| _t|| | }|| _|d | _tdd| _t	j
||| d ddddd| _t	|| _t	j
| j| j |ddddd| _t	|| _d S )	Ng      T)experimentalr   r   r   Fr   )r   r   dim	num_headsroundhead_dimscaler   rA   r%   r&   qkvr)   	attn_dropproj	proj_drop)r/   rD   rE   head_dim_ratiorJ   rL   rG   r3   r5   r6   r   L   s   

  zAttention.__init__c           
      C   s   |j \}}}}| ||d| j| jdddddd}|d\}}}| jr@tj	j
j| | | | jr;| jjndd}n||d	d | j }	|	jdd
}	| |	}	|	| }|dddd|d||}| |}| |}|S )Nr   r   r   r      r   )	dropout_p)rD   )shaperI   reshaperE   rG   permuteunbindrA   torchr%   
functionalscaled_dot_product_attention
contiguoustrainingrJ   p	transposerH   softmaxrK   rL   )
r/   r9   BCHWqkvattnr5   r5   r6   r:   Z   s    *


zAttention.forward)r   rB   r   r   )r;   r<   r=   rV   jitFinalbool__annotations__r   r:   r?   r5   r5   r3   r6   r@   I   s   
 r@   c                
       s<   e Zd Zdddddejedddf
 fdd	Zdd	 Z  ZS )
BlockrB         @r   r   Fc                    s   t    || _|dkrt|nt | _|rd | _d | _n|	|| _t	|||||d| _|	|| _
t|t|| |||
|d| _d S )Nr   )rE   rM   rJ   rL   )r    r#   r0   r1   r$   r"   )r   r   r"   r   r%   Identity	drop_pathnorm1re   r@   norm2r   intmlp)r/   rD   rE   rM   	mlp_ratiorL   rJ   rm   r0   
norm_layerr$   attn_disabledr"   r3   r5   r6   r   q   s.   



zBlock.__init__c                 C   sB   | j d ur|| |  | | }|| | | | }|S r7   )re   rm   rn   rq   ro   r8   r5   r5   r6   r:      s   
zBlock.forward)	r;   r<   r=   r%   r>   r	   r   r:   r?   r5   r5   r3   r6   rj   p   s    )rj   c                       s   e Zd Zddddddddd	d
d
d
d
d
eddddddddf fdd	Zdd Zejjd'ddZ	ejjd(ddZ
ejjdejfddZd)dedefddZd d! Zd'd"efd#d$Zd%d& Z  ZS )*r         r              r   rk   r   111TFr   avgNc                    s  t    t|}|| _| _|| _|| _|| _|| _t	|t
tfr/|\| _| _| _t|}n|d  | _| _|| j | j | _|| _d| _dd td||D | jrmd | _t|||dd| _fdd|D }n[| jd u rd | _t|d |d |dd| _fd	d|D }n:ttj|| jd
ddddt| jtjdd| _dd |D }t|d | jd |dd| _fdd|D }| jr| jrttjdg|R  | _nttjdd g|R  | _tj|d| _ nd | _tj 	
f
ddt!| jD  | _"| jsDt|d d |dd| _#fdd|D }| jr@ttjdg|R  | _$nd | _$nd | _#tj 	
f
ddt!| j| j| j D  | _%| jst|d d |dd| _&fdd|D }| jrttjdd g|R  | _'nd | _'nd | _&tj 	
f
ddt!| j| j |D  | _(| jrnd  | _)| _*| j)| _+t,| j)| j|d\}}|| _-t|
| _.|| _/| jrt0| jdd | jst0| j$dd t0| j'dd | 1| j2 d S )Nr   Fc                 S   s   g | ]}|  qS r5   )item.0r9   r5   r5   r6   
<listcomp>       z&Visformer.__init__.<locals>.<listcomp>r   )img_size
patch_sizein_chans	embed_dimrs   flattenc                    s   g | ]}|  qS r5   r5   r~   r   r5   r6   r      r   r   c                       g | ]}| d   qS r   r5   r~   r   r5   r6   r             r   T)inplacec                 S   s   g | ]}|d  qS r   r5   r~   r5   r5   r6   r      r   rO   c                    r   )rO   r5   r~   r   r5   r6   r      r   r   )r[   c                    sB   g | ]}t d  d | d dk	d dkdqS )r   g      ?r   01rD   rE   rM   rr   rL   rJ   rm   rs   r$   rt   r"   rj   r   i
attn_drop_rate
attn_stagedprr   r$   rr   rs   rE   proj_drop_rater"   r5   r6   r           

r   c                    r   r   r5   r~   r   r5   r6   r     r   c                    s>   g | ]}t d  | d dk	d dkdqS )rB   r   r   r   r   r   r   r   r5   r6   r   "  s     

c                    r   r   r5   r~   r   r5   r6   r   =  r   c                    sB   g | ]}t d  d | d  dk	d  dkdqS )r   rB   r   r   r   r   r   r   r5   r6   r   D  r   	pool_type{Gz?std)3r   r   r   num_classesr   init_channelsr   vit_stem	conv_init
isinstancelisttuple
stage_num1
stage_num2
stage_num3sumuse_pos_embedgrad_checkpointingrV   linspacestemr   patch_embed1r%   
Sequentialr&   BatchNorm2dReLU	Parameterzeros
pos_embed1r)   pos_droprangestage1patch_embed2
pos_embed2stage2patch_embed3
pos_embed3stage3num_featureshead_hidden_sizenormr
   global_pool	head_dropheadr   apply_init_weights)r/   r   r   r   r   r   r   depthrE   rr   	drop_ratepos_drop_rater   r   drop_path_raters   r   r   r"   r   r$   r   r   
embed_normr   r3   )r   r   r   r   r$   rr   rs   rE   r   r   r"   r6   r      s   




  
 
" 
zVisformer.__init__c                 C   s   t |tjrt|jdd |jd urtj|jd d S d S t |tjrH| j	r2tjj
|jddd nt|jdd |jd urJtj|jd d S d S d S )Nr   r   r   fan_outrelu)modenonlinearityr   )r   r%   Linearr   weightr   init	constant_r&   r   kaiming_normal_)r/   mr5   r5   r6   r   f  s   

zVisformer._init_weightsc                 C   s   t d|rdndd fddgdS )Nz^patch_embed1|pos_embed1|stemz^stage(\d+)\.(\d+))z^(?:patch_embed|pos_embed)(\d+))r   )z^norm)i )r   blocks)dict)r/   coarser5   r5   r6   group_matchers  s   zVisformer.group_matcherc                 C   s
   || _ d S r7   )r   )r/   enabler5   r5   r6   set_grad_checkpointing~  s   
z Visformer.set_grad_checkpointingreturnc                 C   s   | j S r7   )r   )r/   r5   r5   r6   get_classifier  s   zVisformer.get_classifierr   r   c                 C   s$   || _ t| j| j |d\| _| _d S )Nr   )r   r
   r   r   r   )r/   r   r   r5   r5   r6   reset_classifier  s   zVisformer.reset_classifierc                 C   s  | j d ur
|  |}| |}| jd ur| || j }| jr+tj s+t| j	|}n| 	|}| j
d urG| 
|}| jd urG| || j }| jrVtj sVt| j|}n| |}| jd urr| |}| jd urr| || j }| jrtj st| j|}n| |}| |}|S r7   )r   r   r   r   r   rV   rf   is_scriptingr   r   r   r   r   r   r   r   r   r8   r5   r5   r6   forward_features  s0   













zVisformer.forward_features
pre_logitsc                 C   s&   |  |}| |}|r|S | |S r7   )r   r   r   )r/   r9   r   r5   r5   r6   forward_head  s   

zVisformer.forward_headc                 C   s   |  |}| |}|S r7   )r   r   r8   r5   r5   r6   r:     s   

zVisformer.forwardF)T)r|   )r;   r<   r=   r	   r   r   rV   rf   ignorer   r   r%   Moduler   rp   strr   r   rh   r   r:   r?   r5   r5   r3   r6   r      sJ     E
$Fc                 K   s,   | dd r
tdtt| |fi |}|S )Nfeatures_onlyz<features_only not implemented for Vision Transformer models.)getRuntimeErrorr   r   )variant
pretraineddefault_cfgkwargsmodelr5   r5   r6   _create_visformer  s   r    c                 K   s    | ddddddt tddd	|S )
Nrw   )r   ru   ru   )r   r   g?bicubicTzstem.0r   )urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizemeanr   
first_conv
classifierr   )r   r   r5   r5   r6   _cfg  s   r   ztimm/)	hf_hub_id)zvisformer_tiny.in1kzvisformer_small.in1kr   c                 K   D   t ddddddddtjd	tjd
}tdd| it |fi |}|S )Nrv      r   rO   rO   r   rk   r   011100Tr   r   r   rE   rr   r$   r   r"   rs   r   r   visformer_tinyr   )r  r   r%   r   r   r   r   	model_cfgr   r5   r5   r6   r       
r  c                 K   r   )Nrx   ry   r   r   rk   r   r   r   Tr  visformer_smallr   )r  r  r  r5   r5   r6   r    r  r  )FN)r   r   )!__doc__rV   torch.nnr%   	timm.datar   r   timm.layersr   r   r   r   r	   r
   r   _builderr   _manipulater   	_registryr   r   __all__r   r   r@   rj   r   r   r   default_cfgsr  r  r5   r5   r5   r6   <module>   s2    	$3'1  

	