o
    پi;                     @   sF  d Z 	 ddlmZ ddlZddlmZ ddlmZmZ ddl	m
Z
mZmZmZmZmZ ddlmZ ddlmZ dd	lmZmZ d
gZeG dd dejZG dd dejZG dd dejZG dd
 d
ejZd"ddZd#ddZeeddeddedddZed"defddZ ed"defddZ!ed"defd d!Z"dS )$a   ConViT Model

@article{d2021convit,
  title={ConViT: Improving Vision Transformers with Soft Convolutional Inductive Biases},
  author={d'Ascoli, St{'e}phane and Touvron, Hugo and Leavitt, Matthew and Morcos, Ari and Biroli, Giulio and Sagun, Levent},
  journal={arXiv preprint arXiv:2103.10697},
  year={2021}
}

Paper link: https://arxiv.org/abs/2103.10697
Original code: https://github.com/facebookresearch/convit, original copyright below

Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
    )OptionalNIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPathtrunc_normal_
PatchEmbedMlp	LayerNormHybridEmbed   )build_model_with_cfg)register_notrace_module)register_modelgenerate_default_cfgsConVitc                       s^   e Zd Z					d fdd	Zdd Zd	d
 ZdddZdd Zdede	j
fddZ  ZS )GPSA   F              ?c                    s   t    || _|| _|| }|d | _|| _tj||d |d| _tj|||d| _	t
|| _t||| _td|| _t
|| _tt| j| _tdddd| _d S )N         bias   r   )super__init__	num_headsdimscalelocality_strengthnnLinearqkvDropout	attn_dropprojpos_proj	proj_drop	Parametertorchonesgating_paramzerosrel_indices)selfr   r   qkv_biasr&   r)   r    head_dim	__class__ F/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/convit.pyr   )   s   
	
zGPSA.__init__c                 C   s   |j \}}}| jd u s| jj d |kr| || _| |}| |||| j|| j dddd}|| dd|||}| 	|}| 
|}|S )Nr   r   r   r   )shaper/   get_rel_indicesget_attentionr$   reshaper   permute	transposer'   r)   )r0   xBNCattnr$   r5   r5   r6   forwardC   s   
*

zGPSA.forwardc                 C   s   |j \}}}| |||d| j|| j ddddd}|d |d }}| j|ddd}| |dddd}||dd | j	 }	|	j
dd}	|j
dd}| jdddd}
d	t|
 |	 t|
|  }||jddd }| |}|S )
Nr   r   r   r      r   r   )r7   r#   r:   r   r;   r/   expandr(   r<   r   softmaxr-   viewr+   sigmoidsum	unsqueezer&   )r0   r=   r>   r?   r@   r#   qk	pos_scorepatch_scoregatingrA   r5   r5   r6   r9   N   s   . 
zGPSA.get_attentionc                 C   sZ   |  |d}| j d d d d df d }td||f|d }|r+||fS |S )Nr   rD         ?	nm,hnm->h)r9   meanr/   squeezer+   einsumsize)r0   r=   
return_mapattn_map	distancesdistr5   r5   r6   get_attention_map^   s    zGPSA.get_attention_mapc                 C   s   | j jjt| j d}t| jd }|d dkr!|d d n|d }t	|D ]6}t	|D ]/}|||  }d| j
jj|df< d||  | | j
jj|df< d||  | | j
jj|df< q/q)| j
j j| j9  _d S )Nr   rR   r   r   rD   )r$   weightdatacopy_r+   eyer   intr   ranger(   r    )r0   locality_distancekernel_sizecenterh1h2positionr5   r5   r6   
local_initg   s     zGPSA.local_initnum_patchesreturnc           	      C   s   t |d }td||d}t|ddt|dd }|||}|j|ddj|dd}|d |d  }|d|d d d d d d df< |d|d d d d d d df< |d|d d d d d d df< | jj	j
}||S )NrR   r   r   rD   r   rF   r   )ra   r+   r.   arangerI   repeatrepeat_interleaverL   r#   r]   deviceto)	r0   rj   img_sizer/   indindxindyinddro   r5   r5   r6   r8   u   s   $"""

zGPSA.get_rel_indices)r   Fr   r   r   F)__name__
__module____qualname__r   rB   r9   r\   ri   ra   r+   Tensorr8   __classcell__r5   r5   r3   r6   r   '   s    
	r   c                       s8   e Zd Z				d
 fdd	ZdddZdd	 Z  ZS )MHSAr   Fr   c                    sb   t    || _|| }|d | _tj||d |d| _t|| _t||| _	t|| _
d S )Nr   r   r   )r   r   r   r   r!   r"   qkvr%   r&   r'   r)   )r0   r   r   r1   r&   r)   r2   r3   r5   r6   r      s   

zMHSA.__init__c                 C   s  |j \}}}| |||d| j|| j ddddd}|d |d |d }}}	||dd | j }
|
jddd}
t	|d	 }t
|ddt
|dd }|||}|j|ddj|dd}|d |d  }|d	 }||j}t
d
||
f| }|r||
fS |S )Nr   r   r   r   rC   rE   rD   rF   rR   rS   )r7   r}   r:   r   r;   r<   r   rH   rT   ra   r+   rl   rI   rm   rn   rp   ro   rV   )r0   r=   rX   r>   r?   r@   r}   rM   rN   r$   rY   rq   rr   rs   rt   ru   rZ   r[   r5   r5   r6   r\      s    .$zMHSA.get_attention_mapc           
      C   s   |j \}}}| |||d| j|| j ddddd}|d\}}}||dd | j }	|	jdd}	| 	|	}	|	| dd|||}| 
|}| |}|S )	Nr   r   r   r   rC   rE   rD   rF   )r7   r}   r:   r   r;   unbindr<   r   rH   r&   r'   r)   )
r0   r=   r>   r?   r@   r}   rM   rN   r$   rA   r5   r5   r6   rB      s   .


zMHSA.forward)r   Fr   r   rv   )rw   rx   ry   r   r\   rB   r{   r5   r5   r3   r6   r|      s    
r|   c                	       s:   e Zd Zdddddejeddf	 fdd	Zdd	 Z  ZS )
Block      @Fr   Tr   c                    s   t    |	|| _|
| _| jrt||||||d| _n
t|||||d| _|dkr.t|nt	 | _
|	|| _t|| }t||||d| _d S )N)r   r1   r&   r)   r    )r   r1   r&   r)   r   )in_featureshidden_features	act_layerdrop)r   r   norm1use_gpsar   rA   r|   r   r!   Identity	drop_pathnorm2ra   r	   mlp)r0   r   r   	mlp_ratior1   r)   r&   r   r   
norm_layerr   r    mlp_hidden_dimr3   r5   r6   r      s8   


	
zBlock.__init__c                 C   s8   ||  | | | }||  | | | }|S N)r   rA   r   r   r   r0   r=   r5   r5   r6   rB      s   zBlock.forward)	rw   rx   ry   r!   GELUr
   r   rB   r{   r5   r5   r3   r6   r      s    ,r   c                       s   e Zd ZdZddddddddd	d
ddddddedddf fdd	Zdd Zejj	dd Z
ejj	d'ddZejj	d(ddZejj	dejfddZd)dedee fddZd d! Zd'd"efd#d$Zd%d& Z  ZS )*r   zI Vision Transformer with support for patch or hybrid CNN input stage
          r     tokeni      r   Fr   Nr   Tc                    s  t    |dv sJ 9 || _|| _| _ | _ | _| _| _|| _	|d ur5t
|||d| _n	t|||d| _| jj}|| _ttdd| _tj|d| _| j	rlttd|| _t| jdd dd	 td
||D t 	f
dd	t|D | _| _td
ddg| _t|| _|d
krt|nt  | _!t| jdd | "| j# | $ D ]\}}t%|dr|&  qd S )N) avgr   )rq   in_chans	embed_dim)rq   
patch_sizer   r   r   )p{Gz?stdc                 S   s   g | ]}|  qS r5   )item).0r=   r5   r5   r6   
<listcomp>'  s    z#ConVit.__init__.<locals>.<listcomp>r   c                    s0   g | ]}t 	 | |k d 
qS ))
r   r   r   r1   r)   r&   r   r   r   r    )r   )r   i
attn_drop_ratedprr   local_up_to_layerr    r   r   r   proj_drop_rater1   r5   r6   r   (  s    head)num_chs	reductionmoduleri   )'r   r   num_classesglobal_poolr   num_featureshead_hidden_sizer   r    use_pos_embedr   patch_embedr   rj   r!   r*   r+   r.   	cls_tokenr%   pos_drop	pos_embedr   linspace
ModuleListrb   blocksnormdictfeature_info	head_dropr"   r   r   apply_init_weightsnamed_moduleshasattrri   )r0   rq   r   r   r   r   r   depthr   r   r1   	drop_ratepos_drop_rater   r   drop_path_ratehybrid_backboner   r   r    r   rj   nmr3   r   r6   r      sR   

 


zConVit.__init__c                 C   s   t |tjr&t|jdd t |tjr"|jd ur$tj|jd d S d S d S t |tjr>tj|jd tj|jd d S d S )Nr   r   r   r   )	
isinstancer!   r"   r   r]   r   init	constant_r
   )r0   r   r5   r5   r6   r   B  s   zConVit._init_weightsc                 C   s   ddhS )Nr   r   r5   r0   r5   r5   r6   no_weight_decayK  s   zConVit.no_weight_decayc                 C   s   t dddgdS )Nz ^cls_token|pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr   )r   )r0   coarser5   r5   r6   group_matcherO  s   zConVit.group_matcherc                 C   s   |rJ dd S )Nz$gradient checkpointing not supportedr5   )r0   enabler5   r5   r6   set_grad_checkpointingV  s   zConVit.set_grad_checkpointingrk   c                 C   s   | j S r   )r   r   r5   r5   r6   get_classifierZ  s   zConVit.get_classifierr   r   c                 C   sJ   || _ |d ur|dv sJ || _|dkrt| j|| _d S t | _d S )N)r   r   r   r   )r   r   r!   r"   r   r   r   )r0   r   r   r5   r5   r6   reset_classifier^  s
   *zConVit.reset_classifierc                 C   s   |  |}| jr|| j }| |}| j|jd dd}t| jD ]\}}|| j	kr4t
j||fdd}||}q"| |}|S )Nr   rD   r   rF   )r   r   r   r   r   rG   r7   	enumerater   r   r+   catr   )r0   r=   
cls_tokensublkr5   r5   r6   forward_featurese  s   





zConVit.forward_features
pre_logitsc                 C   sX   | j r| j dkr|d d dd f jddn|d d df }| |}|r'|S | |S )Nr   r   rF   r   )r   rT   r   r   )r0   r=   r   r5   r5   r6   forward_headr  s   6
zConVit.forward_headc                 C   s   |  |}| |}|S r   )r   r   r   r5   r5   r6   rB   x  s   

zConVit.forwardrv   )Tr   )rw   rx   ry   __doc__r
   r   r   r+   jitignorer   r   r   r!   Moduler   ra   r   strr   r   boolr   rB   r{   r5   r5   r3   r6   r      sH    P	
Fc                 K   s(   | dd r
tdtt| |fi |S )Nfeatures_onlyz<features_only not implemented for Vision Transformer models.)getRuntimeErrorr   r   )variant
pretrainedkwargsr5   r5   r6   _create_convit~  s   r   r   c              
   K   s   | ddd t tdddd	|S )Nr   )r   r   r   Tzpatch_embed.projr   )	urlr   
input_size	pool_sizerT   r   fixed_input_size
first_conv
classifierr   )r   r   r5   r5   r6   _cfg  s   r   ztimm/)	hf_hub_id)zconvit_tiny.fb_in1kzconvit_small.fb_in1kzconvit_base.fb_in1krk   c                 K   4   t ddddd}tdd| dt |fi |}|S )	N
   r   0   rC   r   r    r   r   convit_tinyr   r   r5   r   r   r   r   
model_argsmodelr5   r5   r6   r     
    r   c                 K   r   )	Nr   r   r   	   r   convit_smallr   r5   r   r   r5   r5   r6   r     r   r   c                 K   r   )	Nr   r   r   r   r   convit_baser   r5   r   r   r5   r5   r6   r     r   r   rv   )r   )#r   typingr   r+   torch.nnr!   	timm.datar   r   timm.layersr   r   r   r	   r
   r   _builderr   _features_fxr   	_registryr   r   __all__r   r   r|   r   r   r   r   default_cfgsr   r   r   r5   r5   r5   r6   <module>   s<     [74 


