o
    پio                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 zddlmZ W n ey5   ddlmZ Y nw ddlZddlmZ ddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZ d	d
lmZ d	dl m!Z! d	dl"m#Z#m$Z$ d	dl%m&Z&m'Z' d	dl(m)Z) dgZ*e+e,Z-G dd dej.Z/G dd dej.Z0G dd dej.Z1G dd dej.Z2G dd dej.Z3dJddZ4dKddZ5e&e5dd d!d"e5dd#d$e5d%d d&e5d'd d&e5d(d d&e5d)d d&e5d*d d&e5d+d d&e5 e5d,d d&e5 e5d-d d&e5 d.Z6e'dJd/e3fd0d1Z7e'dJd/e3fd2d3Z8e'dJd/e3fd4d5Z9e'dJd/e3fd6d7Z:e'dJd/e3fd8d9Z;e'dJd/e3fd:d;Z<e'dJd/e3fd<d=Z=e'dJd/e3fd>d?Z>e'dJd/e3fd@dAZ?e'dJd/e3fdBdCZ@e'dJd/e3fdDdEZAe'dJd/e3fdFdGZBe'dJd/e3fdHdIZCdS )Lz Relative Position Vision Transformer (ViT) in PyTorch

NOTE: these models are experimental / WIP, expect changes

Hacked together by / Copyright 2022, Ross Wightman
    N)partial)ListOptionalTupleTypeUnion)Literal)FinalIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STD)
PatchEmbedMlpDropPath	RelPosMlp
RelPosBiasuse_fused_attn	LayerType   )build_model_with_cfg)feature_take_indices)named_apply
checkpoint)generate_default_cfgsregister_model)get_init_weights_vitVisionTransformerRelPosc                       sR   e Zd ZU ee ed< ddddddejf fdd	Zdde	e
j fd	d
Z  ZS )RelPosAttention
fused_attn   FN        c	           	         s   t    || dksJ d|| _|| | _| jd | _t | _tj||d |d| _	|r3|| jnt
 | _|r?|| jnt
 | _|rK||dnd | _t|| _t||| _t|| _d S )Nr   z$dim should be divisible by num_headsg         )bias	num_heads)super__init__r$   head_dimscaler   r   nnLinearqkvIdentityq_normk_normrel_posDropout	attn_dropproj	proj_drop)	selfdimr$   qkv_biasqk_normrel_pos_clsr1   r3   
norm_layer	__class__ Y/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/vision_transformer_relpos.pyr&   %   s   

zRelPosAttention.__init__shared_rel_posc                 C   s@  |j \}}}| |||d| j| jddddd}|d\}}}	| |}| |}| j	rX| j
d ur;| j
 }
n	|d urB|}
nd }
tjjj|||	|
| jrS| jjndd}n1|| j }||dd	 }| j
d urr| j
||d
}n|d urz|| }|jd	d}| |}||	 }|dd|||}| |}| |}|S )Nr!      r   r      r    )	attn_mask	dropout_pr>   r5   )shaper+   reshaper$   r'   permuteunbindr-   r.   r   r/   get_biastorchr)   
functionalscaled_dot_product_attentiontrainingr1   pr(   	transposesoftmaxr2   r3   )r4   xr>   BNCr+   qkv	attn_biasattnr<   r<   r=   forward?   s:   *







zRelPosAttention.forwardN)__name__
__module____qualname__r	   bool__annotations__r)   	LayerNormr&   r   rL   Tensorr\   __classcell__r<   r<   r:   r=   r   "   s   
 r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )
LayerScaleh㈵>Fc                    s*   t    || _t|t| | _d S r]   )r%   r&   inplacer)   	ParameterrL   onesgamma)r4   r5   init_valuesrh   r:   r<   r=   r&   e   s   
zLayerScale.__init__c                 C   s   | j r	|| jS || j S r]   )rh   mul_rk   r4   rS   r<   r<   r=   r\   j   s   zLayerScale.forward)rg   F)r^   r_   r`   r&   r\   re   r<   r<   r:   r=   rf   d   s    rf   c                
       sL   e Zd Zddddddddejejf
 fdd	Zd
deej	 fdd	Z
  ZS )RelPosBlock      @FNr    c              	      s   t    ||| _t||||||	|d| _|rt||dnt | _|
dkr+t	|
nt | _
||| _t|t|| ||d| _|rJt||dnt | _|
dkrZt	|
| _d S t | _d S )Nr6   r7   r8   r1   r3   )rl   r    in_featureshidden_features	act_layerdrop)r%   r&   norm1r   r[   rf   r)   r,   ls1r   
drop_path1norm2r   intmlpls2
drop_path2r4   r5   r$   	mlp_ratior6   r7   r8   rl   r3   r1   	drop_pathru   r9   r:   r<   r=   r&   p   s,   

	

$zRelPosBlock.__init__r>   c              
   C   sH   ||  | | j| ||d }|| | | | | }|S NrE   )ry   rx   r[   rw   r~   r}   r|   rz   r4   rS   r>   r<   r<   r=   r\      s   $ zRelPosBlock.forwardr]   )r^   r_   r`   r)   GELUrc   r&   r   rL   rd   r\   re   r<   r<   r:   r=   ro   n   s    (ro   c                
       sT   e Zd Zddddddddejejf
 fdd	Zdd Zdd	ee	j
 fd
dZ  ZS )ResPostRelPosBlockrp   FNr    c              	      s   t    || _t||||||	|d| _||| _|
dkr!t|
nt | _	t
|t|| ||d| _||| _|
dkr@t|
nt | _|   d S )Nrq   r    rr   )r%   r&   rl   r   r[   rw   r   r)   r,   ry   r   r{   r|   rz   r~   init_weightsr   r:   r<   r=   r&      s,   

	

zResPostRelPosBlock.__init__c                 C   s:   | j d urtj| jj| j  tj| jj| j  d S d S r]   )rl   r)   init	constant_rw   weightrz   r4   r<   r<   r=   r      s   
zResPostRelPosBlock.init_weightsr>   c              	   C   s<   ||  | | j||d }|| | | | }|S r   )ry   rw   r[   r~   rz   r|   r   r<   r<   r=   r\      s   zResPostRelPosBlock.forwardr]   )r^   r_   r`   r)   r   rc   r&   r   r   rL   rd   r\   re   r<   r<   r:   r=   r      s    )r   c                8       s2  e Zd ZdZddddddddd	d
dddddddddddddeddefdeeeeef f deeeeef f dedede	d dededede
dededee
 deded ed!ee d"ed#e
d$e
d%e
d&e
d'e	d( d)ed*eej d+ee d,ee d-eej f6 fd.d/ZdVd1d2Zd3d4 Zejjd5d6 ZejjdWd7d8ZejjdXd9d:Zejjd;ejfd<d=ZdYdedee fd>d?Z					@	dZdAejdBeeeee f  dCedDedEedFedGed;eeej eejeej f f fdHdIZ	J		
d[dBeeee f dKedLefdMdNZ dOdP Z!dWdQefdRdSZ"dTdU Z#  Z$S )\r   ah   Vision Transformer w/ Relative Position Bias

    Differing from classic vit, this impl
      * uses relative position index (swin v1 / beit) or relative log coord + mlp (swin v2) pos embed
      * defaults to no class token (can be enabled)
      * defaults to global avg pool for head (can be changed)
      * layer-scale (residual branch gain) enabled
          r!     avg      rp   TFư>r|   Nr    skipimg_size
patch_sizein_chansnum_classesglobal_pool) r   tokenmap	embed_dimdepthr$   r   r6   r7   rl   class_tokenfc_normrel_pos_typerel_pos_dimr>   	drop_rateproj_drop_rateattn_drop_ratedrop_path_rateweight_init)r   jaxmocor   fix_initembed_layerr9   ru   block_fnc                    s"  t    |dv sJ |s|dksJ pttjdd p tj || _|| _ | _ | _	| _
|r4dnd| _d| _||||d| _| jj}t| jd	rR| j n|t|| jd
}|drw|rf||d< d|v rnd|d< ttfi |nttfi |d| _|rd| _d|rttd| jnd| _dd td||D t 	
fddt|D | _fddt|D | _|sֈnt | _ |rnt | _!t"|| _#|dkrt$| j
|nt | _%|dkr| &| |r| '  dS dS )aE  
        Args:
            img_size: input image size
            patch_size: patch size
            in_chans: number of input channels
            num_classes: number of classes for classification head
            global_pool: type of global pooling for final sequence (default: 'avg')
            embed_dim: embedding dimension
            depth: depth of transformer
            num_heads: number of attention heads
            mlp_ratio: ratio of mlp hidden dim to embedding dim
            qkv_bias: enable bias for qkv if True
            qk_norm: Enable normalization of query and key in attention
            init_values: layer-scale init values
            class_token: use class token (default: False)
            fc_norm: use pre classifier norm instead of pre-pool
            rel_pos_type: type of relative position
            shared_rel_pos: share relative pos across all blocks
            drop_rate: dropout rate
            proj_drop_rate: projection dropout rate
            attn_drop_rate: attention dropout rate
            drop_path_rate: stochastic depth rate
            weight_init: weight init scheme
            fix_init: apply weight initialization fix (scaling w/ layer index)
            embed_layer: patch embedding layer
            norm_layer: normalization layer
            act_layer: MLP activation layer
        r   r   r   r   r   )epsr   r   F)r   r   r   r   
feat_ratio)window_sizeprefix_tokensr|   
hidden_dimswinmodeNr#   c                 S   s   g | ]}|  qS r<   )item).0rS   r<   r<   r=   
<listcomp>?  s    z4VisionTransformerRelPos.__init__.<locals>.<listcomp>c                    s0   g | ]}
	|  d qS ))r5   r$   r   r6   r7   r8   rl   r3   r1   r   r9   ru   r<   r   i)ru   r   r   dprr   rl   r   r9   r$   r   r7   r6   r8   r<   r=   r   @  s"    c                    s    g | ]}t d |  dqS )zblocks.)modulenum_chs	reductiondictr   )r   rr<   r=   r   P  s    r   )(r%   r&   r   r)   rc   r   r   r   num_featureshead_hidden_sizer   num_prefix_tokensgrad_checkpointingpatch_embed	grid_sizehasattrr   r   
startswithr   r   r>   ri   rL   zeros	cls_tokenlinspace
ModuleListrangeblocksfeature_infor,   normr   r0   	head_dropr*   headr   fix_init_weight)r4   r   r   r   r   r   r   r   r$   r   r6   r7   rl   r   r   r   r   r>   r   r   r   r   r   r   r   r9   ru   r   	feat_sizerel_pos_argsr:   )ru   r   r   r   r   rl   r   r9   r$   r   r7   r6   r   r8   r=   r&      s^   
:

 &
 

z VisionTransformerRelPos.__init__r   c                 C   s:   |dv sJ | j d urtjj| j dd tt||  d S )N)r   r   r   r   )std)r   r)   r   normal_r   r   )r4   r   r<   r<   r=   r   ^  s   
z$VisionTransformerRelPos.init_weightsc                 C   sL   dd }t | jD ]\}}||jjjj|d  ||jjjj|d  q	d S )Nc                 S   s   |  td|  d S )Ng       @)div_mathsqrt)param	_layer_idr<   r<   r=   rescalee  s   z8VisionTransformerRelPos.fix_init_weight.<locals>.rescaler   )	enumerater   r[   r2   r   datar|   fc2)r4   r   layer_idlayerr<   r<   r=   r   d  s
   z'VisionTransformerRelPos.fix_init_weightc                 C   s   dhS )Nr   r<   r   r<   r<   r=   no_weight_decayl     z'VisionTransformerRelPos.no_weight_decayc                 C   s   t dddgdS )Nz^cls_token|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr   r   )r4   coarser<   r<   r=   group_matcherp  s   z%VisionTransformerRelPos.group_matcherc                 C   s
   || _ d S r]   )r   )r4   enabler<   r<   r=   set_grad_checkpointingw  s   
z.VisionTransformerRelPos.set_grad_checkpointingreturnc                 C   s   | j S r]   )r   r   r<   r<   r=   get_classifier{  r   z&VisionTransformerRelPos.get_classifierc                 C   sJ   || _ |d ur|dv sJ || _|dkrt| j|| _d S t | _d S )Nr   r   )r   r   r)   r*   r   r,   r   )r4   r   r   r<   r<   r=   reset_classifier  s
   *z(VisionTransformerRelPos.reset_classifierNCHWrS   indicesreturn_prefix_tokensr   
stop_early
output_fmtintermediates_onlyc                    s  |dv sJ d|dk}g }	t tj|\}
}|j\ }}}|}jdur;tjj|jd dd|fdd}j	durEj	
 nd}tj sN|sRj}n	jd|d  }t|D ]*\}}jrstj sst|||d	}n|||d	}||
v r|	|r|n| q_jrfd
d|	D }fdd|	D }	|rj||f\ fdd|	D }	tj s|rtt|	|}	|r|	S |}||	fS )a=   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            return_prefix_tokens: Return both prefix and spatial intermediate tokens
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r   NLCz)Output format must be one of NCHW or NLC.r   Nr   rD   r   rF   rE   c                    s"   g | ]}|d d d j f qS )Nr   r   r   yr   r<   r=   r        " zAVisionTransformerRelPos.forward_intermediates.<locals>.<listcomp>c                    s"   g | ]}|d d  j d f qS r]   r   r   r   r<   r=   r     r   c                    s,   g | ]}|  d dddd qS )rD   r   r!   r   r?   )rH   rI   
contiguousr   )rT   HWr<   r=   r     s   , )r   lenr   rG   r   r   rL   catexpandr>   rK   jitis_scriptingr   r   r   appendr   r   dynamic_feat_sizelistzip)r4   rS   r   r   r   r   r   r   rH   intermediatestake_indices	max_index_heightwidthr>   r   r   blkr   r<   )rT   r   r   r4   r=   forward_intermediates  s>   

$
z-VisionTransformerRelPos.forward_intermediatesr   
prune_norm
prune_headc                 C   sT   t t| j|\}}| jd|d  | _|rt | _|r(t | _| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r   )r   r   r   r)   r,   r   r   r   )r4   r   r
  r  r  r  r<   r<   r=   prune_intermediate_layers  s   

z1VisionTransformerRelPos.prune_intermediate_layersc                 C   s   |  |}| jd urtj| j|jd dd|fdd}| jd ur&| j nd }| jD ]}| j	r=tj
 s=t|||d}q+|||d}q+| |}|S )Nr   rD   r   rF   rE   )r   r   rL   r   r   rG   r>   rK   r   r   r   r   r   r   )r4   rS   r>   r  r<   r<   r=   forward_features  s   

$

z(VisionTransformerRelPos.forward_features
pre_logitsc                 C   sd   | j r| j dkr|d d | jd f jddn|d d df }| |}| |}|r-|S | |S )Nr   r   rF   r   )r   r   meanr   r   r   )r4   rS   r  r<   r<   r=   forward_head  s
   8

z$VisionTransformerRelPos.forward_headc                 C   s   |  |}| |}|S r]   )r  r  rn   r<   r<   r=   r\     s   

zVisionTransformerRelPos.forwardr   F)Tr]   )NFFFr   F)r   FT)%r^   r_   r`   __doc__r   ro   r   r{   r   r   floatra   r   strr   r)   Moduler   r&   r   r   rL   r   ignorer   r   r   r   r   rd   r   r	  r  r  r  r\   re   r<   r<   r:   r=   r      s   	



 	
F
Fc                 K   s0   | dd}tt| |fdt|ddi|}|S )Nout_indicesr!   feature_cfggetter)r  feature_cls)popr   r   r   )variant
pretrainedkwargsr  modelr<   r<   r=   !_create_vision_transformer_relpos  s   
r!  r   c                 K   s    | ddd dddt tddd|S )	Nr   )r!   r   r   g?bicubicTzpatch_embed.projr   )urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizer  r   
first_conv
classifierr
   )r#  r  r<   r<   r=   _cfg   s   r+  zhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_replos_base_patch32_plus_rpn_256-sw-dd486f51.pthztimm/)r!      r,  )r#  	hf_hub_idr$  )r!      r.  )r#  r$  zhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_small_patch16_224-sw-ec2778b4.pth)r#  r-  zhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_medium_patch16_224-sw-11c174af.pthzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_base_patch16_224-sw-49049aed.pthzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_srelpos_small_patch16_224-sw-6cdb8849.pthzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_srelpos_medium_patch16_224-sw-ad702b8c.pthzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_medium_patch16_cls_224-sw-cfe8e259.pthzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_base_patch16_gapcls_224-sw-1a341d6c.pthzhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_medium_patch16_rpn_224-sw-5d2befd8.pth)z,vit_relpos_base_patch32_plus_rpn_256.sw_in1kz*vit_relpos_base_patch16_plus_240.untrainedz$vit_relpos_small_patch16_224.sw_in1kz%vit_relpos_medium_patch16_224.sw_in1kz#vit_relpos_base_patch16_224.sw_in1kz%vit_srelpos_small_patch16_224.sw_in1kz&vit_srelpos_medium_patch16_224.sw_in1kz)vit_relpos_medium_patch16_cls_224.sw_in1kz)vit_relpos_base_patch16_cls_224.untrainedz*vit_relpos_base_patch16_clsgap_224.sw_in1kz*vit_relpos_small_patch16_rpn_224.untrainedz)vit_relpos_medium_patch16_rpn_224.sw_in1kz)vit_relpos_base_patch16_rpn_224.untrainedr   c                 K   s6   t ddddtd}t	d	d| it |fi |}|S )
z` ViT-Base (ViT-B/32+) w/ relative log-coord position and residual post-norm, no class token
          r      )r   r   r   r$   r   $vit_relpos_base_patch32_plus_rpn_256r  N)r2  r   r   r!  r  r  
model_argsr   r<   r<   r=   r2  3  s   r2  c                 K   s4   t ddddd}t	d	d| it |fi |}|S )
zI ViT-Base (ViT-B/16+) w/ relative log-coord position, no class token
    r   r0  r   r1  )r   r   r   r$    vit_relpos_base_patch16_plus_240r  N)r6  r   r!  r4  r<   r<   r=   r6  =  s   r6  c                 K   8   t ddddddd}t	dd	| it |fi |}|S )H ViT-Base (ViT-B/16) w/ relative log-coord position, no class token
    r     r      FTr   r   r   r$   r6   r   vit_relpos_small_patch16_224r  N)r=  r7  r4  r<   r<   r=   r=  G  s   r=  c                 K   r8  )r9  r      r   r   FTr<  vit_relpos_medium_patch16_224r  N)r?  r7  r4  r<   r<   r=   r?  Q     r?  c                 K   s8   t ddddddd}t	d
d| it |fi |}|S )r9  r   r   r   FTr<  vit_relpos_base_patch16_224r  N)rA  r7  r4  r<   r<   r=   rA  \  r@  rA  c              
   K   <   t ddddddddd}t	dd	| it |fi |}|S )O ViT-Base (ViT-B/16) w/ shared relative log-coord position, no class token
    r   r:  r   r;  FTr   r   r   r$   r6   r   r   r>   vit_srelpos_small_patch16_224r  N)rE  r7  r4  r<   r<   r=   rE  g     rE  c              
   K   rB  )rC  r   r>  r   r   FTrD  vit_srelpos_medium_patch16_224r  N)rG  r7  r4  r<   r<   r=   rG  s  rF  rG  c                 K   s>   t dddddddddd		}t	
dd| it |fi |}|S )zM ViT-Base (ViT-M/16) w/ relative log-coord position, class token present
    r   r>  r   r   Fr,  Tr   )	r   r   r   r$   r6   r   r   r   r   !vit_relpos_medium_patch16_cls_224r  N)rH  r7  r4  r<   r<   r=   rH    s   rH  c              	   K   s:   t dddddddd}t	dd	| it |fi |}|S )zM ViT-Base (ViT-B/16) w/ relative log-coord position, class token present
    r   r   r   FTr   )r   r   r   r$   r6   r   r   vit_relpos_base_patch16_cls_224r  N)rI  r7  r4  r<   r<   r=   rI    s   rI  c              	   K   s:   t dddddddd}t	d
d| it |fi |}|S )a   ViT-Base (ViT-B/16) w/ relative log-coord position, class token present
    NOTE this config is a bit of a mistake, class token was enabled but global avg-pool w/ fc-norm was not disabled
    Leaving here for comparisons w/ a future re-train as it performs quite well.
    r   r   r   FT)r   r   r   r$   r6   r   r   "vit_relpos_base_patch16_clsgap_224r  N)rJ  r7  r4  r<   r<   r=   rJ    s   rJ  c                 K   8   t dddddtd}t	d
d| it |fi |}|S )_ ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token
    r   r:  r   r;  Fr   r   r   r$   r6   r    vit_relpos_small_patch16_rpn_224r  N)rN  r3  r4  r<   r<   r=   rN    r@  rN  c                 K   rK  )rL  r   r>  r   r   FrM  !vit_relpos_medium_patch16_rpn_224r  N)rO  r3  r4  r<   r<   r=   rO    r@  rO  c                 K   s8   t dddddtd}t	d	d| it |fi |}|S )
rL  r   r   r   FrM  vit_relpos_base_patch16_rpn_224r  N)rP  r3  r4  r<   r<   r=   rP    r@  rP  r  r  )Dr  loggingr   	functoolsr   typingr   r   r   r   r   r   ImportErrortyping_extensionsrL   torch.nnr)   	torch.jitr	   	timm.datar   r   timm.layersr   r   r   r   r   r   r   _builderr   	_featuresr   _manipulater   r   	_registryr   r   vision_transformerr   __all__	getLoggerr^   _loggerr  r   rf   ro   r   r   r!  r+  default_cfgsr2  r6  r=  r?  rA  rE  rG  rH  rI  rJ  rN  rO  rP  r<   r<   r<   r=   <module>   s    $
B
07  
#


(			




