o
    پi o                     @   s  d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlm  mZ ddlmZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$ dd	l%m&Z& dd
l'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0 dgZ1e2e3Z4de5de5de
j6de
j6fddZ7e*e7 de
j6de
j6de
j6dee5e5f dee5e5f de
j6fddZ8G dd dej9Z:G dd dej9Z;G dd dej9Z<d e
j6d!e5dee
j6ee5e5f f fd"d#Z=	dGd$e
j6d!e5d%ee5e5f d&eee5e5f  de
j6f
d'd(Z>G d)d dej9Z?d*d+ Z@dHd-d.ZAe/eAd/d0d1eedd2d3d4eAd5d0d1eedd2d3d4eAd6d0d1eedd2d3d4eAeed7d8d9d:d;ZBdId=d>ZCe0dIde?fd?d@ZDe0dIde?fdAdBZEe0dIde?fdCdDZFe0dIde?fdEdFZGdS )Ja+   Vision Transformer (ViT) in PyTorch

A PyTorch implement of Vision Transformers as described in:

'Exploring Plain Vision Transformer Backbones for Object Detection'
    - https://arxiv.org/abs/2203.16527

'Segment Anything Model (SAM)'
    - https://github.com/facebookresearch/segment-anything/

    N)partial)CallableListOptionalTupleUnion)IMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STDIMAGENET_INCEPTION_MEANIMAGENET_INCEPTION_STD)
PatchEmbedMlpDropPathPatchDropoutLayerNorm2dClassifierHeadNormMlpClassifierHeadFormatresample_abs_pos_embed_nhwcRotaryEmbeddingCatapply_rot_embed_cat	to_2tupleuse_fused_attn)Final   )build_model_with_cfg)feature_take_indices)register_notrace_function)
checkpointcheckpoint_seq)generate_default_cfgsregister_modelVisionTransformerSAMq_sizek_sizerel_posreturnc                 C   s   t dt| | d }|jd |kr2tj|d|jd dddd|dd}|d|dd}n|}t| dddf t||  d }t|dddf t| | d }|| |d t| | d  }||	  S )	a\  
    Get relative positional embeddings according to the relative positions of
        query and key sizes.
    Args:
        q_size (int): size of query q.
        k_size (int): size of key k.
        rel_pos (Tensor): relative position embeddings (L, C).

    Returns:
        Extracted positional embeddings according to relative positions.
       r   r   linear)sizemodeN      ?)
intmaxshapeFinterpolatereshapepermutetorcharangelong)r#   r$   r%   max_rel_distrel_pos_resizedq_coordsk_coordsrelative_coords r<   V/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/vision_transformer_sam.pyget_rel_pos%   s   $$r>   q	rel_pos_h	rel_pos_wc                 C   s   |\}}|\}}t |||}	t |||}
| j\}}}| ||||}td||	}td||
}|dddddddddf |dddddddddf  }|d|| || S )a  
    Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
    https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py
    Args:
        q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
        rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
        rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
        q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
        k_size (Tuple): spatial sequence size of key k with (k_h, k_w).

    Returns:
        bias (Tensor): attention bias to add to attention map
    zbhwc,hkc->bhwkzbhwc,wkc->bhwkNr(   )r>   r/   r2   r4   einsum)r?   r@   rA   r#   r$   q_hq_wk_hk_wRhRwB_dimr_qrel_hrel_w	attn_biasr<   r<   r=   get_decomposed_rel_pos_biasH   s   DrP   c                	       sh   e Zd ZU ee ed< dddddejdddf	dedee	e
e
f  d	eej f fd
dZdd Z  ZS )	Attention
fused_attn   TF        Nuse_rel_pos
input_sizeropec                    s$  t    || dksJ d|| _|| | _| jd | _t | _tj||d |d| _	|r3|| jnt
 | _|r?|| jnt
 | _t|| _t||| _t|| _|| _| jr|
d u scJ |	d uskJ dttd|	d  d | j| _ttd|	d  d | j| _|
| _d S )	Nr   z$dim should be divisible by num_headsg         )biaszBInput size must be provided if using relative positional encoding.r'   r   )super__init__	num_headshead_dimscaler   rR   nnLinearqkvIdentityq_normk_normDropout	attn_dropproj	proj_droprU   	Parameterr4   zerosr@   rA   rW   )selfrK   r\   qkv_biasqk_normrf   rh   
norm_layerrU   rV   rW   	__class__r<   r=   r[   m   s2   



zAttention.__init__c                 C   s  |j \}}}}|| }|||d}| |||d| jdddddd}|d|| j |dd\}}	}
| || |	}}	| j	rTt
|| j| j||f||f}nd }| jd urp| j }t|||
}t|	||
}	| jrtjjj||	|
|| jr| jjndd}n$|| j }||	d	d }|d ur|| }|jdd
}| |}||
 }||| j|ddd||d}| |}| |}||||d}|S )Nr(   rX   r'   r   r      rT   )	attn_mask	dropout_p)rK   )r/   r2   ra   viewr\   r3   unbindrc   rd   rU   rP   r@   rA   rW   	get_embedr   type_asrR   r4   r_   
functionalscaled_dot_product_attentiontrainingrf   pr^   	transposesoftmaxrg   rh   )rk   xrI   HWrJ   Nra   r?   kvrO   rW   attnr<   r<   r=   forward   s>   ("



$

zAttention.forward)__name__
__module____qualname__r   bool__annotations__r_   	LayerNormr   r   r-   Moduler[   r   __classcell__r<   r<   ro   r=   rQ   j   s&   
 	
'rQ   c                       s&   e Zd Zd fdd	Zdd Z  ZS )
LayerScaleh㈵>Fc                    s*   t    || _t|t| | _d S N)rZ   r[   inplacer_   ri   r4   onesgamma)rk   rK   init_valuesr   ro   r<   r=   r[      s   
zLayerScale.__init__c                 C   s   | j r	|| jS || j S r   )r   mul_r   rk   r   r<   r<   r=   r         zLayerScale.forward)r   F)r   r   r   r[   r   r   r<   r<   ro   r=   r      s    r   c                       sF   e Zd Zdddddddejejeddddf fdd	Zd	d
 Z  Z	S )Block      @TFrT   Nr   c                    s   t    || _||| _t|||||||||dkr|n||f|d
| _|r,t||dnt | _	|	dkr9t
|	nt | _||| _||t|| |
|d| _|rXt||dnt | _|	dkrht
|	| _d S t | _d S )Nr   )	r\   rl   rm   rf   rh   rn   rU   rV   rW   )r   rT   )in_featureshidden_features	act_layerdrop)rZ   r[   window_sizenorm1rQ   r   r   r_   rb   ls1r   
drop_path1norm2r-   mlpls2
drop_path2)rk   rK   r\   	mlp_ratiorl   rm   rh   rf   r   	drop_pathr   rn   	mlp_layerrU   r   rV   rW   ro   r<   r=   r[      s4   



$zBlock.__init__c              
   C   s   |j \}}}}|}| |}d }| jdkrt|| j\}}| | | |}| jdkr7t|| j||f|}|| }|||| d}|| 	| 
| | | }||||d}|S )Nr   r(   )r/   r   r   window_partitionr   r   r   window_unpartitionr2   r   r   r   r   )rk   r   rI   r   r   rJ   shortcutpad_hwr<   r<   r=   r      s   


 zBlock.forward)
r   r   r   r_   GELUr   r   r[   r   r   r<   r<   ro   r=   r      s"    /r   r   r   c              	   C   s   | j \}}}}|||  | }|||  | }t| ddd|d|f} || || }}	| ||| ||	| ||} | dddddd d|||}
|
||	ffS )aU  
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.

    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    r   r   rX   r'   rq      r(   )r/   r0   padru   r3   
contiguous)r   r   rI   r   r   Cpad_hpad_wHpWpwindowsr<   r<   r=   r     s   $r   r   hwr   c           
      C   s   |dur|n|\}}|\}}| j d || | |  }| ||| || ||d}	|	dddddd |||d}	|	ddd|d|ddf  }	|	S )	a  
    Window unpartition into original sequences and removing padding.
    Args:
        windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        window_size (int): window size.
        pad_hw (Tuple): padded height and width (Hp, Wp).
        hw (Tuple): original height and width (H, W) before padding.

    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    Nr   r(   r   rX   r'   rq   r   )r/   ru   r3   r   )
r   r   r   r   r   r   r   r   rI   r   r<   r<   r=   r   (  s   $$r   c                H       sB  e Zd ZdZdddddddddd	d
d	dddddddeeejd	dej	ej
eedd	d	ddddd
d
f!dededededededededededee dededed ed!ed"ed#ed$ed%ed&ee d'ee d(ed)ed*ed+ed,ed-ed.eed/f d0ed1ed2ee d3eeeeef eeef f  fB fd4d5Zejjd6d7 ZejjdUd8d9ZejjdVd:d;Zejjd<ejfd=d>ZdWded1ee fd?d@Z	
					A		dXdBejdCeeee e f  dDedEedFedGed<ee ej eeje ej f f fdHdIZ!	
			dYdCeeee e f  dJedKefdLdMZ"dNdO Z#dUdPefdQdRZ$dSdT Z%  Z&S )Zr"   z Vision Transformer for Segment-Anything Model(SAM)

    A PyTorch impl of : `Exploring Plain Vision Transformer Backbones for Object Detection` or `Segment Anything Model (SAM)`
        - https://arxiv.org/abs/2010.11929
          rX         r   TFNrT    )
output_fmtstrict_img_size   r<      avgimg_size
patch_sizein_chansnum_classes	embed_dimdepthr\   r   rl   rm   r   pre_norm	drop_ratepos_drop_ratepatch_drop_rateproj_drop_rateattn_drop_ratedrop_path_rateweight_initembed_layerrn   r   block_fnr   use_abs_posrU   use_roper   global_attn_indexes.
neck_chansglobal_poolhead_hidden_sizeref_feat_shapec"           $         s  t    
pttjdd
 ptj |_|_ _ _	_
d_||||| d_jjtjdr?j n||rTttdd d _nd_tj|d	_|dkrjt|dd
_nt _|ru
nt _|rrJ d|!durt|!dksJ t|!d }"t|!d }#nd }"}#t d|"d_t dt|#d_nd_d_dd td||D tj  	
fddt!|D  _"fddt!|D _#|rt tj$|dddt%|tj$||ddddt%|_&|_n| r#t _&nt%_&}| r9t'||| ||d_(dS t)||||d_(dS )a  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of image input channels.
            num_classes: Number of classes for classification head.
            global_pool: Type of global pooling for final sequence (default: 'token').
            embed_dim: Transformer embedding dimension.
            depth: Depth of transformer.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: Enable bias for qkv projections if True.
            init_values: Layer-scale init values (layer-scale enabled if not None).
            drop_rate: Head dropout rate.
            pos_drop_rate: Position embedding dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth rate.
            weight_init: Weight initialization scheme.
            embed_layer: Patch embedding layer.
            norm_layer: Normalization layer.
            act_layer: MLP activation layer.
            block_fn: Transformer block layer.
            use_abs_pos: If True, use absolute positional embeddings.
            use_rel_pos: If True, add relative positional embeddings to the attention map.
            use_rope: If True, add rotary position embeddings to q/k in attention block.
            window_size: Window size for window attention blocks. If 0, not use window attention.
            global_attn_indexes: Indexes for blocks using global attention. Used when window_size > 0.
            global_pool: Global pooling type.
            head_hidden_size: If set, use NormMlpHead
            ref_feat_shape: Tuple of reference feature shapes for ROPE, (global, local)
        gư>)epsF)r   r   r   r   rY   
feat_ratior   r   N)r|   )num_prefix_tokenszCROPE and relative pos embeddings should not be enabled at same timer'   )	in_pixels
feat_shaper   c                 S   s   g | ]}|  qS r<   )item).0r   r<   r<   r=   
<listcomp>  s    z1VisionTransformerSAM.__init__.<locals>.<listcomp>c                    s   g | ]G}di d dd	dddddd| d	
d
 ddd|vr7nddd|vrDj njqS )rK   r\   r   rl   rm   r   rh   rf   r   rn   r   r   rU   r   r   rV   rW   r<   )rope_windowrope_globalr   i)r   r   r   dprr   r   	grid_sizer   r   r   rn   r\   r   rm   rl   rk   rU   r   r<   r=   r     sL    	
c                    s    g | ]}t d |  dqS )zblocks.)modulenum_chs	reductiondictr   )r   rr<   r=   r     s    )kernel_sizerY   rX   )r   paddingrY   )hidden_size	pool_typer   )r   r   )*rZ   r[   r   r_   r   r   r   r   num_featuresr   r   grad_checkpointingpatch_embedr   hasattrr   ri   r4   rj   	pos_embedre   pos_dropr   
patch_droprb   norm_prelenr   r   r   r   linspace
Sequentialrangeblocksfeature_infoConv2dr   neckr   headr   )$rk   r   r   r   r   r   r   r\   r   rl   rm   r   r   r   r   r   r   r   r   r   r   rn   r   r   r   r   rU   r   r   r   r   r   r   r   ref_feat_shape_globalref_feat_shape_windowro   )r   r   r   r   r   r   r   r   r   r   rn   r\   r   rm   rl   r   rk   rU   r   r=   r[   F  s   
C
"


0

zVisionTransformerSAM.__init__c                 C   s   ddhS )Nr   
dist_tokenr<   rk   r<   r<   r=   no_weight_decay  s   z$VisionTransformerSAM.no_weight_decayc                 C   s   t dddgdS )Nz^pos_embed|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr   r   )rk   coarser<   r<   r=   group_matcher  s   z"VisionTransformerSAM.group_matcherc                 C   s
   || _ d S r   )r   )rk   enabler<   r<   r=   set_grad_checkpointing  s   
z+VisionTransformerSAM.set_grad_checkpointingr&   c                 C   s   | j S r   r  r  r<   r<   r=   get_classifier  s   z#VisionTransformerSAM.get_classifierc                 C   s   || _ | j|| d S r   )r   r  reset)rk   r   r   r<   r<   r=   reset_classifier  s   z%VisionTransformerSAM.reset_classifierNCHWr   indicesnorm
stop_earlyr   intermediates_onlyc                 C   s8  |dksJ dg }t t| j|\}}	| |}| jdur+|t| j|jdd  }| |}| |}| 	|}t
j sA|sE| j}
n	| jd|	d  }
t|
D ]6\}}| jrdt
j sdt||}n||}||v r|r}|| |dddd qR||dddd qR|r|S | |dddd}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        r  z&Output shape for ViT-SAM must be NCHW.Nr   rX   r   r'   )r   r   r   r   r   r   r/   r   r   r   r4   jitis_scripting	enumerater   r   appendr  r3   )rk   r   r  r  r  r   r  intermediatestake_indices	max_indexr   r   blkr<   r<   r=   forward_intermediates  s2   




z*VisionTransformerSAM.forward_intermediates
prune_norm
prune_headc                 C   sJ   t t| j|\}}| jd|d  | _|rt | _|r#| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r   )r   r   r   r_   rb   r  r  )rk   r  r  r   r  r  r<   r<   r=   prune_intermediate_layersZ  s   
z.VisionTransformerSAM.prune_intermediate_layersc                 C   s   |  |}| jd ur|t| j|jdd  }| |}| |}| |}| jr5tj	
 s5t| j|}n| |}| |dddd}|S )Nr   rX   r   r'   )r   r   r   r/   r   r   r   r   r4   r  r  r   r   r  r3   r   r<   r<   r=   forward_featuresk  s   





z%VisionTransformerSAM.forward_features
pre_logitsc                 C   s   |r	| j |ddS |  |S )NT)r#  r  )rk   r   r#  r<   r<   r=   forward_headz  r   z!VisionTransformerSAM.forward_headc                 C   s   |  |}| |}|S r   )r"  r$  r   r<   r<   r=   r   }  s   

zVisionTransformerSAM.forwardF)Tr   )NFFr  F)NFT)'r   r   r   __doc__r   r   r   NHWCr_   r   r   r   r   r-   floatr   r   strr   r   r[   r4   r  ignorer  r
  r  r   r  r  Tensorr   r   r  r!  r"  r$  r   r   r<   r<   ro   r=   r"   ?  s"   	

 !" C
 
=
c                 C   sT   d| v }i }|   D ]\}}|dr |dd }|dd}n|r#q
|||< q
|S )z Remap SAM checkpoints -> timm z%image_encoder.patch_embed.proj.weightzimage_encoder.r   Nzmlp.linzmlp.fc)items
startswithreplace)
state_dictmodelsam_checkpointout_dictr   r   r<   r<   r=   checkpoint_filter_fn  s   

r3  r   c                 K   s    | ddd dddt tddd|S )	N  rX   r   r   ?bicubicTzpatch_embed.projzhead.fc)urlr   rV   	pool_sizecrop_pctinterpolationfixed_input_sizemeanstd
first_conv
classifier)r
   r   )r8  kwargsr<   r<   r=   _cfg  s   rB  zDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pthztimm/z
apache-2.0r5  r,   )r8  	hf_hub_idlicenser=  r>  r   rV   r:  zDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pthzDhttps://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pthr4  )rX      rE  r6  )r=  r>  r   rV   r:  )zsamvit_base_patch16.sa1bzsamvit_large_patch16.sa1bzsamvit_huge_patch16.sa1bsamvit_base_patch16_224Fc                 K   s.   | dd}tt| |ftt|ddd|S )Nout_indicesrX   getter)rG  feature_cls)pretrained_filter_fnfeature_cfg)popr   r"   r3  r   )variant
pretrainedrA  rG  r<   r<   r=   _create_vision_transformer  s   
rO  c              
   K   s@   t ddddg ddddd}t		dd
| it |fi |}|S )z# ViT-B/16 for Segment-Anything
    r   r   r   r'   r   rS      r   Tr   r   r   r   r\   r   r   rU   r   samvit_base_patch16rN  N)rS  r   rO  rN  rA  
model_argsr0  r<   r<   r=   rS       rS  c              
   K   s@   t ddddg ddddd}t	dd	| it |fi |}|S )z# ViT-L/16 for Segment-Anything
    r   r      )r   rQ        r   TrR  samvit_large_patch16rN  N)r[  rT  rU  r<   r<   r=   r[    rW  r[  c              
   K   s@   t ddddg ddddd}t		dd
| it |fi |}|S )z# ViT-H/16 for Segment-Anything
    r   i       )      rZ     r   Tr   rR  samvit_huge_patch16rN  N)r`  rT  rU  r<   r<   r=   r`    rW  r`  c                 K   sD   t ddddg dddddd	d

}t	dd| it |fi |}|S )z# ViT-B/16 based on samvit arch
    r   r   r   rP  r   TFrE  N)
r   r   r   r\   r   r   rU   r   r   r   rF  rN  )rF  rT  rU  r<   r<   r=   rF    s   
rF  r   )r   r%  )Hr&  logging	functoolsr   typingr   r   r   r   r   r4   torch.nnr_   torch.nn.functionalry   r0   	timm.datar   r	   r
   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   r   r   	torch.jitr   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr    r!   __all__	getLoggerr   _loggerr-   r+  r>   rP   r   rQ   r   r   r   r   r"   r3  rB  default_cfgsrO  rS  r[  r`  rF  r<   r<   r<   r=   <module>   s    <
 


"S
*J

  F

