o
    پi_                     @   sj  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ d dlm  mZ d dlmZmZ d dlmZmZmZmZmZmZmZmZmZmZmZ ddl m!Z! dd	l"m#Z# dd
l$m%Z%m&Z& ddl'm(Z(m)Z) de	e*e*f fddZ+dej,de	e*e*f de	e*e*f fddZ-de*de*de	e*e*f de	e*e*e*e*f fddZ.G dd dej/Z0G dd dej/Z1G dd dej/Z2G dd dej/Z3d>d!d"Z4e(e4d#d$e4d#d$e4d#d$e4d#d$e4d#d$e4d#d$e4d#d%d&d'd(e4d#d%d&d'd(e4d)d%d*d+d,	Z5d?d-d.Z6d@d0e7d1e8de3fd2d3Z9e)d@d4d5Z:e)d@d6d7Z;e)d@d8d9Z<e)d@d:d;Z=e)d@d<d=Z>dS )A    N)deepcopy)partial)DictListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpDropPathClNormMlpClassifierHead
LayerScaleget_norm_layerget_act_layerinit_weight_jaxinit_weight_vit	to_2tupleuse_fused_attn   )build_model_with_cfg)feature_take_indices)named_apply
checkpoint)generate_default_cfgsregister_modelwindow_sizec                 C   sj   | j \}}}}| |||d  |d ||d  |d |} | dddddd d|d |d |}|S )aT  
    Partition into non-overlapping windows with padding if needed.
    Args:
        x (tensor): input tokens with [B, H, W, C].
        window_size (int): window size.
    Returns:
        windows: windows after partition with [B * num_windows, window_size, window_size, C].
        (Hp, Wp): padded height and width before partition
    r   r               shapeviewpermute
contiguous)xr   BHWCwindows r/   M/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/hieradet_sam2.pywindow_partition   s   
,,r1   r.   hwc                 C   s~   |\}}| j d || |d  |d   }| |||d  ||d  |d |d d}|dddddd |||d}|S )aZ  
    Window unpartition into original sequences and removing padding.
    Args:
        x (tensor): input tokens with [B * num_windows, window_size, window_size, C].
        window_size (int): window size.
        hw (Tuple): original height and width (H, W) before padding.
    Returns:
        x: unpartitioned sequences with [B, H, W, C].
    r   r   r#   r   r    r!   r"   r$   )r.   r   r2   r+   r,   r*   r)   r/   r/   r0   window_unpartition$   s
   
",$r3   r+   r,   returnc                 C   sV   |d | |d   |d  }|d ||d   |d  }| | || }}||||fS )Nr   r   r/   )r+   r,   r   pad_hpad_wHpWpr/   r/   r0   	_calc_pad5   s   r9   c                	       s\   e Zd ZU ejje ed< 	ddededede	j
f fddZd	ejd
ejfddZ  ZS )MultiScaleAttention
fused_attnNdimdim_out	num_headsq_poolc                    s`   t    || _|| _|| _|| }|d | _t | _|| _t	
||d | _t	
||| _d S )Ng      r   )super__init__r<   r=   r>   scaler   r;   r?   nnLinearqkvproj)selfr<   r=   r>   r?   head_dim	__class__r/   r0   rA   ?   s   

zMultiScaleAttention.__init__r)   r4   c                 C   s0  |j \}}}}| |||| d| jd}t|d\}}}	| jd urQ||||ddddd}| |dddd}|j dd \}}|||| | jd}|dd}|dd}|	dd}	| j	rnt
|||	}n|| j }||dd }
|
jdd}
|
|	 }|dd|||d}| |}|S )Nr   r#   r    r   r   )r<   )r%   rE   reshaper>   torchunbindr?   r'   	transposer;   Fscaled_dot_product_attentionrB   softmaxrF   )rG   r)   r*   r+   r,   _rE   qkvattnr/   r/   r0   forwardR   s(   


zMultiScaleAttention.forwardN)__name__
__module____qualname__rM   jitFinalbool__annotations__intrC   ModulerA   TensorrX   __classcell__r/   r/   rI   r0   r:   <   s   
 r:   c                       s   e Zd Z							ddeded	ed
edeeeef  deej	e
f deej	e
f dedee def fddZdejdejfddZ  ZS )MultiScaleBlock      @N	LayerNormGELUr           r<   r=   r>   	mlp_ratioq_stride
norm_layer	act_layerr   init_values	drop_pathc                    s<  t    t|}t|}t|| _t| j| _|| _|| _	|| _
||kr-t||| _nt | _d | _| j
rAtj||dd| _||| _t|||t| jd| _|	d ur[t||	nt | _|
dkrht|
nt | _||| _t|t|| |d| _|	d urt||	nt | _|
dkrt|
| _d S t | _d S )NF)kernel_sizestride	ceil_mode)r>   r?   ri   )rm   )r@   rA   r   r   r   r   anyis_windowedr<   r=   rk   rC   rD   rF   Identitypool	MaxPool2dnorm1r:   r   rW   r   ls1r   
drop_path1norm2r   ra   mlpls2
drop_path2)rG   r<   r=   r>   rj   rk   rl   rm   r   rn   ro   rI   r/   r0   rA   v   sF   





$zMultiScaleBlock.__init__r)   r4   c           
   
   C   s  |}|  |}| j| jkr*| |}| jd ur*|dddd}| |dddd}| j}|jdd \}}||}}| jrYt	|||\}}}}	t
|ddd|	d|f}t||}| |}| jd ur| jd | jd  | jd | jd  f}|jdd \}}t	|||\}}}}	| jrt||||f}|d d d |d |d d f  }|| | | }|| | | | | }|S )Nr   r   r   r    )rx   r<   r=   rF   rv   r'   r   r%   rt   r9   rP   padr1   rW   rk   r3   r(   rz   ry   r~   r}   r|   r{   )
rG   r)   shortcutr   r+   r,   r7   r8   r5   r6   r/   r/   r0   rX      s2   






($ zMultiScaleBlock.forward)rf   Nrg   rh   r   Nri   )rZ   r[   r\   ra   floatr   r   r   rC   rb   strrA   rM   rc   rX   rd   r/   r/   rI   r0   re   u   s<    	
6re   c                       sp   e Zd ZdZ					ddeedf d	eedf d
eedf dedef
 fddZdejdejfddZ	  Z
S )HieraPatchEmbedz#
    Image to Patch Embedding.
       r   r!   r!   r   r   r      rp   .rq   paddingin_chans	embed_dimc                    s$   t    tj|||||d| _dS )ab  
        Args:
            kernel_size (Tuple): kernel size of the projection layer.
            stride (Tuple): stride of the projection layer.
            padding (Tuple): padding size of the projection layer.
            in_chans (int): Number of input image channels.
            embed_dim (int):  embed_dim (int): Patch embedding dimension.
        )rp   rq   r   N)r@   rA   rC   Conv2drF   )rG   rp   rq   r   r   r   rI   r/   r0   rA      s   

zHieraPatchEmbed.__init__r)   r4   c                 C   s   |  |}|dddd}|S )Nr   r    r   r   )rF   r'   rG   r)   r/   r/   r0   rX      s   
zHieraPatchEmbed.forward)r   r   r   r   r   )rZ   r[   r\   __doc__r   ra   rA   rM   rc   rX   rd   r/   r/   rI   r0   r      s&    


r   c                5       s  e Zd ZdZ										
									
							dYdedededededeedf deedf deedf deeedf  d ed!eeef d"eedf d#ed$ed%eeef d&eedf d'eedf d(ee d)ed*e	d+ed,ed-ed.e
ejef d/e
ejef f2 fd0d1Zd2ejd3ejfd4d5Zd6d7 Zejjd8d9 ZejjdZd;e	d3efd<d=Zejjd[d>e	d3d
fd?d@ZejjdAdB Zd\dedee dCe	fdDdEZ	
	:		F	:	d]d2ejdGee
eee f  dHe	dIe	dJedKe	d;e	d3e
eej eejeej f f fdLdMZ		:		d^dGe
eee f dNe	dOe	d;e	fdPdQZd2ejd3ejfdRdSZdZdTe	d3ejfdUdVZd2ejd3ejfdWdXZ  Z S )_HieraDetz5
    Reference: https://arxiv.org/abs/2306.00989
    r     avg`   r   r   r   r   Nr    r    r    r      r          @   r!      r      r       TMbP?ri   rg   rh   r   num_classesglobal_poolr   r>   patch_kernel.patch_stridepatch_padding
patch_sizer?   rk   stagesdim_mulhead_mulglobal_pos_sizewindow_specglobal_att_blocksrn   weight_initfix_inithead_init_scale	drop_ratedrop_path_raterl   rm   c           "         s8  t    t|}t|}t t|ksJ d| _|| _|| _d| _t	 }|| _
 fddtdt d D | _d|
  krMt| jd d ksPJ  J dd | jd d D d |
 | _|	d urqtd |	||dd	d
| _n
t|||||d| _|| _|| _ttjd|g| jR  | _ttd|| jd | jd | _dd td||D }d}t | _g | _t|D ]i}|}| j| }| jd ur|| jv rdn|}|d | jv rt|| }t|| }|d7 }t||||| || jv r| j
nd |||d} |}| j|  || jv r%|  jt |d|d  d| j|  dg7  _q| | _!| _"t#|||||d| _$| jd urEtj%j&| jdd | jd urTtj%j&| jdd |dkrm|dkr`t'nt(}!t)|!dd}!t*|!|  |rt| +  t,| j$t#rt,| j$j-tj.r| j$j-j/j01| | j$j-j2j01| d S d S d S )NFNHWCc                    s    g | ]}t  d | d qS )Nr   )sum.0ir   r/   r0   
<listcomp>*  s     z%HieraDet.__init__.<locals>.<listcomp>r   r   r#   c                 S   s   g | ]}|d  qS )r   r/   r   r)   r/   r/   r0   r   ,      T)img_sizer   r   r   
output_fmtdynamic_img_pad)rp   rq   r   r   r   c                 S   s   g | ]}|  qS r/   )itemr   r/   r/   r0   r   H  r   )r<   r=   r>   ro   rk   r   rl   rm   r    zblocks.)num_chs	reductionmodule)	pool_typer   rl   g{Gz?)stdskipjaxhead.fc)classifier_name)3r@   rA   r   r   lengrad_checkpointingr   r   r   r   rk   range
stage_endsq_pool_blocksr   patch_embedr   r   r   rC   	ParameterrM   zeros	pos_embedpos_embed_windowlinspace
Sequentialblocksfeature_infora   re   appenddictnum_featureshead_hidden_sizer   headinittrunc_normal_r   r   r   r   fix_init_weight
isinstancefcrD   weightdatamul_bias)"rG   r   r   r   r   r>   r   r   r   r   r?   rk   r   r   r   r   r   r   rn   r   r   r   r   r   rl   rm   depthdpr	cur_stager   r=   r   blockinit_fnrI   r   r0   rA      s   
'"*"
	$


 	

 zHieraDet.__init__r)   r4   c                 C   s   |j dd \}}| j}tj| j||fdd}|j d |j d  }|j d |j d  }||||f }|dddd}|| S )	Nr   r   bicubic)sizemoderK   r#   r   r    )r%   r   rP   interpolater   tiler'   )rG   r)   hwwindow_embedr   tile_htile_wr/   r/   r0   
_pos_embed  s   zHieraDet._pos_embedc                 C   sL   dd }t | jD ]\}}||jjjj|d  ||jjjj|d  q	d S )Nc                 S   s   |  td|  d S )Nr   )div_mathsqrt)param	_layer_idr/   r/   r0   rescale  s   z)HieraDet.fix_init_weight.<locals>.rescaler   )	enumerater   rW   rF   r   r   r|   fc2)rG   r   layer_idlayerr/   r/   r0   r     s
   zHieraDet.fix_init_weightc                 C   s   ddgS )Nr   r   r/   rG   r/   r/   r0   no_weight_decay     zHieraDet.no_weight_decayFcoarsec                 C   s   t ddgdS )Nz'^pos_embed|pos_embed_window|patch_embed)z^blocks\.(\d+)N)stemr   )r   )rG   r   r/   r/   r0   group_matcher  s   zHieraDet.group_matcherenablec                 C   s
   || _ d S rY   )r   )rG   r  r/   r/   r0   set_grad_checkpointing  s   
zHieraDet.set_grad_checkpointingc                 C   s   | j jS rY   )r   r   r   r/   r/   r0   get_classifier  r   zHieraDet.get_classifierreset_otherc                 C   s   || _ | jj|||d d S )N)r   r  )r   r   reset)rG   r   r   r  r/   r/   r0   reset_classifier  s   zHieraDet.reset_classifierNCHWindicesnorm
stop_earlyr   intermediates_onlyc                    s  |rJ d|dv sJ d|r)t t j|\}}	 fdd|D } j|	 }	n
t t j|\}}	 |} |}g }
tj sF|sJ j}n	 jd|	d  }t	|D ]-\}} j
ritj sit||}n||}||v r|dkr}|d	d
ddn|}|
| qW|r|
S ||
fS )aE   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
            coarse: Take coarse features (stage ends) if true, otherwise all block featrures
        Returns:

        z'normalization of features not supported)r  r   z(Output format must be one of NCHW, NHWC.c                    s   g | ]} j | qS r/   )r   r   r   r/   r0   r     s    z2HieraDet.forward_intermediates.<locals>.<listcomp>Nr   r  r   r   r    )r   r   r   r   r   r   rM   r]   is_scriptingr   r   r   r'   r   )rG   r)   r	  r
  r  r   r  r   take_indices	max_indexintermediatesr   r   blkx_outr/   r   r0   forward_intermediates  s0   


zHieraDet.forward_intermediates
prune_norm
prune_headc                 C   sd   |rt t| j|\}}| j| }n
t t| j|\}}| jd|d  | _|r0| jjd|d |S )z@ Prune layers not required for specified intermediates.
        Nr   r   )r  )r   r   r   r   r   r  )rG   r	  r  r  r   r  r  r/   r/   r0   prune_intermediate_layers  s   	z"HieraDet.prune_intermediate_layersc                 C   sH   |  |}| |}| jD ]}| jrtj st||}q||}q|S rY   )r   r   r   r   rM   r]   r  r   )rG   r)   r  r/   r/   r0   forward_features  s   



zHieraDet.forward_features
pre_logitsc                 C   s$   |r| j ||d}|S |  |}|S )N)r  )r   )rG   r)   r  r/   r/   r0   forward_head  s   
zHieraDet.forward_headc                 C   s   |  |}| |}|S rY   )r  r  r   r/   r/   r0   rX     s   

zHieraDet.forward)r   r   r   r   r   r   r   r   Nr   r   r   r   r   r   r   r   Nr   Tr   ri   ri   rg   rh   F)T)NF)NFTr  FT)r   FTT)!rZ   r[   r\   r   ra   r   r   r   r   r_   r   rC   rb   rA   rM   rc   r   r   r]   ignorer   r   r  r  r  r  r   r  r  r  r  rX   rd   r/   r/   rI   r0   r      s   


	





 !"#$% 


 	
8

r   r   c                 K   s    | ddddddt tddd	|S )
Nr   )r     r  )   r  g      ?r   )r      r  zpatch_embed.projr   )urlr   
input_size	pool_sizecrop_pctinterpolationmin_input_sizemeanr   
first_conv
classifierr	   )r  kwargsr/   r/   r0   _cfg  s   r)  ztimm/)	hf_hub_id)r      r+  )r      r,  )    r-  )r*  r$  r   r!  r   )r   r   )r   r   r!  )	zsam2_hiera_tiny.fb_r896zsam2_hiera_tiny.fb_r896_2pt1zsam2_hiera_small.fb_r896zsam2_hiera_small.fb_r896_2pt1zsam2_hiera_base_plus.fb_r896z!sam2_hiera_base_plus.fb_r896_2pt1zsam2_hiera_large.fb_r1024zsam2_hiera_large.fb_r1024_2pt1zhieradet_small.untrainedc                 C   s`   |  d| } i }|  D ]!\}}||r||d}nq|dd}|dd}|||< q|S )Nmodelr   zmlp.layers.0zmlp.fc1zmlp.layers.1zmlp.fc2)getitems
startswithreplace)
state_dictr.  prefixoutputrU   rV   r/   r/   r0   checkpoint_filter_fnN  s   

r6  Fvariant
pretrainedc                 K   s:   | dd}d}tt| |ftt|dt|ddd|S )Nout_indicesr!   r   )r4  getter)r9  feature_cls)pretrained_filter_fnfeature_cfg)popr   r   r   r6  r   )r7  r8  r(  r9  checkpoint_prefixr/   r/   r0   _create_hiera_det]  s   

r@  c                 K   *   t ddd}tdd| it |fi |S )N)r   r    r   r    )r"   r   	   r   r   sam2_hiera_tinyr8  )rD  r   r@  r8  r(  
model_argsr/   r/   r0   rD  o     rD  c                 K   rA  )Nr   r       r    r   
      rC  sam2_hiera_smallr8  )rN  rE  rF  r/   r/   r0   rN  u  rH  rN  c                 K   s,   t dddd}tdd| it |fi |S )Np   r    )r   r   )r   r>   r   sam2_hiera_base_plusr8  )rP  rE  rF  r/   r/   r0   rP  {  s   rP  c                 K   s0   t dddddd}td	d| it |fi |S )
N   r    )r       $   r!   )   !   +   r   r!   r   r   )r   r>   r   r   r   sam2_hiera_larger8  )rX  rE  rF  r/   r/   r0   rX    s   rX  c                 K   s.   t ddddd}tdd| it |fi |S )	NrI  rK  rW  gh㈵>)r   r   r   rn   hieradet_smallr8  )rY  rE  rF  r/   r/   r0   rY    s   rY  )r   )Nr   r  )?r   copyr   	functoolsr   typingr   r   r   r   r   rM   torch.nnrC   torch.nn.functional
functionalrP   	timm.datar
   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _manipulater   r   	_registryr   r   ra   r1   rc   r3   r9   rb   r:   re   r   r   r)  default_cfgsr6  r   r_   r@  rD  rN  rP  rX  rY  r/   r/   r/   r0   <module>   s    4(.9]!  

4