o
    پi                  	   @   sb  d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlm  mZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZ ddlm Z m!Z! dd	l"m#Z# dd
l$m%Z% ddl&m'Z' ddl(m)Z)m*Z* dgZ+de,de	ej- fddZ.e'dee, dej/dej/fddZ0dej/dee, dee, dej/fddZ1G dd dej-Z2G dd dej-Z3G dd  d ej-Z4G d!d" d"ej-Z5G d#d$ d$ej-Z6G d%d dej-Z7d[d'd(Z8e i d)e8d*d+d,d-e8d*d+dd.d/e8d*d+d,d0e8d*d+dd.d1e8d*d+d,d2e8d*d+dd.d3e8d*d+d,d4e8d*d+dd.d5e8d*d+d,d6e8d*d+dd.d7e8d*d+d,d8e8d*d+dd.d9e8d*d:d;d<d=e8d*d:d;d<d>e8d*d?d:d;d@dAe8d*d?d:d;d@dBe8d:d;dCZ9d\dDdEZ:d]dGe;dHe<de7fdIdJZ=e!d]dKdLZ>e!d]dMdNZ?e!d]dOdPZ@e!d]dQdRZAe!d]dSdTZBe!d]dUdVZCe!d]dWdXZDe!d]dYdZZEdS )^zr An PyTorch implementation of Hiera

Adapted for timm from originals at https://github.com/facebookresearch/hiera
    N)partial)DictListOptionalTupleTypeUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
DropPathMlp
LayerScaleClNormMlpClassifierHeaduse_fused_attn_assertget_norm_layer	to_2tupleinit_weight_vitinit_weight_jax   )generate_default_cfgsregister_model)build_model_with_cfg)feature_take_indices)register_notrace_function)named_apply
checkpointHieranreturnc                 C   s   t jt jt jt jg|  S )z
    Returns a conv with nd (e.g., Conv2d for n=2). Work up to n=3.
    If you wanted a 4d Hiera, you could probably just implement this for n=4. (no promises)
    )nnIdentityConv1dConv2dConv3d)r    r&   E/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/hiera.pyconv_nd/   s   r(   target_sizemaskc                 C   sT   |d u r|S t t|jdd  t| kd |jdd  | kr(tj| | dS |S )N   z.mask spatial shape and target_size must match.)size)r   lenshapeFinterpolatefloatr)   r*   r&   r&   r'   get_resized_mask7   s    r3   xr.   mu_shapec              
   C   s   t |}| jd | jd }}dd t||D }| j|g|||R  } dgtdd ttdd| td| dd|  D g  t | jd g }| |j|g||R  } | S )a  
    Restore spatial organization by undoing windowed organization of mask units.

    Args:
        x: organized by mask units windows, e.g. in 2d [B, #MUy*#MUx, MUy, MUx, C]
        shape: current spatial shape, if it were not organized into mask unit
            windows, e.g. in 2d [B, #MUy*MUy, #MUx*MUx, C].
        mu_shape: current mask unit shape, e.g. in 2d [MUy, MUx]
    Returns:
        x: e.g. in 2d, [B, #MUy*MUy, #MUx*MUx, C]
    r   c                 S      g | ]\}}|| qS r&   r&   ).0smur&   r&   r'   
<listcomp>W       z"undo_windowing.<locals>.<listcomp>c                 S      g | ]}t |qS r&   listr8   pr&   r&   r'   r;   ]       r   r+   )r-   r.   zipviewsumrangepermutereshape)r4   r.   r5   DBCnum_MUsrG   r&   r&   r'   undo_windowingD   s   4rM   c                	       s`   e Zd ZdZdeedf deedf deeedf  f fddZdej	d	ej	fd
dZ
  ZS )Unrolla>  
    Reorders the tokens such that patches are contiguous in memory.
    E.g., given [B, (H, W), C] and stride of (Sy, Sx), this will re-order the tokens as
                           [B, (Sy, Sx, H // Sy, W // Sx), C]

    This allows operations like Max2d to be computed as x.view(B, Sx*Sy, -1, C).max(dim=1).
    Not only is this faster, but it also makes it easy to support inputs of arbitrary
    dimensions in addition to patch-wise sparsity.

    Performing this operation multiple times in sequence puts entire windows as contiguous
    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
    computed easily and efficiently, while also allowing max to be applied sequentially.

    Note: This means that intermediate values of the model are not in HxW order, so they
    need to be re-rolled if you want to use the intermediate values as a HxW feature map.
    The last block of the network is fine though, since by then the strides are all consumed.
    
input_size.patch_strideunroll_schedulec                    s*   t    dd t||D | _|| _d S )Nc                 S   r7   r&   r&   r8   ir9   r&   r&   r'   r;      r<   z#Unroll.__init__.<locals>.<listcomp>)super__init__rC   r,   schedule)selfrO   rP   rQ   	__class__r&   r'   rU   y   s   

zUnroll.__init__r4   r    c           
      C   s   |j \}}}| j}|j|g| |g  }| jD ]X}dd t||D }|gtdd t||D g  |g }||}t|}dgttd|d d ttd|d d |d g }	|	|	}|
dt|}|t|9 }q|dt| j|}|S )z
        Input: Flattened patch embeddings [B, N, C]
        Output: Patch embeddings [B, N, C] permuted such that [B, 4, N//4, C].max(1) etc. performs MaxPoolNd
        c                 S   r7   r&   r&   rR   r&   r&   r'   r;      r<   z"Unroll.forward.<locals>.<listcomp>c                 S   s   g | ]\}}||gqS r&   r&   rR   r&   r&   r'   r;      r<   r   r+   r   r6   )r.   r,   rD   rV   rC   rE   r-   r?   rF   rG   flattenmathprodrH   )
rW   r4   rJ   _rK   cur_sizestrides	new_shapeLrG   r&   r&   r'   forward   s   
&
8
zUnroll.forward__name__
__module____qualname____doc__r   intr   rU   torchTensorrb   __classcell__r&   r&   rX   r'   rN   e   s    


rN   c                
       sz   e Zd ZdZdeedf deedf deeedf  dee def
 fdd	Z	
ddej	dedej	dej	fddZ
  ZS )RerollzQ
    Undos the "unroll" operation so that you can use intermediate features.
    rO   .rP   rQ   
stage_endsq_poolc                    s   t    dd t||D | _i | _| j}t|d d D ])}||f| j|< ||d | v rGt|dkrAdd t||d D }|dd  }qd S )Nc                 S   r7   r&   r&   rR   r&   r&   r'   r;      r<   z#Reroll.__init__.<locals>.<listcomp>r6   r   r   c                 S   r7   r&   r&   )r8   r   r9   r&   r&   r'   r;      r<   )rT   rU   rC   r,   rV   rF   r-   )rW   rO   rP   rQ   rm   rn   r,   rS   rX   r&   r'   rU      s   
zReroll.__init__Nr4   	block_idxr*   r    c              
   C   s6  | j | \}}|j\}}}t|}	dg|	 }
|D ]h}|j|g||t| |
|R  }t|j}dd|	 gtdd ttdd|	 td|	 d |d D g  |d g }|	|}t|	D ]}|
|  || 9  < qb|j
|dg|
|R  }|jd }q|j||g|
|R  }|dur|S t|||
}|S )a&  
        Roll the given tensor back up to spatial order assuming it's from the given block.

        If no mask is provided:
            - Returns [B, H, W, C] for 2d, [B, T, H, W, C] for 3d, etc.
        If a mask is provided:
            - Returns [B, #MUs, MUy, MUx, C] for 2d, etc.
        r   r   c                 S   r=   r&   r>   r@   r&   r&   r'   r;      rB   z"Reroll.forward.<locals>.<listcomp>r6   N)rV   r.   r-   rD   r[   r\   rE   rC   rF   rG   rH   rM   )rW   r4   ro   r*   rV   r,   rJ   NrK   rI   cur_mu_shaper_   ra   rG   rS   r&   r&   r'   rb      s.   
(

4
zReroll.forwardNrc   r&   r&   rX   r'   rl      s.    

rl   c                       sj   e Zd ZU dZejje ed< 			dde	de	de	d	e	d
e	def fddZ
dejdejfddZ  ZS )MaskUnitAttentionz
    Computes either Mask Unit or Global Attention. Also is able to perform q pooling.

    Note: this assumes the tokens have already been flattened and unrolled into mask units.
    See `Unroll` for more details.
    
fused_attnr   r   Fdimdim_outheadsq_stridewindow_sizeuse_mask_unit_attnc                    sp   t    || _|| _|| _|| _|| | _| jd | _t | _	t
|d| | _t
||| _|| _|| _dS )a  
        Args:
        - dim, dim_out: The input and output feature dimensions.
        - heads: The number of attention heads.
        - q_stride: If greater than 1, pool q with this stride. The stride should be flattened (e.g., 2x2 = 4).
        - window_size: The current (flattened) size of a mask unit *after* pooling (if any).
        - use_mask_unit_attn: Use Mask Unit or Global Attention.
        g         N)rT   rU   ru   rv   rw   rx   head_dimscaler   rt   r!   Linearqkvprojry   rz   )rW   ru   rv   rw   rx   ry   rz   rX   r&   r'   rU      s   


zMaskUnitAttention.__init__r4   r    c                 C   s   |j \}}}| jr|| j| j  nd}| ||d|d| j| jdddddd}|	d\}}}	| jdkrH|
|| j|| jd| jjdd}| jrSt|||	}n|| j |dd	 }
|
jdd}
|
|	 }|dd|d| j}| |}|S )
z5 Input should be of shape [batch, tokens, channels]. r   r6   r{   r      r+      ru   )r.   rz   rx   ry   r   rH   rw   r|   rG   unbindrD   amaxrt   r/   scaled_dot_product_attentionr}   	transposesoftmaxrv   r   )rW   r4   rJ   rp   r]   num_windowsr   qkvattnr&   r&   r'   rb     s   .
"
zMaskUnitAttention.forward)r   r   F)rd   re   rf   rg   ri   jitFinalbool__annotations__rh   rU   rj   rb   rk   r&   r&   rX   r'   rs      s(   
 !rs   c                       s   e Zd Zdddejejddddf	ded	ed
edededee de	ej
 de	ej
 dedededef fddZdejdejfddZ  ZS )
HieraBlock      @        Nr   r   TFru   rv   rw   	mlp_ratio	drop_pathinit_values
norm_layer	act_layerrx   ry   use_expand_projrz   c                    s  t    || _|| _||| _||kr-d| _|r!t||| _n||d ks)J d | _nd| _d | _t	||||	|
|| _
|d urGt||dnt | _|dkrTt|nt | _||| _t|t|| |d| _|d urtt||dnt | _|dkrt|| _d S t | _d S )NTr+   F)r   r   )r   )rT   rU   ru   rv   norm1	do_expandr!   r~   r   rs   r   r   r"   ls1r   
drop_path1norm2r   rh   mlpls2
drop_path2)rW   ru   rv   rw   r   r   r   r   r   rx   ry   r   rz   rX   r&   r'   rU   7  s4   


$zHieraBlock.__init__r4   r    c              
   C   s   |  |}| jrT| jd ur'| |}||jd | jjd|jd jdd}n-tj	||jd | jjd|jd jdd||jd | jjd|jd j
ddgdd}|| | | | }|| | | | | }|S )Nr   r6   r   r   )r   r   r   rD   r.   r   rx   r   ri   catmeanr   r   r   r   r   r   )rW   r4   x_normr&   r&   r'   rb   e  s   


*&& zHieraBlock.forward)rd   re   rf   r!   	LayerNormGELUrh   r1   r   r   Moduler   rU   ri   rj   rb   rk   r&   r&   rX   r'   r   6  sH    	
.r   c                       sz   e Zd ZdZ	ddededeedf deedf deedf d	ef fd
dZ	ddej	de
ej	 dej	fddZ  ZS )
PatchEmbedzHPatch embed that supports any number of spatial dimensions (1d, 2d, 3d).Tdim_inrv   kernel.stridepaddingrH   c                    s8   t    t|| _|| _t| j|||||d| _d S )N)kernel_sizer   r   )rT   rU   r-   spatial_dimsrH   r(   r   )rW   r   rv   r   r   r   rH   rX   r&   r'   rU   }  s   
	
zPatchEmbed.__init__Nr4   r*   r    c                 C   sl   |d urt |jdd  |d}| ||tj }n| |}| jr4||jd |jd ddd}|S )Nr+   r2   r   r   r6   )r3   r.   r   tori   r   rH   r   rW   r4   r*   r&   r&   r'   rb     s   
"zPatchEmbed.forwardTrr   )rd   re   rf   rg   rh   r   r   rU   ri   rj   r   rb   rk   r&   r&   rX   r'   r   z  s0    	


r   c                =       s  e Zd Z												
																		dcdeedf dedededededeedf d ed!eedf d"eedf d#eedf d$ed%ed&ed'eedf d(eedf d)eedf d*ed+ed,ee d-ed.ed/e	ee
jf d0ed1ed2ed3ed4ed5eeef f: fd6d7Zd8d9 Zejjd:d; Zejjddd<ed=efd>d?Zejjded@ed=dfdAdBZejjdCdD Zdfdedee dEefdFdGZdHejdIed=ejfdJdKZd=ejfdLdMZ					N		dgdHejdOeej dPee	eee f  dQedRedSedTed<ed=e	eej eejeej f f fdUdVZ				dhdPe	eee f dWedXed<efdYdZZ		dfdHejdOeej d[ed=ejfd\d]Zddd^ed=ejfd_d`Z	didHejdOeej d=ejfdadbZ  Z S )jr      r   r{   `   r     avgr+   r{      r{   r+   r+      r   TTFFT       @   r   r   r   r{   r{   r   r   N r   MbP?F   r   img_size.in_chans	embed_dim	num_headsnum_classesglobal_poolstagesrn   rx   mask_unit_sizemask_unit_attnr   dim_mulhead_mulpatch_kernelrP   patch_paddingr   drop_path_rater   fix_initweight_initr   	drop_ratepatch_drop_ratehead_init_scalesep_pos_embedabs_win_pos_embedglobal_pos_sizec           *         s  t    || _d| _t|}t|trt|}|| _dd t	||D | _
t| j
}t|
}t|	} |t k s>J ||	| _| _||
| _| _dd t	| j
| jD | _ fddtdt d D | _|| _t|||||| _d | _d | _d | _d | _|rttd| j
d | j
d  || _ttd| j
d || _n*|rttjd|g|R  | _ttjd|g|
R  | _nttd||| _t |||	gt| jd d	  | _!t"|||	gt| jd d	  | j|| _#d
d | jd | D }!d}"t$ }#dd t%d||#D }$t& | _'g | _(t|#D ]g}%|}&||" }'|%d | jv rHt|| }&t|| }|"d7 }"|%|!v rH||  }t)||&|||$|% |||%|!v rY| nd|||'d}(|&}|%| jv r|  j(t*|&d|"d  d| j|"  dg7  _(| j'+|( q| | _,| _-t.|||||dd| _/|rtj0j1| jdd tj0j1| jdd n| jd urtj0j1| jdd | jd urtj0j1| jdd |dkr|dkrt2nt3})t4|)dd})t5|)|  |r| 6  t| j/j7tj8r	| j/j7j9j:;| | j/j7j<j:;| d S d S )NFc                 S   r7   r&   r&   rR   r&   r&   r'   r;     r<   z"Hiera.__init__.<locals>.<listcomp>c                 S   r7   r&   r&   rR   r&   r&   r'   r;     r<   c                    s    g | ]}t  d | d qS )Nr   )rE   r8   rS   r   r&   r'   r;     s     r   r+   r   r6   c                 S   s   g | ]}|d  qS )r   r&   r8   r4   r&   r&   r'   r;     rB   c                 S   s   g | ]}|  qS r&   )itemr   r&   r&   r'   r;     rB   )ru   rv   rw   r   r   r   r   rx   ry   r   rz   zblocks.)num_chs	reductionmoduleNLC)	pool_typer   r   	input_fmtg{Gz?)stdskipjaxhead.fc)classifier_name)=rT   rU   r   grad_checkpointingr   
isinstancerh   r   rP   rC   tokens_spatial_shaper[   r\   r-   rn   rx   mu_sizer   mask_spatial_shaperF   rm   r   r   patch_embed	pos_embedpos_embed_winpos_embed_spatialpos_embed_temporalr!   	Parameterri   zerosrN   unrollrl   rerollrE   linspace
ModuleListblocksfeature_infor   dictappendnum_featureshead_hidden_sizer   headinittrunc_normal_r   r   r   r   fix_init_weightfcr~   weightdatamul_bias)*rW   r   r   r   r   r   r   r   rn   rx   r   r   r   r   r   r   rP   r   r   r   r   r   r   r   r   r   r   r   r   r   
num_tokensflat_mu_sizeflat_q_strideq_pool_blocks	cur_stagedepthdprrS   rv   rz   blockinit_fnrX   r   r'   rU     s   
!


"

 


zHiera.__init__c                 C   sL   dd }t | jD ]\}}||jjjj|d  ||jjjj|d  q	d S )Nc                 S   s   |  td|  d S )Nr   )div_r[   sqrt)param	_layer_idr&   r&   r'   rescaleI  s   z&Hiera.fix_init_weight.<locals>.rescaler   )	enumerater   r   r   r  r	  r   fc2)rW   r  layer_idlayerr&   r&   r'   r  H  s
   zHiera.fix_init_weightc                 C   s*   | j d urdgS | jd urddgS ddgS )Nr   pos_embed_absr   r   r   )r   r  rW   r&   r&   r'   no_weight_decayP  s
   

zHiera.no_weight_decaycoarser    c                 C   s   t dddgdS )NzW^pos_embed|pos_embed_spatial|pos_embed_temporal|pos_embed_abs|pos_embed_win|patch_embed)z^blocks\.(\d+)N)z^norm)i )stemr   )r   )rW   r!  r&   r&   r'   group_matcherY  s   zHiera.group_matcherenablec                 C   s
   || _ d S rr   )r   )rW   r$  r&   r&   r'   set_grad_checkpointing`  s   
zHiera.set_grad_checkpointingc                 C   s   | j jS rr   )r  r  r  r&   r&   r'   get_classifierd  s   zHiera.get_classifierreset_otherc                 C   s   || _ | jj|||d d S )Nr'  )r   r  reset)rW   r   r   r'  r&   r&   r'   reset_classifierh  s   zHiera.reset_classifierr4   
mask_ratioc           
      C   s   |j d }t| j}t|d|  }tj|||jd}tj|dd}tj|dd}tj	||g|jd}	d|	ddd|f< tj
|	d|d}	|	 S )z
        Generates a random mask, mask_ratio fraction are dropped.
        1 is *keep*, 0 is *remove*. Useful for MAE, FLIP, etc.
        r   r   )devicer   N)ru   index)r.   r[   r\   r   rh   ri   randr,  argsortr   gatherr   )
rW   r4   r+  rJ   r   len_keepnoiseids_shuffleids_restorer*   r&   r&   r'   get_random_maskl  s   
zHiera.get_random_maskc                 C   s   | j d ur)| j | j}tj| j|jdd  ddd}|| }|ddd}n%| jd ur2| j}n| j	
d| jd dtj| j| jd | jd  dd }|| }|S )	Nr   bicubicT)r,   mode	antialiasr+   r   r   r   )r   tiler   r/   r0   r   r.   rZ   r   r   repeatr   ri   repeat_interleaver   )rW   r4   r   r   r&   r&   r'   
_pos_embed  s,   

	zHiera._pos_embedNCHWr*   indicesnorm
stop_early
output_fmtintermediates_onlyc	              	      s  |rJ d|dv sJ d|r)t t j|\}	}
 fdd|	D }	 j|
 }
n
t t j|\}	}
|durF|j|jd dg jR  }nd} j||d	} |} 	|}|durw||d
 
d j|jd  |jd d|jd }g }tj s|s j}n	 jd|
d  }t|D ]3\}} jrtj st||}n||}||	v rĈ j|||d	}||dkr|ddddn| q|r|S ||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        z'normalization of features not supported)r=  NHWCz(Output format must be one of NCHW, NHWC.c                    s   g | ]} j | qS r&   )rm   r   r  r&   r'   r;     s    z/Hiera.forward_intermediates.<locals>.<listcomp>Nr   r   r*   .Nr+   r6   r=  r{   )r   r-   rm   r   rD   r.   r   r   r<  r   r9  r   ri   r   is_scriptingr  r   r   r   r   rG   )rW   r4   r*   r>  r?  r@  rA  rB  r!  take_indices	max_index
patch_maskintermediatesr   rS   blkx_intr&   r  r'   forward_intermediates  s<   

4"zHiera.forward_intermediates
prune_norm
prune_headc                 C   sd   |rt t| j|\}}| j| }n
t t| j|\}}| jd|d  | _|r0| jjddd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   Tr(  )r   r-   rm   r   r  r)  )rW   r>  rN  rO  r!  rG  rH  r&   r&   r'   prune_intermediate_layers  s   	zHiera.prune_intermediate_layersreturn_intermediatesc                 C   s$  | j r| jdkr|du sJ | j|| jd}|dur)|j|jd dg| jR  }nd}| j||d}| |}| |}|durZ||d 	d| j
|jd  |jd d|jd }g }t| jD ](\}}| jrstj sst||}n||}|r|| jv r|| j|||d qa|r||fS |S )	z
        mask should be a boolean tensor of shape [B, #MUt*#MUy*#MUx] where #MU are the number of mask units in that dim.
        Note: 1 in mask is *keep*, 0 is *remove*; mask.sum(dim=-1) should be the same across the batch.
        r   N)r+  r   rD  rE  r+   r6   )trainingr   r5  rD   r.   r   r   r<  r   r9  r   r  r   r   ri   r   rF  r   rm   r   r   )rW   r4   r*   rQ  rI  rJ  rS   rK  r&   r&   r'   forward_features  s,   


4zHiera.forward_features
pre_logitsc                 C   s$   |r| j ||d}|S |  |}|S )N)rT  )r  )rW   r4   rT  r&   r&   r'   forward_head  s   
zHiera.forward_headc                 C   s$   | j ||d}|d u r| |}|S )NrD  )rS  rU  r   r&   r&   r'   rb   #  s   
zHiera.forward)r   r{   r   r   r   r   r   r{   r   r   r   Tr   r   r   r   r   r   r   NTr   r   r   r   r   FFr   Fr   )NF)NNFTr=  FT)r   FTTrr   )!rd   re   rf   r   rh   strr   r1   r   r   r!   r   rU   r  ri   r   ignorer   r   r#  r%  r&  r*  rj   r5  r<  r   rM  rP  rS  rU  rb   rk   r&   r&   rX   r'   r     sB   

	







 %

	 

A

-r   c                 K   s    | ddd dddt tddd|S )	Nr   )r{   r   r   g?r6  Tzpatch_embed.projr   )urlr   rO   	pool_sizecrop_pctinterpolationfixed_input_sizer   r   
first_conv
classifierr	   )rY  kwargsr&   r&   r'   _cfg.  s   ra  zhiera_tiny_224.mae_in1k_ft_in1kztimm/zcc-by-nc-4.0)	hf_hub_idlicensezhiera_tiny_224.mae)rb  rc  r   z hiera_small_224.mae_in1k_ft_in1kzhiera_small_224.maezhiera_base_224.mae_in1k_ft_in1kzhiera_base_224.maez$hiera_base_plus_224.mae_in1k_ft_in1kzhiera_base_plus_224.maez hiera_large_224.mae_in1k_ft_in1kzhiera_large_224.maezhiera_huge_224.mae_in1k_ft_in1kzhiera_huge_224.maez.hiera_small_abswin_256.sbb2_e200_in12k_ft_in1k)r{      rd  gffffff?)rb  rO   r[  z1hiera_small_abswin_256.sbb2_pd_e200_in12k_ft_in1kz&hiera_small_abswin_256.sbb2_e200_in12ki-.  )rb  r   rO   r[  z)hiera_small_abswin_256.sbb2_pd_e200_in12kzhiera_base_abswin_256.untrained)rO   r[  c                 C   s|   |  d| } i }|  D ]/\}}d|v r|dd}|dr&|dd}n|dr1|dd}|dkr7d}|||< q|S )	Nmodel_statezhead.projection.zhead.fc.zencoder_norm.z
head.norm.znorm.r  r   )getitemsreplace
startswith)
state_dictmodeloutputr   r   r&   r&   r'   checkpoint_filter_fn  s   



rm  Fvariant
pretrainedc                 K   s.   | dd}tt| |ftt|ddd|S )Nout_indicesr   getter)rp  feature_cls)pretrained_filter_fnfeature_cfg)popr   r   rm  r   )rn  ro  r`  rp  r&   r&   r'   _create_hiera  s   
rv  c                 K   ,   t dddd}tdd| it |fi |S )Nr   r   )r   r+   r   r+   r   r   r   hiera_tiny_224ro  )ry  r   rv  ro  r`  
model_argsr&   r&   r'   ry       ry  c                 K   rw  )Nr   r   r   r+      r+   rx  hiera_small_224ro  )r  rz  r{  r&   r&   r'   r    r}  r  c                 K   rw  )Nr   r   r   rx  hiera_base_224ro  )r  rz  r{  r&   r&   r'   r    r}  r  c                 K   rw  )Np   r+   r   rx  hiera_base_plus_224ro  )r  rz  r{  r&   r&   r'   r    r}  r  c                 K   rw  )N   r+   r+      $   r   rx  hiera_large_224ro  )r  rz  r{  r&   r&   r'   r    r}  r  c                 K   rw  )Nrd  r   r  rx  hiera_huge_224ro  )r  rz  r{  r&   r&   r'   r    r}  r  c              
   K   s6   t ddddddddd	}tdd| it |fi |S )Nr   r   r~  T)r   r   h㈵>r   F)r   r   r   r   r   r   r   r   hiera_small_abswin_256ro  )r  rz  r{  r&   r&   r'   r    s
   
r  c                 K   s2   t ddddddd}td
d	| it |fi |S )Nr   r   r   Tr  r   )r   r   r   r   r   r   hiera_base_abswin_256ro  )r  rz  r{  r&   r&   r'   r    s   r  )r   rr   rV  )Frg   r[   	functoolsr   typingr   r   r   r   r   r   ri   torch.nnr!   torch.nn.functional
functionalr/   	timm.datar
   r   timm.layersr   r   r   r   r   r   r   r   r   r   	_registryr   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   __all__rh   r   r(   rj   r3   rM   rN   rl   rs   r   r   r   ra  default_cfgsrm  rW  r   rv  ry  r  r  r  r  r  r  r  r&   r&   r&   r'   <module>   s(    0 
!>QBD(   
#)-37=AEJO
V