o
    پi^                  	   @   s~  d Z ddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZ ddlZddlm  mZ ddlmZ ddlmZmZ ddlmZmZmZmZmZmZ ddlmZmZmZmZmZ d	d
l m!Z! d	dl"m#Z# d	dl$m%Z% d	dl&m'Z'm(Z( d	dl)m*Z*m+Z+m,Z, dgZ-e.e/Z0G dd dej1Z2G dd dej1Z3G dd dej1Z4de5fddZ6e%de5fddZ7G dd dej1Z8G dd dej1Z9d?d ej1d!e:d"e;fd#d$Z<d%d& Z=d'd( Z>d@d*d+Z?dAd,d-Z@e+e@ e@ e@ e@d.d/e@d.d/e@d.d/d0ZAe*d@d1e9fd2d3ZBe*d@d1e9fd4d5ZCe*d@d1e9fd6d7ZDe*d@d1e9fd8d9ZEe*d@d1e9fd:d;ZFe*d@d1e9fd<d=ZGe,e/d9d;d=d> dS )Ba   Nested Transformer (NesT) in PyTorch

A PyTorch implement of Aggregating Nested Transformers as described in:

'Aggregating Nested Transformers'
    - https://arxiv.org/abs/2105.12723

The official Jax code is released and available at https://github.com/google-research/nested-transformer. The weights
have been converted with convert/convert_nest_flax.py

Acknowledgments:
* The paper authors for sharing their research, code, and model weights
* Ross Wightman's existing code off which I based this

Copyright 2021 Alexander Soare
    N)partial)ListOptionalTupleUnion)nnIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpDropPathcreate_classifiertrunc_normal__assert)create_conv2dcreate_pool2d	to_ntupleuse_fused_attn	LayerNorm   )build_model_with_cfg)feature_take_indices)register_notrace_function)checkpoint_seqnamed_apply)register_modelgenerate_default_cfgsregister_model_deprecationsNestc                       s<   e Zd ZU dZejje ed< d
 fdd	Z	dd	 Z
  ZS )	Attentionz
    This is much like `.vision_transformer.Attention` but uses *localised* self attention by accepting an input with
     an extra "image block" dim
    
fused_attn   F        c                    sj   t    || _|| }|d | _t | _tj|d| |d| _t	|| _
t||| _t	|| _d S )Ng         )bias)super__init__	num_headsscaler   r!   r   LinearqkvDropout	attn_dropproj	proj_drop)selfdimr(   qkv_biasr-   r/   head_dim	__class__ D/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/nest.pyr'   1   s   

zAttention.__init__c              	   C   s   |j \}}}}| ||||d| j|| j dddddd}|d\}}}	| jr<tj|||	| j	r7| j
jndd}n|| j }||d	d
 }
|
jd
d}
| 
|
}
|
|	 }|ddddd||||}| |}| |}|S )zm
        x is shape: B (batch_size), T (image blocks), N (seq length per image block), C (embed dim)
        r$   r      r         r#   )	dropout_p)r1   )shaper+   reshaper(   permuteunbindr!   Fscaled_dot_product_attentiontrainingr-   pr)   	transposesoftmaxr.   r/   )r0   xBTNCr+   qkvattnr6   r6   r7   forward=   s   2"



zAttention.forward)r"   Fr#   r#   )__name__
__module____qualname____doc__torchjitFinalbool__annotations__r'   rQ   __classcell__r6   r6   r4   r7   r    *   s
   
 r    c                       s<   e Zd ZdZdddddejejf fdd	Zdd Z  Z	S )	TransformerLayerz
    This is much like `.vision_transformer.Block` but:
        - Called TransformerLayer here to allow for "block" as defined in the paper ("non-overlapping image blocks")
        - Uses modified Attention layer that handles the "block" dimension
          @Fr#   c
                    sn   t    |	|| _t|||||d| _|dkrt|nt | _|	|| _	t
|| }
t||
||d| _d S )N)r(   r2   r-   r/   r#   )in_featureshidden_features	act_layerdrop)r&   r'   norm1r    rP   r   r   Identity	drop_pathnorm2intr   mlp)r0   r1   r(   	mlp_ratior2   r/   r-   rd   r`   
norm_layermlp_hidden_dimr4   r6   r7   r'   \   s$   


zTransformerLayer.__init__c                 C   s<   |  |}|| | | }|| | | | }|S N)rb   rd   rP   rg   re   )r0   rH   yr6   r6   r7   rQ   {   s   
zTransformerLayer.forward)
rR   rS   rT   rU   r   GELUr   r'   rQ   r[   r6   r6   r4   r7   r\   V   s    	r\   c                       s&   e Zd Zd fdd	Zdd Z  ZS )ConvPool c                    s>   t    t||d|dd| _||| _tddd|d| _d S )Nr$   T)kernel_sizepaddingr%   maxr9   )rp   striderq   )r&   r'   r   convnormr   pool)r0   in_channelsout_channelsri   pad_typer4   r6   r7   r'      s   

zConvPool.__init__c                 C   sj   t |jd d dkd t |jd d dkd | |}| |dddddddd}| |}|S )z:
        x is expected to have shape (B, C, H, W)
        r<   r9   r   z1BlockAggregation requires even input spatial dimsr=   r$   r   )r   r>   rt   ru   r@   rv   r0   rH   r6   r6   r7   rQ      s   
"
zConvPool.forwardro   )rR   rS   rT   r'   rQ   r[   r6   r6   r4   r7   rn      s    rn   
block_sizec                 C   sv   | j \}}}}t|| dkd t|| dkd || }|| }| ||||||} | dd||| d|} | S )zimage to blocks
    Args:
        x (Tensor): with shape (B, H, W, C)
        block_size (int): edge length of a single square block in units of H, W
    r   z,`block_size` must divide input height evenlyz+`block_size` must divide input width evenlyr9   r$   r=   )r>   r   r?   rF   )rH   r|   rI   HWrL   grid_height
grid_widthr6   r6   r7   blockify   s   r   c           	      C   sX   | j \}}}}tt|}||  }}| ||||||} | dd||||} | S )zblocks to image
    Args:
        x (Tensor): with shape (B, T, N, C) where T is number of blocks and N is sequence size per block
        block_size (int): edge length of a single square block in units of desired H, W
    r9   r$   )r>   rf   mathsqrtr?   rF   )	rH   r|   rI   rJ   _rL   	grid_sizeheightwidthr6   r6   r7   
deblockify   s   r   c                	       s<   e Zd ZdZdddddg dddf	 fdd	Zd	d
 Z  ZS )	NestLevelz7 Single hierarchical level of a Nested Transformer
    Nr]   Tr#   ro   c              
      s   t    || _d| _ttd||| _|d ur%t	||d| _
nt | _
tr8t|ks8J dtj f	ddt|D  | _d S )NFr   )ri   ry   zDMust provide as many drop path rates as there are transformer layersc                    s*   g | ]}t |  d 	qS ))	r1   r(   rh   r2   r/   r-   rd   ri   r`   )r\   .0i	r`   r-   rd   	embed_dimrh   ri   r(   r/   r2   r6   r7   
<listcomp>   s    z&NestLevel.__init__.<locals>.<listcomp>)r&   r'   r|   grad_checkpointingr   	ParameterrV   zeros	pos_embedrn   rv   rc   len
Sequentialrangetransformer_encoder)r0   
num_blocksr|   
seq_lengthr(   depthr   prev_embed_dimrh   r2   r/   r-   rd   ri   r`   ry   r4   r   r7   r'      s   

zNestLevel.__init__c                 C   st   |  |}|dddd}t|| j}|| j }| jr'tj s't	| j
|}n| 
|}t|| j}|ddddS )z+
        expects x as (B, C, H, W)
        r   r9   r$   r   )rv   r@   r   r|   r   r   rV   rW   is_scriptingr   r   r   rz   r6   r6   r7   rQ      s   


zNestLevel.forward)rR   rS   rT   rU   r'   rQ   r[   r6   r6   r4   r7   r      s    
.r   c                       sf  e Zd ZdZ											
									d7 fdd	Zejjd8ddZejjdd Z	ejjd9ddZ
ejjd:ddZejjdejfddZd;dedefd d!Z				"	d<d#ejd$eeeee f  d%ed&ed'ed(edeeej eejeej f f fd)d*Z	+		
d=d$eeee f d,ed-efd.d/Zd0d1 Zd9d2efd3d4Zd5d6 Z  ZS )>r   z Nested Transformer (NesT)

    A PyTorch impl of : `Aggregating Nested Transformers`
        - https://arxiv.org/abs/2105.12723
       r$   r8         i   r8   r"      r9   r9        r]   Tr#         ?Nro   avgc                    s  t    dD ]}t | }t|tjjr#t||ks#J d| dqt||}t||}t||}|| _	|d  | _
| _g | _|pGt}|pLtj}|| _|| _t|tjjrj|d |d ksfJ d|d }|| dkstJ d|| _d	t| d | _|| t| jd  dksJ d
t|| t| jd  | _t||||d dd| _| jj| _| j| jd  | _g }dd t d|t!|"|D }d}d	}t#t| jD ]=}|| }|$t%| j| | j| j|| || |||	|
|||| |||d |  jt&||d| dg7  _|}|d9 }qtj'| | _(||d | _)t*| j
| j	|d\}}|| _+t,|| _-|| _.| /| dS )a  
        Args:
            img_size (int, tuple): input image size
            in_chans (int): number of input channels
            patch_size (int): patch size
            num_levels (int): number of block hierarchies (T_d in the paper)
            embed_dims (int, tuple): embedding dimensions of each level
            num_heads (int, tuple): number of attention heads for each level
            depths (int, tuple): number of transformer layers for each level
            num_classes (int): number of classes for classification head
            mlp_ratio (int): ratio of mlp hidden dim to embedding dim for MLP of transformer layers
            qkv_bias (bool): enable bias for qkv if True
            drop_rate (float): dropout rate for MLP of transformer layers, MSA final projection layer, and classifier
            attn_drop_rate (float): attention dropout rate
            drop_path_rate (float): stochastic depth rate
            norm_layer: (nn.Module): normalization layer for transformer layers
            act_layer: (nn.Module): activation layer in MLP of transformer layers
            pad_type: str: Type of padding to use '' for PyTorch symmetric, 'same' for TF SAME
            weight_init: (str): weight init scheme
            global_pool: (str): type of pooling operation to apply to final feature map

        Notes:
            - Default values follow NesT-B from the original Jax code.
            - `embed_dims`, `num_heads`, `depths` should be ints or tuples with length `num_levels`.
            - For those following the paper, Table A1 may have errors!
                - https://github.com/google-research/nested-transformer/issues/2
        
embed_dimsr(   depthszRequire `len(z) == num_levels`r=   r   r   z Model only handles square inputsz*`patch_size` must divide `img_size` evenlyr8   zUFirst level blocks don't fit evenly. Check `img_size`, `patch_size`, and `num_levels`F)img_size
patch_sizein_chansr   flattenc                 S   s   g | ]}|  qS r6   )tolist)r   rH   r6   r6   r7   r   ]  s    z!Nest.__init__.<locals>.<listcomp>N)rh   r2   r/   r-   rd   ri   r`   ry   zlevels.)num_chs	reductionmoduler9   	pool_type)0r&   r'   locals
isinstancecollectionsabcSequencer   r   num_classesnum_featureshead_hidden_sizefeature_infor   r   rm   	drop_rate
num_levelsr   rV   arangeflipr   r   r   r   rf   r|   r   patch_embednum_patchesr   linspacesumsplitr   appendr   dictr   levelsru   r   global_poolr,   	head_dropheadinit_weights)r0   r   r   r   r   r   r(   r   r   rh   r2   r   proj_drop_rateattn_drop_ratedrop_path_rateri   r`   ry   weight_initr   
param_nameparam_valuer   dp_ratesprev_dimcurr_strider   r1   r   r4   r6   r7   r'      s   
1


" 
zNest.__init__c                 C   sZ   |dv sJ d|v rt | j nd}| jD ]}t|jdddd qttt|d|  d S )	N)nlhbro   r   r#   {Gz?r<   r9   stdab)	head_bias)	r   logr   r   r   r   r   r   _init_nest_weights)r0   moder   levelr6   r6   r7   r     s
   
zNest.init_weightsc                 C   s   dd t t| jD S )Nc                 S   s   h | ]}d | dqS )zlevel.z
.pos_embedr6   r   r6   r6   r7   	<setcomp>  s    z'Nest.no_weight_decay.<locals>.<setcomp>)r   r   r   r0   r6   r6   r7   no_weight_decay  s   zNest.no_weight_decayFc                 C   s"   t d|rdndd fddgd}|S )Nz^patch_embedz^levels\.(\d+)z*^levels\.(\d+)\.transformer_encoder\.(\d+))z"^levels\.(\d+)\.(?:pool|pos_embed))r   )z^norm)i )stemblocks)r   )r0   coarsematcherr6   r6   r7   group_matcher  s   zNest.group_matcherc                 C   s   | j D ]}||_qd S rk   )r   r   )r0   enablelr6   r6   r7   set_grad_checkpointing  s   
zNest.set_grad_checkpointingreturnc                 C   s   | j S rk   )r   r   r6   r6   r7   get_classifier  s   zNest.get_classifierr   r   c                 C   s$   || _ t| j| j |d\| _| _d S )Nr   )r   r   r   r   r   )r0   r   r   r6   r6   r7   reset_classifier  s   
zNest.reset_classifierNCHWrH   indicesru   
stop_early
output_fmtintermediates_onlyc              	   C   s  |dv sJ dg }t t| j|\}}	| |}t| jd }
tj s'|s+| j}n	| jd|	d  }t|D ].\}}||}||v rf|ra||
kra| 	|
dddd
dddd}|| q8|| q8|rk|S ||
kr| 	|
dddd
dddd}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r   zOutput shape must be NCHW.r   Nr   r9   r$   )r   r   r   r   r   rV   rW   r   	enumerateru   r@   r   )r0   rH   r   ru   r   r   r   intermediatestake_indices	max_indexlast_idxstagesfeat_idxstagex_interr6   r6   r7   forward_intermediates  s*   
"
"zNest.forward_intermediatesr   
prune_norm
prune_headc                 C   sJ   t t| j|\}}| jd|d  | _|rt | _|r#| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   ro   )r   r   r   r   rc   ru   r   )r0   r   r   r   r   r   r6   r6   r7   prune_intermediate_layers  s   
zNest.prune_intermediate_layersc                 C   s:   |  |}| |}| |dddddddd}|S )Nr   r9   r$   r   )r   r   ru   r@   rz   r6   r6   r7   forward_features  s   

"zNest.forward_features
pre_logitsc                 C   s&   |  |}| |}|r|S | |S rk   )r   r   r   )r0   rH   r  r6   r6   r7   forward_head  s   

zNest.forward_headc                 C   s   |  |}| |}|S rk   )r  r  rz   r6   r6   r7   rQ     s   

zNest.forward)r   r$   r8   r$   r   r   r   r   r]   Tr#   r#   r#   r   NNro   ro   r   r{   F)T)r   )NFFr   F)r   FT)rR   rS   rT   rU   r'   rV   rW   ignorer   r   r   r   r   Moduler   rf   strr   Tensorr   r   r   rY   r   r   r  r  r  rQ   r[   r6   r6   r4   r7   r      s     
 
5
ro   r#   r   namer   c                 C   s   t | tjr7|drt| jdddd tj| j| dS t| jdddd | jdur5tj	| j dS dS t | tj
rTt| jdddd | jdurVtj	| j dS dS dS )zn NesT weight initialization
    Can replicate Jax implementation. Otherwise follows vision_transformer.py
    r   r   r<   r9   r   N)r   r   r*   
startswithr   weightinit	constant_r%   zeros_Conv2d)r   r
  r   r6   r6   r7   r     s   


r   c                 C   s   t d| j|j | jd }|jdd \}}tt|| }t| tt|dddd} tj	| ||gddd} t
| ddddtt|} | S )	z
    Rescale the grid of position embeddings when loading from state_dict
    Expected shape of position embeddings is (1, T, N, C), and considers only square images
    z$Resized position embedding: %s to %sr9   r   r$   r   bicubicF)sizer   align_corners)_loggerinfor>   rf   r   r   r   r@   rB   interpolater   )posemb
posemb_newseq_length_oldnum_blocks_newseq_length_newsize_newr6   r6   r7   resize_pos_embed  s   
  r  c                 C   sN   dd |   D }|D ]}| | jt||jkr$t| | t||| |< q| S )z4 resize positional embeddings of pretrained weights c                 S   s   g | ]	}| d r|qS )
pos_embed_)r  )r   rN   r6   r6   r7   r   #  s    z(checkpoint_filter_fn.<locals>.<listcomp>)keysr>   getattrr  )
state_dictmodelpos_embed_keysrN   r6   r6   r7   checkpoint_filter_fn!  s   r$  Fc                 K   s&   t t| |ftdddtd|}|S )N)r   r   r9   T)out_indicesflatten_sequential)feature_cfgpretrained_filter_fn)r   r   r   r$  )variant
pretrainedkwargsr"  r6   r6   r7   _create_nest*  s   
	r,  c                 K   s$   | ddddgdddt tddd	|S )
Nr   )r$   r   r      g      ?r  Tzpatch_embed.projr   )urlr   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizemeanr   
first_conv
classifierr   )r.  r+  r6   r6   r7   _cfg7  s   
r7  ztimm/)	hf_hub_id)znest_base.untrainedznest_small.untrainedznest_tiny.untrainedznest_base_jx.goog_in1kznest_small_jx.goog_in1kznest_tiny_jx.goog_in1kr   c                 K   ,   t ddddd|}td	d| i|}|S )
 Nest-B @ 224x224
    r   r   r   r   	nest_baser*  Nr6   )r;  r   r,  r*  r+  model_kwargsr"  r6   r6   r7   r;  M  s   r;  c                 K   r9  )
 Nest-S @ 224x224
    `      i  r$         r   r   
nest_smallr*  Nr6   )rF  r<  r=  r6   r6   r7   rF  W     rF  c                 K   r9  )
 Nest-T @ 224x224
    r@  rC  r9   r9   r"   r   	nest_tinyr*  Nr6   )rJ  r<  r=  r6   r6   r7   rJ  `  rG  rJ  c                 K   8   | dd td
dddd|}tdd| i|}|S )r:  ry   samer   r   r   r   nest_base_jxr*  Nr6   )rM  
setdefaultr   r,  r=  r6   r6   r7   rM  i  s   rM  c                 K   rK  )r?  ry   rL  r@  rC  r   r   nest_small_jxr*  Nr6   )rP  rN  r=  r6   r6   r7   rP  t     rP  c                 K   rK  )rH  ry   rL  r@  rC  rI  r   nest_tiny_jxr*  Nr6   )rR  rN  r=  r6   r6   r7   rR  ~  rQ  rR  )jx_nest_basejx_nest_smalljx_nest_tiny)ro   r#   r  r{   )HrU   collections.abcr   loggingr   	functoolsr   typingr   r   r   r   rV   torch.nn.functionalr   
functionalrB   	timm.datar	   r
   timm.layersr   r   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr   r   r   __all__	getLoggerrR   r  r  r    r\   rn   rf   r   r   r   r   r  floatr   r  r$  r,  r7  default_cfgsr;  rF  rJ  rM  rP  rR  r6   r6   r6   r7   <module>   sv     
,,B  	
	
	
		