o
    پiP                     @   sX  d Z ddlZddlmZmZmZmZmZmZm	Z	 ddl
Z
ddlmZ ddlm  mZ ddlmZmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
l m!Z!m"Z" dgZ#G dd dej$Z%G dd dej$Z&G dd dej$Z'G dd dej$Z(G dd dej$Z)G dd dej$Z*de+dedej$fddZ,dqdee-df de.d e-dee-e-e-e-f fd!d"Z/G d#d$ d$ej$Z0G d%d& d&ej$Z1		'		(	)	*		+drd,ed-e-d.e-d/ee- d0e-d1e-d2e-d3e-d4e.d5e2d6e.d7e.d8edej3fd9d:Z4	)	*		+dsd,ed-e-d.e-d/ee- d0e-d4e.d5e2d6e.d7e.d8edej3fd;d<Z5G d=d dej$Z6dtd>e+d?e2d8ede6fd@dAZ7dudCe+d8edee+ef fdDdEZ8e"e8dFdGdHdIe8dFdJdKdLdMe8dFdNdHdIe8dFdOdKdLdMe8dFdPdHdIe8dFdQdKdRdMe8dFdSdHdIe8dFdTdUdRdMe8dFdVdHdIe8dFdWdUdRdMe8dFdXdUdYdMdZZ9e!dtd?e2d8ede6fd[d\Z:e!dtd?e2d8ede6fd]d^Z;e!dtd?e2d8ede6fd_d`Z<e!dtd?e2d8ede6fdadbZ=e!dtd?e2d8ede6fdcddZ>e!dtd?e2d8ede6fdedfZ?e!dtd?e2d8ede6fdgdhZ@e!dtd?e2d8ede6fdidjZAe!dtd?e2d8ede6fdkdlZBe!dtd?e2d8ede6fdmdnZCe!dtd?e2d8ede6fdodpZDdS )va5   Vision OutLOoker (VOLO) implementation

Paper: `VOLO: Vision Outlooker for Visual Recognition` - https://arxiv.org/abs/2106.13112

Code adapted from official impl at https://github.com/sail-sg/volo, original copyright in comment below

Modifications and additions for timm by / Copyright 2022, Ross Wightman
    N)AnyCallableDictListOptionalTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)DropPathMlp	to_2tuple	to_ntupletrunc_normal_use_fused_attn   )build_model_with_cfg)feature_take_indices)
checkpoint)register_modelgenerate_default_cfgsVOLOc                       sf   e Zd ZdZ						ddededed	ed
edededef fddZdej	dej	fddZ
  ZS )OutlookAttentionz,Outlook attention mechanism for VOLO models.   r   F        dim	num_headskernel_sizepaddingstrideqkv_bias	attn_drop	proj_dropc	           
         s   t    || }	|| _|| _|| _|| _|	d | _tj|||d| _	t||d | | _
t|| _t||| _t|| _tj|||d| _tj||dd| _dS )a  Initialize OutlookAttention.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            kernel_size: Kernel size for attention computation.
            padding: Padding for attention computation.
            stride: Stride for attention computation.
            qkv_bias: Whether to use bias in linear layers.
            attn_drop: Attention dropout rate.
            proj_drop: Projection dropout rate.
              ࿩bias   )r   r   r    T)r   r    	ceil_modeN)super__init__r   r   r   r    scalennLinearvattnDropoutr"   projr#   Unfoldunfold	AvgPool2dpool)
selfr   r   r   r   r    r!   r"   r#   head_dim	__class__ D/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/volo.pyr*   *   s   

zOutlookAttention.__init__xreturnc           
      C   sp  |j \}}}}| |dddd}t|| j t|| j }}| ||| j|| j | j	| j	 || ddddd}| 
|dddddddd}	| |	||| | j| j	| j	 | j	| j	 ddddd}	|	| j }	|	jdd}	| |	}	|	| ddddd||| j	 | j	 || }tj|||f| j	| j| jd}| |dddd}| |}|S )	Forward pass.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Output tensor of shape (B, H, W, C).
        r   r   r      r'   r   )output_sizer   r   r    )shaper.   permutemathceilr    r3   reshaper   r   r5   r/   r+   softmaxr"   Ffoldr   r1   r#   )
r6   r<   BHWCr.   hwr/   r:   r:   r;   forwardS   s0   	"
"



0
zOutlookAttention.forward)r   r   r   Fr   r   )__name__
__module____qualname____doc__intboolfloatr*   torchTensorrQ   __classcell__r:   r:   r8   r;   r   '   s4    	)r   c                       sz   e Zd ZdZdddddejejdfdededed	ed
edededede	de	de
f fddZdejdejfddZ  ZS )	Outlookerz9Outlooker block that combines outlook attention with MLP.r         @r   Fr   r   r   r    r   	mlp_ratior"   	drop_path	act_layer
norm_layerr!   c              	      s   t    |
|| _t|||||||d| _|dkrt|nt | _|
|| _	t
|t|| |	d| _|dkr?t|| _dS t | _dS )af  Initialize Outlooker block.

        Args:
            dim: Input feature dimension.
            kernel_size: Kernel size for outlook attention.
            padding: Padding for outlook attention.
            stride: Stride for outlook attention.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio for MLP hidden dimension.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth drop rate.
            act_layer: Activation layer type.
            norm_layer: Normalization layer type.
            qkv_bias: Whether to use bias in linear layers.
        )r   r   r    r!   r"   r   in_featureshidden_featuresr`   N)r)   r*   norm1r   r/   r   r,   Identity
drop_path1norm2r   rV   mlp
drop_path2)r6   r   r   r   r    r   r^   r"   r_   r`   ra   r!   r8   r:   r;   r*   y   s&   

	

$zOutlooker.__init__r<   r=   c                 C   8   ||  | | | }|| | | | }|S zoForward pass.

        Args:
            x: Input tensor.

        Returns:
            Output tensor.
        rg   r/   re   rj   ri   rh   r6   r<   r:   r:   r;   rQ         	zOutlooker.forward)rR   rS   rT   rU   r,   GELU	LayerNormrV   rX   r   rW   r*   rY   rZ   rQ   r[   r:   r:   r8   r;   r\   v   sD    	
2r\   c                       sh   e Zd ZU dZejje ed< 				dde	de	ded	e
d
e
f
 fddZdejdejfddZ  ZS )	Attentionz!Multi-head self-attention module.
fused_attn   Fr   r   r   r!   r"   r#   c                    sj   t    || _|| }|d | _t | _tj||d |d| _t	|| _
t||| _t	|| _dS )a,  Initialize Attention module.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            qkv_bias: Whether to use bias in QKV projection.
            attn_drop: Attention dropout rate.
            proj_drop: Projection dropout rate.
        r$   r   r%   N)r)   r*   r   r+   r   rs   r,   r-   qkvr0   r"   r1   r#   )r6   r   r   r!   r"   r#   r7   r8   r:   r;   r*      s   

zAttention.__init__r<   r=   c                 C   s   |j \}}}}| |||| d| j|| j ddddd}|d\}}}	| jr<tj|||	| j	r7| j
jndd}n|| j }||dd	 }
|
jd	d
}
| 
|
}
|
|	 }|dd||||}| |}| |}|S )r>   r   r?   r   r   r'   r   )	dropout_pr@   rA   )rC   ru   rG   r   rD   unbindrs   rI   scaled_dot_product_attentiontrainingr"   pr+   	transposerH   r1   r#   )r6   r<   rK   rL   rM   rN   ru   qkr.   r/   r:   r:   r;   rQ      s"   	2



zAttention.forward)rt   Fr   r   )rR   rS   rT   rU   rY   jitFinalrW   __annotations__rV   rX   r*   rZ   rQ   r[   r:   r:   r8   r;   rr      s&   
 rr   c                       sj   e Zd ZdZddddejejfdededede	d	ed
ede
de
f fddZdejdejfddZ  ZS )Transformerz9Transformer block with multi-head self-attention and MLP.      @Fr   r   r   r^   r!   r"   r_   r`   ra   c	           	         s   t    ||| _t||||d| _|dkrt|nt | _||| _	t
|t|| |d| _|dkr<t|| _dS t | _dS )a  Initialize Transformer block.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            mlp_ratio: Ratio for MLP hidden dimension.
            qkv_bias: Whether to use bias in QKV projection.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth drop rate.
            act_layer: Activation layer type.
            norm_layer: Normalization layer type.
        )r   r!   r"   r   rb   N)r)   r*   re   rr   r/   r   r,   rf   rg   rh   r   rV   ri   rj   )	r6   r   r   r^   r!   r"   r_   r`   ra   r8   r:   r;   r*      s   


$zTransformer.__init__r<   r=   c                 C   rk   rl   rm   rn   r:   r:   r;   rQ     ro   zTransformer.forward)rR   rS   rT   rU   r,   rp   rq   rV   rX   rW   r   r*   rY   rZ   rQ   r[   r:   r:   r8   r;   r      s4    	 r   c                       s`   e Zd ZdZ					ddededee d	ed
edef fddZde	j
de	j
fddZ  ZS )ClassAttentionz6Class attention mechanism for class token interaction.rt   NFr   r   r   r7   r!   r"   r#   c                    s   t    || _|dur|| _n|| }|| _|d | _tj|| j| j d |d| _tj|| j| j |d| _t	|| _
t| j| j || _t	|| _dS )a{  Initialize ClassAttention.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            head_dim: Dimension per head. If None, computed as dim // num_heads.
            qkv_bias: Whether to use bias in QKV projection.
            attn_drop: Attention dropout rate.
            proj_drop: Projection dropout rate.
        Nr$   r?   r%   )r)   r*   r   r7   r+   r,   r-   kvr}   r0   r"   r1   r#   )r6   r   r   r7   r!   r"   r#   r8   r:   r;   r*   .  s   

zClassAttention.__init__r<   r=   c                 C   s   |j \}}}| |||d| j| jddddd}|d\}}| |ddddddf || jd| j| j }||	dd }	|	j
dd	}	| |	}	|	| 	dd|d| j| j }
| |
}
| |
}
|
S )
zForward pass.

        Args:
            x: Input tensor of shape (B, N, C) where first token is class token.

        Returns:
            Class token output of shape (B, 1, C).
        r?   r   r   r   r'   Nrw   r@   rA   )rC   r   rG   r   r7   rD   rx   r}   r+   r|   rH   r"   r1   r#   )r6   r<   rK   NrN   r   r~   r.   r}   r/   	cls_embedr:   r:   r;   rQ   P  s   	*6
"

zClassAttention.forward)rt   NFr   r   )rR   rS   rT   rU   rV   r   rW   rX   r*   rY   rZ   rQ   r[   r:   r:   r8   r;   r   +  s*    "r   c                       sz   e Zd ZdZddddddejejfdededee d	e	d
e
de	de	de	dedef fddZdejdejfddZ  ZS )
ClassBlockz3Class block that combines class attention with MLP.Nr   Fr   r   r   r7   r^   r!   dropr"   r_   r`   ra   c                    s   t    |
|| _t||||||d| _|dkrt|nt | _|
|| _	t
|t|| |	|d| _|dkr?t|| _dS t | _dS )a1  Initialize ClassBlock.

        Args:
            dim: Input feature dimension.
            num_heads: Number of attention heads.
            head_dim: Dimension per head. If None, computed as dim // num_heads.
            mlp_ratio: Ratio for MLP hidden dimension.
            qkv_bias: Whether to use bias in QKV projection.
            drop: Dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth drop rate.
            act_layer: Activation layer type.
            norm_layer: Normalization layer type.
        )r   r7   r!   r"   r#   r   )rc   rd   r`   r   N)r)   r*   re   r   r/   r   r,   rf   rg   rh   r   rV   ri   rj   )r6   r   r   r7   r^   r!   r   r"   r_   r`   ra   r8   r:   r;   r*   l  s&   



$zClassBlock.__init__r<   r=   c                 C   sj   |ddddf }||  | | | }|| | | | }tj||ddddf gddS )zForward pass.

        Args:
            x: Input tensor of shape (B, N, C) where first token is class token.

        Returns:
            Output tensor with updated class token.
        Nr   rA   )rg   r/   re   rj   ri   rh   rY   cat)r6   r<   r   r:   r:   r;   rQ     s   	"zClassBlock.forward)rR   rS   rT   rU   r,   rp   rq   rV   r   rX   rW   r   r*   rY   rZ   rQ   r[   r:   r:   r8   r;   r   i  s@    	
0r   
block_typekargsr=   c                 K   s   | dkrt di |S dS )zGet block based on type.

    Args:
        block_type: Type of block ('ca' for ClassBlock).
        **kargs: Additional keyword arguments for block.

    Returns:
        The requested block module.
    caNr:   )r   )r   r   r:   r:   r;   	get_block  s   
r   size.lamr+   c                 C   s   | d | }| d | }t j|t jd}t j|t jd}t d| }||  }||  }	t d|d}
t d|d}t |
|d  d|}t ||	d  d|}t |
|d  d|}t ||	d  d|}| | | | fS )a1  Get random bounding box for token labeling.

    Reference: https://github.com/zihangJiang/TokenLabeling

    Args:
        size: Input tensor size tuple.
        lam: Lambda parameter for cutmix.
        scale: Scaling factor.

    Returns:
        Bounding box coordinates (bbx1, bby1, bbx2, bby2).
    r   r?   )dtype      ?r   r   )rY   tensorfloat32sqrtrV   randintclampitem)r   r   r+   rM   rL   W_tH_tcut_ratcut_wcut_hcxcybbx1bby1bbx2bby2r:   r:   r;   	rand_bbox  s   r   c                       sd   e Zd ZdZ							dd	ed
edededededef fddZdejdejfddZ	  Z
S )
PatchEmbedz6Image to patch embedding with multi-layer convolution.   Fr   rt   r   @     img_size	stem_convstem_stride
patch_sizein_chans
hidden_dim	embed_dimc                    s   t    |dv sJ |rLttj||d|dddt|tjddtj||dddddt|tjddtj||dddddt|tjdd	| _nd	| _tj|||| || d
| _|| ||  | _	d	S )a  Initialize PatchEmbed.

        Different from ViT which uses 1 conv layer, VOLO uses multiple conv layers for patch embedding.

        Args:
            img_size: Input image size.
            stem_conv: Whether to use stem convolution layers.
            stem_stride: Stride for stem convolution.
            patch_size: Patch size (must be 4, 8, or 16).
            in_chans: Number of input channels.
            hidden_dim: Hidden dimension for stem convolution.
            embed_dim: Output embedding dimension.
        )r'   rt         r   F)r   r    r   r&   T)inplacer   Nr   r    )
r)   r*   r,   
SequentialConv2dBatchNorm2dReLUconvr1   num_patches)r6   r   r   r   r   r   r   r   r8   r:   r;   r*     s&   



zPatchEmbed.__init__r<   r=   c                 C   s"   | j dur
|  |}| |}|S )zForward pass.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Output tensor of shape (B, embed_dim, H', W').
        N)r   r1   rn   r:   r:   r;   rQ   	  s   
	

zPatchEmbed.forward)r   Fr   rt   r   r   r   )rR   rS   rT   rU   rV   rW   r*   rY   rZ   rQ   r[   r:   r:   r8   r;   r     s2    ,r   c                       sF   e Zd ZdZddededef fddZdejd	ejfd
dZ  Z	S )
Downsamplez#Downsampling module between stages.r?   in_embed_dimout_embed_dimr   c                    s"   t    tj||||d| _dS )zInitialize Downsample.

        Args:
            in_embed_dim: Input embedding dimension.
            out_embed_dim: Output embedding dimension.
            patch_size: Patch size for downsampling.
        r   N)r)   r*   r,   r   r1   )r6   r   r   r   r8   r:   r;   r*     s   
zDownsample.__init__r<   r=   c                 C   s.   | dddd}| |}| dddd}|S )zForward pass.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Output tensor of shape (B, H', W', C').
        r   r   r   r?   )rD   r1   rn   r:   r:   r;   rQ   &  s   	
zDownsample.forward)r?   )
rR   rS   rT   rU   rV   r*   rY   rZ   rQ   r[   r:   r:   r8   r;   r     s    r   r   r?   r]   Fr   block_fnindexr   layersr   r   r   r    r^   r!   r"   drop_path_ratekwargsc                 K   sh   g }t || D ]$}||t|d|   t|d  }|| |||||||	|
|d	 qtj| }|S )a  Generate outlooker layers for stage 1.

    Args:
        block_fn: Block function to use (typically Outlooker).
        index: Index of current stage.
        dim: Feature dimension.
        layers: List of layer counts for each stage.
        num_heads: Number of attention heads.
        kernel_size: Kernel size for outlook attention.
        padding: Padding for outlook attention.
        stride: Stride for outlook attention.
        mlp_ratio: Ratio for MLP hidden dimension.
        qkv_bias: Whether to use bias in QKV projection.
        attn_drop: Attention dropout rate.
        drop_path_rate: Stochastic depth drop rate.
        **kwargs: Additional keyword arguments.

    Returns:
        Sequential module containing outlooker blocks.
    Nr   )r   r   r    r   r^   r!   r"   r_   rangesumappendr,   r   )r   r   r   r   r   r   r   r    r^   r!   r"   r   r   blocks	block_idx	block_dprr:   r:   r;   outlooker_blocks5  s    #$

r   c	                 K   sb   g }
t || D ]!}||t|d|   t|d  }|
| ||||||d qtj|
 }
|
S )ae  Generate transformer layers for stage 2.

    Args:
        block_fn: Block function to use (typically Transformer).
        index: Index of current stage.
        dim: Feature dimension.
        layers: List of layer counts for each stage.
        num_heads: Number of attention heads.
        mlp_ratio: Ratio for MLP hidden dimension.
        qkv_bias: Whether to use bias in QKV projection.
        attn_drop: Attention dropout rate.
        drop_path_rate: Stochastic depth drop rate.
        **kwargs: Additional keyword arguments.

    Returns:
        Sequential module containing transformer blocks.
    Nr   )r^   r!   r"   r_   r   )r   r   r   r   r   r^   r!   r"   r   r   r   r   r   r:   r:   r;   transformer_blocksj  s   $

r   c                -       s  e Zd ZdZddddddddd	d	d
dddddejddddfdee dedededededede	ee  de	ee  de
edf de
edf dedededed ed!ed"ed#e	e
edf  d$ed%ed&ef, fd'd(Zd)ejd*dfd+d,Zejjd*efd-d.ZejjdTd/ed*eeef fd0d1ZejjdUd2ed*dfd3d4Zejjd*ejfd5d6ZdVdede	e d*dfd7d8Zd9ejd*ejfd:d;Zd9ejd*ejfd<d=Zd9ejd*eeje
ejeje
eeeef f f fd>d?Z 				@	dWd9ejdAe	eeee f  dBedCedDedEed*eeej e
ejeej f f fdFdGZ!	H		dXdAeeee f dIedJed*ee fdKdLZ"d9ejd*ejfdMdNZ#dTd9ejdOed*ejfdPdQZ$d9ejd*ejfdRdSZ%  Z&S )Yr   zVision Outlooker (VOLO) model.r   r     tokenrt   r   N)TFFFr]   Fr   )r   r   Tr?   r   r   r   num_classesglobal_poolr   stem_hidden_dim
embed_dimsr   downsamples.outlook_attentionr^   r!   	drop_ratepos_drop_rateattn_drop_rater   ra   post_layersuse_aux_headuse_mix_tokenpooling_scalec                    s  t    t|}t|t|}|| _|| _|| _|| _d  | _	| _
|r4d| _|dks4J dd| _tdd|||d d	| _|}|d | | |d
 | | f}ttd
|d |d
 d | _tj|d| _g | _g | _g }d}tt|D ]h}|| rtt|| || |  d	}ntt|| || | | d
}|| | j| | jt| |d| d |d
7 }|
| r|t| |d
  d |d9 }|d
7 }qt || _!d| _"dur!t  fddttD | _"ttd
d
d | _#t$| j#dd |r6|dkr0t%| j	|nt& | _'nd| _'| j	| _(t|| _)|dkrQt%| j	|nt& | _*t$| jdd | +| j, dS )a/  Initialize VOLO model.

        Args:
            layers: Number of blocks in each stage.
            img_size: Input image size.
            in_chans: Number of input channels.
            num_classes: Number of classes for classification.
            global_pool: Global pooling type ('token', 'avg', or '').
            patch_size: Patch size for patch embedding.
            stem_hidden_dim: Hidden dimension for stem convolution.
            embed_dims: List of embedding dimensions for each stage.
            num_heads: List of number of attention heads for each stage.
            downsamples: Whether to downsample between stages.
            outlook_attention: Whether to use outlook attention in each stage.
            mlp_ratio: Ratio for MLP hidden dimension.
            qkv_bias: Whether to use bias in QKV projection.
            drop_rate: Dropout rate.
            pos_drop_rate: Position embedding dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth drop rate.
            norm_layer: Normalization layer type.
            post_layers: Post-processing layer types.
            use_aux_head: Whether to use auxiliary head.
            use_mix_token: Whether to use token mixing for training.
            pooling_scale: Pooling scale factor.
        r@   r   r   z)return all tokens if mix_token is enabledFTr?   r   )r   r   r   r   r   r   r   )r{   )r^   r!   r"   ra   )r^   r!   r   r"   ra   znetwork.)num_chs	reductionmoduleNc                    s4   g | ]}t | d  d  d   ddqS )r@   r   )r   r   r^   r!   r"   r_   ra   )r   .0ir   r   r^   ra   r   r   r!   r:   r;   
<listcomp>  s    
z!VOLO.__init__.<locals>.<listcomp>{Gz?std)-r)   r*   lenr   r   r   r   	mix_tokenr   num_featureshead_hidden_sizebetagrad_checkpointingr   patch_embedr,   	ParameterrY   zeros	pos_embedr0   pos_drop
stage_endsfeature_infor   r   r\   r   r   r   dictr   
ModuleListnetworkpost_network	cls_tokenr   r-   rf   aux_headnorm	head_dropheadapply_init_weights)r6   r   r   r   r   r   r   r   r   r   r   r   r^   r!   r   r   r   r   ra   r   r   r   r   
num_layersr
patch_gridr   r   r   stager8   r   r;   r*     s   
3 $
 



$"zVOLO.__init__mr=   c                 C   sP   t |tjr"t|jdd t |tjr$|jdur&tj|jd dS dS dS dS )z\Initialize weights for modules.

        Args:
            m: Module to initialize.
        r   r   Nr   )
isinstancer,   r-   r   weightr&   init	constant_)r6   r  r:   r:   r;   r   7  s   zVOLO._init_weightsc                 C   s   ddhS )zwGet set of parameters that should not have weight decay.

        Returns:
            Set of parameter names.
        r   r   r:   r6   r:   r:   r;   no_weight_decayB  s   zVOLO.no_weight_decaycoarsec                 C   s   t dddgg ddS )zGet parameter grouping for optimizer.

        Args:
            coarse: Whether to use coarse grouping.

        Returns:
            Parameter grouping dictionary.
        z ^cls_token|pos_embed|patch_embed)z^network\.(\d+)\.(\d+)N)z^network\.(\d+)r   ))z
^cls_tokenr
  )z^post_network\.(\d+)N)z^norm)i )stemr   blocks2)r   )r6   r	  r:   r:   r;   group_matcherK  s   
zVOLO.group_matcherenablec                 C   s
   || _ dS )zqSet gradient checkpointing.

        Args:
            enable: Whether to enable gradient checkpointing.
        N)r   )r6   r  r:   r:   r;   set_grad_checkpointingb  s   
zVOLO.set_grad_checkpointingc                 C   s   | j S )zYGet classifier module.

        Returns:
            The classifier head module.
        )r   r  r:   r:   r;   get_classifierk  s   zVOLO.get_classifierc                 C   sf   || _ |dur
|| _|dkrt| j|nt | _| jdur1|dkr*t| j|nt | _dS dS )zReset classifier head.

        Args:
            num_classes: Number of classes for new classifier.
            global_pool: Global pooling type.
        Nr   )r   r   r,   r-   r   rf   r   r   )r6   r   r   r:   r:   r;   reset_classifiert  s    
$zVOLO.reset_classifierr<   c                 C   st   t | jD ]$\}}|dkr|| j }| |}| jr%tj s%t||}q||}q|j	\}}}}|
|d|}|S )zForward pass through token processing stages.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Token tensor of shape (B, N, C).
        r?   r@   )	enumerater   r   r   r   rY   r   is_scriptingr   rC   rG   )r6   r<   idxblockrK   rL   rM   rN   r:   r:   r;   forward_tokens  s   	


zVOLO.forward_tokensc                 C   sb   |j \}}}| j|dd}tj||gdd}| jD ]}| jr*tj s*t	||}q||}q|S )zForward pass through class attention blocks.

        Args:
            x: Input token tensor of shape (B, N, C).

        Returns:
            Output tensor with class token of shape (B, N+1, C).
        r@   r   rA   )
rC   r   expandrY   r   r   r   r   r  r   )r6   r<   rK   r   rN   
cls_tokensr  r:   r:   r;   forward_cls  s   	

zVOLO.forward_clsc                 C   sR  	 |  |}|dddd}| jr|| jr|tj| j| j }|j	d | j
 |j	d | j
 }}t| || j
d\}}}}| }	| j
| | j
| }
}| j
| | j
| }}|ddd|
|||ddf |	dd|
|||ddf< |	}nd\}}}}| |}| jdur| |}| |}| jdkr|jdd	}n| jd
kr|dddf }n|}| jdu r|S | |ddddf }| js|d|dd   S | jr | jr ||j	d |||j	d }| }	|ddd||||ddf |	dd||||ddf< |	}||j	d || |j	d }||||||ffS )a  Forward pass for training with mix token support.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            If training with mix_token: tuple of (class_token, aux_tokens, bbox).
            Otherwise: class_token tensor.
        r   r?   r   r   )r+   N)r   r   r   r   avgrA   r         ?r@   )r   rD   r   rz   rY   distributionsBetar   samplerC   r   r   r   cloneflipr  r   r  r   r   meanr   maxrG   )r6   r<   r   patch_hpatch_wr   r   r   r   temp_xsbbx1sbby1sbbx2sbby2x_clsx_auxr:   r:   r;   forward_train  sD   

"B






BzVOLO.forward_trainNCHWindicesr   
stop_early
output_fmtintermediates_onlyc              	      sf  |dv sJ dg }t t j|\}}	 fdd|D } j|	 }	|j\}
}}} |dddd}tj s;|s? j	}n	 j	d	|	d  }t
|D ]A\}}|dkr^| j } |} jrltj slt||}n||}||v r|r|dkr |}n|}||dddd qL|r|S |j\}
}}}||
d
|} jd	ur |} |}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to all intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )r-  zOutput format must be NCHW.c                    s   g | ]} j | qS r:   )r   r   r  r:   r;   r     s    z.VOLO.forward_intermediates.<locals>.<listcomp>r   r?   r   r   Nr@   )r   r   r   rC   r   rD   rY   r   r  r   r  r   r   r   r   r   r   rG   r   r  )r6   r<   r.  r   r/  r0  r1  intermediatestake_indices	max_indexrK   _heightwidthr   r  r  x_interrL   rM   rN   r:   r  r;   forward_intermediates  s>   





zVOLO.forward_intermediatesr   
prune_norm
prune_headc                 C   s`   	 t t| j|\}}| j| }| jd|d  | _|r!t | _|r.t | _| 	dd |S )aH  Prune layers not required for specified intermediates.

        Args:
            indices: Indices of intermediate layers to keep.
            prune_norm: Whether to prune normalization layer.
            prune_head: Whether to prune classification head.

        Returns:
            List of kept intermediate indices.
        Nr   r    )
r   r   r   r   r,   rf   r   r   r   r  )r6   r.  r:  r;  r3  r4  r:   r:   r;   prune_intermediate_layers*  s   


zVOLO.prune_intermediate_layersc                 C   sB   |  |dddd}| |}| jdur| |}| |}|S )zForward pass through feature extraction.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Feature tensor.
        r   r?   r   r   N)r   rD   r  r   r  r   rn   r:   r:   r;   forward_featuresF  s   	



zVOLO.forward_features
pre_logitsc                 C   s   | j dkr|jdd}n| j dkr|dddf }n|}| |}|r%|S | |}| jdurG| |ddddf }|d|dd   }|S )zForward pass through classification head.

        Args:
            x: Input feature tensor.
            pre_logits: Whether to return pre-logits features.

        Returns:
            Classification logits or pre-logits features.
        r  r   rA   r   Nr   r  )r   r!  r   r   r   r"  )r6   r<   r?  outauxr:   r:   r;   forward_headZ  s   





zVOLO.forward_headc                 C   s   	 |  |}| |}|S )zForward pass (simplified, without mix token training).

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Classification logits.
        )r>  rB  rn   r:   r:   r;   rQ   t  s   	

zVOLO.forwardF)T)N)NFFr-  F)r   FT)'rR   rS   rT   rU   r,   rq   r   rV   strr   r   rW   rX   r   r*   Moduler   rY   r   ignoresetr  r   r   r  r  r  r  rZ   r  r  r   r,  r9  r=  r>  rB  rQ   r[   r:   r:   r8   r;   r     s    
	



 8@ 
C
variant
pretrainedc                 K   s,   | dd}tt| |fdt|ddi|S )zCreate VOLO model.

    Args:
        variant: Model variant name.
        pretrained: Whether to load pretrained weights.
        **kwargs: Additional model arguments.

    Returns:
        VOLO model instance.
    out_indicesr   feature_cfggetter)rJ  feature_cls)popr   r   r   )rH  rI  r   rJ  r:   r:   r;   _create_volo  s   
rO  r<  urlc                 K   s    | ddddddt tddd	|S )
zCreate model configuration.

    Args:
        url: URL for pretrained weights.
        **kwargs: Additional configuration options.

    Returns:
        Model configuration dictionary.
    r   )r   r   r   NQ?bicubicTzpatch_embed.conv.0)r   r   )rP  r   
input_size	pool_sizecrop_pctinterpolationfixed_input_sizer!  r   
first_conv
classifierr	   )rP  r   r:   r:   r;   _cfg  s   rZ  ztimm/zLhttps://github.com/sail-sg/volo/releases/download/volo_1/d1_224_84.2.pth.tarrQ  )	hf_hub_idrP  rU  zLhttps://github.com/sail-sg/volo/releases/download/volo_1/d1_384_85.2.pth.tarr   )r   r   r   )r[  rP  rU  rS  zLhttps://github.com/sail-sg/volo/releases/download/volo_1/d2_224_85.2.pth.tarzLhttps://github.com/sail-sg/volo/releases/download/volo_1/d2_384_86.0.pth.tarzLhttps://github.com/sail-sg/volo/releases/download/volo_1/d3_224_85.4.pth.tarzLhttps://github.com/sail-sg/volo/releases/download/volo_1/d3_448_86.3.pth.tar)r     r\  zLhttps://github.com/sail-sg/volo/releases/download/volo_1/d4_224_85.7.pth.tarzMhttps://github.com/sail-sg/volo/releases/download/volo_1/d4_448_86.79.pth.targffffff?zMhttps://github.com/sail-sg/volo/releases/download/volo_1/d5_224_86.10.pth.tarzLhttps://github.com/sail-sg/volo/releases/download/volo_1/d5_448_87.0.pth.tarzMhttps://github.com/sail-sg/volo/releases/download/volo_1/d5_512_87.07.pth.tar)r      r]  )zvolo_d1_224.sail_in1kzvolo_d1_384.sail_in1kzvolo_d2_224.sail_in1kzvolo_d2_384.sail_in1kzvolo_d3_224.sail_in1kzvolo_d3_448.sail_in1kzvolo_d4_224.sail_in1kzvolo_d4_448.sail_in1kzvolo_d5_224.sail_in1kzvolo_d5_448.sail_in1kzvolo_d5_512.sail_in1kc                 K   ,   t ddddd|}td	d| i|}|S )
VOLO-D1 model, Params: 27M.r'   r'   rt   r?      r   r   r         re  re  r   r   r   volo_d1_224rI  Nr:   )rg  r   rO  rI  r   
model_argsmodelr:   r:   r;   rg       rg  c                 K   r^  )
r_  r`  ra  rc  rf  volo_d1_384rI  Nr:   )rm  rh  ri  r:   r:   r;   rm    rl  rm  c                 K   r^  )
VOLO-D2 model, Params: 59M.rd  r'   
   r'      r]  r]  r]  rt   r   r   r   rf  volo_d2_224rI  Nr:   )rt  rh  ri  r:   r:   r;   rt    rl  rt  c                 K   r^  )
rn  ro  rq  rs  rf  volo_d2_384rI  Nr:   )ru  rh  ri  r:   r:   r;   ru    rl  ru  c                 K   r^  )
VOLO-D3 model, Params: 86M.rt   rt   r   r'   rq  rs  rf  volo_d3_224rI  Nr:   )rx  rh  ri  r:   r:   r;   rx    rl  rx  c                 K   r^  )
rv  rw  rq  rs  rf  volo_d3_448rI  Nr:   )ry  rh  ri  r:   r:   r;   ry    rl  ry  c                 K   r^  )
VOLO-D4 model, Params: 193M.rw  r      r|  r|  re  r   r   r   rf  volo_d4_224rI  Nr:   )r~  rh  ri  r:   r:   r;   r~    rl  r~  c                 K   r^  )
rz  rw  r{  r}  rf  volo_d4_448rI  Nr:   )r  rh  ri  r:   r:   r;   r    rl  r  c                 K   0   t d
dddddd|}tdd| i|}|S )jVOLO-D5 model, Params: 296M.

    stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5.
    re  re     r'   r{  r}  r'      r   r   r   r^   r   volo_d5_224rI  Nr:   )r  rh  ri  r:   r:   r;   r       r  c                 K   r  )r  r  r{  r}  r'   r  r  volo_d5_448rI  Nr:   )r  rh  ri  r:   r:   r;   r  )  r  r  c                 K   r  )r  r  r{  r}  r'   r  r  volo_d5_512rI  Nr:   )r  rh  ri  r:   r:   r;   r  6  r  r  r   )r   r   r   r?   r]   Fr   r   )r]   Fr   r   rC  )r<  )ErU   rE   typingr   r   r   r   r   r   r   rY   torch.nnr,   torch.nn.functional
functionalrI   	timm.datar
   r   timm.layersr   r   r   r   r   r   _builderr   	_featuresr   _manipulater   	_registryr   r   __all__rE  r   r\   rr   r   r   r   rD  r   rV   rX   r   r   r   rW   r   r   r   r   rO  rZ  default_cfgsrg  rm  rt  ru  rx  ry  r~  r  r  r  r  r:   r:   r:   r;   <module>   sX   $ OCA1>B0!>"	

;	

,   p 0