o
    پi̿                     @   s  d Z ddlZddlmZmZmZmZmZmZm	Z	m
Z
mZ ddlZddlmZ ddlm  mZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z# dd	l$m%Z% dd
l&m'Z' ddl(m)Z)m*Z*m+Z+ dgZ,ee-e	e-e-f f Z.dej/de	e-e-f dej/fddZ0e%dej/de	e-e-f de	e-e-f dej/fddZ1G dd dej2Z3G dd dej2Z4G dd dej2Z5G dd dej2Z6G dd dej2Z7dee8ej/f d ej2dee8ej/f fd!d"Z9dad$e8d%e:de7fd&d'Z;dbd)d*Z<e)e<d+d,d-e<d+d.d/d0d1d2e<d+d3d-e<d+d4d/d0d1d2e<d+d5d-e<d+d6d-e<d+d7d-e<d+d8d-e<d+d9d-e<d+d:d-e<d+d;d<d=d>d?e<d+d@d<d=d>d?dAZ=e*dad%e:de7fdBdCZ>e*dad%e:de7fdDdEZ?e*dad%e:de7fdFdGZ@e*dad%e:de7fdHdIZAe*dad%e:de7fdJdKZBe*dad%e:de7fdLdMZCe*dad%e:de7fdNdOZDe*dad%e:de7fdPdQZEe*dad%e:de7fdRdSZFe*dad%e:de7fdTdUZGe*dad%e:de7fdVdWZHe*dad%e:de7fdXdYZIe+eJdZd[d\d]d^d_d` dS )caK   Swin Transformer V2
A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
    - https://arxiv.org/abs/2111.09883

Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below

Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman
    N)	AnyCallableDictListOptionalSetTupleTypeUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)

PatchEmbedMlpDropPath	to_2tupletrunc_normal_ClassifierHeadresample_patch_embedndgridget_act_layer	LayerType   )build_model_with_cfg)feature_take_indices)register_notrace_function)
checkpoint)generate_default_cfgsregister_modelregister_model_deprecationsSwinTransformerV2xwindow_sizereturnc                 C   sj   | j \}}}}| |||d  |d ||d  |d |} | dddddd d|d |d |}|S )zPartition into non-overlapping windows.

    Args:
        x: Input tensor of shape (B, H, W, C).
        window_size: Window size (height, width).

    Returns:
        Windows tensor of shape (num_windows*B, window_size[0], window_size[1], C).
    r   r               shapeviewpermute
contiguous)r!   r"   BHWCwindows r3   S/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/swin_transformer_v2.pywindow_partition$   s   
,,r5   r2   img_sizec                 C   sf   |\}}| j d }| d||d  ||d  |d |d |}|dddddd d|||}|S )a1  Merge windows back to feature map.

    Args:
        windows: Windows tensor of shape (num_windows * B, window_size[0], window_size[1], C).
        window_size: Window size (height, width).
        img_size: Image size (height, width).

    Returns:
        Feature map tensor of shape (B, H, W, C).
    r(   r   r   r$   r%   r&   r'   r)   )r2   r"   r6   r/   r0   r1   r!   r3   r3   r4   window_reverse4   s
   
,$r7   c                       s   e Zd ZdZ					ddedeeef ded	ed
edededeeef ddf fddZdddZ	deeef ddfddZ
ddejdeej dejfddZ  ZS )WindowAttentionzWindow based multi-head self attention (W-MSA) module with relative position bias.

    Supports both shifted and non-shifted window attention with continuous relative
    position bias and cosine attention.
    TF        r   r   dimr"   	num_headsqkv_biasqkv_bias_separate	attn_drop	proj_droppretrained_window_sizer#   Nc	           	   
      s&  t    || _|| _t|| _|| _|| _t	t
dt
|ddf | _ttjddddtjddtjd|dd| _tj||d	 dd| _|rjt	t
|| _| jd
t
|dd t	t
|| _n	d| _d| _d| _t|| _t||| _t|| _tjdd| _|   dS )a4  Initialize window attention module.

        Args:
            dim: Number of input channels.
            window_size: The height and width of the window.
            num_heads: Number of attention heads.
            qkv_bias: If True, add a learnable bias to query, key, value.
            qkv_bias_separate: If True, use separate bias for q, k, v projections.
            attn_drop: Dropout ratio of attention weight.
            proj_drop: Dropout ratio of output.
            pretrained_window_size: The height and width of the window in pre-training.
        
   r   r%   i   Tbias)inplaceFr$   k_bias
persistentNr(   r;   )super__init__r;   r"   r   rA   r<   r>   nn	Parametertorchlogoneslogit_scale
SequentialLinearReLUcpb_mlpqkvzerosq_biasregister_bufferv_biasrF   Dropoutr?   projr@   Softmaxsoftmax"_make_pair_wise_relative_positions)	selfr;   r"   r<   r=   r>   r?   r@   rA   	__class__r3   r4   rK   N   s2   

"
zWindowAttention.__init__c           
      C   s  t | jd d  | jd t j}t | jd d  | jd t j}t t||}|ddd 	d}| j
d dkrp|dddddddf  | j
d d   < |dddddddf  | j
d d   < n.|dddddddf  | jd d   < |dddddddf  | jd d   < |d9 }t |t t |d  td }| jd|dd	 t | jd }t | jd }t t||}t |d}|dddddf |dddddf  }|ddd }|dddddf  | jd d 7  < |dddddf  | jd d 7  < |dddddf  d| jd  d 9  < |d
}	| jd|	dd	 dS )z?Create pair-wise relative position index and coordinates table.r   r   r%   N         ?relative_coords_tableFrG   r(   relative_position_index)rN   aranger"   tofloat32stackr   r,   r-   	unsqueezerA   signlog2absmathrY   flattensum)
r`   relative_coords_hrelative_coords_wre   coords_hcoords_wcoordscoords_flattenrelative_coordsrf   r3   r3   r4   r_      s6   &&.0..,((,
z2WindowAttention._make_pair_wise_relative_positionsc                 C   s(   t |}|| jkr|| _|   dS dS )zUpdate window size and regenerate relative position tables.

        Args:
            window_size: New window size (height, width).
        N)r   r"   r_   )r`   r"   r3   r3   r4   set_window_size   s
   
zWindowAttention.set_window_sizer!   maskc                 C   s  |j \}}}| jdu r| |}n"t| j| j| jf}| jr)| |}||7 }n
tj	|| jj
|d}|||d| jdddddd}|d\}}	}
tj|dd	tj|	dd	d
d }tj| jtdd }|| }| | jd| j}|| jd | jd | jd  | jd | jd  d}|ddd }dt| }||d }|dur|j d }|d|| j|||dd }|d| j||}| |}n| |}| |}||
 dd|||}|  |}| !|}|S )a#  Forward pass of window attention.

        Args:
            x: Input features with shape of (num_windows*B, N, C).
            mask: Attention mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None.

        Returns:
            Output features with shape of (num_windows*B, N, C).
        N)weightrD   r$   r(   r%   r   r   r&   rI   g      Y@)max   )"r*   rX   rV   rN   catrF   rZ   r>   Flinearr{   reshaper<   r,   unbind	normalize	transposeclamprQ   ro   rO   exprU   re   r+   rf   r"   r-   sigmoidrk   r^   r?   r\   r@   )r`   r!   rz   B_Nr1   rV   r=   qkvattnrQ   relative_position_bias_tablerelative_position_biasnum_winr3   r3   r4   forward   s>   



"$&
$



zWindowAttention.forward)TFr9   r9   r:   r#   NN)__name__
__module____qualname____doc__intr   boolfloatrK   r_   ry   rN   Tensorr   r   __classcell__r3   r3   ra   r4   r8   G   s:    

	

7*r8   c                       s4  e Zd ZdZddddddddddejdfd	ed
edededededede	dede	de	de	de
deej def fddZd*deej deej fddZ	d*ded ee deeeef eeef f fd!d"Z	d*d#eeef deeef dee ddfd$d%Zdejdejfd&d'Zdejdejfd(d)Z  ZS )+SwinTransformerV2BlockzSwin Transformer V2 Block.

    A standard transformer block with window attention and shifted window attention
    for modeling long-range dependencies efficiently.
       r   F      @Tr9   gelur;   input_resolutionr<   r"   
shift_sizealways_partitiondynamic_mask	mlp_ratior=   r@   r?   	drop_path	act_layer
norm_layerrA   c              	      s  t    || _t|| _|| _t|| _|| _|| _| 	||\| _
| _| j
d | j
d  | _|| _t|}t|t| j
||	||
t|d| _||| _|dkrUt|nt | _t|t|| ||
d| _||| _|dkrtt|nt | _| jd| jrdn|  dd	 dS )
a  
        Args:
            dim: Number of input channels.
            input_resolution: Input resolution.
            num_heads: Number of attention heads.
            window_size: Window size.
            shift_size: Shift size for SW-MSA.
            always_partition: Always partition into full windows and shift
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
            pretrained_window_size: Window size in pretraining.
        r   r   )r"   r<   r=   r?   r@   rA   r9   )in_featureshidden_featuresr   drop	attn_maskNFrG   )rJ   rK   r;   r   r   r<   target_shift_sizer   r   _calc_window_shiftr"   r   window_arear   r   r8   r   norm1r   rL   Identity
drop_path1r   r   mlpnorm2
drop_path2rY   get_attn_mask)r`   r;   r   r<   r"   r   r   r   r   r=   r@   r?   r   r   r   rA   ra   r3   r4   rK      sF   
#


	


zSwinTransformerV2Block.__init__Nr!   r#   c              	   C   sf  t | jr|du rtdg| jdR }ntjd|jd |jd df|j|jd}d}d| jd  f| jd  | jd  f| jd  dffD ]<}d| jd  f| jd  | jd  f| jd  dffD ]}||dd|d |d |d |d ddf< |d7 }qdqGt	|| j}|
d| j}|d|d }||dktd|dktd}|S d}|S )	zGenerate attention mask for shifted window attention.

        Args:
            x: Input tensor for dynamic shape calculation.

        Returns:
            Attention mask or None if no shift.
        Nr   r%   )dtypedevicer   r(   g      Yr9   )anyr   rN   rW   r   r*   r   r   r"   r5   r+   r   rk   masked_fillr   )r`   r!   img_maskcnthwmask_windowsr   r3   r3   r4   r   4  s.   
	(0
$z$SwinTransformerV2Block.get_attn_masktarget_window_sizer   c                 C   s   t |}|du r| j}t|r|d d |d d f}nt |}| jr'||fS t |}t |}dd t| j|D }dd t| j||D }t|t|fS )a  Calculate window size and shift size based on input resolution.

        Args:
            target_window_size: Target window size.
            target_shift_size: Target shift size.

        Returns:
            Tuple of (adjusted_window_size, adjusted_shift_size).
        Nr   r%   r   c                 S   s    g | ]\}}||kr|n|qS r3   r3   ).0rr   r3   r3   r4   
<listcomp>u  s     z=SwinTransformerV2Block._calc_window_shift.<locals>.<listcomp>c                 S   s"   g | ]\}}}||krd n|qS r   r3   )r   r   r   sr3   r3   r4   r   v  s   " )r   r   r   r   zipr   tuple)r`   r   r   r"   r   r3   r3   r4   r   X  s   z)SwinTransformerV2Block._calc_window_shift	feat_sizec                 C   sp   || _ |dur
|| _| t|\| _| _| jd | jd  | _| j| j | j	d| j
r/dn|  dd dS )zSet input size and update window configuration.

        Args:
            feat_size: New feature map size.
            window_size: New window size.
            always_partition: Override always_partition setting.
        Nr   r   r   FrG   )r   r   r   r   r"   r   r   r   ry   rY   r   r   )r`   r   r"   r   r3   r3   r4   set_input_sizey  s   
z%SwinTransformerV2Block.set_input_sizec              	   C   sp  |j \}}}}t| j}|r!tj|| jd  | jd  fdd}n|}| jd || jd   | jd  }| jd || jd   | jd  }	tjj|ddd|	d|f}|j \}
}}}
t	|| j}|
d| j|}t| ddrt| |}n| j}| j||d}|
d| jd | jd |}t|| j||f}|d	d	d	|d	|d	d	f  }|rtj|| jdd}|S |}|S )
zApply windowed attention with optional shift.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Output tensor of shape (B, H, W, C).
        r   r   )r   r%   )shiftsdimsr(   r   F)rz   N)r*   r   r   rN   rollr"   rL   
functionalpadr5   r+   r   getattrr   r   r   r7   r-   )r`   r!   r.   r/   r0   r1   	has_shift	shifted_xpad_hpad_w_HpWp	x_windowsr   attn_windowsr3   r3   r4   _attn  s.   	
&""$zSwinTransformerV2Block._attnc                 C   sd   |j \}}}}|| | | | }||d|}|| | | | }|||||}|S )Nr(   )r*   r   r   r   r   r   r   r   )r`   r!   r.   r/   r0   r1   r3   r3   r4   r     s   zSwinTransformerV2Block.forwardr   )r   r   r   r   rL   	LayerNormr   _int_or_tuple_2_tr   r   r   r	   ModulerK   r   rN   r   r   r   r   r   r   r   r   r3   r3   ra   r4   r      s    	
 J'
%


.r   c                       sV   e Zd ZdZdejfdedee deej	 f fddZ
dejd	ejfd
dZ  ZS )PatchMergingzPatch Merging Layer.

    Merges 2x2 neighboring patches and projects to higher dimension,
    effectively downsampling the feature maps.
    Nr;   out_dimr   c                    sF   t    || _|pd| | _tjd| | jdd| _|| j| _dS )z
        Args:
            dim (int): Number of input channels.
            out_dim (int): Number of output channels (or 2 * dim if None)
            norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
        r%   r&   FrC   N)rJ   rK   r;   r   rL   rS   	reductionnorm)r`   r;   r   r   ra   r3   r4   rK     s
   
zPatchMerging.__init__r!   r#   c                 C   s   |j \}}}}ddd|d d|d f}tj||}|j \}}}}|||d d|d d|ddddddd}| |}| |}|S )Nr   r%   r   r$   r&   r'   )	r*   rL   r   r   r   r,   rp   r   r   )r`   r!   r.   r/   r0   r1   
pad_valuesr   r3   r3   r4   r     s   2

zPatchMerging.forward)r   r   r   r   rL   r   r   r   r	   r   rK   rN   r   r   r   r3   r3   ra   r4   r     s    	r   c                '       s   e Zd ZdZdddddddddejddfded	ed
ededededededede	dede	de	de	de
eef deej dededdf& fddZ	d&deeef dedee ddfdd Zd!ejdejfd"d#Zd'd$d%Z  ZS )(SwinTransformerV2StagezA Swin Transformer V2 Stage.

    A single stage consisting of multiple Swin Transformer blocks with
    optional downsampling at the beginning.
    Fr   Tr9   r   r   r;   r   r   depthr<   r"   r   r   
downsampler   r=   r@   r?   r   r   r   rA   output_nchwr#   Nc                    s   t    |_|_|	rtdd |D n|_|_|_d_t	tdd D |	r:t
|d_n|ks@J t _t 	
fddt|D _dS )	a  
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels.
            input_resolution: Input resolution.
            depth: Number of blocks.
            num_heads: Number of attention heads.
            window_size: Local window size.
            always_partition: Always partition into full windows and shift
            dynamic_mask: Create attention mask in forward based on current input size
            downsample: Use downsample layer at start of the block.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Projection dropout rate
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer type.
            norm_layer: Normalization layer.
            pretrained_window_size: Local window size in pretraining.
            output_nchw: Output tensors on NCHW format instead of NHWC.
        c                 s       | ]}|d  V  qdS r%   Nr3   r   ir3   r3   r4   	<genexpr>$      z2SwinTransformerV2Stage.__init__.<locals>.<genexpr>Fc                 S   s   g | ]}|d  qS r%   r3   )r   r   r3   r3   r4   r   )      z3SwinTransformerV2Stage.__init__.<locals>.<listcomp>)r;   r   r   c                    sV   g | ]'}t j|d  dkrdn
ttr"| n 	dqS )r%   r   )r;   r   r<   r"   r   r   r   r   r=   r@   r?   r   r   r   rA   )r   output_resolution
isinstancelistr   r   r   r?   r   r   r   r   r<   r   rA   r@   r=   r`   r   r"   r3   r4   r   3  s(    N)rJ   rK   r;   r   r   r   r   r   grad_checkpointingr   r   r   rL   r   
ModuleListrangeblocks)r`   r;   r   r   r   r<   r"   r   r   r   r   r=   r@   r?   r   r   r   rA   r   ra   r   r4   rK     s    
*
*zSwinTransformerV2Stage.__init__r   c                 C   sb   || _ t| jtjr|| _nt| jtsJ tdd |D | _| jD ]}|j	| j||d q#dS )zUpdate resolution, window size and relative positions.

        Args:
            feat_size: New input (feature) resolution.
            window_size: New window size.
            always_partition: Always partition / shift the window.
        c                 s   r   r   r3   r   r3   r3   r4   r   Y  r   z8SwinTransformerV2Stage.set_input_size.<locals>.<genexpr>r   r"   r   N)
r   r   r   rL   r   r   r   r   r   r   )r`   r   r"   r   blockr3   r3   r4   r   G  s   
z%SwinTransformerV2Stage.set_input_sizer!   c                 C   s>   |  |}| jD ]}| jrtj st||}q||}q|S )zForward pass through the stage.

        Args:
            x: Input tensor of shape (B, H, W, C).

        Returns:
            Output tensor of shape (B, H', W', C').
        )r   r   r   rN   jitis_scriptingr   )r`   r!   blkr3   r3   r4   r   a  s   
	

zSwinTransformerV2Stage.forwardc                 C   sX   | j D ]&}tj|jjd tj|jjd tj|jjd tj|jjd qdS )z/Initialize residual post-normalization weights.r   N)r   rL   init	constant_r   rD   r{   r   )r`   r   r3   r3   r4   _init_respostnorms  s   
z(SwinTransformerV2Stage._init_respostnormr   r   )r   r   r   r   rL   r   r   r   r   r   r
   strr   r	   r   rK   r   r   r   rN   r   r   r   r   r3   r3   ra   r4   r     s    	

T

r   c                +       s  e Zd ZdZdddddddd	d
dddddddddejdfdedededededede	edf de	edf dede
de
dede
d ed!ed"ed#ed$eeef d%ed&e	edf f( fd'd(Zd)ejd*d+fd,d-Z	+	+	+	.	+dSdee	eef  dee	eef  dee	eef  d/ee dee
 f
d0d1Zejjd*ee fd2d3ZejjdTd4e
d*eeef fd5d6ZejjdUd7e
d*d+fd8d9Zejjd*ejfd:d;ZdVdedee d*d+fd<d=Z	+			>	dWd?ejd@eeeee f  dAe
dBe
dCedDe
d*eeej e	ejeej f f fdEdFZ 	G		dXd@eeee f dHe
dIe
fdJdKZ!d?ejd*ejfdLdMZ"dTd?ejdNe
d*ejfdOdPZ#d?ejd*ejfdQdRZ$  Z%S )Yr    a   Swin Transformer V2.

    A hierarchical vision transformer using shifted windows for efficient
    self-attention computation with continuous position bias.

    A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution`
        - https://arxiv.org/abs/2111.09883
       r&   r$     avg`   r%   r%      r%   r$   r        r   FTr   r9   g?r   )r   r   r   r   r6   
patch_sizein_chansnum_classesglobal_pool	embed_dimdepths.r<   r"   r   strict_img_sizer   r=   	drop_rateproj_drop_rateattn_drop_ratedrop_path_rater   r   pretrained_window_sizesc              	      s  t    || _|dv sJ || _d| _t|| _ | _t d| jd    | _	| _
g | _t ttfsA fddt| jD  t||| d ||dd| _| jj}d	d td|t||D }g } d }d}t| jD ]r} | }|td i d
|d|d|d | |d | fd|| d|dkd|| d|	d|
d| d|d|d|d|d|| d|d|d|| g7 }|}|dkr|d9 }|  jt|d| d| dg7  _qqtj| | _|| j	| _t| j	|||| jd| _| | j  | jD ]}|!  qdS )!a]  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of input image channels.
            num_classes: Number of classes for classification head.
            embed_dim: Patch embedding dimension.
            depths: Depth of each Swin Transformer stage (layer).
            num_heads: Number of attention heads in different layers.
            window_size: Window size.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            drop_rate: Head dropout rate.
            proj_drop_rate: Projection dropout rate.
            attn_drop_rate: Attention dropout rate.
            drop_path_rate: Stochastic depth rate.
            norm_layer: Normalization layer.
            act_layer: Activation layer type.
            patch_norm: If True, add normalization after patch embedding.
            pretrained_window_sizes: Pretrained window sizes of each layer.
            output_fmt: Output tensor format if not None, otherwise output 'NHWC' by default.
        ) r  NHWCr%   r   c                    s   g | ]
}t  d |  qS r   )r   r   r  r3   r4   r     s    z.SwinTransformerV2.__init__.<locals>.<listcomp>r   )r6   r	  r
  r  r   r  
output_fmtc                 S   s   g | ]}|  qS r3   )tolist)r   r!   r3   r3   r4   r     r   r;   r   r   r   r   r<   r"   r   r   r   r=   r@   r?   r   r   r   rA   r&   layers.)num_chsr   module)	pool_typer  	input_fmtNr3   )"rJ   rK   r  r  r  len
num_layersr  r   num_featureshead_hidden_sizefeature_infor   r   r   r   r   patch_embed	grid_sizerN   linspacerq   splitr   dictrL   rR   layersr   r   headapply_init_weightsr   )r`   r6   r	  r
  r  r  r  r  r<   r"   r   r  r   r=   r  r  r  r  r   r   r  kwargsr%  dprr)  in_dimscaler   r   blyra   r  r4   rK     s   
.
	"
	
&
zSwinTransformerV2.__init__mr#   Nc                 C   sP   t |tjr"t|jdd t |tjr$|jdur&tj|jd dS dS dS dS )zVInitialize model weights.

        Args:
            m: Module to initialize.
        g{Gz?)stdNr   )r   rL   rS   r   r{   rD   r   r   )r`   r2  r3   r3   r4   r,    s   zSwinTransformerV2._init_weightsrc   window_ratioc           
         s   |dus|dur| j j||d | j j}|du r' dur't fdd|D }t| jD ]\}}dt|d d }	|j|d |	 |d |	 f||d q,dS )	aT  Updates the image resolution, window size, and so the pair-wise relative positions.

        Args:
            img_size (Optional[Tuple[int, int]]): New input resolution, if None current resolution is used
            patch_size (Optional[Tuple[int, int]): New patch size, if None use current patch size
            window_size (Optional[int]): New window size, if None based on new_img_size // window_div
            window_ratio (int): divisor for calculating window size from patch grid size
            always_partition: always partition / shift windows even if feat size is < window
        N)r6   r	  c                    s   g | ]}|  qS r3   r3   )r   r   r4  r3   r4   r     r   z4SwinTransformerV2.set_input_size.<locals>.<listcomp>r%   r   r   r   )r$  r   r%  r   	enumerater)  r}   )
r`   r6   r	  r"   r4  r   r%  indexstagestage_scaler3   r5  r4   r     s   z SwinTransformerV2.set_input_sizec                    s<   t  }|  D ]\ }t fdddD r|  q|S )zGet parameter names that should not use weight decay.

        Returns:
            Set of parameter names to exclude from weight decay.
        c                       g | ]}| v qS r3   r3   )r   kwnr3   r4   r   .  r   z5SwinTransformerV2.no_weight_decay.<locals>.<listcomp>)rU   rQ   )setnamed_modulesr   add)r`   nodr2  r3   r<  r4   no_weight_decay%  s   
z!SwinTransformerV2.no_weight_decaycoarsec                 C   s   t d|rddS g ddS )zCreate parameter group matcher for optimizer parameter groups.

        Args:
            coarse: If True, use coarse grouping.

        Returns:
            Dictionary mapping group names to regex patterns.
        z^absolute_pos_embed|patch_embedz^layers\.(\d+)))z^layers\.(\d+).downsampler   )z^layers\.(\d+)\.\w+\.(\d+)N)z^norm)i )stemr   )r(  )r`   rC  r3   r3   r4   group_matcher2  s   
zSwinTransformerV2.group_matcherenablec                 C   s   | j D ]}||_qdS )z}Enable or disable gradient checkpointing.

        Args:
            enable: If True, enable gradient checkpointing.
        N)r)  r   )r`   rF  lr3   r3   r4   set_grad_checkpointingE  s   
z(SwinTransformerV2.set_grad_checkpointingc                 C   s   | j jS )z_Get the classifier head.

        Returns:
            The classification head module.
        )r*  fc)r`   r3   r3   r4   get_classifierO  s   z SwinTransformerV2.get_classifierc                 C   s   || _ | j|| dS )zReset the classification head.

        Args:
            num_classes: Number of classes for new head.
            global_pool: Global pooling type.
        N)r  r*  reset)r`   r  r  r3   r3   r4   reset_classifierX  s   z"SwinTransformerV2.reset_classifierNCHWr!   indicesr   
stop_earlyr  intermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}t| j}
tj s%|s)| j}n	| jd|	d  }t|D ]+\}}||}||v ra|rP||
d krP| |}n|}|	dddd
 }|| q6|rf|S | |}||fS )a   Forward features that returns intermediates.

        Args:
            x: Input image tensor
            indices: Take last n blocks if int, all if None, select matching indices if sequence
            norm: Apply norm layer to compatible intermediates
            stop_early: Stop iterating over blocks when last desired intermediate hit
            output_fmt: Shape of intermediate feature outputs
            intermediates_only: Only return intermediate features
        Returns:

        )rM  zOutput shape must be NCHW.Nr   r   r$   r%   )r   r  r)  r$  rN   r   r   r6  r   r,   r-   append)r`   r!   rN  r   rO  r  rP  intermediatestake_indices	max_index
num_stagesstagesr   r8  x_interr3   r3   r4   forward_intermediatesb  s*   



z'SwinTransformerV2.forward_intermediatesr   
prune_norm
prune_headc                 C   sJ   t t| j|\}}| jd|d  | _|rt | _|r#| dd |S )z@ Prune layers not required for specified intermediates.
        Nr   r   r  )r   r  r)  rL   r   r   rL  )r`   rN  rY  rZ  rS  rT  r3   r3   r4   prune_intermediate_layers  s   
z+SwinTransformerV2.prune_intermediate_layersc                 C   s"   |  |}| |}| |}|S )zForward pass through feature extraction layers.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Feature tensor of shape (B, H', W', C).
        )r$  r)  r   r`   r!   r3   r3   r4   forward_features  s   
	

z"SwinTransformerV2.forward_features
pre_logitsc                 C   s   |r	| j |ddS |  |S )a  Forward pass through classification head.

        Args:
            x: Feature tensor of shape (B, H, W, C).
            pre_logits: If True, return features before final linear layer.

        Returns:
            Logits tensor of shape (B, num_classes) or pre-logits.
        T)r^  )r*  )r`   r!   r^  r3   r3   r4   forward_head  s   
zSwinTransformerV2.forward_headc                 C   s   |  |}| |}|S )zForward pass through the model.

        Args:
            x: Input tensor of shape (B, C, H, W).

        Returns:
            Logits tensor of shape (B, num_classes).
        )r]  r_  r\  r3   r3   r4   r     s   
	
zSwinTransformerV2.forward)NNNrc   NF)Tr   )NFFrM  F)r   FT)&r   r   r   r   rL   r   r   r   r   r   r   r   r
   r   rK   r   r,  r   r   rN   r   ignorer   rB  r   r   rE  rH  rJ  rL  r   r   rX  r[  r]  r_  r   r   r3   r3   ra   r4   r    |  s    

	


t
 	 
4

state_dictmodelc           	         s   |  d| } |  d| } d| v }i }ddl}|  D ]L\ }t fdddD r*qd	 v rQ|jjjj\}}}}|jd
 |ksF|jd |krQt|||fdddd}|sb|	ddd    
dd || < q|S )aM  Filter and process checkpoint state dict for loading.

    Handles resizing of patch embeddings and relative position tables
    when model size differs from checkpoint.

    Args:
        state_dict: Checkpoint state dictionary.
        model: Target model to load weights into.

    Returns:
        Filtered state dictionary.
    rc  rb  zhead.fc.weightr   Nc                    r:  r3   r3   )r   r=  r   r3   r4   r     r   z(checkpoint_filter_fn.<locals>.<listcomp>)rf   re   r   zpatch_embed.proj.weightr|   r(   bicubicT)interpolation	antialiasverbosezlayers.(\d+).downsamplec                 S   s   dt | dd  dS )Nr  r   z.downsample)r   group)r!   r3   r3   r4   <lambda>  s    z&checkpoint_filter_fn.<locals>.<lambda>zhead.zhead.fc.)getreitemsr   r$  r\   r{   r*   r   subreplace)	rb  rc  native_checkpointout_dictrl  r   r   r/   r0   r3   rd  r4   checkpoint_filter_fn  s.   
rr  Fvariant
pretrainedc                 K   sP   t dd t|ddD }|d|}tt| |fttd|dd|}|S )	zCreate a Swin Transformer V2 model.

    Args:
        variant: Model variant name.
        pretrained: If True, load pretrained weights.
        **kwargs: Additional model arguments.

    Returns:
        SwinTransformerV2 model instance.
    c                 s   s    | ]\}}|V  qd S r   r3   )r   r   r   r3   r3   r4   r     r   z._create_swin_transformer_v2.<locals>.<genexpr>r  )r   r   r   r   out_indicesT)flatten_sequentialru  )pretrained_filter_fnfeature_cfg)r   r6  rk  popr   r    rr  r(  )rs  rt  r-  default_out_indicesru  rc  r3   r3   r4   _create_swin_transformer_v2  s   
r{  r  c                 K   s"   | ddddddt tddd	d
|S )Nr  )r$      r|  )rc   rc   g?re  Tzpatch_embed.projzhead.fcmit)urlr  
input_size	pool_sizecrop_pctrf  fixed_input_sizemeanr3  
first_conv
classifierlicenser   )r~  r-  r3   r3   r4   _cfg  s   r  ztimm/z{https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to16_192to256_22kto1k_ft.pth)	hf_hub_idr~  z{https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to24_192to384_22kto1k_ft.pth)r$     r  )r  r  rd   )r  r~  r  r  r  z|https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to16_192to256_22kto1k_ft.pthz|https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to24_192to384_22kto1k_ft.pthzfhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window8_256.pthzghttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window16_256.pthzghttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window8_256.pthzhhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window16_256.pthzfhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window8_256.pthzghttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window16_256.pthzkhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12_192_22k.pthiQU  )r$      r  )r  r  )r  r~  r  r  r  zlhttps://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12_192_22k.pth)2swinv2_base_window12to16_192to256.ms_in22k_ft_in1k2swinv2_base_window12to24_192to384.ms_in22k_ft_in1k3swinv2_large_window12to16_192to256.ms_in22k_ft_in1k3swinv2_large_window12to24_192to384.ms_in22k_ft_in1kzswinv2_tiny_window8_256.ms_in1kz swinv2_tiny_window16_256.ms_in1kz swinv2_small_window8_256.ms_in1kz!swinv2_small_window16_256.ms_in1kzswinv2_base_window8_256.ms_in1kz swinv2_base_window16_256.ms_in1k!swinv2_base_window12_192.ms_in22k"swinv2_large_window12_192.ms_in22kc                 K   0   t ddddd}t	d	d| it |fi |S )
z"Swin-T V2 @ 256x256, window 16x16.r~   r  r  r  r"   r  r  r<   swinv2_tiny_window16_256rt  N)r  r(  r{  rt  r-  
model_argsr3   r3   r4   r  Q     r  c                 K   r  )
z Swin-T V2 @ 256x256, window 8x8.rc   r  r  r  r  swinv2_tiny_window8_256rt  N)r  r  r  r3   r3   r4   r  Y  r  r  c                 K   r  )
z"Swin-S V2 @ 256x256, window 16x16.r~   r  r%   r%      r%   r  r  swinv2_small_window16_256rt  N)r  r  r  r3   r3   r4   r  a  r  r  c                 K   r  )
z Swin-S V2 @ 256x256, window 8x8.rc   r  r  r  r  swinv2_small_window8_256rt  N)r  r  r  r3   r3   r4   r  i  r  r  c                 K   r  )
z"Swin-B V2 @ 256x256, window 16x16.r~      r  r&   rc   r~       r  swinv2_base_window16_256rt  N)r  r  r  r3   r3   r4   r  q  r  r  c                 K   r  )
z Swin-B V2 @ 256x256, window 8x8.rc   r  r  r  r  swinv2_base_window8_256rt  N)r  r  r  r3   r3   r4   r  y  r  r  c                 K   r  )
z"Swin-B V2 @ 192x192, window 12x12.r  r  r  r  r  swinv2_base_window12_192rt  N)r  r  r  r3   r3   r4   r    r  r  c                 K   2   t dddddd}t	d
d| it |fi |S )zQSwin-B V2 @ 192x192, trained at window 12x12, fine-tuned to 256x256 window 16x16.r~   r  r  r  r  r  r  r  r"   r  r  r<   r  !swinv2_base_window12to16_192to256rt  N)r  r  r  r3   r3   r4   r       r  c                 K   r  )zQSwin-B V2 @ 192x192, trained at window 12x12, fine-tuned to 384x384 window 24x24.r  r  r  r  r  r  !swinv2_base_window12to24_192to384rt  N)r  r  r  r3   r3   r4   r    r  r  c                 K   r  )
z"Swin-L V2 @ 192x192, window 12x12.r  r  r  r  r  r  0   r  swinv2_large_window12_192rt  N)r  r  r  r3   r3   r4   r    r  r  c                 K   r  )zQSwin-L V2 @ 192x192, trained at window 12x12, fine-tuned to 256x256 window 16x16.r~   r  r  r  r  r  "swinv2_large_window12to16_192to256rt  N)r  r  r  r3   r3   r4   r    r  r  c                 K   r  )zQSwin-L V2 @ 192x192, trained at window 12x12, fine-tuned to 384x384 window 24x24.r  r  r  r  r  r  "swinv2_large_window12to24_192to384rt  N)r  r  r  r3   r3   r4   r    r  r  r  r  r  r  r  r  )swinv2_base_window12_192_22k)swinv2_base_window12to16_192to256_22kft1k)swinv2_base_window12to24_192to384_22kft1kswinv2_large_window12_192_22k*swinv2_large_window12to16_192to256_22kft1k*swinv2_large_window12to24_192to384_22kft1kr`  )r  )Kr   ro   typingr   r   r   r   r   r   r   r	   r
   rN   torch.nnrL   torch.nn.functionalr   r   	timm.datar   r   timm.layersr   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   	_registryr   r   r   __all__r   r   r   r5   r7   r   r8   r   r   r   r    r   rr  r   r{  r  default_cfgsr  r  r  r  r  r  r  r  r  r  r  r  r   r3   r3   r3   r4   <module>   s    ,0"0  h&   ,R*
:				