o
    پi                     @   s*  d Z ddlZddlZddlmZmZmZmZmZm	Z	m
Z
mZ ddlZddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddlmZ ddl m!Z! dd	l"m#Z# dd
l$m%Z%m&Z& ddl'm(Z(m)Z)m*Z* ddl+m,Z, dgZ-e.e/Z0ee1e
e1e1f f Z2dej3de
e1e1f dej3fddZ4e#dej3de
e1e1f de1de1dej3f
ddZ5de1de1dej3fddZ6G dd dej7Z8G dd dej7Z9G d d! d!ej7Z:G d"d# d#ej7Z;G d$d dej7Z<d%e=d&ej7dee>ej3f fd'd(Z?dsd*e>d+e@de<fd,d-ZAdtd/e>dee>ef fd0d1ZBe(i d2eBd3d4d5d6eBd3d7d5d8eBd3d9d:d;d<d=d>eBd3d?d5d@eBd3dAd:d;d<d=dBeBd3dCd5dDeBd3dEd5dFeBd3dGd5dHeBd3dId:d;d<d=dJeBd3dKd5dLeBd3dMdNdOdPeBd3dQdNdOdReBd3dSdNdOdTeBd3dUd:d;d<dNdVdWeBd3dXdNdOdYeBd3dZd:d;d<dNdVd[eBd3d\d5eBd3d]d5eBd3d^d5d_ZCe)dsde<fd`daZDe)dsde<fdbdcZEe)dsde<fdddeZFe)dsde<fdfdgZGe)dsde<fdhdiZHe)dsde<fdjdkZIe)dsde<fdldmZJe)dsde<fdndoZKe)dsde<fdpdqZLe*e/dRdTdWdYdr dS )ua   Swin Transformer
A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`
    - https://arxiv.org/pdf/2103.14030

Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below

S3 (AutoFormerV2, https://arxiv.org/abs/2111.14725) Swin weights from
    - https://github.com/microsoft/Cream/tree/main/AutoFormerV2

Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman
    N)AnyDictCallableListOptionalSetTupleUnionIMAGENET_DEFAULT_MEANIMAGENET_DEFAULT_STD)
PatchEmbedMlpDropPathClassifierHead	to_2tuple	to_ntupletrunc_normal_use_fused_attnresize_rel_pos_bias_tableresample_patch_embedndgrid   )build_model_with_cfg)feature_take_indices)register_notrace_function)checkpoint_seqnamed_apply)generate_default_cfgsregister_modelregister_model_deprecations)get_init_weights_vitSwinTransformerxwindow_sizereturnc                 C   sj   | j \}}}}| |||d  |d ||d  |d |} | dddddd d|d |d |}|S )zPartition into non-overlapping windows.

    Args:
        x: Input tokens with shape [B, H, W, C].
        window_size: Window size.

    Returns:
        Windows after partition with shape [B * num_windows, window_size, window_size, C].
    r   r               shapeviewpermute
contiguous)r#   r$   BHWCwindows r5   P/home/ubuntu/.local/lib/python3.10/site-packages/timm/models/swin_transformer.pywindow_partition*   s   ,,r7   r4   r1   r2   c                 C   s^   | j d }| d||d  ||d  |d |d |}|dddddd d|||}|S )a
  Reverse window partition.

    Args:
        windows: Windows with shape (num_windows*B, window_size, window_size, C).
        window_size: Window size.
        H: Height of image.
        W: Width of image.

    Returns:
        Tensor with shape (B, H, W, C).
    r*   r   r   r&   r'   r(   r)   r+   )r4   r$   r1   r2   r3   r#   r5   r5   r6   window_reverse=   s   
,$r8   win_hwin_wc                 C   s   t tt | t |}t |d}|dddddf |dddddf  }|ddd }|dddddf  | d 7  < |dddddf  |d 7  < |dddddf  d| d 9  < |dS )zGet pair-wise relative position index for each token inside the window.

    Args:
        win_h: Window height.
        win_w: Window width.

    Returns:
        Relative position index tensor.
    r   Nr'   r   r*   )torchstackr   arangeflattenr.   r/   sum)r9   r:   coordscoords_flattenrelative_coordsr5   r5   r6   get_relative_position_indexP   s   ,""&
rC   c                       s   e Zd ZU dZejje ed< 					dde	de	d	e
e	 d
edededef fddZd
ee	e	f ddfddZdejfddZddejde
ej dejfddZ  ZS )WindowAttentionzWindow based multi-head self attention (W-MSA) module with relative position bias.

    Supports both shifted and non-shifted windows.
    
fused_attnN   T        dim	num_headshead_dimr$   qkv_bias	attn_drop	proj_dropc                    s   t    || _t|| _| j\}}	||	 | _|| _|p|| }|| }
|d | _tdd| _	t
td| d d|	 d  || _| jdt||	dd t
j||
d	 |d
| _t
|| _t
|
|| _t
|| _t| jdd t
jdd| _dS )a  
        Args:
            dim: Number of input channels.
            num_heads: Number of attention heads.
            head_dim: Number of channels per head (dim // num_heads if not set)
            window_size: The height and width of the window.
            qkv_bias:  If True, add a learnable bias to query, key, value.
            attn_drop: Dropout ratio of attention weight.
            proj_drop: Dropout ratio of output.
        g      T)experimentalr'   r   relative_position_indexF
persistentr&   biasg{Gz?)stdr*   )rH   N)super__init__rH   r   r$   window_arearI   scaler   rE   nn	Parameterr;   zerosrelative_position_bias_tableregister_bufferrC   LinearqkvDropoutrL   projrM   r   Softmaxsoftmax)selfrH   rI   rJ   r$   rK   rL   rM   r9   r:   attn_dim	__class__r5   r6   rV   l   s$   




(zWindowAttention.__init__r%   c                 C   s   t |}|| jkrdS || _| j\}}|| | _t 0 d| d d| d  | jf}tt| j	| j|d| _	| j
dt||dd W d   dS 1 sOw   Y  dS )zzUpdate window size & interpolate position embeddings
        Args:
            window_size (int): New window size
        Nr'   r   new_window_sizenew_bias_shaperO   FrP   )r   r$   rW   r;   no_gradrI   rY   rZ   r   r\   r]   rC   )rd   r$   r9   r:   rj   r5   r5   r6   set_window_size   s"   



"zWindowAttention.set_window_sizec                 C   s<   | j | jd | j| jd}|ddd }|dS )Nr*   r'   r   r   )r\   rO   r-   rW   r.   r/   	unsqueeze)rd   relative_position_biasr5   r5   r6   _get_rel_pos_bias   s   

z!WindowAttention._get_rel_pos_biasr#   maskc                 C   s  |j \}}}| |||d| jdddddd}|d\}}}	| jrd|  }
|durP|j d }|d|d||	|| d| jdd}|
|d| j|| }
t
jjj|||	|
| jr_| jjndd	}nE|| j }||d
d }||   }|dur|j d }|d|| j|||dd }|d| j||}| |}| |}||	 }|dd||d}| |}| |}|S )a  Forward pass.

        Args:
            x: Input features with shape of (num_windows*B, N, C).
            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None.

        Returns:
            Output features with shape of (num_windows*B, N, C).
        r&   r*   r'   r   r   r(   NrG   )	attn_mask	dropout_p)r,   r_   reshaperI   r.   unbindrE   ro   r-   expandr;   rY   
functionalscaled_dot_product_attentiontrainingrL   prX   	transposerm   rc   ra   rM   )rd   r#   rp   B_Nr3   r_   qkvrq   num_winattnr5   r5   r6   forward   s8   
(
&

$



zWindowAttention.forward)NrF   TrG   rG   N)__name__
__module____qualname____doc__r;   jitFinalbool__annotations__intr   _int_or_tuple_2_tfloatrV   r   rl   Tensorro   r   __classcell__r5   r5   rf   r6   rD   e   s4   
 -*rD   c                       sD  e Zd ZdZddddddddd	d	d	ejejfd
ededede	e dedede
de
dede
dededededef fddZd*de	ej de	ej fddZ	d*deeeeef f d e	eeeeef f  deeeef eeef f fd!d"Z	d*d#eeef deeef de	e
 fd$d%Zd&d' Zdejdejfd(d)Z  ZS )+SwinTransformerBlockzkSwin Transformer Block.

    A transformer block with window-based self-attention and shifted windows.
    r(   NrF   r   F      @TrG   rH   input_resolutionrI   rJ   r$   
shift_sizealways_partitiondynamic_mask	mlp_ratiorK   rM   rL   	drop_path	act_layer
norm_layerc              	      s   t    || _|| _t|| _|| _|| _| ||\| _	| _
| j	d | j	d  | _|	| _||| _t|||| j	|
||d| _|dkrHt|nt | _||| _t|t||	 ||d| _|dkrgt|nt | _| jd| jrtdn|  dd	 dS )
a  
        Args:
            dim: Number of input channels.
            input_resolution: Input resolution.
            window_size: Window size.
            num_heads: Number of attention heads.
            head_dim: Enforce the number of channels per head
            shift_size: Shift size for SW-MSA.
            always_partition: Always partition into full windows and shift
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            act_layer: Activation layer.
            norm_layer: Normalization layer.
        r   r   )rI   rJ   r$   rK   rL   rM   rG   )in_featureshidden_featuresr   droprq   NFrP   )rU   rV   rH   r   r   target_shift_sizer   r   _calc_window_shiftr$   r   rW   r   norm1rD   r   r   rY   Identity
drop_path1norm2r   r   mlp
drop_path2r]   get_attn_mask)rd   rH   r   rI   rJ   r$   r   r   r   r   rK   rM   rL   r   r   r   rf   r5   r6   rV      sB   
#

	


zSwinTransformerBlock.__init__r#   r%   c              	   C   s  t | jr|d ur|jd |jd }}|j}|j}n	| j\}}d }d }t|| jd  | jd  }t|| jd  | jd  }t	j
d||df||d}d}d| jd  f| jd  | jd  f| jd  d ffD ]<}d| jd  f| jd  | jd  f| jd  d ffD ]}	||d d |d |d |	d |	d d d f< |d7 }qqkt|| j}
|
d| j}
|
d|
d }||dktd|dktd}|S d }|S )Nr   r'   r   )dtypedevicer*   g      YrG   )anyr   r,   r   r   r   mathceilr$   r;   r[   r7   r-   rW   rm   masked_fillr   )rd   r#   r1   r2   r   r   img_maskcnthwmask_windowsrq   r5   r5   r6   r   -  s<   

0
$z"SwinTransformerBlock.get_attn_masktarget_window_sizer   c                 C   s   t |}|d u r| j}t|r|d d |d d f}nt |}| jr'||fS dd t| j|D }dd t| j||D }t|t|fS )Nr   r'   r   c                 S   s    g | ]\}}||kr|n|qS r5   r5   ).0rr   r5   r5   r6   
<listcomp>a  s     z;SwinTransformerBlock._calc_window_shift.<locals>.<listcomp>c                 S   s"   g | ]\}}}||krd n|qS r   r5   )r   r   r   sr5   r5   r6   r   b  s   " )r   r   r   r   zipr   tuple)rd   r   r   r$   r   r5   r5   r6   r   P  s   z'SwinTransformerBlock._calc_window_shift	feat_sizec                 C   sl   || _ |dur
|| _| |\| _| _| jd | jd  | _| j| j | jd| j	r-dn| 
 dd dS )z
        Args:
            feat_size: New input resolution
            window_size: New window size
            always_partition: Change always_partition attribute if not None
        Nr   r   rq   FrP   )r   r   r   r$   r   rW   r   rl   r]   r   r   )rd   r   r$   r   r5   r5   r6   set_input_sizee  s   
z#SwinTransformerBlock.set_input_sizec              	   C   sn  |j \}}}}t| j}|r!tj|| jd  | jd  fdd}n|}| jd || jd   | jd  }| jd || jd   | jd  }	tjj|ddd|	d|f}|j \}
}}}
t	|| j}|
d| j|}t| ddrt| |}n| j}| j||d}|
d| jd | jd |}t|| j||}|d d d |d |d d f  }|rtj|| jdd}|S |}|S )	Nr   r   )r   r'   )shiftsdimsr*   r   F)rp   )r,   r   r   r;   rollr$   rY   rw   padr7   r-   rW   getattrr   rq   r   r8   r/   )rd   r#   r0   r1   r2   r3   	has_shift	shifted_xpad_hpad_w_HpWp	x_windowsrq   attn_windowsr5   r5   r6   _attn}  s.   
&""$zSwinTransformerBlock._attnc                 C   sd   |j \}}}}|| | | | }||d|}|| | | | }|||||}|S )zForward pass.

        Args:
            x: Input features with shape (B, H, W, C).

        Returns:
            Output features with shape (B, H, W, C).
        r*   )r,   r   r   r   rt   r   r   r   )rd   r#   r0   r1   r2   r3   r5   r5   r6   r     s   	zSwinTransformerBlock.forwardr   )r   r   r   r   rY   GELU	LayerNormr   r   r   r   r   r   rV   r;   r   r   r	   r   r   r   r   r   r   r5   r5   rf   r6   r      s    		
 H&



'r   c                       sP   e Zd ZdZdejfdedee def fddZ	de
jd	e
jfd
dZ  ZS )PatchMergingzVPatch Merging Layer.

    Downsample features by merging 2x2 neighboring patches.
    NrH   out_dimr   c                    sH   t    || _|pd| | _|d| | _tjd| | jdd| _dS )z
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels (or 2 * dim if None)
            norm_layer: Normalization layer.
        r'   r(   FrR   N)rU   rV   rH   r   normrY   r^   	reduction)rd   rH   r   r   rf   r5   r6   rV     s
   
zPatchMerging.__init__r#   r%   c                 C   s   |j \}}}}ddd|d d|d f}tj||}|j \}}}}|||d d|d d|ddddddd}| |}| |}|S )zForward pass.

        Args:
            x: Input features with shape (B, H, W, C).

        Returns:
            Output features with shape (B, H//2, W//2, out_dim).
        r   r'   r   r&   r(   r)   )	r,   rY   rw   r   rt   r.   r>   r   r   )rd   r#   r0   r1   r2   r3   
pad_valuesr   r5   r5   r6   r     s   	2

zPatchMerging.forward)r   r   r   r   rY   r   r   r   r   rV   r;   r   r   r   r5   r5   rf   r6   r     s    r   c                !       s   e Zd ZdZdddddddddddejfd	ed
edeeef dededede	e de
dededededededeee ef def  fddZ	d"deeef dede	e fddZdejdejfd d!Z  ZS )#SwinTransformerStagez|A basic Swin Transformer layer for one stage.

    Contains multiple Swin Transformer blocks and optional downsampling.
    Tr(   NrF   Fr   rG   rH   r   r   depth
downsamplerI   rJ   r$   r   r   r   rK   rM   rL   r   r   c                    s   t    |_|_|rtdd |D n|_|_d_ttdd D |r7t	|d_
n|ks=J t _
tj 	
fddt|D  _dS )	a  
        Args:
            dim: Number of input channels.
            out_dim: Number of output channels.
            input_resolution: Input resolution.
            depth: Number of blocks.
            downsample: Downsample layer at the end of the layer.
            num_heads: Number of attention heads.
            head_dim: Channels per head (dim // num_heads if not set)
            window_size: Local window size.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            proj_drop: Projection dropout rate.
            attn_drop: Attention dropout rate.
            drop_path: Stochastic depth rate.
            norm_layer: Normalization layer.
        c                 s       | ]}|d  V  qdS r'   Nr5   r   ir5   r5   r6   	<genexpr>      z0SwinTransformerStage.__init__.<locals>.<genexpr>Fc                 S   s   g | ]}|d  qS r'   r5   )r   r   r5   r5   r6   r         z1SwinTransformerStage.__init__.<locals>.<listcomp>)rH   r   r   c                    sT   g | ]&}t j|d  dkrdn 
	ttr#| ndqS )r'   r   )rH   r   rI   rJ   r$   r   r   r   r   rK   rM   rL   r   r   )r   output_resolution
isinstancelistr   r   rL   r   r   rJ   r   r   rI   r   rM   rK   rd   r   r$   r5   r6   r   !  s&    N)rU   rV   rH   r   r   r   r   grad_checkpointingr   r   r   rY   r   
Sequentialrangeblocks)rd   rH   r   r   r   r   rI   rJ   r$   r   r   r   rK   rM   rL   r   r   rf   r   r6   rV     s&   
$

(zSwinTransformerStage.__init__r   c                 C   sR   || _ t| jtjr|| _n
tdd |D | _| jD ]}|j| j||d qdS )a   Updates the resolution, window size and so the pair-wise relative positions.

        Args:
            feat_size: New input (feature) resolution
            window_size: New window size
            always_partition: Always partition / shift the window
        c                 s   r   r   r5   r   r5   r5   r6   r   E  r   z6SwinTransformerStage.set_input_size.<locals>.<genexpr>r   r$   r   N)	r   r   r   rY   r   r   r   r   r   )rd   r   r$   r   blockr5   r5   r6   r   4  s   
z#SwinTransformerStage.set_input_sizer#   r%   c                 C   s8   |  |}| jrtj st| j|}|S | |}|S )zsForward pass.

        Args:
            x: Input features.

        Returns:
            Output features.
        )r   r   r;   r   is_scriptingr   r   rd   r#   r5   r5   r6   r   M  s   
	
zSwinTransformerStage.forwardr   )r   r   r   r   rY   r   r   r   r   r   r   r   r	   r   r   rV   r   r;   r   r   r   r5   r5   rf   r6   r     sp    
	
P

r   c                +       s  e Zd ZdZdddddddd	d
dddddddddeejdfdedededede	dede
edf de
edf dee dedededed ed!ed"ed#ed$ed%ed&ee	ef d'e	f* fd(d)ZejjdSd*e	d+d
fd,d-Zejjd+ee	 fd.d/Z	
	
	
	0	
dTdee
eef  dee
eef  dee
eef  d1edee d+d
fd2d3ZejjdUd4ed+ee	ef fd5d6ZejjdVd7ed+d
fd8d9Zejjd+ejfd:d;ZdWdedee	 d+d
fd<d=Z	
			>	dXd?ejd@eeee e f  dAedBedCe	dDed+ee ej e
eje ej f f fdEdFZ!	G		dYd@eee e f dHedIed+e e fdJdKZ"d?ejd+ejfdLdMZ#dUd?ejdNed+ejfdOdPZ$d?ejd+ejfdQdRZ%  Z&S )Zr"   zSwin Transformer.

    A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
          https://arxiv.org/pdf/2103.14030
       r(   r&     avg`   r'   r'      r'   r&   r         NrF   FTr   rG   g? img_size
patch_sizein_chansnum_classesglobal_pool	embed_dimdepths.rI   rJ   r$   r   strict_img_sizer   rK   	drop_rateproj_drop_rateattn_drop_ratedrop_path_rateembed_layerr   weight_initc              	      s|  t    |dv sJ || _|| _d| _t|| _ | _t d| jd    | _	| _
g | _t ttfsA fddt| jD  |||| d ||dd| _| jj}t| j|	}	t|
ttfsit| j|
}
nt|
dkru|
f| j }
t|
| jks~J t| j|}d	d td|t||D }g } d }d}t| jD ]t} | }|tdi d
|d|d|d | |d | fd|| d|dkd|| d|	| d|
| d|d| d|| d|d|d|d|| d|g7 }|}|dkr|d9 }|  jt||| d| dg7  _qtj| | _|| j	| _t| j	|||| jd| _|dkr<| | dS dS ) a~  
        Args:
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of input image channels.
            num_classes: Number of classes for classification head.
            embed_dim: Patch embedding dimension.
            depths: Depth of each Swin Transformer layer.
            num_heads: Number of attention heads in different layers.
            head_dim: Dimension of self-attention heads.
            window_size: Window size.
            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
            qkv_bias: If True, add a learnable bias to query, key, value.
            drop_rate: Dropout rate.
            attn_drop_rate (float): Attention dropout rate.
            drop_path_rate (float): Stochastic depth rate.
            embed_layer: Patch embedding layer.
            norm_layer (nn.Module): Normalization layer.
        )r   r   NHWCr'   r   c                    s   g | ]
}t  d |  qS r   )r   r   r   r5   r6   r     s    z,SwinTransformer.__init__.<locals>.<listcomp>r   )r   r   r   r   r   r   
output_fmtc                 S   s   g | ]}|  qS r5   )tolist)r   r#   r5   r5   r6   r     r   rH   r   r   r   r   rI   rJ   r$   r   r   r   rK   rM   rL   r   r   layers.)num_chsr   module)	pool_typer   	input_fmtskipNr5   ) rU   rV   r   r   r  len
num_layersr   r   num_featureshead_hidden_sizefeature_infor   r   r   r   patch_embed	grid_sizer   r;   linspacer?   splitr   dictrY   r   layersr   r   headinit_weights)rd   r   r   r   r   r   r   r   rI   rJ   r$   r   r   r   rK   r   r   r   r   r   r   r  kwargs
patch_griddprr  in_dimrX   r   r   rf   r  r6   rV   f  s   
,
	"


	

&
zSwinTransformer.__init__moder%   c                 C   s<   |dv sJ d|v rt | j nd}tt||d|  dS )zInitialize model weights.

        Args:
            mode: Weight initialization mode ('jax', 'jax_nlhb', 'moco', or '').
        )jaxjax_nlhbmocor   nlhbrG   )	head_biasN)r   logr   r   r!   )rd   r  r"  r5   r5   r6   r    s   zSwinTransformer.init_weightsc                 C   s.   t  }|  D ]\}}d|v r|| q|S )z,Parameters that should not use weight decay.r\   )setnamed_parametersadd)rd   nwdnr   r5   r5   r6   no_weight_decay  s   
zSwinTransformer.no_weight_decay   window_ratioc           
         s   |dus|dur| j j||d | j j}|du r#t fdd|D }t| jD ]\}}dt|d d }	|j|d |	 |d |	 f||d q(dS )	a  Update the image resolution and window size.

        Args:
            img_size: New input resolution, if None current resolution is used.
            patch_size: New patch size, if None use current patch size.
            window_size: New window size, if None based on new_img_size // window_div.
            window_ratio: Divisor for calculating window size from grid size.
            always_partition: Always partition into windows and shift (even if window size < feat size).
        N)r   r   c                    s   g | ]}|  qS r5   r5   )r   pgr+  r5   r6   r   
  r   z2SwinTransformer.set_input_size.<locals>.<listcomp>r'   r   r   r   )r  r   r  r   	enumerater  max)
rd   r   r   r$   r+  r   r  indexstagestage_scaler5   r-  r6   r     s   zSwinTransformer.set_input_sizecoarsec                 C   s   t d|rddS g ddS )z"Group parameters for optimization.z^patch_embedz^layers\.(\d+)))z^layers\.(\d+).downsampler   )z^layers\.(\d+)\.\w+\.(\d+)N)z^norm)i )stemr   )r  )rd   r3  r5   r5   r6   group_matcher  s   zSwinTransformer.group_matcherenablec                 C   s   | j D ]}||_qdS )z)Enable or disable gradient checkpointing.N)r  r   )rd   r6  lr5   r5   r6   set_grad_checkpointing   s   
z&SwinTransformer.set_grad_checkpointingc                 C   s   | j jS )zGet the classifier head.)r  fc)rd   r5   r5   r6   get_classifier&  s   zSwinTransformer.get_classifierc                 C   s   || _ | jj||d dS )zReset the classifier head.

        Args:
            num_classes: Number of classes for new classifier.
            global_pool: Global pooling type.
        )r	  N)r   r  reset)rd   r   r   r5   r5   r6   reset_classifier+  s   z SwinTransformer.reset_classifierNCHWr#   indicesr   
stop_earlyr  intermediates_onlyc                 C   s   |dv sJ dg }t t| j|\}}	| |}t| j}
tj s%|s)| j}n	| jd|	d  }t|D ]+\}}||}||v ra|rP||
d krP| |}n|}|	dddd
 }|| q6|rf|S | |}||fS )aK  Forward features that returns intermediates.

        Args:
            x: Input image tensor.
            indices: Take last n blocks if int, all if None, select matching indices if sequence.
            norm: Apply norm layer to compatible intermediates.
            stop_early: Stop iterating over blocks when last desired intermediate hit.
            output_fmt: Shape of intermediate feature outputs.
            intermediates_only: Only return intermediate features.

        Returns:
            List of intermediate features or tuple of (final features, intermediates).
        )r=  zOutput shape must be NCHW.Nr   r   r&   r'   )r   r  r  r  r;   r   r   r.  r   r.   r/   append)rd   r#   r>  r   r?  r  r@  intermediatestake_indices	max_index
num_stagesstagesr   r1  x_interr5   r5   r6   forward_intermediates5  s*   



z%SwinTransformer.forward_intermediatesr   
prune_norm
prune_headc                 C   sJ   t t| j|\}}| jd|d  | _|rt | _|r#| dd |S )aE  Prune layers not required for specified intermediates.

        Args:
            indices: Indices of intermediate layers to keep.
            prune_norm: Whether to prune normalization layer.
            prune_head: Whether to prune the classifier head.

        Returns:
            List of indices that were kept.
        Nr   r   r   )r   r  r  rY   r   r   r<  )rd   r>  rI  rJ  rC  rD  r5   r5   r6   prune_intermediate_layersh  s   
z)SwinTransformer.prune_intermediate_layersc                 C   s"   |  |}| |}| |}|S )z/Forward pass through feature extraction layers.)r  r  r   r   r5   r5   r6   forward_features  s   


z SwinTransformer.forward_features
pre_logitsc                 C   s   |r	| j |ddS |  |S )zForward pass through classifier head.

        Args:
            x: Feature tensor.
            pre_logits: Return features before final classifier.

        Returns:
            Output tensor.
        T)rM  )r  )rd   r#   rM  r5   r5   r6   forward_head  s   
zSwinTransformer.forward_headc                 C   s   |  |}| |}|S )zoForward pass.

        Args:
            x: Input tensor.

        Returns:
            Output logits.
        )rL  rN  r   r5   r5   r6   r     s   
	
zSwinTransformer.forwardr   )NNNr*  NF)Tr   )NFFr=  F)r   FT)'r   r   r   r   r   rY   r   r   r   strr   r   r   r   r   r	   rV   r;   r   ignorer  r   r)  r   r   r   r5  r8  Moduler:  r<  r   r   rH  rK  rL  rN  r   r   r5   r5   rf   r6   r"   _  s   

	

z


  
5

state_dictmodelc           
         s,  d}d| v rd}ddl }i }| d| } | d| } |  D ]u\ }t fdd	d
D r.qd v rU|jjjj\}}}}|jd |ksJ|jd |krUt|||fdddd} 	dr~|
 dd }	|j|	jjkst|	jd |	jd kr~t||	j|	jjd}|r|ddd    dd || < q|S )zConvert patch embedding weight from manual patchify + linear proj to conv.

    Args:
        state_dict: State dictionary from checkpoint.
        model: Model instance.

    Returns:
        Filtered state dictionary.
    Tzhead.fc.weightFr   NrU  rT  c                    s   g | ]}| v qS r5   r5   )r   r(  r   r5   r6   r     r   z(checkpoint_filter_fn.<locals>.<listcomp>)rO   rq   zpatch_embed.proj.weightrs   r*   bicubic)interpolation	antialiasverboser\   ir   rh   zlayers.(\d+).downsamplec                 S   s   dt | dd  dS )Nr  r   z.downsample)r   group)r#   r5   r5   r6   <lambda>  s    z&checkpoint_filter_fn.<locals>.<lambda>zhead.zhead.fc.)regetitemsr   r  ra   weightr,   r   endswithget_submoduler\   r$   r   subreplace)
rT  rU  old_weightsr]  out_dictr   r   r1   r2   mr5   rV  r6   checkpoint_filter_fn  sB   

"
rh  Fvariant
pretrainedc                 K   sP   t dd t|ddD }|d|}tt| |fttd|dd|}|S )	zCreate a Swin Transformer model.

    Args:
        variant: Model variant name.
        pretrained: Load pretrained weights.
        **kwargs: Additional model arguments.

    Returns:
        SwinTransformer model instance.
    c                 s   s    | ]\}}|V  qd S r   r5   )r   r   r   r5   r5   r6   r     r   z+_create_swin_transformer.<locals>.<genexpr>r   )r   r   r&   r   out_indicesT)flatten_sequentialrk  )pretrained_filter_fnfeature_cfg)r   r.  r^  popr   r"   rh  r  )ri  rj  r  default_out_indicesrk  rU  r5   r5   r6   _create_swin_transformer  s   
rq  r   urlc                 K   s"   | ddddddt tddd	d
|S )z9Create default configuration for Swin Transformer models.r   )r&   r   r   )rF   rF   g?rW  Tzpatch_embed.projzhead.fcmit)rr  r   
input_size	pool_sizecrop_pctrX  fixed_input_sizemeanrT   
first_conv
classifierlicenser
   )rr  r  r5   r5   r6   _cfg  s   r|  z.swin_small_patch4_window7_224.ms_in22k_ft_in1kztimm/zvhttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_small_patch4_window7_224_22kto1k_finetune.pth)	hf_hub_idrr  z-swin_base_patch4_window7_224.ms_in22k_ft_in1kzlhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22kto1k.pthz.swin_base_patch4_window12_384.ms_in22k_ft_in1kzmhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22kto1k.pth)r&     r~  )r   r   g      ?)r}  rr  rt  ru  rv  z.swin_large_patch4_window7_224.ms_in22k_ft_in1kzmhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22kto1k.pthz/swin_large_patch4_window12_384.ms_in22k_ft_in1kznhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22kto1k.pthz$swin_tiny_patch4_window7_224.ms_in1kzdhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pthz%swin_small_patch4_window7_224.ms_in1kzehttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pthz$swin_base_patch4_window7_224.ms_in1kzdhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224.pthz%swin_base_patch4_window12_384.ms_in1kzehttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384.pthz-swin_tiny_patch4_window7_224.ms_in22k_ft_in1kzuhttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_tiny_patch4_window7_224_22kto1k_finetune.pthz%swin_tiny_patch4_window7_224.ms_in22kzhhttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_tiny_patch4_window7_224_22k.pthiQU  )r}  rr  r   z&swin_small_patch4_window7_224.ms_in22kzihttps://github.com/SwinTransformer/storage/releases/download/v1.0.8/swin_small_patch4_window7_224_22k.pthz%swin_base_patch4_window7_224.ms_in22kzhhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22k.pthz&swin_base_patch4_window12_384.ms_in22kzihttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth)r}  rr  rt  ru  rv  r   z&swin_large_patch4_window7_224.ms_in22kzihttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22k.pthz'swin_large_patch4_window12_384.ms_in22kzjhttps://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pthzswin_s3_tiny_224.ms_in1kzbhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_t-1d53f6a8.pthzbhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_s-3bb4c69d.pthzbhttps://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_b-a1e95db4.pth)zswin_s3_small_224.ms_in1kzswin_s3_base_224.ms_in1kc                 K   2   t dddddd}t	d
d| it |fi |S )z+ Swin-T @ 224x224, trained ImageNet-1k
    r(   rF   r   r   r   r   r$   r   r   rI   swin_tiny_patch4_window7_224rj  N)r  r  rq  rj  r  
model_argsr5   r5   r6   r  @     r  c                 K   r  )z Swin-S @ 224x224
    r(   rF   r   r'   r'      r'   r   r  swin_small_patch4_window7_224rj  N)r  r  r  r5   r5   r6   r  I  r  r  c                 K   r  )z Swin-B @ 224x224
    r(   rF      r  r(   r*         r  swin_base_patch4_window7_224rj  N)r  r  r  r5   r5   r6   r  R  r  r  c                 K   r  )z Swin-B @ 384x384
    r(   r   r  r  r  r  swin_base_patch4_window12_384rj  N)r  r  r  r5   r5   r6   r  [  r  r  c                 K   r  )z Swin-L @ 224x224
    r(   rF      r  r   r   r   0   r  swin_large_patch4_window7_224rj  N)r  r  r  r5   r5   r6   r  d  r  r  c                 K   r  )z Swin-L @ 384x384
    r(   r   r  r  r  r  swin_large_patch4_window12_384rj  N)r  r  r  r5   r5   r6   r  m  r  r  c                 K   0   t dddddd}td
d| it |fi |S )z; Swin-S3-T @ 224x224, https://arxiv.org/abs/2111.14725
    r(   rF   rF      rF   r   r   r   r  swin_s3_tiny_224rj  N)r  r  r  r5   r5   r6   r  v     
r  c                 K   r  )z; Swin-S3-S @ 224x224, https://arxiv.org/abs/2111.14725
    r(   )r  r  r  rF   r   r  r   r  swin_s3_small_224rj  N)r  r  r  r5   r5   r6   r    r  r  c                 K   r  )z; Swin-S3-B @ 224x224, https://arxiv.org/abs/2111.14725
    r(   r  r   )r'   r'      r'   r   r  swin_s3_base_224rj  N)r  r  r  r5   r5   r6   r    r  r  )"swin_base_patch4_window7_224_in22k#swin_base_patch4_window12_384_in22k#swin_large_patch4_window7_224_in22k$swin_large_patch4_window12_384_in22krP  rO  )Mr   loggingr   typingr   r   r   r   r   r   r   r	   r;   torch.nnrY   	timm.datar   r   timm.layersr   r   r   r   r   r   r   r   r   r   r   _builderr   	_featuresr   _features_fxr   _manipulater   r   	_registryr   r   r    vision_transformerr!   __all__	getLoggerr   _loggerr   r   r   r7   r8   rC   rS  rD   r   r   r   r"   r  rQ  rh  r   rq  r|  default_cfgsr  r  r  r  r  r  r  r  r  r5   r5   r5   r6   <module>   sL   (4


,z W-}  "D1"&*.26:?B
K