o
    	۷i                  	   @   s  d Z ddlZddlZddlZddlmZ ddlmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZmZmZ ddlmZ ddlm Z  e!e"Z#eeddG dd deZ$eeddG dd deZ%eeddG dd deZ&eeddG dd deZ'dd Z(dd  Z)G d!d" d"ej*Z+G d#d$ d$ej*Z,G d%d& d&ej*Z-dOd)e
j.d*e/d+e0d,e
j.fd-d.Z1G d/d0 d0ej*Z2G d1d2 d2ej*Z3G d3d4 d4ej*Z4G d5d6 d6ej*Z5G d7d8 d8ej*Z6G d9d: d:ej*Z7G d;d< d<ej*Z8G d=d> d>eZ9G d?d@ d@ej*Z:eG dAdB dBeZ;eG dCdD dDe;Z<edEdG dFdG dGe;Z=edHdG dIdJ dJe;Z>edKdG dLdM dMe;eZ?g dNZ@dS )PzPyTorch Swin Transformer model.    N)	dataclass)OptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)BackboneMixin   )
SwinConfigzN
    Swin encoder's outputs, with potential hidden states and attentions.
    )custom_introc                   @   sr   e Zd ZU dZdZeej ed< dZ	ee
ejdf  ed< dZee
ejdf  ed< dZee
ejdf  ed< dS )SwinEncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tupler   r    r#   r#   \/home/ubuntu/vllm_env/lib/python3.10/site-packages/transformers/models/swin/modeling_swin.pyr   *   s   
 	r   zV
    Swin model's outputs that also contains a pooling of the last hidden states.
    c                   @      e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	SwinModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_output.r   r   r   )r   r   r   r   r   r   r   r    r!   r'   r   r"   r   r   r#   r#   r#   r$   r&   @   s   
 r&   z*
    Swin masked image model outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< ed	d
 ZdS )SwinMaskedImageModelingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlossreconstruction.r   r   r   c                 C   s   t dt | jS )Nzlogits attribute is deprecated and will be removed in version 5 of Transformers. Please use the reconstruction attribute to retrieve the final output instead.)warningswarnFutureWarningr*   selfr#   r#   r$   logitss   s
   z$SwinMaskedImageModelingOutput.logits)r   r   r   r   r)   r   r   r    r!   r*   r   r"   r   r   propertyr0   r#   r#   r#   r$   r(   Y   s   
 r(   z0
    Swin outputs for image classification.
    c                   @   r%   )	SwinImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr)   r0   .r   r   r   )r   r   r   r   r)   r   r   r    r!   r0   r   r"   r   r   r#   r#   r#   r$   r2   }   s   
 r2   c                 C   sR   | j \}}}}| ||| ||| ||} | dddddd d|||}|S )z2
    Partitions the given input into windows.
    r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowsr#   r#   r$   window_partition   s   $rC   c                 C   sN   | j d }| d|| || |||} | dddddd d|||} | S )z?
    Merges windows to produce higher resolution features.
    r6   r   r   r   r3   r4   r5   r7   )rB   r=   r?   r@   rA   r#   r#   r$   window_reverse   s   
$rD   c                
       sr   e Zd ZdZd fdd	Zdejdededejfd	d
Z		dde	ej
 de	ej dedeej fddZ  ZS )SwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    Fc                    s   t    t|| _| jj}| jj| _|r tt	
dd|jnd | _|jr5tt	
d|d |j| _nd | _t|j| _t|j| _|j| _|| _d S )Nr   )super__init__SwinPatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)r/   rY   use_mask_tokenrJ   	__class__r#   r$   rG      s   


 
zSwinEmbeddings.__init__
embeddingsr?   r@   returnc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr6         ?r   r   r3   bicubicF)sizemodealign_cornersdim)r8   rR   r   jit
is_tracingrX   r   reshaper:   r   
functionalinterpolater9   cat)r/   r]   r?   r@   rJ   num_positionsclass_pos_embedpatch_pos_embedre   
new_height	new_widthsqrt_num_positionsr#   r#   r$   interpolate_pos_encoding   s(   



z'SwinEmbeddings.interpolate_pos_encodingNpixel_valuesbool_masked_posrr   c                 C   s   |j \}}}}| |\}}	| |}| \}
}}|d ur8| j|
|d}|d|}|d|  ||  }| jd urN|rI|| 	||| }n|| j }| 
|}||	fS )Nr6         ?)r8   rI   rT   ra   rP   expand	unsqueezetype_asrR   rr   rW   )r/   rs   rt   rr   _rA   r?   r@   r]   output_dimensionsr>   seq_lenmask_tokensmaskr#   r#   r$   forward   s   



zSwinEmbeddings.forward)F)NF)r   r   r   r   rG   r   Tensorintrr   r   r    
BoolTensorboolr"   r~   __classcell__r#   r#   r[   r$   rE      s    +rE   c                       sN   e Zd ZdZ fddZdd Zdeej de	ej
e	e f fdd	Z  ZS )
rH   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
|d |d  |d |d  f| _tj||||d| _d S )Nr   r   )kernel_sizestride)rF   rG   
image_sizerX   rA   rO   
isinstancecollectionsabcIterablerJ   rK   r   Conv2d
projection)r/   rY   r   rX   rA   hidden_sizerJ   r[   r#   r$   rG     s   
 "zSwinPatchEmbeddings.__init__c                 C   s   || j d  dkrd| j d || j d   f}tj||}|| j d  dkr>ddd| j d || j d   f}tj||}|S )Nr   r   )rX   r   ri   pad)r/   rs   r?   r@   
pad_valuesr#   r#   r$   	maybe_pad   s    zSwinPatchEmbeddings.maybe_padrs   r^   c                 C   sV   |j \}}}}| |||}| |}|j \}}}}||f}|ddd}||fS )Nr3   r   )r8   r   r   flatten	transpose)r/   rs   ry   rA   r?   r@   r]   rz   r#   r#   r$   r~   )  s   
zSwinPatchEmbeddings.forward)r   r   r   r   rG   r   r   r   r    r"   r   r   r~   r   r#   r#   r[   r$   rH   
  s
    .	rH   c                	       sh   e Zd ZdZejfdee dedejddf fddZ	d	d
 Z
dejdeeef dejfddZ  ZS )SwinPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutionre   
norm_layerr^   Nc                    sB   t    || _|| _tjd| d| dd| _|d| | _d S )Nr4   r3   Fbias)rF   rG   r   re   r   Linear	reductionrT   )r/   r   re   r   r[   r#   r$   rG   B  s
   
zSwinPatchMerging.__init__c                 C   sF   |d dkp|d dk}|r!ddd|d d|d f}t j||}|S )Nr3   r   r   )r   ri   r   )r/   r<   r?   r@   
should_padr   r#   r#   r$   r   I  s
   zSwinPatchMerging.maybe_padr<   input_dimensionsc                 C   s   |\}}|j \}}}|||||}| |||}|d d dd ddd dd d f }|d d dd ddd dd d f }	|d d dd ddd dd d f }
|d d dd ddd dd d f }t||	|
|gd}||dd| }| |}| |}|S )Nr   r3   r   r6   r4   )r8   r9   r   r   rk   rT   r   )r/   r<   r   r?   r@   r>   re   rA   input_feature_0input_feature_1input_feature_2input_feature_3r#   r#   r$   r~   Q  s   $$$$

zSwinPatchMerging.forward)r   r   r   r   r   rS   r"   r   ModulerG   r   r   r   r~   r   r#   r#   r[   r$   r   5  s
    **r           Finput	drop_probtrainingr^   c                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    r   r   r   )r   dtypedevice)r8   ndimr   randr   r   floor_div)r   r   r   	keep_probr8   random_tensoroutputr#   r#   r$   	drop_pathl  s   
r   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )SwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r^   c                    s   t    || _d S N)rF   rG   r   )r/   r   r[   r#   r$   rG     s   

zSwinDropPath.__init__r   c                 C   s   t || j| jS r   )r   r   r   r/   r   r#   r#   r$   r~     s   zSwinDropPath.forwardc                 C   s   d| j  S )Nzp=)r   r.   r#   r#   r$   
extra_repr  s   zSwinDropPath.extra_reprr   )r   r   r   r   r   floatrG   r   r   r~   strr   r   r#   r#   r[   r$   r     s
    r   c                       sZ   e Zd Z fddZ			ddejdeej deej dee d	e	ej f
d
dZ
  ZS )SwinSelfAttentionc                    s
  t    || dkrtd| d| d|| _t|| | _| j| j | _t|tj	j
r0|n||f| _ttd| jd  d d| jd  d  || _t| jd }t| jd }tt||gdd}t|d}|d d d d d f |d d d d d f  }	|	ddd }	|	d d d d df  | jd d 7  < |	d d d d df  | jd d 7  < |	d d d d df  d| jd  d 9  < |	d	}
| d
|
 tj| j| j|jd| _tj| j| j|jd| _tj| j| j|jd| _t|j| _ d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r3   r   ij)indexingr6   relative_position_indexr   )!rF   rG   
ValueErrornum_attention_headsr   attention_head_sizeall_head_sizer   r   r   r   r=   r   rM   r   rN   relative_position_bias_tablearangestackr   r   r:   r;   sumregister_bufferr   qkv_biasquerykeyvaluerU   attention_probs_dropout_probrW   )r/   rY   re   	num_headsr=   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r[   r#   r$   rG     s8   
*,((,
zSwinSelfAttention.__init__NFr   attention_mask	head_maskoutput_attentionsr^   c                 C   s  |j \}}}||d| jf}| ||dd}	| ||dd}
| ||dd}t|	|
dd}|t	
| j }| j| jd }|| jd | jd  | jd | jd  d}|ddd }||d }|d ur|j d }||| || j||}||dd }|d| j||}tjj|dd}| |}|d ur|| }t||}|dddd }| d d | jf }||}|r||f}|S |f}|S )Nr6   r   r3   r   rd   r   )r8   r   r   r9   r   r   r   r   matmulmathsqrtr   r   r=   r:   r;   rw   r   r   ri   softmaxrW   ra   r   )r/   r   r   r   r   r>   re   rA   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputsr#   r#   r$   r~     s@   &


zSwinSelfAttention.forwardNNF)r   r   r   rG   r   r   r   r    r   r"   r~   r   r#   r#   r[   r$   r     s     (r   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )SwinSelfOutputc                    s*   t    t||| _t|j| _d S r   )rF   rG   r   r   denserU   r   rW   r/   rY   re   r[   r#   r$   rG     s   
zSwinSelfOutput.__init__r   input_tensorr^   c                 C      |  |}| |}|S r   r   rW   )r/   r   r   r#   r#   r$   r~     s   

zSwinSelfOutput.forwardr   r   r   rG   r   r   r~   r   r#   r#   r[   r$   r     s    $r   c                       sb   e Zd Z fddZdd Z			ddejdeej d	eej d
ee	 de
ej f
ddZ  ZS )SwinAttentionc                    s2   t    t||||| _t||| _t | _d S r   )rF   rG   r   r/   r   r   setpruned_heads)r/   rY   re   r   r=   r[   r#   r$   rG     s   
zSwinAttention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   rd   )lenr   r/   r   r   r   r   r   r   r   r   r   r   union)r/   headsindexr#   r#   r$   prune_heads  s   zSwinAttention.prune_headsNFr   r   r   r   r^   c                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r/   r   )r/   r   r   r   r   self_outputsattention_outputr   r#   r#   r$   r~     s   zSwinAttention.forwardr   )r   r   r   rG   r   r   r   r   r    r   r"   r~   r   r#   r#   r[   r$   r     s"    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )SwinIntermediatec                    sJ   t    t|t|j| | _t|jt	rt
|j | _d S |j| _d S r   )rF   rG   r   r   r   	mlp_ratior   r   
hidden_actr   r   intermediate_act_fnr   r[   r#   r$   rG   "  s
   
zSwinIntermediate.__init__r   r^   c                 C   r   r   )r   r   r   r#   r#   r$   r~   *     

zSwinIntermediate.forwardr   r#   r#   r[   r$   r   !  s    r   c                       r   )
SwinOutputc                    s4   t    tt|j| || _t|j| _	d S r   )
rF   rG   r   r   r   r   r   rU   rV   rW   r   r[   r#   r$   rG   1  s   
zSwinOutput.__init__r   r^   c                 C   r   r   r   r   r#   r#   r$   r~   6  r   zSwinOutput.forwardr   r#   r#   r[   r$   r   0  s    r   c                       s   e Zd Zd fdd	Zdd Zdd Zd	d
 Z			ddejde	e
e
f deej dee dee de	ejejf fddZ  ZS )	SwinLayerr   r   c                    s   t    |j| _|| _|j| _|| _tj||jd| _	t
|||| jd| _|dkr.t|nt | _tj||jd| _t||| _t||| _d S )Neps)r=   r   )rF   rG   chunk_size_feed_forward
shift_sizer=   r   r   rS   layer_norm_epslayernorm_beforer   	attentionr   Identityr   layernorm_afterr   intermediater   r   )r/   rY   re   r   r   drop_path_rater   r[   r#   r$   rG   =  s   
zSwinLayer.__init__c                 C   sD   t || jkr td| _tj rt t|nt || _d S d S Nr   )minr=   r   r   r   rf   rg   tensor)r/   r   r#   r#   r$   set_shift_and_window_sizeJ  s
   
 z#SwinLayer.set_shift_and_window_sizec              	   C   s  | j dkrtjd||df||d}td| j t| j | j  t| j  d f}td| j t| j | j  t| j  d f}d}|D ]}	|D ]}
||d d |	|
d d f< |d7 }qEqAt|| j}|d| j| j }|d|d }||dkd|dkd}|S d }|S )Nr   r   r   r6   r3   g      Yr   )	r   r   rN   slicer=   rC   r9   rw   masked_fill)r/   r?   r@   r   r   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_maskr#   r#   r$   get_attn_maskR  s.   

zSwinLayer.get_attn_maskc                 C   sR   | j || j   | j  }| j || j   | j  }ddd|d|f}tj||}||fS r  )r=   r   ri   r   )r/   r   r?   r@   	pad_right
pad_bottomr   r#   r#   r$   r   n  s
   zSwinLayer.maybe_padNFr   r   r   r   always_partitionr^   c                 C   s  |s|  | n	 |\}}| \}}	}
|}| |}|||||
}| |||\}}|j\}	}}}	| jdkrGtj|| j | j fdd}n|}t	|| j
}|d| j
| j
 |
}| j|||j|jd}| j||||d}|d }|d| j
| j
|
}t|| j
||}| jdkrtj|| j| jfdd}n|}|d dkp|d dk}|r|d d d |d |d d f  }|||| |
}|| | }| |}| |}|| | }|r||d	 f}|S |f}|S )
Nr   )r   r3   )shiftsdimsr6   r   )r   r   r5   r   )r
  ra   r  r9   r   r8   r   r   rollrC   r=   r  r   r   r  rD   r;   r   r  r  r   )r/   r   r   r   r   r  r?   r@   r>   ry   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr  attention_outputsr   attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputsr#   r#   r$   r~   u  sN   


$

zSwinLayer.forward)r   r   NFF)r   r   r   rG   r
  r  r   r   r   r"   r   r   r    r   r~   r   r#   r#   r[   r$   r   <  s*    
r   c                       sd   e Zd Z fddZ			ddejdeeef deej	 dee
 d	ee
 d
eej fddZ  ZS )	SwinStagec                    sh   t     | _| _t fddt|D | _|d ur,|tjd| _	nd | _	d| _
d S )Nc              
      s:   g | ]}t  | |d  dkrdn jd  dqS )r3   r   )rY   re   r   r   r  r   )r   r=   .0irY   re   r   r   r   r#   r$   
<listcomp>  s    	z&SwinStage.__init__.<locals>.<listcomp>)re   r   F)rF   rG   rY   re   r   
ModuleListrangeblocksrS   
downsamplepointing)r/   rY   re   r   depthr   r   r2  r[   r-  r$   rG     s   
	
zSwinStage.__init__NFr   r   r   r   r  r^   c                 C   s   |\}}t | jD ]\}}	|d ur|| nd }
|	|||
||}|d }q	|}| jd urE|d d |d d }}||||f}| ||}n||||f}|||f}|rZ||dd  7 }|S )Nr   r   r3   )	enumerater1  r2  )r/   r   r   r   r   r  r?   r@   r,  layer_modulelayer_head_maskr'  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledrz   stage_outputsr#   r#   r$   r~     s"   



zSwinStage.forwardr(  )r   r   r   rG   r   r   r"   r   r   r    r   r~   r   r#   r#   r[   r$   r)    s$    
r)  c                       s   e Zd Z fddZ						ddejdeeef deej	 d	ee
 d
ee
 dee
 dee
 dee
 deeef fddZ  ZS )SwinEncoderc                    sp   t    t j_ _dd tjd jt	 jddD t
 fddtjD _d_d S )Nc                 S   s   g | ]}|  qS r#   )item)r+  xr#   r#   r$   r.    s    z(SwinEncoder.__init__.<locals>.<listcomp>r   cpu)r   c                    s   g | ]E}t  t jd |  d d |  d d |  f j|  j| t jd| t jd|d   |jd k rCtnddqS )r3   r   r   N)rY   re   r   r4  r   r   r2  )r)  r   rO   depthsr   r   
num_layersr   )r+  i_layerrY   dprrK   r/   r#   r$   r.    s    
*F)rF   rG   r   r@  rA  rY   r   linspacer  r   r   r/  r0  layersgradient_checkpointing)r/   rY   rK   r[   rC  r$   rG     s   
$

zSwinEncoder.__init__NFTr   r   r   r   output_hidden_states(output_hidden_states_before_downsamplingr  return_dictr^   c	                 C   s  |rdnd }	|r
dnd }
|rdnd }|r7|j \}}}|j|g||R  }|dddd}|	|f7 }	|
|f7 }
t| jD ]\}}|d urH|| nd }||||||}|d }|d }|d }|d |d f}|r|r|j \}}}|j|g|d |d f|R  }|dddd}|	|f7 }	|
|f7 }
n'|r|s|j \}}}|j|g||R  }|dddd}|	|f7 }	|
|f7 }
|r||dd  7 }q<|stdd	 ||	|fD S t||	||
d
S )Nr#   r   r   r   r3   r   r6   c                 s   s    | ]	}|d ur|V  qd S r   r#   )r+  vr#   r#   r$   	<genexpr>F  s    z&SwinEncoder.forward.<locals>.<genexpr>)r   r   r   r   )r8   r9   r:   r5  rF  r"   r   )r/   r   r   r   r   rH  rI  r  rJ  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr>   ry   r   reshaped_hidden_stater,  r6  r7  r'  r8  rz   r#   r#   r$   r~     s^   





zSwinEncoder.forward)NFFFFT)r   r   r   rG   r   r   r"   r   r   r    r   r   r   r~   r   r#   r#   r[   r$   r<    s6    
	

r<  c                   @   s0   e Zd ZU eed< dZdZdZdgZdd Z	dS )	SwinPreTrainedModelrY   swinrs   Tr)  c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS t |trW|jdurH|jj
  |jdurU|jj
  dS dS t |trd|jj
  dS dS )zInitialize the weightsr   )meanstdNru   )r   r   r   r   weightdatanormal_rY   initializer_ranger   zero_rS   fill_rE   rP   rR   r   r   )r/   moduler#   r#   r$   _init_weightsX  s"   




z!SwinPreTrainedModel._init_weightsN)
r   r   r   r   r!   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr\  r#   r#   r#   r$   rQ  P  s   
 rQ  c                       s   e Zd Zd fdd	Zdd Zdd Ze													dd
eej	 deej
 deej	 dee dee dedee deeef fddZ  ZS )	SwinModelTFc                    s   t  | || _t|j| _t|jd| jd   | _t	||d| _
t|| j
j| _tj| j|jd| _|r<tdnd| _|   dS )a  
        add_pooling_layer (`bool`, *optional*, defaults to `True`):
            Whether or not to apply pooling layer.
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether or not to create and apply mask tokens in the embedding layer.
        r3   r   )rZ   r   N)rF   rG   rY   r   r@  rA  r   rO   num_featuresrE   r]   r<  rL   encoderr   rS   r   	layernormAdaptiveAvgPool1dpooler	post_init)r/   rY   add_pooling_layerrZ   r[   r#   r$   rG   n  s   zSwinModel.__init__c                 C      | j jS r   r]   rI   r.   r#   r#   r$   get_input_embeddings     zSwinModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsrc  layerr  r   )r/   heads_to_prunern  r   r#   r#   r$   _prune_heads  s   zSwinModel._prune_headsNrs   rt   r   r   rH  rr   rJ  r^   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| |t| j j}| j|||d\}}	| j	||	||||d}
|
d }| 
|}d}| jdurd| |dd}t|d}|sr||f|
dd  }|S t|||
j|
j|
jdS )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rt   rr   )r   r   rH  rJ  r   r   r3   )r   r'   r   r   r   )rY   r   rH  use_return_dictr   get_head_maskr   r@  r]   rc  rd  rf  r   r   r   r&   r   r   r   )r/   rs   rt   r   r   rH  rr   rJ  embedding_outputr   encoder_outputssequence_outputpooled_outputr   r#   r#   r$   r~     sD   
	

zSwinModel.forward)TFNNNNNFN)r   r   r   rG   rk  rp  r   r   r   r    r   r   r   r"   r&   r~   r   r#   r#   r[   r$   ra  l  s:    
	ra  ad  
    Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    c                       s   e Zd Z fddZe							ddeej deej deej dee	 d	ee	 d
e	dee	 de
eef fddZ  ZS )SwinForMaskedImageModelingc                    sn   t  | t|ddd| _t|jd|jd   }ttj	||j
d |j ddt|j
| _|   d S )NFT)rh  rZ   r3   r   )in_channelsout_channelsr   )rF   rG   ra  rR  r   rO   rA  r   
Sequentialr   encoder_striderA   PixelShuffledecoderrg  )r/   rY   rb  r[   r#   r$   rG     s   
z#SwinForMaskedImageModeling.__init__NFrs   rt   r   r   rH  rr   rJ  r^   c              	   C   s>  |dur|n| j j}| j|||||||d}|d }	|	dd}	|	j\}
}}t|d  }}|	|
|||}	| |	}d}|dur}| j j	| j j
 }|d||}|| j j
d| j j
dd }tjj||dd	}||  | d
  | j j }|s|f|dd  }|dur|f| S |S t|||j|j|jdS )a7  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, SwinForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-base-simmim-window6-192")
        >>> model = SwinForMaskedImageModeling.from_pretrained("microsoft/swin-base-simmim-window6-192")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)rt   r   r   rH  rr   rJ  r   r   r3   r_   r6   none)r   gh㈵>)r)   r*   r   r   r   )rY   rq  rR  r   r8   r   floorrh   r~  r   rX   repeat_interleaverw   r;   r   ri   l1_lossr   rA   r(   r   r   r   )r/   rs   rt   r   r   rH  rr   rJ  r   ru  r>   rA   sequence_lengthr?   r@   reconstructed_pixel_valuesmasked_im_lossra   r}   reconstruction_lossr   r#   r#   r$   r~     sJ   &

 z"SwinForMaskedImageModeling.forwardrw  )r   r   r   rG   r   r   r   r    r   r   r   r"   r(   r~   r   r#   r#   r[   r$   rx    s6    
	rx  a  
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune Swin on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                       s   e Zd Z fddZe							ddeej deej deej dee	 d	ee	 d
e	dee	 de
eef fddZ  ZS )SwinForImageClassificationc                    sP   t  | |j| _t|| _|jdkrt| jj|jnt | _	| 
  d S r  )rF   rG   
num_labelsra  rR  r   r   rb  r  
classifierrg  )r/   rY   r[   r#   r$   rG   R  s   
"z#SwinForImageClassification.__init__NFrs   r   labelsr   rH  rr   rJ  r^   c                 C   s   |dur|n| j j}| j||||||d}|d }	| |	}
d}|dur,| ||
| j }|sB|
f|dd  }|dur@|f| S |S t||
|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   rH  rr   rJ  r   r3   )r)   r0   r   r   r   )	rY   rq  rR  r  loss_functionr2   r   r   r   )r/   rs   r   r  r   rH  rr   rJ  r   rv  r0   r)   r   r#   r#   r$   r~   `  s0   	
z"SwinForImageClassification.forwardrw  )r   r   r   rG   r   r   r   r    
LongTensorr   r   r"   r2   r~   r   r#   r#   r[   r$   r  C  s6    
	r  zM
    Swin backbone, to be used with frameworks like DETR and MaskFormer.
    c                       s^   e Zd Zdef fddZdd Z			ddejdee	 d	ee	 d
ee	 de
f
ddZ  ZS )SwinBackbonerY   c                    s   t    t     jg fddtt jD  | _t | _	t
 | j	j| _i }t| j| jD ]\}}t|||< q5t|| _|   d S )Nc                    s   g | ]}t  jd |  qS )r3   )r   rO   r*  rY   r#   r$   r.    s    z)SwinBackbone.__init__.<locals>.<listcomp>)rF   rG   _init_backbonerO   r0  r   r@  rb  rE   r]   r<  rL   rc  zip_out_featuresr  r   rS   
ModuleDicthidden_states_normsrg  )r/   rY   r  stagerA   r[   r  r$   rG     s   &
zSwinBackbone.__init__c                 C   ri  r   rj  r.   r#   r#   r$   rk    rl  z!SwinBackbone.get_input_embeddingsNrs   rH  r   rJ  r^   c              
   C   s6  |dur|n| j j}|dur|n| j j}|dur|n| j j}| |\}}| j||d|ddddd}|j}d}	t| j|D ]A\}
}|
| j	v r~|j
\}}}}|dddd }|||| |}| j|
 |}|||||}|dddd }|	|f7 }	q=|s|	f}|r||jf7 }|S t|	|r|jnd|jd	S )
aK  
        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/swin-tiny-patch4-window7-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 7, 7]
        ```NT)r   r   rH  rI  r  rJ  r#   r   r3   r   r   )feature_mapsr   r   )rY   rq  rH  r   r]   rc  r   r  stage_namesout_featuresr8   r:   r;   r9   r  r   r	   r   )r/   rs   rH  r   rJ  rs  r   r   r   r  r  hidden_stater>   rA   r?   r@   r   r#   r#   r$   r~     sJ    

zSwinBackbone.forward)NNN)r   r   r   r   rG   rk  r   r   r   r   r	   r~   r   r#   r#   r[   r$   r    s"    r  )r  rx  ra  rQ  r  )r   F)Ar   collections.abcr   r   r+   dataclassesr   typingr   r   r   r   activationsr   modeling_layersr   modeling_outputsr	   modeling_utilsr
   pytorch_utilsr   r   r   utilsr   r   r   r   utils.backbone_utilsr   configuration_swinr   
get_loggerr   loggerr   r&   r(   r2   rC   rD   r   rE   rH   r   r   r   r   r   r   r   r   r   r   r   r   r)  r<  rQ  ra  rx  r  r  __all__r#   r#   r#   r$   <module>   s   

\+ 7_&}<[cg@b