o
    ei.                  	   @   s  d Z ddlZddlZddlmZ ddlZddlmZmZ ddl	m
Z ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ eeZeeddG dd deZ eeddG dd deZ!eeddG dd deZ"eeddG dd deZ#dd Z$dd Z%dNd"ejd#e&d$e'd%ejfd&d'Z(G d(d) d)ej)Z*G d*d+ d+ej)Z+G d,d- d-ej)Z,G d.d/ d/ej)Z-G d0d1 d1ej)Z.G d2d3 d3ej)Z/G d4d5 d5ej)Z0G d6d7 d7ej)Z1G d8d9 d9ej)Z2G d:d; d;ej)Z3G d<d= d=eZ4G d>d? d?ej)Z5eG d@dA dAeZ6eG dBdC dCe6Z7edDdG dEdF dFe6Z8edGdG dHdI dIe6Z9edJdG dKdL dLee6Z:g dMZ;dS )Oz!PyTorch Swinv2 Transformer model.    N)	dataclass)Tensornn   )initialization)ACT2FN)BackboneMixin)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel)ModelOutputauto_docstringlogging	torch_int   )Swinv2ConfigzP
    Swinv2 encoder's outputs, with potential hidden states and attentions.
    )custom_introc                   @   sr   e Zd ZU dZdZejdB ed< dZe	ejdf dB ed< dZ
e	ejdf dB ed< dZe	ejdf dB ed< dS )Swinv2EncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   tupler   r    r    r    h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/swinv2/modeling_swinv2.pyr   '   s   
 	r   zX
    Swinv2 model's outputs that also contains a pooling of the last hidden states.
    c                   @      e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
ejdf dB ed< dZe
ejdf dB ed< dZe
ejdf dB ed< dS )	Swinv2ModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_output.r   r   r   )r   r   r   r   r   r   r   r   r$   r   r   r   r   r    r    r    r!   r#   >   s   
 r#   z,
    Swinv2 masked image model outputs.
    c                   @   r"   )	Swinv2MaskedImageModelingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlossreconstruction.r   r   r   )r   r   r   r   r&   r   r   r   r'   r   r   r   r   r    r    r    r!   r%   X      
 r%   z2
    Swinv2 outputs for image classification.
    c                   @   r"   )	Swinv2ImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr&   logits.r   r   r   )r   r   r   r   r&   r   r   r   r*   r   r   r   r   r    r    r    r!   r)   t   r(   r)   c                 C   sR   | j \}}}}| ||| ||| ||} | dddddd d|||}|S )z2
    Partitions the given input into windows.
    r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowsr    r    r!   window_partition   s   $r;   c                 C   sN   | j d }| d|| || |||} | dddddd d|||} | S )z?
    Merges windows to produce higher resolution features.
    r.   r   r   r   r+   r,   r-   r/   )r:   r5   r7   r8   r9   r    r    r!   window_reverse   s   
$r<           Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r=   r   r   )r   )dtypedevice)r0   ndimr   randrB   rC   floor_div)r>   r?   r@   	keep_probr0   random_tensoroutputr    r    r!   	drop_path   s   rK   c                       sT   e Zd ZdZddedB ddf fddZdejdejfdd	Zde	fd
dZ
  ZS )Swinv2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr?   rA   c                    s   t    || _d S N)super__init__r?   )selfr?   	__class__r    r!   rO      s   

zSwinv2DropPath.__init__r   c                 C   s   t || j| jS rM   )rK   r?   r@   rP   r   r    r    r!   forward   s   zSwinv2DropPath.forwardc                 C   s   d| j  S )Nzp=)r?   rP   r    r    r!   
extra_repr   s   zSwinv2DropPath.extra_reprrM   )r   r   r   r   floatrO   r   r   rT   strrV   __classcell__r    r    rQ   r!   rL      s
    rL   c                
       sr   e Zd ZdZd fdd	Zdejdededejfd	d
Z		ddej	dB dej
dB dedeej fddZ  ZS )Swinv2EmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    Fc                    s   t    t|| _| jj}| jj| _|r tt	
dd|jnd | _|jr5tt	
d|d |j| _nd | _t|j| _t|j| _|j| _|| _d S )Nr   )rN   rO   Swinv2PatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr   zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)rP   rl   use_mask_tokenr]   rQ   r    r!   rO      s   


 
zSwinv2Embeddings.__init__
embeddingsr7   r8   rA   c                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr.         ?r   r   r+   bicubicF)sizemodealign_cornersdim)r0   re   r   jit
is_tracingrk   r   reshaper2   r   
functionalinterpolater1   cat)rP   rn   r7   r8   r]   num_positionsclass_pos_embedpatch_pos_embedru   
new_height	new_widthsqrt_num_positionsr    r    r!   interpolate_pos_encoding   s(   



z)Swinv2Embeddings.interpolate_pos_encodingNpixel_valuesbool_masked_posr   c                 C   s   |j \}}}}| |\}}	| |}| \}
}}|d ur8| j|
|d}|d|}|d|  ||  }| jd urN|rI|| 	||| }n|| j }| 
|}||	fS )Nr.         ?)r0   r\   rg   rq   rc   expand	unsqueezetype_asre   r   rj   )rP   r   r   r   _r9   r7   r8   rn   output_dimensionsr6   seq_lenmask_tokensmaskr    r    r!   rT     s   



zSwinv2Embeddings.forwardFNF)r   r   r   r   rO   r   r   intr   r   
BoolTensorboolr   rT   rY   r    r    rQ   r!   rZ      s    +rZ   c                       sN   e Zd ZdZ fddZdd ZdejdB deej	ee
 f fd	d
Z  ZS )r[   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
|d |d  |d |d  f| _tj||||d| _d S )Nr   r   )kernel_sizestride)rN   rO   
image_sizerk   r9   rb   
isinstancecollectionsabcIterabler]   r^   r   Conv2d
projection)rP   rl   r   rk   r9   hidden_sizer]   rQ   r    r!   rO   ,  s   
 "zSwinv2PatchEmbeddings.__init__c                 C   s   || j d  dkrd| j d || j d   f}tj||}|| j d  dkr>ddd| j d || j d   f}tj||}|S )Nr   r   )rk   r   ry   pad)rP   r   r7   r8   
pad_valuesr    r    r!   	maybe_pad;  s    zSwinv2PatchEmbeddings.maybe_padr   NrA   c                 C   sV   |j \}}}}| |||}| |}|j \}}}}||f}|ddd}||fS )Nr+   r   )r0   r   r   flatten	transpose)rP   r   r   r9   r7   r8   rn   r   r    r    r!   rT   D  s   
zSwinv2PatchEmbeddings.forward)r   r   r   r   rO   r   r   r   r   r   r   rT   rY   r    r    rQ   r!   r[   %  s
    .	r[   c                	       sh   e Zd ZdZejfdee dedejddf fddZ	d	d
 Z
dejdeeef dejfddZ  ZS )Swinv2PatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutionru   
norm_layerrA   Nc                    sB   t    || _|| _tjd| d| dd| _|d| | _d S )Nr,   r+   Fbias)rN   rO   r   ru   r   Linear	reductionrg   )rP   r   ru   r   rQ   r    r!   rO   ]  s
   
zSwinv2PatchMerging.__init__c                 C   sF   |d dkp|d dk}|r!ddd|d d|d f}t j||}|S )Nr+   r   r   )r   ry   r   )rP   r4   r7   r8   
should_padr   r    r    r!   r   d  s
   zSwinv2PatchMerging.maybe_padr4   input_dimensionsc                 C   s   |\}}|j \}}}|||||}| |||}|d d dd ddd dd d f }|d d dd ddd dd d f }	|d d dd ddd dd d f }
|d d dd ddd dd d f }t||	|
|gd}||dd| }| |}| |}|S )Nr   r+   r   r.   r,   )r0   r1   r   r   r{   r   rg   )rP   r4   r   r7   r8   r6   ru   r9   input_feature_0input_feature_1input_feature_2input_feature_3r    r    r!   rT   l  s   $$$$

zSwinv2PatchMerging.forward)r   r   r   r   r   rf   r   r   ModulerO   r   r   r   rT   rY   r    r    rQ   r!   r   P  s
    **r   c                
       s^   e Zd Zddgf fdd	Z		ddejdejdB dedB d	eej fd
dZ	dd Z
  ZS )Swinv2SelfAttentionr   c              
      sF  t    || dkrtd| d| d|| _t|| | _| j| j | _t|tj	j
r0|n||f| _|| _ttdt|ddf | _ttjddd	d
tjd	dtjd|dd
| _|  \}}| jd|dd | jd|dd tj| j| j|jd
| _tj| j| jdd
| _tj| j| j|jd
| _t|j| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()
   r   r+   i   Tr   )inplaceFrelative_coords_table)
persistentrelative_position_index) rN   rO   
ValueErrornum_attention_headsr   attention_head_sizeall_head_sizer   r   r   r   r5   pretrained_window_sizer   r`   r   logoneslogit_scale
Sequentialr   ReLUcontinuous_position_bias_mlpcreate_coords_table_and_indexregister_bufferqkv_biasquerykeyvaluerh   attention_probs_dropout_probrj   )rP   rl   ru   	num_headsr5   r   r   r   rQ   r    r!   rO     s,   
"&zSwinv2SelfAttention.__init__NFr   attention_maskoutput_attentionsrA   c                 C   s  |j \}}}| ||d| j| jdd}| ||d| j| jdd}| ||d| j| jdd}	tj	j
|ddtj	j
|dddd }
tj| jtdd }|
| }
| | jd| j}|| jd | jd | jd  | jd | jd  d}|ddd }d	t| }|
|d }
|d ur|j d }|
|| || j|||dd }
|
|dd }
|
d| j||}
tj	j|
dd}| |}t||	}|dddd
 }| d d | jf }||}|r||f}|S |f}|S )Nr.   r   r+   rt   g      Y@)maxr      r   )r0   r   r1   r   r   r   r   r   r   ry   	normalizer   clampr   mathr   expr   r   r   r5   r2   r3   sigmoidr   softmaxrj   matmulrq   r   )rP   r   r   r   r6   ru   r9   query_layer	key_layervalue_layerattention_scoresr   relative_position_bias_tablerelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputsr    r    r!   rT     s`   &


zSwinv2SelfAttention.forwardc           
      C   s  t j| jd d  | jd t jd }t j| jd d  | jd t jd }t t j||gddddd 	d}| j
d dkrt|d d d d d d df  | j
d d   < |d d d d d d df  | j
d d   < n5| jd dkr|d d d d d d df  | jd d   < |d d d d d d df  | jd d   < |d9 }t |t t |d  td }|t| j j}t | jd }t | jd }t t j||gdd}t |d}|d d d d d f |d d d d d f  }|ddd }|d d d d df  | jd d 7  < |d d d d df  | jd d 7  < |d d d d df  d| jd  d 9  < |d	}	||	fS )
Nr   r   rB   ij)indexingr+      r   r.   )r   aranger5   int64rW   stackmeshgridr2   r3   r   r   signlog2absr   tonextr   
parametersrB   r   sum)
rP   relative_coords_hrelative_coords_wr   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r    r    r!   r     s8   ((
.0..&,((,
z1Swinv2SelfAttention.create_coords_table_and_indexr   )r   r   r   rO   r   r   r   r   r   rT   r   rY   r    r    rQ   r!   r     s     
Dr   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )Swinv2SelfOutputc                    s*   t    t||| _t|j| _d S rM   )rN   rO   r   r   denserh   r   rj   rP   rl   ru   rQ   r    r!   rO     s   
zSwinv2SelfOutput.__init__r   input_tensorrA   c                 C      |  |}| |}|S rM   r   rj   )rP   r   r   r    r    r!   rT     s   

zSwinv2SelfOutput.forwardr   r   r   rO   r   r   rT   rY   r    r    rQ   r!   r     s    $r   c                
       sP   e Zd Zd fdd	Z		ddejdejdB dedB d	eej fd
dZ	  Z
S )Swinv2Attentionr   c                    sD   t    t||||t|tjjr|n||fd| _t||| _	d S )Nrl   ru   r   r5   r   )
rN   rO   r   r   r   r   r   rP   r   rJ   )rP   rl   ru   r   r5   r   rQ   r    r!   rO     s   
	zSwinv2Attention.__init__NFr   r   r   rA   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )rP   rJ   )rP   r   r   r   self_outputsattention_outputr   r    r    r!   rT   *  s   zSwinv2Attention.forwardr   r   )r   r   r   rO   r   r   r   r   r   rT   rY   r    r    rQ   r!   r     s    r   c                       2   e Zd Z fddZdejdejfddZ  ZS )Swinv2Intermediatec                    sJ   t    t|t|j| | _t|jt	rt
|j | _d S |j| _d S rM   )rN   rO   r   r   r   	mlp_ratior   r   
hidden_actrX   r   intermediate_act_fnr   rQ   r    r!   rO   8  s
   
zSwinv2Intermediate.__init__r   rA   c                 C   r   rM   )r   r  rS   r    r    r!   rT   @     

zSwinv2Intermediate.forwardr   r    r    rQ   r!   r  7  s    r  c                       r  )Swinv2Outputc                    s4   t    tt|j| || _t|j| _	d S rM   )
rN   rO   r   r   r   r  r   rh   ri   rj   r   rQ   r    r!   rO   H  s   
zSwinv2Output.__init__r   rA   c                 C   r   rM   r   rS   r    r    r!   rT   M  r  zSwinv2Output.forwardr   r    r    rQ   r!   r	  G  s    r	  c                       s   e Zd Z	d fdd	Zdeeeef eeef f fddZdd	 Zd
d Z	dde	j
deeef dedB dee	j
e	j
f fddZ  ZS )Swinv2Layerr=   r   c           	         s   t    || _| |j|jf||f\}}|d | _|d | _t|||| jt|tj	j
r/|n||fd| _tj||jd| _|dkrGt|nt | _t||| _t||| _tj||jd| _d S )Nr   r   epsr=   )rN   rO   r   _compute_window_shiftr5   
shift_sizer   r   r   r   r   	attentionr   rf   layer_norm_epslayernorm_beforerL   IdentityrK   r  intermediater	  rJ   layernorm_after)	rP   rl   ru   r   r   drop_path_rater  r   r5   rQ   r    r!   rO   T  s*   


	zSwinv2Layer.__init__rA   c                 C   s6   dd t | j|D }dd t | j||D }||fS )Nc                 S   s   g | ]	\}}t ||qS r    )min).0rwr    r    r!   
<listcomp>n  s    z5Swinv2Layer._compute_window_shift.<locals>.<listcomp>c                 S   s"   g | ]\}}}||krd n|qS r  r    )r  r  r  sr    r    r!   r  o  s   " )zipr   )rP   target_window_sizetarget_shift_sizer5   r  r    r    r!   r  m  s   z!Swinv2Layer._compute_window_shiftc              	   C   s  | j dkrtjd||df|d}td| j t| j | j  t| j  d f}td| j t| j | j  t| j  d f}d}|D ]}|D ]}	||d d ||	d d f< |d7 }qDq@t|| j}
|
d| j| j }
|
d|
d }||dkd|dkd}|S d }|S )Nr   r   r   r.   r+   g      Yr=   )	r  r   ra   slicer5   r;   r1   r   masked_fill)rP   r7   r8   rB   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_maskr    r    r!   get_attn_maskr  s.   

zSwinv2Layer.get_attn_maskc                 C   sR   | j || j   | j  }| j || j   | j  }ddd|d|f}tj||}||fS Nr   )r5   r   ry   r   )rP   r   r7   r8   	pad_right
pad_bottomr   r    r    r!   r     s
   zSwinv2Layer.maybe_padFr   r   r   Nc                 C   s  |\}}|  \}}}|}	|||||}| |||\}}
|j\}}}}| jdkr9tj|| j | j fdd}n|}t|| j}|d| j| j |}| j	|||j
d}|d ur_||j}| j|||d}|d }|d| j| j|}t|| j||}| jdkrtj|| j| jfdd}n|}|
d dkp|
d dk}|r|d d d |d |d d f  }|||| |}| |}|	| | }| |}| |}|| | | }|r||d	 f}|S |f}|S )
Nr   )r   r+   )shiftsdimsr.   r   )r   r   r-   r   )rq   r1   r   r0   r  r   rollr;   r5   r)  rB   r   rC   r  r<   r3   r  rK   r  rJ   r  )rP   r   r   r   r7   r8   r6   r   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr(  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputsr    r    r!   rT     sD   

$


zSwinv2Layer.forward)r=   r   r   r   )r   r   r   rO   r   r   r  r)  r   r   r   r   rT   rY   r    r    rQ   r!   r
  S  s     &
r
  c                
       sR   e Zd Z	d fdd	Z	ddejdeeef dedB d	eej fd
dZ	  Z
S )Swinv2Stager   c	              
      s   t    || _|| _g }	t|D ]}
t||||||
 |
d dkr#dn|jd |d}|	| qt	|	| _
|d urE|||tjd| _nd | _d| _d S )Nr+   r   )rl   ru   r   r   r  r  r   )ru   r   F)rN   rO   rl   ru   ranger
  r5   appendr   
ModuleListblocksrf   
downsamplepointing)rP   rl   ru   r   depthr   rK   rA  r   r@  iblockrQ   r    r!   rO     s(   
	
zSwinv2Stage.__init__Fr   r   r   NrA   c                 C   s   |\}}t | jD ]\}}||||}|d }q	|}	| jd ur9|d d |d d }
}|||
|f}| |	|}n||||f}||	|f}|rN||dd  7 }|S )Nr   r   r+   )	enumerater@  rA  )rP   r   r   r   r7   r8   rD  layer_moduler;  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledr   stage_outputsr    r    r!   rT     s$   


zSwinv2Stage.forwardr  r   )r   r   r   rO   r   r   r   r   r   rT   rY   r    r    rQ   r!   r<    s     
r<  c                       sl   e Zd Zd fdd	Z				ddejdeeef ded	B d
ed	B ded	B ded	B dee	B fddZ
  ZS )Swinv2Encoderr   r   r   r   c                    s  t    t|j| _|| _| jjd ur|j}dd tjd|j	t
|jddD }g }t| jD ]M}t|t|jd|  |d d|  |d d|  f|j| |j| |t
|jd | t
|jd |d   || jd k rqtnd || d}|| q0t|| _d	| _d S )
Nc                 S   s   g | ]}|  qS r    )item)r  xr    r    r!   r    s    z*Swinv2Encoder.__init__.<locals>.<listcomp>r   cpu)rC   r+   r   )rl   ru   r   rC  r   rK   rA  r   F)rN   rO   lendepths
num_layersrl   pretrained_window_sizesr   linspacer  r   r=  r<  r   rb   r   r   r>  r   r?  layersgradient_checkpointing)rP   rl   r^   rT  dprrV  i_layerstagerQ   r    r!   rO   
  s*   
$*

zSwinv2Encoder.__init__FTr   r   r   Noutput_hidden_states(output_hidden_states_before_downsamplingreturn_dictrA   c                 C   s  |rdnd }|r
dnd }|rdnd }	|r7|j \}
}}|j|
g||R  }|dddd}||f7 }||f7 }t| jD ]}\}}||||}|d }|d }|d }|d |d f}|r|r|j \}
}}|j|
g|d |d f|R  }|dddd}||f7 }||f7 }n'|r|s|j \}
}}|j|
g||R  }|dddd}||f7 }||f7 }|r|	|dd  7 }	q<|stdd	 |||	|fD S t|||	|d
S )Nr    r   r   r   r+   r   r.   c                 s   s    | ]	}|d ur|V  qd S rM   r    )r  vr    r    r!   	<genexpr>[  s    z(Swinv2Encoder.forward.<locals>.<genexpr>)r   r   r   r   )r0   r1   r2   rF  rV  r   r   )rP   r   r   r   r[  r\  r]  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr6   r   r   reshaped_hidden_staterD  rG  r;  rH  r   r    r    r!   rT   #  sd   	





zSwinv2Encoder.forward)rM  )FFFT)r   r   r   rO   r   r   r   r   r   r   rT   rY   r    r    rQ   r!   rL  	  s*    
rL  c                   @   s<   e Zd ZU eed< dZdZdZdZdgZ	e
 dd Zd	S )
Swinv2PreTrainedModelrl   swinv2r   )imageTr<  c                 C   s   t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjr7t
|j	 t|j dS t |trV|jdurGt
|j |jdurTt
|j dS dS t |tr{t|jtd | \}}t|j| t|j| dS dS )zInitialize the weightsr=   )meanstdNr   )r   r   r   r   initnormal_weightrl   initializer_ranger   zeros_rf   ones_rZ   rc   re   r   	constant_r   r   r   r   copy_r   r   )rP   moduler   r   r    r    r!   _init_weightsr  s(   




z#Swinv2PreTrainedModel._init_weightsN)r   r   r   r   r   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesr   no_gradrr  r    r    r    r!   rd  i  s   
 rd  c                       sz   e Zd Zd fdd	Zdd Ze						ddejdB d	ejdB d
e	dB de	dB de	de	dB de
eB fddZ  ZS )Swinv2ModelTFc                    s   t  | || _t|j| _t|jd| jd   | _t	||d| _
t|| j
j| _tj| j|jd| _|r<tdnd| _|   dS )a  
        add_pooling_layer (`bool`, *optional*, defaults to `True`):
            Whether or not to apply pooling layer.
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether or not to create and apply mask tokens in the embedding layer.
        r+   r   )rm   r  N)rN   rO   rl   rQ  rR  rS  r   rb   num_featuresrZ   rn   rL  r_   encoderr   rf   r  	layernormAdaptiveAvgPool1dpooler	post_init)rP   rl   add_pooling_layerrm   rQ   r    r!   rO     s   zSwinv2Model.__init__c                 C      | j jS rM   rn   r\   rU   r    r    r!   get_input_embeddings     z Swinv2Model.get_input_embeddingsNr   r   r   r[  r   r]  rA   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| j|||d\}}	| j||	|||d}
|
d }| |}d}| jdurY| |	dd}t
|d}|sg||f|
dd  }|S t|||
j|
j|
jdS )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)r   r   )r   r[  r]  r   r   r+   )r   r$   r   r   r   )rl   r   r[  use_return_dictr   rn   r{  r|  r~  r   r   r   r#   r   r   r   )rP   r   r   r   r[  r   r]  kwargsembedding_outputr   encoder_outputssequence_outputpooled_outputrJ   r    r    r!   rT     s@   


zSwinv2Model.forward)TFNNNNFN)r   r   r   rO   r  r   r   r   r   r   r   r#   rT   rY   r    r    rQ   r!   ry    s2    	ry  a~  
        Swinv2 Model with a decoder on top for masked image modeling, as proposed in
    [SimMIM](https://huggingface.co/papers/2111.09886).

        <Tip>

        Note that we provide a script to pre-train this model on custom data in our [examples
        directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

        </Tip>
    c                       p   e Zd Z fddZe						ddejdB dejdB dedB dedB d	ed
edB de	e
B fddZ  ZS )Swinv2ForMaskedImageModelingc                    sn   t  | t|ddd| _t|jd|jd   }ttj	||j
d |j ddt|j
| _|   d S )NFT)r  rm   r+   r   )in_channelsout_channelsr   )rN   rO   ry  re  r   rb   rS  r   r   r   encoder_strider9   PixelShuffledecoderr  )rP   rl   rz  rQ   r    r!   rO     s   
z%Swinv2ForMaskedImageModeling.__init__NFr   r   r   r[  r   r]  rA   c                 K   s<  |dur|n| j j}| j||||||d}|d }	|	dd}	|	j\}
}}t|d  }}|	|
|||}	| |	}d}|dur|| j j	| j j
 }|d||}|| j j
d| j j
dd }tjj||dd	}||  | d
  | j j }|s|f|dd  }|dur|f| S |S t|||j|j|jdS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, Swinv2ForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
        >>> model = Swinv2ForMaskedImageModeling.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 256, 256]
        ```N)r   r   r[  r   r]  r   r   r+   ro   r.   none)r   gh㈵>)r&   r'   r   r   r   )rl   r  re  r   r0   r   floorrx   r  r   rk   repeat_interleaver   r3   r   ry   l1_lossr   r9   r%   r   r   r   )rP   r   r   r   r[  r   r]  r  r   r  r6   r9   sequence_lengthr7   r8   reconstructed_pixel_valuesmasked_im_lossrq   r   reconstruction_lossrJ   r    r    r!   rT     sH   (	
 z$Swinv2ForMaskedImageModeling.forwardr  )r   r   r   rO   r   r   r   r   r   r   r%   rT   rY   r    r    rQ   r!   r    s0    	r  a  
    Swinv2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune SwinV2 on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                       r  )Swinv2ForImageClassificationc                    sP   t  | |j| _t|| _|jdkrt| jj|jnt | _	| 
  d S r*  )rN   rO   
num_labelsry  re  r   r   rz  r  
classifierr  rP   rl   rQ   r    r!   rO   c  s   
"z%Swinv2ForImageClassification.__init__NFr   labelsr   r[  r   r]  rA   c                 K   s   |dur|n| j j}| j|||||d}|d }	| |	}
d}|dur+| ||
| j }|sA|
f|dd  }|dur?|f| S |S t||
|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r[  r   r]  r   r+   )r&   r*   r   r   r   )	rl   r  re  r  loss_functionr)   r   r   r   )rP   r   r  r   r[  r   r]  r  r   r  r*   r&   rJ   r    r    r!   rT   q  s.   
z$Swinv2ForImageClassification.forwardr  )r   r   r   rO   r   r   r   
LongTensorr   r   r)   rT   rY   r    r    rQ   r!   r  S  s0    	r  zO
    Swinv2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sZ   e Zd Z fddZdd Ze			ddededB dedB d	edB d
ef
ddZ	  Z
S )Swinv2Backbonec                    sX   t     jg fddtt jD  | _t | _t	 | jj
| _|   d S )Nc                    s   g | ]}t  jd |  qS )r+   )r   rb   )r  rD  rl   r    r!   r    s    z+Swinv2Backbone.__init__.<locals>.<listcomp>)rN   rO   rb   r=  rQ  rR  rz  rZ   rn   rL  r_   r{  r  r  rQ   r  r!   rO     s
   &
zSwinv2Backbone.__init__c                 C   r  rM   r  rU   r    r    r!   r    r  z#Swinv2Backbone.get_input_embeddingsNr   r   r[  r]  rA   c                 K   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |\}}| j|||dd|d}|r5|jn|d }	d}
t| j|	D ]\}}|| j	v rO|
|f7 }
qA|si|
f}|r^||d f7 }|rg||d f7 }|S t
|
|rp|jnd|jdS )	a  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/swinv2-tiny-patch4-window8-256", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 2048, 7, 7]
        ```NT)r   r[  r\  r]  r.   r    r   r+   )feature_mapsr   r   )rl   r  r[  r   rn   r{  r   r  stage_namesout_featuresr
   r   r   )rP   r   r   r[  r]  r  r  r   r   r   r  rZ  hidden_staterJ   r    r    r!   rT     s>   #	

zSwinv2Backbone.forward)NNN)r   r   r   rO   r  r   r   r   r
   rT   rY   r    r    rQ   r!   r    s$    
r  )r  r  ry  rd  r  )r=   F)<r   collections.abcr   r   dataclassesr   r   r   r    r   ri  activationsr   backbone_utilsr   modeling_layersr	   modeling_outputsr
   modeling_utilsr   utilsr   r   r   r   configuration_swinv2r   
get_loggerr   loggerr   r#   r%   r)   r;   r<   rW   r   rK   r   rL   rZ   r[   r   r   r   r   r  r	  r
  r<  rL  rd  ry  r  r  r  __all__r    r    r    r!   <module>   s   
 ]+6 
z<`Sh?X