o
    iL                  	   @   s  d Z ddlZddlZddlZddlmZ ddlmZm	Z	 ddl
Z
ddl
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZmZmZ ddlmZ ddl m!Z! e"e#Z$eeddG dd deZ%eeddG dd deZ&eeddG dd deZ'eeddG dd deZ(dd Z)dd  Z*dOd#e
jd$e+d%e,d&e
jfd'd(Z-G d)d* d*ej.Z/G d+d, d,ej.Z0G d-d. d.ej.Z1G d/d0 d0ej.Z2G d1d2 d2ej.Z3G d3d4 d4ej.Z4G d5d6 d6ej.Z5G d7d8 d8ej.Z6G d9d: d:ej.Z7G d;d< d<ej.Z8G d=d> d>eZ9G d?d@ d@ej.Z:eG dAdB dBeZ;eG dCdD dDe;Z<edEdG dFdG dGe;Z=edHdG dIdJ dJe;Z>edKdG dLdM dMe;eZ?g dNZ@dS )Pz!PyTorch Swinv2 Transformer model.    N)	dataclass)OptionalUnion)Tensornn   )ACT2FN)GradientCheckpointingLayer)BackboneOutput)PreTrainedModel) find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringlogging	torch_int)BackboneMixin   )Swinv2ConfigzP
    Swinv2 encoder's outputs, with potential hidden states and attentions.
    )custom_introc                   @   sr   e Zd ZU dZdZeej ed< dZ	ee
ejdf  ed< dZee
ejdf  ed< dZee
ejdf  ed< dS )Swinv2EncoderOutputa  
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlast_hidden_state.hidden_states
attentionsreshaped_hidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   tupler   r    r$   r$   g/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/swinv2/modeling_swinv2.pyr   *   s   
 	r   zX
    Swinv2 model's outputs that also contains a pooling of the last hidden states.
    c                   @      e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< dS )	Swinv2ModelOutputa  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
        Average pooling of the last layer hidden-state.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr   pooler_output.r   r   r   )r   r   r   r   r   r   r    r!   r"   r(   r   r#   r   r   r$   r$   r$   r%   r'   A   s   
 r'   z,
    Swinv2 masked image model outputs.
    c                   @   s   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eeejdf  ed< dZeeejdf  ed< dZeeejdf  ed< ed	d
 ZdS )Swinv2MaskedImageModelingOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
        Masked image modeling (MLM) loss.
    reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
        Reconstructed pixel values.
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nlossreconstruction.r   r   r   c                 C   s   t dt | jS )Nzlogits attribute is deprecated and will be removed in version 5 of Transformers. Please use the reconstruction attribute to retrieve the final output instead.)warningswarnFutureWarningr+   selfr$   r$   r%   logitsv   s
   z&Swinv2MaskedImageModelingOutput.logits)r   r   r   r   r*   r   r    r!   r"   r+   r   r#   r   r   propertyr1   r$   r$   r$   r%   r)   [   s   
 r)   z2
    Swinv2 outputs for image classification.
    c                   @   r&   )	Swinv2ImageClassifierOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
        shape `(batch_size, hidden_size, height, width)`.

        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
        include the spatial dimensions.
    Nr*   r1   .r   r   r   )r   r   r   r   r*   r   r    r!   r"   r1   r   r#   r   r   r$   r$   r$   r%   r3      s   
 r3   c                 C   sR   | j \}}}}| ||| ||| ||} | dddddd d|||}|S )z2
    Partitions the given input into windows.
    r   r   r            shapeviewpermute
contiguous)input_featurewindow_size
batch_sizeheightwidthnum_channelswindowsr$   r$   r%   window_partition   s   $rD   c                 C   sN   | j d }| d|| || |||} | dddddd d|||} | S )z?
    Merges windows to produce higher resolution features.
    r7   r   r   r   r4   r5   r6   r8   )rC   r>   r@   rA   rB   r$   r$   r%   window_reverse   s   
$rE           Finput	drop_probtrainingreturnc                 C   sd   |dks|s| S d| }| j d fd| jd   }|tj|| j| jd }|  | || }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
    rF   r   r   )r   )dtypedevice)r9   ndimr    randrK   rL   floor_div)rG   rH   rI   	keep_probr9   random_tensoroutputr$   r$   r%   	drop_path   s   
rT   c                       sT   e Zd ZdZddee ddf fddZdejdejfdd	Z	de
fd
dZ  ZS )Swinv2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).NrH   rJ   c                    s   t    || _d S N)super__init__rH   )r0   rH   	__class__r$   r%   rX      s   

zSwinv2DropPath.__init__r   c                 C   s   t || j| jS rV   )rT   rH   rI   r0   r   r$   r$   r%   forward   s   zSwinv2DropPath.forwardc                 C   s   d| j  S )Nzp=)rH   r/   r$   r$   r%   
extra_repr   s   zSwinv2DropPath.extra_reprrV   )r   r   r   r   r   floatrX   r    r   r\   strr]   __classcell__r$   r$   rY   r%   rU      s
    rU   c                
       sr   e Zd ZdZd fdd	Zdejdededejfd	d
Z		dde	ej
 de	ej dedeej fddZ  ZS )Swinv2EmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    Fc                    s   t    t|| _| jj}| jj| _|r tt	
dd|jnd | _|jr5tt	
d|d |j| _nd | _t|j| _t|j| _|j| _|| _d S )Nr   )rW   rX   Swinv2PatchEmbeddingspatch_embeddingsnum_patches	grid_size
patch_gridr   	Parameterr    zeros	embed_dim
mask_tokenuse_absolute_embeddingsposition_embeddings	LayerNormnormDropouthidden_dropout_probdropout
patch_sizeconfig)r0   rs   use_mask_tokenrd   rY   r$   r%   rX      s   


 
zSwinv2Embeddings.__init__
embeddingsr@   rA   rJ   c                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr7         ?r   r   r4   bicubicF)sizemodealign_cornersdim)r9   rl   r    jit
is_tracingrr   r   reshaper;   r   
functionalinterpolater:   cat)r0   ru   r@   rA   rd   num_positionsclass_pos_embedpatch_pos_embedr|   
new_height	new_widthsqrt_num_positionsr$   r$   r%   interpolate_pos_encoding   s(   



z)Swinv2Embeddings.interpolate_pos_encodingNpixel_valuesbool_masked_posr   c                 C   s   |j \}}}}| |\}}	| |}| \}
}}|d ur8| j|
|d}|d|}|d|  ||  }| jd urN|rI|| 	||| }n|| j }| 
|}||	fS )Nr7         ?)r9   rc   rn   rx   rj   expand	unsqueezetype_asrl   r   rq   )r0   r   r   r   _rB   r@   rA   ru   output_dimensionsr?   seq_lenmask_tokensmaskr$   r$   r%   r\     s   



zSwinv2Embeddings.forward)FNF)r   r   r   r   rX   r    r   intr   r   r!   
BoolTensorboolr#   r\   r`   r$   r$   rY   r%   ra      s    +ra   c                       sN   e Zd ZdZ fddZdd Zdeej de	ej
e	e f fdd	Z  ZS )
rb   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
|d |d  |d |d  f| _tj||||d| _d S )Nr   r   )kernel_sizestride)rW   rX   
image_sizerr   rB   ri   
isinstancecollectionsabcIterablerd   re   r   Conv2d
projection)r0   rs   r   rr   rB   hidden_sizerd   rY   r$   r%   rX   =  s   
 "zSwinv2PatchEmbeddings.__init__c                 C   s   || j d  dkrd| j d || j d   f}tj||}|| j d  dkr>ddd| j d || j d   f}tj||}|S )Nr   r   )rr   r   r   pad)r0   r   r@   rA   
pad_valuesr$   r$   r%   	maybe_padL  s    zSwinv2PatchEmbeddings.maybe_padr   rJ   c                 C   sV   |j \}}}}| |||}| |}|j \}}}}||f}|ddd}||fS )Nr4   r   )r9   r   r   flatten	transpose)r0   r   r   rB   r@   rA   ru   r   r$   r$   r%   r\   U  s   
zSwinv2PatchEmbeddings.forward)r   r   r   r   rX   r   r   r    r!   r#   r   r   r\   r`   r$   r$   rY   r%   rb   6  s
    .	rb   c                	       sh   e Zd ZdZejfdee dedejddf fddZ	d	d
 Z
dejdeeef dejfddZ  ZS )Swinv2PatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    input_resolutionr|   
norm_layerrJ   Nc                    sB   t    || _|| _tjd| d| dd| _|d| | _d S )Nr5   r4   Fbias)rW   rX   r   r|   r   Linear	reductionrn   )r0   r   r|   r   rY   r$   r%   rX   n  s
   
zSwinv2PatchMerging.__init__c                 C   sF   |d dkp|d dk}|r!ddd|d d|d f}t j||}|S )Nr4   r   r   )r   r   r   )r0   r=   r@   rA   
should_padr   r$   r$   r%   r   u  s
   zSwinv2PatchMerging.maybe_padr=   input_dimensionsc                 C   s   |\}}|j \}}}|||||}| |||}|d d dd ddd dd d f }|d d dd ddd dd d f }	|d d dd ddd dd d f }
|d d dd ddd dd d f }t||	|
|gd}||dd| }| |}| |}|S )Nr   r4   r   r7   r5   )r9   r:   r   r    r   r   rn   )r0   r=   r   r@   rA   r?   r|   rB   input_feature_0input_feature_1input_feature_2input_feature_3r$   r$   r%   r\   }  s   $$$$

zSwinv2PatchMerging.forward)r   r   r   r   r   rm   r#   r   ModulerX   r   r    r   r\   r`   r$   r$   rY   r%   r   a  s
    **r   c                       sb   e Zd Zddgf fdd	Z			ddejdeej deej d	ee d
e	ej f
ddZ
  ZS )Swinv2SelfAttentionr   c              
      s  t    || dkrtd| d| d|| _t|| | _| j| j | _t|tj	j
r0|n||f| _|| _ttdt|ddf | _ttjddd	d
tjd	dtjd|dd
| _tj| jd d  | jd tjd }tj| jd d  | jd tjd }tt||gddddd d}|d dkr|d d d d d d df  |d d   < |d d d d d d df  |d d   < n3|dkr|d d d d d d df  | jd d   < |d d d d d d df  | jd d   < |d9 }t|t t!|d  t" d }|#t$| j% j&}| j'd|dd t| jd }	t| jd }
tt|	|
gdd}t(|d}|d d d d d f |d d d d d f  }|ddd }|d d d d df  | jd d 7  < |d d d d df  | jd d 7  < |d d d d df  d| jd  d 9  < |)d}| j'd|dd tj| j| j|j*d
| _+tj| j| jdd
| _,tj| j| j|j*d
| _-t.|j/| _0d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()
   r   r4   i   Tr   )inplaceFrK   ij)indexing   r   relative_coords_table)
persistentr7   relative_position_index)1rW   rX   
ValueErrornum_attention_headsr   attention_head_sizeall_head_sizer   r   r   r   r>   pretrained_window_sizer   rg   r    logoneslogit_scale
Sequentialr   ReLUcontinuous_position_bias_mlparangeint64r^   stackr   r;   r<   r   signlog2absmathtonext
parametersrK   register_bufferr   sumqkv_biasquerykeyvaluero   attention_probs_dropout_probrq   )r0   rs   r|   	num_headsr>   r   relative_coords_hrelative_coords_wr   coords_hcoords_wcoordscoords_flattenrelative_coordsr   rY   r$   r%   rX     s`   
"&((
,.
..&,((,
zSwinv2SelfAttention.__init__NFr   attention_mask	head_maskoutput_attentionsrJ   c                 C   s"  |j \}}}| ||d| j| jdd}| ||d| j| jdd}	| ||d| j| jdd}
tj	j
|ddtj	j
|	dddd }tj| jtdd }|| }| | jd| j}|| jd | jd | jd  | jd | jd  d}|ddd }d	t| }||d }|d ur|j d }||| || j|||dd }||dd }|d| j||}tj	j|dd}| |}|d ur|| }t||
}|dddd
 }| d d | jf }||}|r||f}|S |f}|S )Nr7   r   r4   r{   g      Y@)maxr      r   )r9   r   r:   r   r   r   r   r   r   r   	normalizer    clampr   r   r   expr   r   r   r>   r;   r<   sigmoidr   softmaxrq   matmulrx   r   )r0   r   r   r   r   r?   r|   rB   query_layer	key_layervalue_layerattention_scoresr   relative_position_bias_tablerelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputsr$   r$   r%   r\     sd   &


zSwinv2SelfAttention.forwardNNF)r   r   r   rX   r    r   r   r!   r   r#   r\   r`   r$   r$   rY   r%   r     s     @r   c                       s8   e Zd Z fddZdejdejdejfddZ  ZS )Swinv2SelfOutputc                    s*   t    t||| _t|j| _d S rV   )rW   rX   r   r   densero   r   rq   r0   rs   r|   rY   r$   r%   rX     s   
zSwinv2SelfOutput.__init__r   input_tensorrJ   c                 C      |  |}| |}|S rV   r   rq   )r0   r   r   r$   r$   r%   r\   $  s   

zSwinv2SelfOutput.forwardr   r   r   rX   r    r   r\   r`   r$   r$   rY   r%   r     s    $r   c                       sd   e Zd Zd fdd	Zdd Z			ddejd	eej d
eej dee	 de
ej f
ddZ  ZS )Swinv2Attentionr   c                    sL   t    t||||t|tjjr|n||fd| _t||| _	t
 | _d S )Nrs   r|   r   r>   r   )rW   rX   r   r   r   r   r   r0   r   rS   setpruned_heads)r0   rs   r|   r   r>   r   rY   r$   r%   rX   ,  s   
	zSwinv2Attention.__init__c                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r{   )lenr   r0   r   r   r  r   r   r   r   rS   r   r   union)r0   headsindexr$   r$   r%   prune_heads:  s   zSwinv2Attention.prune_headsNFr   r   r   r   rJ   c                 C   s6   |  ||||}| |d |}|f|dd   }|S )Nr   r   )r0   rS   )r0   r   r   r   r   self_outputsattention_outputr   r$   r$   r%   r\   L  s   zSwinv2Attention.forwardr   r   )r   r   r   rX   r  r    r   r   r!   r   r#   r\   r`   r$   r$   rY   r%   r  +  s"    r  c                       2   e Zd Z fddZdejdejfddZ  ZS )Swinv2Intermediatec                    sJ   t    t|t|j| | _t|jt	rt
|j | _d S |j| _d S rV   )rW   rX   r   r   r   	mlp_ratior   r   
hidden_actr_   r   intermediate_act_fnr   rY   r$   r%   rX   [  s
   
zSwinv2Intermediate.__init__r   rJ   c                 C   r  rV   )r   r  r[   r$   r$   r%   r\   c     

zSwinv2Intermediate.forwardr  r$   r$   rY   r%   r  Z  s    r  c                       r  )Swinv2Outputc                    s4   t    tt|j| || _t|j| _	d S rV   )
rW   rX   r   r   r   r  r   ro   rp   rq   r   rY   r$   r%   rX   k  s   
zSwinv2Output.__init__r   rJ   c                 C   r  rV   r  r[   r$   r$   r%   r\   p  r  zSwinv2Output.forwardr  r$   r$   rY   r%   r  j  s    r  c                       s   e Zd Z	d fdd	Zdeeeef eeef f fddZdd	 Zd
d Z		dde	j
deeef dee	j dee dee	j
e	j
f f
ddZ  ZS )Swinv2LayerrF   r   c           	         s   t    || _| |j|jf||f\}}|d | _|d | _t|||| jt|tj	j
r/|n||fd| _tj||jd| _|dkrGt|nt | _t||| _t||| _tj||jd| _d S )Nr   r  epsrF   )rW   rX   r   _compute_window_shiftr>   
shift_sizer  r   r   r   r   	attentionr   rm   layer_norm_epslayernorm_beforerU   IdentityrT   r  intermediater  rS   layernorm_after)	r0   rs   r|   r   r   drop_path_rater  r   r>   rY   r$   r%   rX   w  s*   


	zSwinv2Layer.__init__rJ   c                 C   s6   dd t | j|D }dd t | j||D }||fS )Nc                 S   s    g | ]\}}||kr|n|qS r$   r$   ).0rwr$   r$   r%   
<listcomp>  s     z5Swinv2Layer._compute_window_shift.<locals>.<listcomp>c                 S   s"   g | ]\}}}||krd n|qS r  r$   )r#  r$  r%  sr$   r$   r%   r&    s   " )zipr   )r0   target_window_sizetarget_shift_sizer>   r  r$   r$   r%   r    s   z!Swinv2Layer._compute_window_shiftc              	   C   s  | j dkrtjd||df|d}td| j t| j | j  t| j  d f}td| j t| j | j  t| j  d f}d}|D ]}|D ]}	||d d ||	d d f< |d7 }qDq@t|| j}
|
d| j| j }
|
d|
d }||dkd|dkd}|S d }|S )Nr   r   r   r7   r4   g      YrF   )	r  r    rh   slicer>   rD   r:   r   masked_fill)r0   r@   rA   rK   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_maskr$   r$   r%   get_attn_mask  s.   

zSwinv2Layer.get_attn_maskc                 C   sR   | j || j   | j  }| j || j   | j  }ddd|d|f}tj||}||fS Nr   )r>   r   r   r   )r0   r   r@   rA   	pad_right
pad_bottomr   r$   r$   r%   r     s
   zSwinv2Layer.maybe_padNFr   r   r   r   c                 C   s  |\}}|  \}}}	|}
|||||	}| |||\}}|j\}}}}| jdkr9tj|| j | j fdd}n|}t|| j}|d| j| j |	}| j	|||j
d}|d ur_||j}| j||||d}|d }|d| j| j|	}t|| j||}| jdkrtj|| j| jfdd}n|}|d dkp|d dk}|r|d d d |d |d d f  }|||| |	}| |}|
| | }| |}| |}|| | | }|r||d	 f}|S |f}|S )
Nr   )r   r4   )shiftsdimsr7   r   )r   r   r6   r   )rx   r:   r   r9   r  r    rollrD   r>   r5  rK   r   rL   r  rE   r<   r  rT   r   rS   r!  )r0   r   r   r   r   r@   rA   r?   r   channelsshortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr4  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputsr$   r$   r%   r\     sH   

$


zSwinv2Layer.forward)rF   r   r   r   )r   r   r   rX   r#   r   r  r5  r   r    r   r   r!   r   r\   r`   r$   r$   rY   r%   r  v  s&    &
r  c                       s^   e Zd Z	d fdd	Z		ddejdeeef deej	 d	ee
 d
eej f
ddZ  ZS )Swinv2Stager   c	              
      s   t    || _|| _g }	t|D ]}
t||||||
 |
d dkr#dn|jd |d}|	| qt	|	| _
|d urE|||tjd| _nd | _d| _d S )Nr4   r   )rs   r|   r   r   r"  r  r   )r|   r   F)rW   rX   rs   r|   ranger  r>   appendr   
ModuleListblocksrm   
downsamplepointing)r0   rs   r|   r   depthr   rT   rM  r   rL  iblockrY   r$   r%   rX     s(   
	
zSwinv2Stage.__init__NFr   r   r   r   rJ   c                 C   s   |\}}t | jD ]\}}|d ur|| nd }	||||	|}
|
d }q	|}| jd urD|d d |d d }}||||f}| ||}n||||f}|||f}|rY||
dd  7 }|S )Nr   r   r4   )	enumeraterL  rM  )r0   r   r   r   r   r@   rA   rP  layer_modulelayer_head_maskrG  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledr   stage_outputsr$   r$   r%   r\     s(   


zSwinv2Stage.forwardr  r   )r   r   r   rX   r    r   r#   r   r   r!   r   r\   r`   r$   r$   rY   r%   rH    s      
rH  c                       s|   e Zd Zd fdd	Z					ddejdeeef d	eej	 d
ee
 dee
 dee
 dee
 deeef fddZ  ZS )Swinv2Encoderr   r   r   r   c                    s  t    t|j| _|| _| jjd ur|j}dd tjd|j	t
|jddD }g }t| jD ]M}t|t|jd|  |d d|  |d d|  f|j| |j| |t
|jd | t
|jd |d   || jd k rqtnd || d}|| q0t|| _d	| _d S )
Nc                 S   s   g | ]}|  qS r$   )item)r#  xr$   r$   r%   r&  :  s    z*Swinv2Encoder.__init__.<locals>.<listcomp>r   cpu)rL   r4   r   )rs   r|   r   rO  r   rT   rM  r   F)rW   rX   r  depths
num_layersrs   pretrained_window_sizesr    linspacer"  r   rI  rH  r   ri   r   r   rJ  r   rK  layersgradient_checkpointing)r0   rs   re   r`  dprrb  i_layerstagerY   r$   r%   rX   4  s*   
$*

zSwinv2Encoder.__init__NFTr   r   r   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrJ   c                 C   s  |rdnd }|r
dnd }	|rdnd }
|r7|j \}}}|j|g||R  }|dddd}||f7 }|	|f7 }	t| jD ]\}}|d urH|| nd }|||||}|d }|d }|d }|d |d f}|r|r|j \}}}|j|g|d |d f|R  }|dddd}||f7 }|	|f7 }	n'|r|s|j \}}}|j|g||R  }|dddd}||f7 }|	|f7 }	|r|
|dd  7 }
q<|stdd	 |||
|	fD S t|||
|	d
S )Nr$   r   r   r   r4   r   r7   c                 s   s    | ]	}|d ur|V  qd S rV   r$   )r#  vr$   r$   r%   	<genexpr>  s    z(Swinv2Encoder.forward.<locals>.<genexpr>)r   r   r   r   )r9   r:   r;   rR  rb  r#   r   )r0   r   r   r   r   rg  rh  ri  all_hidden_statesall_reshaped_hidden_statesall_self_attentionsr?   r   r   reshaped_hidden_staterP  rS  rT  rG  rU  r   r$   r$   r%   r\   M  sh   






zSwinv2Encoder.forward)rZ  )NFFFT)r   r   r   rX   r    r   r#   r   r   r!   r   r   r   r\   r`   r$   r$   rY   r%   rY  3  s0    

	rY  c                   @   s0   e Zd ZU eed< dZdZdZdgZdd Z	dS )	Swinv2PreTrainedModelrs   swinv2r   TrH  c                 C   s   t |tjtjfr#|jjjd| jjd |j	dur!|j	j
  dS dS t |tjr8|j	j
  |jjd dS t |trW|jdurH|jj
  |jdurU|jj
  dS dS t |trh|jjtd dS dS )zInitialize the weightsrF   )meanstdNr   r   )r   r   r   r   weightdatanormal_rs   initializer_ranger   zero_rm   fill_ra   rj   rl   r   r   r   r   )r0   moduler$   r$   r%   _init_weights  s"   




z#Swinv2PreTrainedModel._init_weightsN)
r   r   r   r   r"   base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr{  r$   r$   r$   r%   rp    s   
 rp  c                       s   e Zd Zd fdd	Zdd Zdd Ze													dd
eej	 deej
 deej	 dee dee dedee deeef fddZ  ZS )Swinv2ModelTFc                    s   t  | || _t|j| _t|jd| jd   | _t	||d| _
t|| j
j| _tj| j|jd| _|r<tdnd| _|   dS )a  
        add_pooling_layer (`bool`, *optional*, defaults to `True`):
            Whether or not to apply pooling layer.
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether or not to create and apply mask tokens in the embedding layer.
        r4   r   )rt   r  N)rW   rX   rs   r  r^  r_  r   ri   num_featuresra   ru   rY  rf   encoderr   rm   r  	layernormAdaptiveAvgPool1dpooler	post_init)r0   rs   add_pooling_layerrt   rY   r$   r%   rX     s   zSwinv2Model.__init__c                 C      | j jS rV   ru   rc   r/   r$   r$   r%   get_input_embeddings     z Swinv2Model.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  layerr  r  )r0   heads_to_pruner  r
  r$   r$   r%   _prune_heads  s   zSwinv2Model._prune_headsNr   r   r   r   rg  r   ri  rJ   c                 C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| |t| j j}| j|||d\}}	| j	||	||||d}
|
d }| 
|}d}| jdurd| |dd}t|d}|sr||f|
dd  }|S t|||
j|
j|
jdS )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)r   r   )r   r   rg  ri  r   r   r4   )r   r(   r   r   r   )rs   r   rg  use_return_dictr   get_head_maskr  r^  ru   r  r  r  r   r    r   r'   r   r   r   )r0   r   r   r   r   rg  r   ri  embedding_outputr   encoder_outputssequence_outputpooled_outputrS   r$   r$   r%   r\     sD   
	

zSwinv2Model.forward)TFNNNNNFN)r   r   r   rX   r  r  r   r   r    r!   r   r   r   r#   r'   r\   r`   r$   r$   rY   r%   r    s:    
	r  a~  
        Swinv2 Model with a decoder on top for masked image modeling, as proposed in
    [SimMIM](https://huggingface.co/papers/2111.09886).

        <Tip>

        Note that we provide a script to pre-train this model on custom data in our [examples
        directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

        </Tip>
    c                       s   e Zd Z fddZe							ddeej deej deej dee	 d	ee	 d
e	dee	 de
eef fddZ  ZS )Swinv2ForMaskedImageModelingc                    sn   t  | t|ddd| _t|jd|jd   }ttj	||j
d |j ddt|j
| _|   d S )NFT)r  rt   r4   r   )in_channelsout_channelsr   )rW   rX   r  rq  r   ri   r_  r   r   r   encoder_striderB   PixelShuffledecoderr  )r0   rs   r  rY   r$   r%   rX   '  s   
z%Swinv2ForMaskedImageModeling.__init__NFr   r   r   r   rg  r   ri  rJ   c              	   C   s>  |dur|n| j j}| j|||||||d}|d }	|	dd}	|	j\}
}}t|d  }}|	|
|||}	| |	}d}|dur}| j j	| j j
 }|d||}|| j j
d| j j
dd }tjj||dd	}||  | d
  | j j }|s|f|dd  }|dur|f| S |S t|||j|j|jdS )a?  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, Swinv2ForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
        >>> model = Swinv2ForMaskedImageModeling.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 256, 256]
        ```N)r   r   r   rg  r   ri  r   r   r4   rv   r7   none)r   gh㈵>)r*   r+   r   r   r   )rs   r  rq  r   r9   r   floorr   r  r   rr   repeat_interleaver   r<   r   r   l1_lossr   rB   r)   r   r   r   )r0   r   r   r   r   rg  r   ri  r   r  r?   rB   sequence_lengthr@   rA   reconstructed_pixel_valuesmasked_im_lossrx   r   reconstruction_lossrS   r$   r$   r%   r\   7  sJ   &

 z$Swinv2ForMaskedImageModeling.forwardr  )r   r   r   rX   r   r   r    r!   r   r   r   r#   r)   r\   r`   r$   r$   rY   r%   r    s6    
	r  a  
    Swinv2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
    of the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune SwinV2 on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                       s   e Zd Z fddZe							ddeej deej deej dee	 d	ee	 d
e	dee	 de
eef fddZ  ZS )Swinv2ForImageClassificationc                    sP   t  | |j| _t|| _|jdkrt| jj|jnt | _	| 
  d S r6  )rW   rX   
num_labelsr  rq  r   r   r  r  
classifierr  r0   rs   rY   r$   r%   rX     s   
"z%Swinv2ForImageClassification.__init__NFr   r   labelsr   rg  r   ri  rJ   c                 C   s   |dur|n| j j}| j||||||d}|d }	| |	}
d}|dur,| ||
| j }|sB|
f|dd  }|dur@|f| S |S t||
|j|j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   rg  r   ri  r   r4   )r*   r1   r   r   r   )	rs   r  rq  r  loss_functionr3   r   r   r   )r0   r   r   r  r   rg  r   ri  r   r  r1   r*   rS   r$   r$   r%   r\     s0   	
z$Swinv2ForImageClassification.forwardr  )r   r   r   rX   r   r   r    r!   
LongTensorr   r   r#   r3   r\   r`   r$   r$   rY   r%   r    s6    
	r  zO
    Swinv2 backbone, to be used with frameworks like DETR and MaskFormer.
    c                       sZ   e Zd Z fddZdd Ze			ddedee dee d	ee d
e	f
ddZ
  ZS )Swinv2Backbonec                    sd   t    t     jg fddtt jD  | _t | _	t
 | j	j| _|   d S )Nc                    s   g | ]}t  jd |  qS )r4   )r   ri   )r#  rP  rs   r$   r%   r&    s    z+Swinv2Backbone.__init__.<locals>.<listcomp>)rW   rX   _init_backboneri   rI  r  r^  r  ra   ru   rY  rf   r  r  r  rY   r  r%   rX     s   &
zSwinv2Backbone.__init__c                 C   r  rV   r  r/   r$   r$   r%   r    r  z#Swinv2Backbone.get_input_embeddingsNr   r   rg  ri  rJ   c              	   C   s   |dur|n| j j}|dur|n| j j}|dur|n| j j}| |\}}| j||d|dd|d}|r6|jn|d }d}	t| j|D ]\}
}|
| j	v rP|	|f7 }	qB|sj|	f}|r_||d f7 }|rh||d f7 }|S t
|	|rq|jnd|jdS )	aK  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/swinv2-tiny-patch4-window8-256")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/swinv2-tiny-patch4-window8-256", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 2048, 7, 7]
        ```NT)r   r   rg  rh  ri  r7   r$   r   r4   )feature_mapsr   r   )rs   r  rg  r   ru   r  r   r(  stage_namesout_featuresr
   r   r   )r0   r   r   rg  ri  r  r   r   r   r  rf  hidden_staterS   r$   r$   r%   r\     s@    


zSwinv2Backbone.forward)NNN)r   r   r   rX   r  r   r   r   r   r
   r\   r`   r$   r$   rY   r%   r    s$    r  )r  r  r  rp  r  )rF   F)Ar   collections.abcr   r   r,   dataclassesr   typingr   r   r    r   r   activationsr   modeling_layersr	   modeling_outputsr
   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   utils.backbone_utilsr   configuration_swinv2r   
get_loggerr   loggerr   r'   r)   r3   rD   rE   r^   r   rT   r   rU   ra   rb   r   r   r   r  r  r  r  rH  rY  rp  r  r  r  r  __all__r$   r$   r$   r%   <module>   s   
 ]+6 /}@dcg@W