o
    i                    @  s  d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlZdd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$ e!%e&Z'dZ(dZ)g dZ*dZ+dZ,e	G dd deZ-e	G dd deZ.e	G dd deZ/e	G dd deZ0ddd d!Z1ded%d&Z2	)dfdgd0d1Z3G d2d3 d3ej4j5Z6G d4d5 d5ej4j5Z7G d6d7 d7ej4j5Z8G d8d9 d9ej4j5Z9G d:d; d;ej4j5Z:G d<d= d=ej4j5Z;G d>d? d?ej4j5Z<G d@dA dAej4j5Z=G dBdC dCej4j5Z>G dDdE dEej4j5Z?G dFdG dGej4j5Z@G dHdI dIej4j5ZAG dJdK dKeZBdLZCdMZDdhdPdQZEG dRdS dSej4j5ZFeG dTdU dUej4j5ZGedVeCG dWdX dXeBZHG dYdZ dZej4j5ZIG d[d\ d\ej4j5ZJed]eCG d^d_ d_eBZKed`eCG dadb dbeBeZLg dcZMdS )izTF 2.0 Swin Transformer model.    )annotationsN)Iterable)	dataclass)partial)AnyCallable   )ACT2FN)TFPreTrainedModelTFSequenceClassificationLossget_initializerkeraskeras_serializableunpack_inputs)
shape_list)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
SwinConfigr   z&microsoft/swin-tiny-patch4-window7-224)r   1   i   ztabby, tabby catc                   @  sB   e Zd ZU dZdZded< dZded< dZded< dZded< dS )	TFSwinEncoderOutputaH  
    Swin encoder's outputs, with potential hidden states and attentions.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Ntf.Tensor | Nonelast_hidden_statetuple[tf.Tensor, ...] | Nonehidden_states
attentionsreshaped_hidden_states)	__name__
__module____qualname____doc__r   __annotations__r   r   r     r&   r&   ]/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/swin/modeling_tf_swin.pyr   D   s   
 r   c                   @  N   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	ded	< dS )
TFSwinModelOutputa  
    Swin model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr   r   pooler_outputr   r   r   r    )
r!   r"   r#   r$   r   r%   r*   r   r   r    r&   r&   r&   r'   r)   e      
 r)   c                   @  sZ   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	ded	< e
d
d ZdS )TFSwinMaskedImageModelingOutputa  
    Swin masked image model outputs.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
        reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr   lossreconstructionr   r   r   r    c                 C  s   t dt | jS )Nzlogits attribute is deprecated and will be removed in version 5 of Transformers. Please use the reconstruction attribute to retrieve the final output instead.)warningswarnFutureWarningr.   selfr&   r&   r'   logits   s
   z&TFSwinMaskedImageModelingOutput.logits)r!   r"   r#   r$   r-   r%   r.   r   r   r    propertyr4   r&   r&   r&   r'   r,      s   
 r,   c                   @  r(   )
TFSwinImageClassifierOutputa  
    Swin outputs for image classification.

    Args:
        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        reshaped_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each stage) of shape
            `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr   r-   r4   r   r   r   r    )
r!   r"   r#   r$   r-   r%   r4   r   r   r    r&   r&   r&   r'   r6      r+   r6   input_feature	tf.Tensorwindow_sizeintreturnc              	   C  sT   t | \}}}}t| ||| ||| ||f} t| d}t|d|||f}|S )z2
    Partitions the given input into windows.
    r   r   r            )r   tfreshape	transpose)r7   r9   
batch_sizeheightwidthnum_channelswindowsr&   r&   r'   window_partition   s   rI   rH   rE   rF   c              	   C  sz   t | d }t || ||  t j}t j||}t | ||| || ||df} t | d} t | |||df} | S )z?
    Merges windows to produce higher resolution features.
    r   r@   r<   )rA   shapecastint32mathfloordivrB   rC   )rH   r9   rE   rF   xyrD   r&   r&   r'   window_reverse   s   rQ           FTinput	drop_probfloattrainingboolscale_by_keepc           	      C  sz   |dks|s| S d| }t | }t|}|d gdg|d   }tj|}t||kdd}|dkr9|r9|| }| | S )zb
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
    rR   r   r         ?)r   lenrA   randomuniformwhere)	rS   rT   rV   rX   	keep_probinput_shapendimrJ   random_tensorr&   r&   r'   	drop_path   s   rb   c                      s<   e Zd ZdZdd fd	d
ZdddZ	ddddZ  ZS )TFSwinEmbeddingszW
    Construct the patch and position embeddings. Optionally, also the mask token.
    Fconfigr   use_mask_tokenrW   r;   Nonec                   sz   t  jdi | t|dd| _| jj| _| jj| _|j| _|| _|j	| _	t
jjddd| _t
jj|jdd| _|| _d S )Npatch_embeddingsnamenormh㈵>)ri   epsilondropoutr&   )super__init__TFSwinPatchEmbeddingsrg   num_patches	grid_size
patch_grid	embed_dimre   use_absolute_embeddingsr   layersLayerNormalizationrj   Dropouthidden_dropout_probrm   rd   )r3   rd   re   kwargs	__class__r&   r'   ro     s   


zTFSwinEmbeddings.__init__r_   tf.TensorShapec                 C  sX  | j r| jdd| jfddd| _nd | _| jr(| jd| jd | jfddd| _nd | _| jr0d S d| _t| dd d urXt	
| jj | jd  W d    n1 sSw   Y  t| d	d d urt	
| jj | jd d | jjg W d    n1 s}w   Y  t| d
d d urt	
| jj | jd  W d    d S 1 sw   Y  d S d S )Nr   zeros
mask_tokenrJ   initializerri   positional_embeddings)r   ri   Trg   rj   rm   )re   
add_weightrt   r   ru   rq   position_embeddingsbuiltgetattrrA   
name_scoperg   ri   buildrj   rd   rm   r3   r_   r&   r&   r'   r     s0   
"zTFSwinEmbeddings.buildNpixel_valuesr8   bool_masked_posbool | NonerV   !tuple[tf.Tensor, tuple[int, int]]c                 C  s   | j ||d\}}| j||d}t|\}}}|d urAt| j|d}	t|	|d}	t|d}
t|
|	j}
|d|
  |	|
  }| j	d urK|| j	 }| j
||d}||fS )NrV   r   r   r@   rY   )rg   rj   r   rA   repeatr   expand_dimsrK   dtyper   rm   )r3   r   r   rV   
embeddingsoutput_dimensionsrD   seq_len_mask_tokensmaskr&   r&   r'   call6  s   

zTFSwinEmbeddings.callF)rd   r   re   rW   r;   rf   r_   r}   r;   rf   )NF)r   r8   r   r   rV   rW   r;   r   )r!   r"   r#   r$   ro   r   r   __classcell__r&   r&   r{   r'   rc   
  s    
rc   c                      s@   e Zd ZdZ fddZdd
dZddddZdddZ  ZS )rp   z#
    Image to Patch Embedding.
    c                   s   t  jdi | |j|j}}|j|j}}t|tjj	r |n||f}t|tjj	r-|n||f}|d |d  |d |d   }|| _|| _|| _|| _
|d |d  |d |d  f| _tjj|| j| jddd| _d S )Nr   r   valid
projection)filterskernel_sizestridespaddingri   r&   )rn   ro   
image_size
patch_sizerG   rt   
isinstancecollectionsabcr   rq   rr   r   rv   Conv2Dr   )r3   rd   rz   r   r   rG   hidden_sizerq   r{   r&   r'   ro   S  s$    "zTFSwinPatchEmbeddings.__init__r   r8   rE   r:   rF   r;   c                 C  s   || j d  dkr!dddd| j d || j d   ff}t||}|| j d  dkrBddd| j d || j d   fdf}t||}|S )Nr   r   r   r   )r   rA   pad)r3   r   rE   rF   
pad_valuesr&   r&   r'   	maybe_padh  s   $$zTFSwinPatchEmbeddings.maybe_padFrV   rW   r   c                 C  s   t |\}}}}t r|| jkrtd| |||}t|d}| j||d}t|d}t |\}}	}}||f}
t|||	df}t|d}||
fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r=   r   r   r   r   r   r   r=   r@   r   r=   r   )	r   rA   executing_eagerlyrG   
ValueErrorr   rC   r   rB   )r3   r   rV   r   rG   rE   rF   r   rD   channelsr   r&   r&   r'   r   q  s   zTFSwinPatchEmbeddings.callNc                 C  sn   | j rd S d| _ t| dd d ur5t| jj | jd d d | jg W d    d S 1 s.w   Y  d S d S )NTr   )r   r   rA   r   r   ri   r   rG   r   r&   r&   r'   r     s   "zTFSwinPatchEmbeddings.build)r   r8   rE   r:   rF   r:   r;   r8   r   )r   r8   rV   rW   r;   r   N	r!   r"   r#   r$   ro   r   r   r   r   r&   r&   r{   r'   rp   N  s    
	rp   c                      sF   e Zd ZdZ	dd fddZdddZddddZdddZ  ZS ) TFSwinPatchMergingaB  
    Patch Merging Layer.

    Args:
        input_resolution (`tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`keras.layer.Layer`, *optional*, defaults to `keras.layers.LayerNormalization`):
            Normalization layer class.
    Ninput_resolutiontuple[int, int]dimr:   
norm_layerCallable | Noner;   rf   c                   sd   t  jd	i | || _|| _tjjd| ddd| _|d u r*tjjddd| _	d S |dd| _	d S )
Nr=   F	reduction)use_biasri   rk   rj   rl   ri   rh   r&   )
rn   ro   r   r   r   rv   Denser   rw   rj   )r3   r   r   r   rz   r{   r&   r'   ro     s   zTFSwinPatchMerging.__init__r7   r8   rE   rF   c                 C  sH   |d dkp|d dk}|r"dd|d fd|d fdf}t ||}|S )Nr=   r   r   r   )rA   r   )r3   r7   rE   rF   
should_padr   r&   r&   r'   r     s
   zTFSwinPatchMerging.maybe_padFinput_dimensionsrV   rW   c                 C  s  |\}}t |\}}}t|||||f}| |||}|d d dd ddd dd d f }	|d d dd ddd dd d f }
|d d dd ddd dd d f }|d d dd ddd dd d f }t|	|
||gd}t||dd| f}| j||d}| j||d}|S )Nr   r=   r   r@   r>   r   )r   rA   rB   r   concatrj   r   )r3   r7   r   rV   rE   rF   rD   r   rG   input_feature_0input_feature_1input_feature_2input_feature_3r&   r&   r'   r     s   $$$$zTFSwinPatchMerging.callc                 C  s   | j rd S d| _ t| dd d ur3t| jj | jd d d| j g W d    n1 s.w   Y  t| dd d urat| jj | jd d d| j g W d    d S 1 sZw   Y  d S d S )NTr   r>   rj   )	r   r   rA   r   r   ri   r   r   rj   r   r&   r&   r'   r     s   "zTFSwinPatchMerging.buildr   )r   r   r   r:   r   r   r;   rf   )r7   r8   rE   r:   rF   r:   r;   r8   r   )r7   r8   r   r   rV   rW   r;   r8   r   r&   r&   r{   r'   r     s    
r   c                      s0   e Zd ZdZdd fd
dZddddZ  ZS )TFSwinDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).NTrT   float | NonerX   rW   r;   rf   c                   s"   t  jdi | || _|| _d S Nr&   )rn   ro   rT   rX   )r3   rT   rX   rz   r{   r&   r'   ro     s   
zTFSwinDropPath.__init__FrS   r8   rV   c                 C  s   t || j|| jS r   )rb   rT   rX   )r3   rS   rV   r&   r&   r'   r     s   zTFSwinDropPath.call)NT)rT   r   rX   rW   r;   rf   r   )rS   r8   rV   rW   r;   r8   r!   r"   r#   r$   ro   r   r   r&   r&   r{   r'   r     s    r   c                      sF   e Zd Zd fdd	ZdddZd ddZ				d!d"ddZ  ZS )#TFSwinSelfAttentionrd   r   r   r:   	num_headsr;   rf   c                   s   t  jd	i | || dkrtd| d| d|| _t|| | _| j| j | _|j}t|t	j
jr7|n||f| _tjj| jt|j|jdd| _tjj| jt|j|jdd| _tjj| jt|j|jdd| _tj|j| _d S )
Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()query)kernel_initializerr   ri   keyvaluer&   )rn   ro   r   num_attention_headsr:   attention_head_sizeall_head_sizer9   r   r   r   r   r   rv   r   r   initializer_rangeqkv_biasr   r   r   rx   attention_probs_dropout_probrm   )r3   rd   r   r   rz   r9   r{   r&   r'   ro     s<   zTFSwinSelfAttention.__init__r_   r}   c           	      C  s  | j d| jd  d d| jd  d  | jfddd| _| j | jd d | jd d fdtjdd	| _t| jd }t| jd }ttj	||d
d}t
|t|d df}|d d d d d f |d d d d d f  }t|d}tj|dd\}}|| jd d 7 }|d| jd  d 9 }|| jd d 7 }tj||gdd}| jttj|ddtj | jrd S d| _t| dd d urt| jj | jd d | jg W d    n1 sw   Y  t| dd d urt| jj | jd d | jg W d    n	1 sw   Y  t| dd d ur?t| jj | jd d | jg W d    d S 1 s8w   Y  d S d S )Nr=   r   r   r~   relative_position_bias_tabler   Frelative_position_index)rJ   	trainabler   ri   ij)indexingr@   )r   r=   r   axisTr   r   r   )r   r9   r   r   rA   rL   r   rangestackmeshgridrB   r   rC   unstackassignrK   
reduce_sumr   r   r   r   ri   r   r   r   r   )	r3   r_   coords_hcoords_wcoordscoords_flattenrelative_coordsstack_0stack_1r&   r&   r'   r     sN   (, $zTFSwinSelfAttention.buildrO   r8   c                 C  s4   t |d d | j| jg }t||}t|dS )Nr@   r   r=   r   r   )r   r   r   rA   rB   rC   )r3   rO   new_x_shaper&   r&   r'   transpose_for_scores6  s   z(TFSwinSelfAttention.transpose_for_scoresNFr   attention_maskr   	head_maskoutput_attentionsrW   rV   tuple[tf.Tensor, ...]c                 C  s  t |\}}}| |}	| | |}
| | |}| |	}t|t|
d}|t	| j
 }t| jt| jd}t|| jd | jd  | jd | jd  df}t|d}|t|d }|d urt |d }t||| || j||f}t|d}t|d}|| }t|d| j||f}tjj|dd}| j||d}|d ur|| }t||}t|d	}t |d d
 | jg }t||}|r||f}|S |f}|S )N)r   r   r   r=   r@   r   r   r@   )r=   r   r   r   r   r   )r   r   r   r   r   rA   matmulrC   rM   sqrtr   gatherr   rB   r   r9   r   r   nnsoftmaxrm   r   )r3   r   r   r   r   rV   rD   r   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputsr&   r&   r'   r   ;  sN   

(zTFSwinSelfAttention.callrd   r   r   r:   r   r:   r;   rf   r   rO   r8   r;   r8   NNFF)r   r8   r   r   r   r   r   rW   rV   rW   r;   r   )r!   r"   r#   ro   r   r   r   r   r&   r&   r{   r'   r     s    
$
*r   c                      s4   e Zd Zd fddZddddZdddZ  ZS )TFSwinSelfOutputrd   r   r   r:   r;   rf   c                   sB   t  jdi | tjj|dd| _tjj|jdd| _|| _	d S Ndenserh   rm   r&   )
rn   ro   r   rv   r   r	  rx   r   rm   r   r3   rd   r   rz   r{   r&   r'   ro   |  s   
zTFSwinSelfOutput.__init__Fr   r8   input_tensorrV   rW   c                 C  s   |  |}| j||d}|S Nr   r	  rm   )r3   r   r  rV   r&   r&   r'   r        
zTFSwinSelfOutput.callNc                 C  s   | j rd S d| _ t| dd d ur1t| jj | jd d | jg W d    n1 s,w   Y  t| dd d urYt| jj | jd  W d    d S 1 sRw   Y  d S d S )NTr	  rm   )	r   r   rA   r   r	  ri   r   r   rm   r   r&   r&   r'   r     s   "zTFSwinSelfOutput.buildrd   r   r   r:   r;   rf   r   )r   r8   r  r8   rV   rW   r;   r8   r   r!   r"   r#   ro   r   r   r   r&   r&   r{   r'   r  {  s    r  c                      sD   e Zd Zd fdd	Zd
d Z				ddddZdddZ  ZS )TFSwinAttentionrd   r   r   r:   r   r;   rf   c                   s@   t  jdi | t|||dd| _t||dd| _t | _d S )Nr3   rh   outputr&   )rn   ro   r   r3   r  self_outputsetpruned_heads)r3   rd   r   r   rz   r{   r&   r'   ro     s   zTFSwinAttention.__init__c                 C  s   t )z
        Prunes heads of the model. See base class PreTrainedModel heads: dict of {layer_num: list of heads to prune in
        this layer}
        )NotImplementedError)r3   headsr&   r&   r'   prune_heads  s   zTFSwinAttention.prune_headsNFr   r8   r   r   r   r   rW   rV   c           	      C  s>   | j |||||d}| j|d ||d}|f|dd   }|S )Nr   r   r   )r3   r  )	r3   r   r   r   r   rV   self_outputsattention_outputr  r&   r&   r'   r     s   zTFSwinAttention.callc                 C     | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urUt| jj | jd  W d    d S 1 sNw   Y  d S d S )NTr3   r  )r   r   rA   r   r3   ri   r   r  r   r&   r&   r'   r        "zTFSwinAttention.buildr  r  )r   r8   r   r   r   r   r   rW   rV   rW   r;   r8   r   )r!   r"   r#   ro   r  r   r   r   r&   r&   r{   r'   r    s    
r  c                      s2   e Zd Zd fddZdddZdddZ  ZS )TFSwinIntermediaterd   r   r   r:   r;   rf   c                   sZ   t  jdi | tjjt|j| dd| _t|j	t
r$t|j	 | _n|j	| _|| _d S )Nr	  rh   r&   )rn   ro   r   rv   r   r:   	mlp_ratior	  r   
hidden_actstrr	   intermediate_act_fnr   r
  r{   r&   r'   ro     s   
zTFSwinIntermediate.__init__r   r8   c                 C  s   |  |}| |}|S r   )r	  r!  )r3   r   r&   r&   r'   r     s   

zTFSwinIntermediate.callNc                 C  sl   | j rd S d| _ t| dd d ur4t| jj | jd d | jg W d    d S 1 s-w   Y  d S d S NTr	  )r   r   rA   r   r	  ri   r   r   r   r&   r&   r'   r     s   "zTFSwinIntermediate.buildr  )r   r8   r;   r8   r   r  r&   r&   r{   r'   r    s    
	r  c                      s4   e Zd Zd fddZddddZdddZ  ZS )TFSwinOutputrd   r   r   r:   r;   rf   c                   sF   t  jdi | tjj|dd| _tj|jd| _|| _	|| _
d S r  )rn   ro   r   rv   r   r	  rx   ry   rm   rd   r   r
  r{   r&   r'   ro     s
   
zTFSwinOutput.__init__Fr   r8   rV   rW   c                 C  s   |  |}| j||d}|S r  r  )r3   r   rV   r&   r&   r'   r     r  zTFSwinOutput.callNc                 C  sx   | j rd S d| _ t| dd d ur:t| jj | jd d t| jj	| j
 g W d    d S 1 s3w   Y  d S d S r"  )r   r   rA   r   r	  ri   r   r:   rd   r  r   r   r&   r&   r'   r     s   ""zTFSwinOutput.buildr  r   )r   r8   rV   rW   r;   r8   r   r  r&   r&   r{   r'   r#    s    r#  c                      sT   e Zd Z		d$d% fddZd&ddZd'ddZ			d(d)d d!Zd*d"d#Z  ZS )+TFSwinLayerrR   r   r   r   r   r:   drop_path_raterU   
shift_sizer;   rf   c           	        s   t  jdi | |j| _t|}||jkr|n|j| _|| jkr$dn|| _|| _tj	j
|jdd| _t|||dd| _|dkrGt|ddntj	jddd| _tj	j
|jd	d| _t||d
d| _t||dd| _|| _d S )Nr   layernorm_beforer   	attentionrh   rR   rb   linearlayernorm_afterintermediater  r&   )rn   ro   chunk_size_feed_forwardrA   
reduce_minr9   r&  r   r   rv   rw   layer_norm_epsr'  r  r(  r   
Activationrb   r*  r  r+  r#  swin_outputr   )	r3   rd   r   r   r   r%  r&  rz   min_resr{   r&   r'   ro     s    


zTFSwinLayer.__init__rE   rF   r9   r   c              	   C  sz  t ||f}d| f| | f| dff}d| f| | f| dff}|dkrd}|D ]T}	|D ]O}
t |	d | |	d | d }t |
d | |
d | d }t t jt ||ddd}t|dkr|t jt|f|jd| }t 	|||}|d7 }q1q-t 
|d}t 
|d}t||}t |d|| f}t 
|dt 
|d }t |dkd|}t |dkd	|}|S )
Nr   r@   r   r   )r@   r=   )r   r=   g      YrR   )rA   r~   r   rB   r   r   rZ   onesr   tensor_scatter_nd_updater   rI   r]   )r3   rE   rF   r9   r&  img_maskheight_sliceswidth_slicescountheight_slicewidth_sliceheight_inds
width_indsindicesupdatesmask_windows	attn_maskr&   r&   r'   get_attn_mask  s.     
	
zTFSwinLayer.get_attn_maskr   r8   tuple[tf.Tensor, tf.Tensor]c                 C  s\   |||  | }|||  | }ddgd|gd|gddgg}t ||}t |d}||fS )Nr   r   )rA   r   rB   )r3   r   r9   rE   rF   	pad_right
pad_bottomr   r&   r&   r'   r   "  s   zTFSwinLayer.maybe_padNFr   r   r   rW   rV   c                 C  s  t |}|| jkrdn| j}|| jkr|n| j}|\}	}
t|\}}}|}| j||d}t |||	|
|f}| |||	|
\}}t|\}}}}|dkrZt j|| | fdd}n|}t	||}t |d|| |f}| j
||||d}| j|||||d}|d }t |d|||f}t||||}|dkrt j|||fdd}n|}|d dkp|d	 dk}|r|d d d |	d |
d d f }t |||	|
 |f}|| j||d }| j||d}| |}|| j||d }|r||d
 f}|S |f}|S )Nr   r   )r   r=   )shiftr   r@   )rE   rF   r9   r&  )r   rV   r   r?   r   )rA   r-  r9   r&  r   r'  rB   r   rollrI   r@  r(  rQ   rb   r*  r+  r0  )r3   r   r   r   r   rV   r1  r&  r9   rE   rF   rD   r   r   shortcutr   
height_pad	width_padshifted_hidden_stateshidden_states_windowsr?  attention_outputsr  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputsr&   r&   r'   r   ,  sN   
	

 
zTFSwinLayer.callc                 C  s  | j rd S d| _ t| dd d ur1t| jj | jd d | jg W d    n1 s,w   Y  t| dd d urVt| jj | jd  W d    n1 sQw   Y  t| dd d ur{t| j	j | j	d  W d    n1 svw   Y  t| dd d urt| j
j | j
d d | jg W d    n1 sw   Y  t| dd d urt| jj | jd  W d    n1 sw   Y  t| dd d urt| jj | jd  W d    d S 1 sw   Y  d S d S )NTr'  r(  rb   r*  r+  r0  )r   r   rA   r   r'  ri   r   r   r(  rb   r*  r+  r0  r   r&   r&   r'   r   n  s8   "zTFSwinLayer.build)rR   r   )
r   r   r   r:   r%  rU   r&  r:   r;   rf   )
rE   r:   rF   r:   r9   r:   r&  r:   r;   r   )
r   r8   r9   r:   rE   r:   rF   r:   r;   rA  NFF)r   r8   r   r   r   r   r   rW   rV   rW   r;   r8   r   )	r!   r"   r#   ro   r@  r   r   r   r   r&   r&   r{   r'   r$    s    

Br$  c                      s:   e Zd Zd! fddZ			d"d#ddZd$dd Z  ZS )%TFSwinStagerd   r   r   r:   r   r   depthr   rb   list[float]
downsampler   r;   rf   c           	        sv   t  jdi |  | _| _ fddt|D | _|d ur3|ttjj	dddd| _
nd | _
d| _d S )	Nc                   sB   g | ]}t  |d  dkrdn jd  | d| dqS )r=   r   zblocks.)rd   r   r   r   r&  r%  ri   )r$  r9   ).0ird   r   rb   r   r   r&   r'   
<listcomp>  s    
z(TFSwinStage.__init__.<locals>.<listcomp>rk   )rl   rU  )r   r   ri   Fr&   )rn   ro   rd   r   r   blocksr   r   rv   rw   rU  pointing)	r3   rd   r   r   rS  r   rb   rU  rz   r{   rX  r'   ro     s   


zTFSwinStage.__init__NFr   r8   r   r   r   r   r   rV   rW   r   c                 C  s   |\}}t | jD ]\}}	|d ur|| nd }
|	|||
||d}|d }q	| jd urH|d d |d d }}||||f}| j|d ||d}n||||f}||f}|r\||dd  7 }|S )Nr   r   r   r=   )	enumeraterZ  rU  )r3   r   r   r   r   rV   rE   rF   rW  layer_modulelayer_head_maskrP  height_downsampledwidth_downsampledr   stage_outputsr&   r&   r'   r     s    


zTFSwinStage.callc              	   C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urV| jD ]}t|j |d  W d    n1 sPw   Y  q8d S d S )NTrU  rZ  )r   r   rA   r   rU  ri   r   rZ  r3   r_   layerr&   r&   r'   r     s   
zTFSwinStage.build)rd   r   r   r:   r   r   rS  r:   r   r:   rb   rT  rU  r   r;   rf   rQ  )r   r8   r   r   r   r   r   r   rV   rW   r;   r   r   r  r&   r&   r{   r'   rR    s    ,rR  c                      s>   e Zd Zd fddZ						ddddZdddZ  ZS )TFSwinEncoderrd   r   rr   r   c                   sp   t  jdi | t j_ _ttddt	 j j
   fddtjD _d_d S )Nr   r   c                   s   g | ]I}t  t jd |  d d |  d d |  f j|  j| t jd| t jd|d   |jd k rCtndd| dqS )r=   r   r   Nzlayers.)rd   r   r   rS  r   rb   rU  ri   )rR  r:   rt   depthsr   sum
num_layersr   )rV  i_layerrd   dprrr   r3   r&   r'   rY    s    *z*TFSwinEncoder.__init__.<locals>.<listcomp>Fr&   )rn   ro   rZ   re  rg  rd   listrA   linspacerf  r%  numpyr   rv   gradient_checkpointing)r3   rd   rr   rz   r{   ri  r'   ro     s   "
zTFSwinEncoder.__init__NFTr   r8   r   r   r   r   rW   output_hidden_statesreturn_dictrV   r;   +tuple[tf.Tensor, ...] | TFSwinEncoderOutputc                 C  s`  d}|rdnd }	|rdnd }
|rdnd }|r9t |\}}}t||g||R }t|d}|	|f7 }	|
|f7 }
t| jD ][\}}|d urJ|| nd }||||||d}|d }|d }|d |d f}||f7 }|rt |\}}}t||g||R }t|d}|	|f7 }	|
|f7 }
|r||dd  7 }q>|std	d
 ||	|fD S t||	||
dS )Nr&   r   r   r   r   r   r@   r=   c                 s  s    | ]	}|d ur|V  qd S r   r&   )rV  vr&   r&   r'   	<genexpr>!  s    z%TFSwinEncoder.call.<locals>.<genexpr>)r   r   r   r    )r   rA   rB   rC   r\  rv   tupler   )r3   r   r   r   r   ro  rp  rV   all_input_dimensionsall_hidden_statesall_reshaped_hidden_statesall_self_attentionsrD   r   r   reshaped_hidden_staterW  r]  r^  rP  r   r&   r&   r'   r     sH   






zTFSwinEncoder.callc              	   C  sj   | j rd S d| _ t| dd d ur1| jD ]}t|j |d  W d    n1 s+w   Y  qd S d S )NTrv   )r   r   rv   rA   r   ri   r   rb  r&   r&   r'   r   *  s   
zTFSwinEncoder.build)rd   r   rr   r   )NFFTF)r   r8   r   r   r   r   r   rW   ro  rW   rp  rW   rV   rW   r;   rq  r   r  r&   r&   r{   r'   rd    s    9rd  c                   @  s   e Zd ZdZeZdZdZdS )TFSwinPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    swinr   N)r!   r"   r#   r$   r   config_classbase_model_prefixmain_input_namer&   r&   r&   r'   rz  4  s
    rz  a`  
    This model is a Tensorflow
    [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer) sub-class. Use it as a
    regular Tensorflow Module and refer to the Tensorflow documentation for all matter related to general usage and
    behavior.

    Parameters:
        config ([`SwinConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a:  
    Args:
        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
            for details.
        head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
r   r   c                 C  s6   | du r	t j } |  }|dvrtdt|  |S )z
    From tensorflow addons
    https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/utils/keras_utils.py#L71
    N>   channels_lastchannels_firstzWThe `data_format` argument must be one of "channels_first", "channels_last". Received: )r   backendimage_data_formatlowerr   r   )r   data_formatr&   r&   r'   normalize_data_formata  s   

r  c                      sL   e Zd ZdZejdfd fddZdddZdddZd fddZ	  Z
S )AdaptiveAveragePooling1Da|  
    Args:
    Average 1D Pooling with adaptive kernel size.
      output_size: An integer or tuple/list of a single integer, specifying pooled_features.
        The new size of output channels.
      data_format: A string,
        one of `channels_last` (default) or `channels_first`. The ordering of the dimensions in the inputs.
        `channels_last` corresponds to inputs with shape `(batch, steps, channels)` while `channels_first` corresponds
        to inputs with shape `(batch, channels, steps)`.
    Input shape:
      - If `data_format='channels_last'`: 3D tensor with shape `(batch, steps, channels)`.
      - If `data_format='channels_first'`: 3D tensor with shape `(batch, channels, steps)`.
    Output shape:
      - If `data_format='channels_last'`: 3D tensor with shape `(batch_size, pooled_steps, channels)`.
      - If `data_format='channels_first'`: 3D tensor with shape `(batch_size, channels, pooled_steps)`.

    Adapted from [tensorflow-addon's adaptive pooling.py](
        https://github.com/tensorflow/addons/blob/8cec33fcaaf1cf90aec7bdd55a0fcdbb251ce5c2/tensorflow_addons/layers/adaptive_pooling.py#L90-L120
    )
    Noutput_sizeint | Iterable[int]reduce_functionr   r  
str | Noner;   rf   c                   s@   t || _|| _t|tr|fnt|| _t jdi | d S r   )	r  r  r  r   r:   rt  r  rn   ro   )r3   r  r  r  rz   r{   r&   r'   ro     s   
z!AdaptiveAveragePooling1D.__init__inputsr8   c                 G  st   | j d }| jdkr"tj||dd}tj|dd}| j|dd}|S tj||dd}tj|dd}| j|dd}|S )Nr   r  r   r   r=   r   )r  r  rA   splitr   r  )r3   r  argsbinssplitsout_vectr&   r&   r'   r     s   

zAdaptiveAveragePooling1D.callr_   Iterable[int]r}   c                 C  s\   t | }| jdkrt |d | jd |d g}|S t |d |d | jd g}|S )Nr  r   r=   r   )rA   TensorShapeas_listr  r  )r3   r_   rJ   r&   r&   r'   compute_output_shape  s   
z-AdaptiveAveragePooling1D.compute_output_shapedict[str, Any]c                   s$   | j | jd}t  }i ||S )N)r  r  )r  r  rn   
get_config)r3   rd   base_configr{   r&   r'   r    s
   
z#AdaptiveAveragePooling1D.get_config)r  r  r  r   r  r  r;   rf   )r  r8   r;   rf   )r_   r  r;   r}   )r;   r  )r!   r"   r#   r$   rA   reduce_meanro   r   r  r  r   r&   r&   r{   r'   r  p  s    

r  c                      sl   e Zd ZeZ	d&d' fd
dZd(ddZd)ddZd*ddZe								d+d,d"d#Z
d-d$d%Z  ZS ).TFSwinMainLayerTFrd   r   add_pooling_layerrW   re   r;   rf   c                   s   t  jdi | || _t|j| _t|jd| jd   | _t	||dd| _
t|| j
jdd| _tjj|jdd| _|rEtd	d
| _d S d | _d S )Nr=   r   r   )re   ri   encoderrh   	layernormr   r   )r  r&   )rn   ro   rd   rZ   re  rg  r:   rt   num_featuresrc   r   rd  rs   r  r   rv   rw   r.  r  r  poolerr3   rd   r  re   rz   r{   r&   r'   ro     s   zTFSwinMainLayer.__init__rp   c                 C  s   | j jS r   )r   rg   r2   r&   r&   r'   get_input_embeddings  s   z$TFSwinMainLayer.get_input_embeddingsheads_to_prunedict[int, list]c                 C  s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  rc  r(  r  )r3   r  rc  r  r&   r&   r'   _prune_heads  s   zTFSwinMainLayer._prune_headsr   
Any | Nonerk  c                 C  s   |d urt d gt| jj S r   )r  rZ   rd   re  )r3   r   r&   r&   r'   get_head_mask  s   zTFSwinMainLayer.get_head_maskNr   r   r   r   r   ro  rp  rV   )TFSwinModelOutput | tuple[tf.Tensor, ...]c              	   C  s  |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&td| |}| j|||d\}}	| j||	|||||d}
|
d }| j||d}d }| j	d urgt
|\}}}| 	|}t|||f}|su||f|
dd   }|S t|||
j|
j|
jdS )N You have to specify pixel_values)r   rV   r   r   ro  rp  rV   r   r   r   )r   r*   r   r   r    )rd   r   ro  use_return_dictr   r  r   r  r  r  r   rA   rB   r)   r   r   r    )r3   r   r   r   r   ro  rp  rV   embedding_outputr   encoder_outputssequence_outputpooled_outputrD   r   r  r  r&   r&   r'   r     sH   




zTFSwinMainLayer.callc                 C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d ur~t| jj | jd d | j	g W d    d S 1 sww   Y  d S d S )NTr   r  r  )
r   r   rA   r   r   ri   r   r  r  r  r   r&   r&   r'   r     s    "zTFSwinMainLayer.buildTFrd   r   r  rW   re   rW   r;   rf   )r;   rp   )r  r  )r   r  r;   rk  NNNNNNFr   r   r   r   r   r   r   r   ro  r   rp  r   rV   rW   r;   r  r   )r!   r"   r#   r   r|  ro   r  r  r  r   r   r   r   r&   r&   r{   r'   r    s"    


<r  z^The bare Swin Model transformer outputting raw hidden-states without any specific head on top.c                      sd   e Zd Z	dd fd
dZeeeeee	de
de							dd ddZd!ddZ  ZS )"TFSwinModelTFrd   r   r  rW   re   r;   rf   c                   s,   t  j|fi | || _t|dd| _d S )Nr{  rh   )rn   ro   rd   r  r{  r  r{   r&   r'   ro   "  s   zTFSwinModel.__init__vision)
checkpointoutput_typer|  modalityexpected_outputNr   r   r   r   r   r   ro  rp  rV   r  c           	   	   C  sh   |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| j|||||||d}|S )z
        bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nr  )r   r   r   r   ro  rp  rV   )rd   r   ro  r  r   r{  )	r3   r   r   r   r   ro  rp  rV   swin_outputsr&   r&   r'   r   )  s    
zTFSwinModel.callc                 C  sd   | j rd S d| _ t| dd d ur0t| jj | jd  W d    d S 1 s)w   Y  d S d S )NTr{  )r   r   rA   r   r{  ri   r   r   r&   r&   r'   r   U  s   "zTFSwinModel.buildr  r  r  r  r   )r!   r"   r#   ro   r   SWIN_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr)   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   r   r   r   r&   r&   r{   r'   r    s*    #r  c                      s,   e Zd ZdZd fddZdd
dZ  ZS )TFSwinPixelShufflez0TF layer implementation of torch.nn.PixelShuffleupscale_factorr:   r;   rf   c                   s<   t  jdi | t|tr|dk rtd| || _d S )Nr=   z1upscale_factor must be an integer value >= 2 got r&   )rn   ro   r   r:   r   r  )r3   r  rz   r{   r&   r'   ro   a  s   
zTFSwinPixelShuffle.__init__rO   r8   c                   s~   |}t |\}}}}| jd  t|  t fddt D g}tj|t||dgdd}tjj	|| jdd}|S )	Nr=   c                   s&   g | ]}t D ]}||   qqS r&   )r   )rV  rW  jblock_size_squaredoutput_depthr&   r'   rY  q  s   & z+TFSwinPixelShuffle.call.<locals>.<listcomp>r   r@   )paramsr<  
batch_dimsNHWC)
block_sizer  )
r   r  r:   rA   constantr   r   tiler   depth_to_space)r3   rO   r   rD   r   num_input_channelspermutationr&   r  r'   r   g  s   
zTFSwinPixelShuffle.call)r  r:   r;   rf   r  r   r&   r&   r{   r'   r  ^  s    r  c                      s2   e Zd Zd fddZddd	ZdddZ  ZS )TFSwinDecoderrd   r   c                   sN   t  jdi | tjj|jd |j dddd| _t|jdd| _	|| _
d S )Nr=   r   0)r   r   r   ri   1rh   r&   )rn   ro   r   rv   r   encoder_striderG   conv2dr  pixel_shufflerd   )r3   rd   rz   r{   r&   r'   ro   y  s   
zTFSwinDecoder.__init__rO   r8   r;   c                 C  s4   |}t |d}| |}| |}t |d}|S )Nr   r   )rA   rC   r  r  )r3   rO   r   r&   r&   r'   r     s   

zTFSwinDecoder.callNc                 C  s   | j rd S d| _ t| dd d ur3t| jj | jd d d | jjg W d    n1 s.w   Y  t| dd d ur[t| j	j | j	d  W d    d S 1 sTw   Y  d S d S )NTr  r  )
r   r   rA   r   r  ri   r   rd   r   r  r   r&   r&   r'   r     s   "zTFSwinDecoder.buildrd   r   r  r   r  r&   r&   r{   r'   r  x  s    

r  z~Swin Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).c                      sZ   e Zd Zd fddZeeeeede								ddddZ
dddZ  ZS )TFSwinForMaskedImageModelingrd   r   c                   s0   t  | t|dddd| _t|dd| _d S )NFTr{  )r  re   ri   decoderrh   )rn   ro   r  r{  r  r  r3   rd   r{   r&   r'   ro     s   z%TFSwinForMaskedImageModeling.__init__)r  r|  NFr   r   r   r   r   r   ro  rp  rV   rW   r;   'tuple | TFSwinMaskedImageModelingOutputc              	   C  s  |dur|n| j j}| j|||||||d}|d }	t|	d}	t|	\}
}}t|d  }}t|	|
|||f}	| |	}d}|dur| j j	| j j
 }t|d||f}t|| j j
d}t|| j j
d}t|d}t|tj}tjt|d	t|d	}t|d}t|| }t|d
 | j j }|| }t|d}|s|f|dd  }|dur|f| S |S t|||j|j|jdS )aA  
        bool_masked_pos (`tf.Tensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, TFSwinForMaskedImageModeling
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
        >>> model = TFSwinForMaskedImageModeling.from_pretrained("microsoft/swin-tiny-patch4-window7-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = tf.random.uniform((1, num_patches)) >= 0.5

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```N)r   r   r   ro  rp  rV   r   r   g      ?r@   r   r=   )r   r=   r   r   rk   r  )r-   r.   r   r   r    )rd   r  r{  rA   rC   r   r:   rB   r  r   r   r   r   rK   float32r   lossesmean_absolute_errorr   rG   r,   r   r   r    )r3   r   r   r   r   ro  rp  rV   r  r  rD   rG   sequence_lengthrE   rF   reconstructed_pixel_valuesmasked_im_losssizer   reconstruction_loss
total_lossnum_masked_pixelsr  r&   r&   r'   r     sV   *



z!TFSwinForMaskedImageModeling.callc                 C  r  )NTr{  r  )r   r   rA   r   r{  ri   r   r  r   r&   r&   r'   r     r  z"TFSwinForMaskedImageModeling.buildr  r  )r   r   r   r   r   r   r   r   ro  r   rp  r   rV   rW   r;   r  r   )r!   r"   r#   ro   r   r  r   r,   r  r   r   r   r   r&   r&   r{   r'   r    s    
]r  z
    Swin Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                      s^   e Zd Zd fddZeeeeee	e
de							ddddZdddZ  ZS )TFSwinForImageClassificationrd   r   c                   sZ   t  | |j| _t|dd| _|jdkr"tjj|jdd| _	d S tjjddd| _	d S )Nr{  rh   r   
classifierr)  )
rn   ro   
num_labelsr  r{  r   rv   r   r/  r  r  r{   r&   r'   ro     s   
z%TFSwinForImageClassification.__init__)r  r  r|  r  NFr   r   r   labelsr   r   ro  rp  rV   rW   r;   3tuple[tf.Tensor, ...] | TFSwinImageClassifierOutputc                 C  s   |dur|n| j j}| j||||||d}|d }	| j|	|d}
|du r&dn| ||
}|sB|
f|dd  }|dur@|f| S |S t||
|j|j|jdS )a  
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   r=   )r-   r4   r   r   r    )	rd   r  r{  r  hf_compute_lossr6   r   r   r    )r3   r   r   r  r   ro  rp  rV   r  r  r4   r-   r  r&   r&   r'   r   %  s,   	z!TFSwinForImageClassification.callc                 C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d ur`t| jdrbt| jj | jd d | jj	g W d    d S 1 sYw   Y  d S d S d S )NTr{  r  ri   )
r   r   rA   r   r{  ri   r   hasattrr  r  r   r&   r&   r'   r   Z  s   "z"TFSwinForImageClassification.buildr  r  )r   r   r   r   r  r   r   r   ro  r   rp  r   rV   rW   r;   r  r   )r!   r"   r#   ro   r   r  r   _IMAGE_CLASS_CHECKPOINTr6   r  _IMAGE_CLASS_EXPECTED_OUTPUTr   r   r   r   r&   r&   r{   r'   r    s&    -r  )r  r  r  rz  )r7   r8   r9   r:   r;   r8   )
rH   r8   r9   r:   rE   r:   rF   r:   r;   r8   )rR   FT)
rS   r8   rT   rU   rV   rW   rX   rW   r;   r8   )r   r   r;   r   )Nr$   
__future__r   collections.abcr   rM   r/   r   dataclassesr   	functoolsr   typingr   r   
tensorflowrA   activations_tfr	   modeling_tf_utilsr
   r   r   r   r   r   tf_utilsr   utilsr   r   r   r   r   r   configuration_swinr   
get_loggerr!   loggerr  r  r  r  r  r   r)   r,   r6   rI   rQ   rb   rv   Layerrc   rp   r   r   r   r  r  r  r#  r$  rR  rd  rz  SWIN_START_DOCSTRINGr  r  r  r  r  r  r  r  r  __all__r&   r&   r&   r'   <module>   s     
 #,
#
DDI ' UY
?m=tP