o
    i                     @  s  d Z ddlmZ ddlZddlZddlmZ ddlZ	ddl
mZ ddlmZmZmZmZ ddlmZmZmZmZmZmZ dd	lmZmZ dd
lmZmZmZmZm Z m!Z! ddl"m#Z# e $e%Z&dZ'dZ(g dZ)dZ*dZ+eG dd deZ,G dd dej-j.Z/G dd dej-j.Z0G dd dej-j.Z1G dd dej-j.Z2G dd dej-j.Z3G dd dej-j.Z4G dd  d ej-j.Z5G d!d" d"ej-j.Z6G d#d$ d$ej-j.Z7eG d%d& d&ej-j.Z8G d'd( d(eZ9d)Z:d*Z;ed+e:G d,d- d-e9Z<G d.d/ d/ej-j.Z=G d0d1 d1ej-j.Z>G d2d3 d3ej-j.Z?ed4e:G d5d6 d6e9Z@ed7e:G d8d9 d9e9eZAed:e:G d;d< d<e9ZBg d=ZCdS )>zTensorFlow DeiT model.    )annotationsN)	dataclass   )get_tf_activation)TFBaseModelOutputTFBaseModelOutputWithPoolingTFImageClassifierOutputTFMaskedImageModelingOutput)TFPreTrainedModelTFSequenceClassificationLossget_initializerkeraskeras_serializableunpack_inputs)
shape_liststable_softmax)ModelOutputadd_code_sample_docstringsadd_start_docstrings%add_start_docstrings_to_model_forwardloggingreplace_return_docstrings   )
DeiTConfigr   z(facebook/deit-base-distilled-patch16-224)r      i   ztabby, tabby catc                   @  sN   e Zd ZU dZdZded< dZded< dZded< dZded< dZ	ded	< dS )
-TFDeiTForImageClassificationWithTeacherOutputa  
    Output type of [`DeiTForImageClassificationWithTeacher`].

    Args:
        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores as the average of the cls_logits and distillation logits.
        cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
            class token).
        distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
            distillation token).
        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
            the initial embedding outputs.
        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
            the self-attention heads.
    Ntf.Tensor | Nonelogits
cls_logitsdistillation_logitsztuple[tf.Tensor] | Nonehidden_states
attentions)
__name__
__module____qualname____doc__r   __annotations__r   r   r    r!    r'   r'   ]/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/deit/modeling_tf_deit.pyr   B   s   
 r   c                      sJ   e Zd ZdZdd fd	d
ZdddZdddZ			d d!ddZ  ZS )"TFDeiTEmbeddingszv
    Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
    Fconfigr   use_mask_tokenboolreturnNonec                   sD   t  jdi | || _|| _t|dd| _tjj|j	dd| _
d S )Npatch_embeddings)r*   namedropoutr0   r'   )super__init__r*   r+   TFDeiTPatchEmbeddingsr/   r   layersDropouthidden_dropout_probr1   )selfr*   r+   kwargs	__class__r'   r(   r4   f   s
   zTFDeiTEmbeddings.__init__Nc                 C  sV  | j dd| jjftj ddd| _| j dd| jjftj ddd| _d | _| j	r<| j dd| jjftj ddd| _| j
j}| j d|d | jjftj ddd| _| jrYd S d| _t| d	d d urt| j
j | j
d  W d    n1 s|w   Y  t| d
d d urt| jj | jd  W d    d S 1 sw   Y  d S d S )Nr   T	cls_token)shapeinitializer	trainabler0   distillation_token
mask_token   position_embeddingsr/   r1   )
add_weightr*   hidden_sizer   initializerszerosr=   rA   rB   r+   r/   num_patchesrD   builtgetattrtf
name_scoper0   buildr1   )r9   input_shaperI   r'   r'   r(   rN   m   sN   "zTFDeiTEmbeddings.build
embeddings	tf.Tensorheightintwidthc              	   C  sF  |j d d }| jj d d }||kr||kr| jS | jd d dd d f }| jd d dd d f }| jd d dd d d f }|j d }	|| jj }
|| jj }|
d |d }
}t|dtt|tt||	f}tj	j
|t|
t|fdd}tj|g dd	}t|dd|	f}tjtj|dd
tj|dd
|gdd
S )Nr   rC   r   g?bicubic)sizemethodr   rC   r   r   permaxis)r>   rD   r*   
patch_sizerL   reshaperS   mathsqrtimageresize	transposeconcatexpand_dims)r9   rP   rR   rT   rI   num_positionsclass_pos_embeddist_pos_embedpatch_pos_embeddimh0w0r'   r'   r(   interpolate_pos_encoding   s(   
 z)TFDeiTEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posr   trainingrn   c                 C  s   |j \}}}}| |}t|\}	}
}|d ur:t| j|	|
dg}tj|dd}tj||jd}|d|  ||  }tj	| j
|	dd}tj	| j|	dd}tj|||fdd}| j}|rb| |||}|| }| j||d}|S )	Nr   rU   r\   dtypeg      ?r   )repeatsr]   rq   )r>   r/   r   rL   tilerB   rf   castrs   repeatr=   rA   re   rD   rn   r1   )r9   ro   rp   rq   rn   _rR   rT   rP   
batch_size
seq_lengthmask_tokensmask
cls_tokensdistillation_tokensposition_embeddingr'   r'   r(   call   s"   
zTFDeiTEmbeddings.callF)r*   r   r+   r,   r-   r.   N)rP   rQ   rR   rS   rT   rS   r-   rQ   )NFF)
ro   rQ   rp   r   rq   r,   rn   r,   r-   rQ   )	r"   r#   r$   r%   r4   rN   rn   r   __classcell__r'   r'   r;   r(   r)   a   s    

'r)   c                      s6   e Zd ZdZd fddZdd
dZdddZ  ZS )r5   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    r*   r   r-   r.   c                   s   t  jdi | |j|j}}|j|j}}t|tjj	r |n||f}t|tjj	r-|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tjj|||dd| _d S )Nr   r   
projection)kernel_sizestridesr0   r'   )r3   r4   
image_sizer^   num_channelsrF   
isinstancecollectionsabcIterablerI   r   r6   Conv2Dr   )r9   r*   r:   r   r^   r   rF   rI   r;   r'   r(   r4      s    zTFDeiTPatchEmbeddings.__init__ro   rQ   c                 C  s^   t |\}}}}t r|| jkrtd| |}t |\}}}}t|||| |f}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.)r   rL   executing_eagerlyr   
ValueErrorr   r_   )r9   ro   rz   rR   rT   r   xr'   r'   r(   r      s   
zTFDeiTPatchEmbeddings.callNc                 C  sn   | j rd S d| _ t| dd d ur5t| jj | jd d d | jg W d    d S 1 s.w   Y  d S d S )NTr   )rJ   rK   rL   rM   r   r0   rN   r   r9   rO   r'   r'   r(   rN         "zTFDeiTPatchEmbeddings.buildr*   r   r-   r.   )ro   rQ   r-   rQ   r   r"   r#   r$   r%   r4   r   rN   r   r'   r'   r;   r(   r5      s
    
r5   c                      s@   e Zd Zd fddZdd
dZ	ddddZdddZ  ZS )TFDeiTSelfAttentionr*   r   c                   s   t  jd
i | |j|j dkrtd|j d|j d|j| _t|j|j | _| j| j | _t	| j| _
tjj| jt|jdd| _tjj| jt|jdd| _tjj| jt|jdd| _tjj|jd	| _|| _d S )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()queryunitskernel_initializerr0   keyvaluerater'   )r3   r4   rF   num_attention_headsr   rS   attention_head_sizeall_head_sizer`   ra   sqrt_att_head_sizer   r6   Denser   initializer_ranger   r   r   r7   attention_probs_dropout_probr1   r*   r9   r*   r:   r;   r'   r(   r4      s,   

zTFDeiTSelfAttention.__init__tensorrQ   rz   rS   r-   c                 C  s,   t j||d| j| jfd}t j|g ddS )NrU   r   r>   r   rC   r   r   rZ   )rL   r_   r   r   rd   )r9   r   rz   r'   r'   r(   transpose_for_scores  s   z(TFDeiTSelfAttention.transpose_for_scoresFr    	head_maskoutput_attentionsr,   rq   tuple[tf.Tensor]c                 C  s   t |d }| j|d}| j|d}| j|d}| ||}	| ||}
| ||}tj|	|
dd}tj| j|j	d}t
||}t|dd}| j||d}|d urXt||}t||}tj|g d	d
}tj||d| jfd}|rz||f}|S |f}|S )Nr   inputsT)transpose_brr   rU   )r   r]   r   rq   r   rZ   r   )r   r   r   r   r   rL   matmulrw   r   rs   divider   r1   multiplyrd   r_   r   )r9   r    r   r   rq   rz   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresdkattention_probsattention_outputoutputsr'   r'   r(   r     s*   zTFDeiTSelfAttention.callNc                 C  s  | j rd S d| _ t| dd d ur2t| jj | jd d | jjg W d    n1 s-w   Y  t| dd d ur\t| j	j | j	d d | jjg W d    n1 sWw   Y  t| dd d urt| j
j | j
d d | jjg W d    d S 1 sw   Y  d S d S )NTr   r   r   )rJ   rK   rL   rM   r   r0   rN   r*   rF   r   r   r   r'   r'   r(   rN   G  s    "zTFDeiTSelfAttention.buildr*   r   )r   rQ   rz   rS   r-   rQ   r   
r    rQ   r   rQ   r   r,   rq   r,   r-   r   r   )r"   r#   r$   r4   r   r   rN   r   r'   r'   r;   r(   r      s    
)r   c                      s8   e Zd ZdZd fddZddddZdddZ  ZS )TFDeiTSelfOutputz
    The residual connection is defined in TFDeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r*   r   c                   J   t  jdi | tjj|jt|jdd| _tjj	|j
d| _|| _d S Ndenser   r   r'   r3   r4   r   r6   r   rF   r   r   r   r7   r8   r1   r*   r   r;   r'   r(   r4   ]     
zTFDeiTSelfOutput.__init__Fr    rQ   input_tensorrq   r,   r-   c                 C  s   | j |d}| j||d}|S Nr   r   r   r1   r9   r    r   rq   r'   r'   r(   r   f  s   zTFDeiTSelfOutput.callNc                 C  n   | j rd S d| _ t| dd d ur5t| jj | jd d | jjg W d    d S 1 s.w   Y  d S d S NTr   	rJ   rK   rL   rM   r   r0   rN   r*   rF   r   r'   r'   r(   rN   l  r   zTFDeiTSelfOutput.buildr   r   r    rQ   r   rQ   rq   r,   r-   rQ   r   r   r'   r'   r;   r(   r   W  s
    	r   c                      s>   e Zd Zd fddZdd Z	ddddZdddZ  ZS )TFDeiTAttentionr*   r   c                   s2   t  jdi | t|dd| _t|dd| _d S )N	attentionr2   outputr'   )r3   r4   r   self_attentionr   dense_outputr   r;   r'   r(   r4   w  s   zTFDeiTAttention.__init__c                 C     t r   NotImplementedError)r9   headsr'   r'   r(   prune_heads}  s   zTFDeiTAttention.prune_headsFr   rQ   r   r   r,   rq   r-   r   c                 C  s<   | j ||||d}| j|d ||d}|f|dd   }|S )Nr    r   r   rq   r   r    r   rq   r   )r   r   )r9   r   r   r   rq   self_outputsr   r   r'   r'   r(   r     s   
zTFDeiTAttention.callNc                 C     | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urUt| jj | jd  W d    d S 1 sNw   Y  d S d S )NTr   r   )rJ   rK   rL   rM   r   r0   rN   r   r   r'   r'   r(   rN        "zTFDeiTAttention.buildr   r   )
r   rQ   r   rQ   r   r,   rq   r,   r-   r   r   )r"   r#   r$   r4   r   r   rN   r   r'   r'   r;   r(   r   v  s    r   c                      2   e Zd Zd fddZddd	ZdddZ  ZS )TFDeiTIntermediater*   r   c                   sZ   t  jdi | tjj|jt|jdd| _t	|j
tr$t|j
| _n|j
| _|| _d S )Nr   r   r'   )r3   r4   r   r6   r   intermediate_sizer   r   r   r   
hidden_actstrr   intermediate_act_fnr*   r   r;   r'   r(   r4     s   
zTFDeiTIntermediate.__init__r    rQ   r-   c                 C  s   | j |d}| |}|S )Nr   )r   r   )r9   r    r'   r'   r(   r     s   
zTFDeiTIntermediate.callNc                 C  r   r   r   r   r'   r'   r(   rN     r   zTFDeiTIntermediate.buildr   r    rQ   r-   rQ   r   r"   r#   r$   r4   r   rN   r   r'   r'   r;   r(   r     s    
r   c                      s4   e Zd Zd fddZddddZdddZ  ZS )TFDeiTOutputr*   r   c                   r   r   r   r   r;   r'   r(   r4     r   zTFDeiTOutput.__init__Fr    rQ   r   rq   r,   r-   c                 C  s&   | j |d}| j||d}|| }|S r   r   r   r'   r'   r(   r     s   zTFDeiTOutput.callNc                 C  r   r   )	rJ   rK   rL   rM   r   r0   rN   r*   r   r   r'   r'   r(   rN     r   zTFDeiTOutput.buildr   r   r   r   r   r'   r'   r;   r(   r     s    	r   c                      s:   e Zd ZdZd fddZ	ddddZdddZ  ZS )TFDeiTLayerz?This corresponds to the Block class in the timm implementation.r*   r   c                   sn   t  jdi | t|dd| _t|dd| _t|dd| _tj	j
|jdd| _tj	j
|jdd| _|| _d S )	Nr   r2   intermediater   layernorm_beforeepsilonr0   layernorm_afterr'   )r3   r4   r   r   r   r   r   deit_outputr   r6   LayerNormalizationlayer_norm_epsr   r   r*   r   r;   r'   r(   r4     s   
zTFDeiTLayer.__init__Fr    rQ   r   r   r,   rq   r-   r   c           
      C  sn   | j | j||d|||d}|d }|| }| j||d}| j||d}| j|||d}|f|dd   }	|	S )Nr   )r   r   r   rq   r   )r    rq   r   r   )r   r   r   r   r   )
r9   r    r   r   rq   attention_outputsr   layer_outputintermediate_outputr   r'   r'   r(   r     s   zTFDeiTLayer.callNc                 C  s  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d urwt| jj | jd  W d    n1 srw   Y  t| dd d urt| j	j | j	d d | j
jg W d    n1 sw   Y  t| dd d urt| jj | jd d | j
jg W d    d S 1 sw   Y  d S d S )NTr   r   r   r   r   )rJ   rK   rL   rM   r   r0   rN   r   r   r   r*   rF   r   r   r'   r'   r(   rN     s0   "zTFDeiTLayer.buildr   r   r   r   r   r'   r'   r;   r(   r     s     r   c                      s6   e Zd Zd fddZ	ddddZdddZ  ZS )TFDeiTEncoderr*   r   c                   s0   t  jdi |  fddt jD | _d S )Nc                   s   g | ]}t  d | dqS )zlayer_._r2   )r   ).0ir*   r'   r(   
<listcomp>  s    z*TFDeiTEncoder.__init__.<locals>.<listcomp>r'   )r3   r4   rangenum_hidden_layerslayerr   r;   r   r(   r4     s   zTFDeiTEncoder.__init__Fr    rQ   r   r   r,   output_hidden_statesreturn_dictrq   r-   $TFBaseModelOutput | tuple[tf.Tensor]c                 C  s   |rdnd }|r
dnd }t | jD ]"\}	}
|r||f }|
|||	 ||d}|d }|r3||d f }q|r;||f }|sItdd |||fD S t|||dS )Nr'   r   r   r   c                 s  s    | ]	}|d ur|V  qd S r   r'   )r   vr'   r'   r(   	<genexpr>@  s    z%TFDeiTEncoder.call.<locals>.<genexpr>)last_hidden_stater    r!   )	enumerater   tupler   )r9   r    r   r   r   r   rq   all_hidden_statesall_attentionsr   layer_modulelayer_outputsr'   r'   r(   r      s,   	

zTFDeiTEncoder.callNc              	   C  sj   | j rd S d| _ t| dd d ur1| jD ]}t|j |d  W d    n1 s+w   Y  qd S d S )NTr   )rJ   rK   r   rL   rM   r0   rN   )r9   rO   r   r'   r'   r(   rN   F  s   
zTFDeiTEncoder.buildr   r   )r    rQ   r   rQ   r   r,   r   r,   r   r,   rq   r,   r-   r   r   r   r'   r'   r;   r(   r     s
    &r   c                      sj   e Zd ZeZ	d#d$ fd
dZd%ddZdd Zdd Ze									d&d'dd Z
d(d!d"Z  ZS ))TFDeiTMainLayerTFr*   r   add_pooling_layerr,   r+   r-   r.   c                   sj   t  jdi | || _t||dd| _t|dd| _tjj	|j
dd| _|r0t|dd| _d S d | _d S )	NrP   )r+   r0   encoderr2   	layernormr   poolerr'   )r3   r4   r*   r)   rP   r   r  r   r6   r   r   r  TFDeiTPoolerr  r9   r*   r
  r+   r:   r;   r'   r(   r4   T  s    zTFDeiTMainLayer.__init__r5   c                 C  s   | j jS r   )rP   r/   )r9   r'   r'   r(   get_input_embeddings`  s   z$TFDeiTMainLayer.get_input_embeddingsc                 C  r   )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        r   )r9   heads_to_pruner'   r'   r(   _prune_headsc  s   zTFDeiTMainLayer._prune_headsc                 C  s   |d urt d g| jj }|S r   )r   r*   r   )r9   r   r'   r'   r(   get_head_maskj  s   zTFDeiTMainLayer.get_head_maskNro   r   rp   r   r   bool | Noner   r   rn   rq   4TFBaseModelOutputWithPooling | tuple[tf.Tensor, ...]c	                 C  s   |d ur|n| j j}|d ur|n| j j}|d ur|n| j j}|d u r&tdt|d}| |}| j||||d}	| j	|	|||||d}
|
d }| j
||d}| jd ur\| j||dnd }|ss|d urh||fn|f}||
dd   S t|||
j|
jdS )	Nz You have to specify pixel_valuesrY   )rp   rq   rn   )r   r   r   r   rq   r   ru   r   )r  pooler_outputr    r!   )r*   r   r   use_return_dictr   rL   rd   r  rP   r  r  r  r   r    r!   )r9   ro   rp   r   r   r   r   rn   rq   embedding_outputencoder_outputssequence_outputpooled_outputhead_outputsr'   r'   r(   r   r  sD   
zTFDeiTMainLayer.callc                 C  sL  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urRt| jj | jd  W d    n1 sMw   Y  t| dd d ur|t| jj | jd d | j	j
g W d    n1 sww   Y  t| dd d urt| jj | jd  W d    d S 1 sw   Y  d S d S )NTrP   r  r  r  )rJ   rK   rL   rM   rP   r0   rN   r  r  r*   rF   r  r   r'   r'   r(   rN     s(   "zTFDeiTMainLayer.buildTFr*   r   r
  r,   r+   r,   r-   r.   )r-   r5   NNNNNNFF)ro   r   rp   r   r   r   r   r  r   r  r   r  rn   r,   rq   r,   r-   r  r   )r"   r#   r$   r   config_classr4   r  r  r  r   r   rN   r   r'   r'   r;   r(   r	  P  s$    
=r	  c                   @  s   e Zd ZdZeZdZdZdS )TFDeiTPreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models.
    deitro   N)r"   r#   r$   r%   r   r   base_model_prefixmain_input_namer'   r'   r'   r(   r!    s
    r!  aR  
    This model is a TensorFlow
    [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
    TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.

    Parameters:
        config ([`DeiTConfig`]): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
a  
    Args:
        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
            [`DeiTImageProcessor.__call__`] for details.

        head_mask (`tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        output_attentions (`bool`, *optional*):
            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
            tensors for more detail.
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
            more detail.
        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
            Whether to interpolate the pre-trained position encodings.
        return_dict (`bool`, *optional*):
            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
z^The bare DeiT Model transformer outputting raw hidden-states without any specific head on top.c                	      sf   e Zd Z	dd fd
dZeeeeee	e
ded								d d!ddZd"ddZ  ZS )#TFDeiTModelTFr*   r   r
  r,   r+   r-   r.   c                   s*   t  j|fi | t|||dd| _d S )Nr"  r
  r+   r0   )r3   r4   r	  r"  r  r;   r'   r(   r4     s   zTFDeiTModel.__init__vision)
checkpointoutput_typer   modalityexpected_outputNro   r   rp   r   r   r  r   r   rn   rq   $tuple | TFBaseModelOutputWithPoolingc	           
   
   C  s   | j ||||||||d}	|	S )N)ro   rp   r   r   r   r   rn   rq   )r"  )
r9   ro   rp   r   r   r   r   rn   rq   r   r'   r'   r(   r      s   
zTFDeiTModel.callc                 C  sd   | j rd S d| _ t| dd d ur0t| jj | jd  W d    d S 1 s)w   Y  d S d S )NTr"  )rJ   rK   rL   rM   r"  r0   rN   r   r'   r'   r(   rN      s   "zTFDeiTModel.buildr  r  r  )ro   r   rp   r   r   r   r   r  r   r  r   r  rn   r,   rq   r,   r-   r,  r   )r"   r#   r$   r4   r   r   DEIT_INPUTS_DOCSTRINGr   _CHECKPOINT_FOR_DOCr   _CONFIG_FOR_DOC_EXPECTED_OUTPUT_SHAPEr   rN   r   r'   r'   r;   r(   r%    s,    		r%  c                      r   )r  r*   r   c                   s<   t  jdi | tjj|jt|j|jdd| _	|| _
d S )Nr   )r   r   
activationr0   r'   )r3   r4   r   r6   r   pooler_output_sizer   r   
pooler_actr   r*   r   r;   r'   r(   r4   +  s   
zTFDeiTPooler.__init__r    rQ   r-   c                 C  s    |d d df }| j |d}|S )Nr   r   )r   )r9   r    first_token_tensorr  r'   r'   r(   r   6  s   zTFDeiTPooler.callNc                 C  r   r   r   r   r'   r'   r(   rN   >  r   zTFDeiTPooler.buildr   r   r   r   r'   r'   r;   r(   r  *  s    
r  c                      s,   e Zd ZdZd fddZdd
dZ  ZS )TFDeitPixelShufflez0TF layer implementation of torch.nn.PixelShuffleupscale_factorrS   r-   r.   c                   s<   t  jdi | t|tr|dk rtd| || _d S )NrC   z1upscale_factor must be an integer value >= 2 got r'   )r3   r4   r   rS   r   r6  )r9   r6  r:   r;   r'   r(   r4   J  s   
zTFDeitPixelShuffle.__init__r   rQ   c                   s~   |}t |\}}}}| jd  t|  t fddt D g}tj|t||dgdd}tjj	|| jdd}|S )	NrC   c                   s&   g | ]}t D ]}||   qqS r'   )r   )r   r   jblock_size_squaredoutput_depthr'   r(   r   Z  s   & z+TFDeitPixelShuffle.call.<locals>.<listcomp>r   rU   )paramsindices
batch_dimsNHWC)
block_sizedata_format)
r   r6  rS   rL   constantr   gatherrv   nndepth_to_space)r9   r   r    rz   ry   num_input_channelspermutationr'   r8  r(   r   P  s   
zTFDeitPixelShuffle.call)r6  rS   r-   r.   )r   rQ   r-   rQ   )r"   r#   r$   r%   r4   r   r   r'   r'   r;   r(   r5  G  s    r5  c                      s4   e Zd Zd fddZddddZdddZ  ZS )TFDeitDecoderr*   r   r-   r.   c                   sL   t  jdi | tjj|jd |j ddd| _t|jdd| _	|| _
d S )NrC   r   0)filtersr   r0   1r2   r'   )r3   r4   r   r6   r   encoder_strider   conv2dr5  pixel_shuffler*   r   r;   r'   r(   r4   b  s   
zTFDeitDecoder.__init__Fr   rQ   rq   r,   c                 C  s   |}|  |}| |}|S r   )rL  rM  )r9   r   rq   r    r'   r'   r(   r   j  s   

zTFDeitDecoder.callNc                 C  s   | j rd S d| _ t| dd d ur3t| jj | jd d d | jjg W d    n1 s.w   Y  t| dd d ur[t| j	j | j	d  W d    d S 1 sTw   Y  d S d S )NTrL  rM  )
rJ   rK   rL   rM   rL  r0   rN   r*   rF   rM  r   r'   r'   r(   rN   p  s   "zTFDeitDecoder.buildr   r   )r   rQ   rq   r,   r-   rQ   r   r   r'   r'   r;   r(   rG  a  s    rG  z~DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).c                      s\   e Zd Zd fddZeeeeee	d										ddddZ
dddZ  ZS )TFDeiTForMaskedImageModelingr*   r   r-   r.   c                   s0   t  | t|dddd| _t|dd| _d S )NFTr"  r&  decoderr2   )r3   r4   r	  r"  rG  rO  r9   r*   r;   r'   r(   r4     s   z%TFDeiTForMaskedImageModeling.__init__r)  r   NFro   r   rp   r   r   r  r   r   rn   r,   rq   #tuple | TFMaskedImageModelingOutputc	              
   C  s  |dur|n| j j}| j||||||||d}	|	d }
|
ddddf }
t|
\}}}t|d  }}t|
||||f}
| j|
|d}t|d}d}|dur| j j	| j j
 }t|d||f}t|| j j
d}t|| j j
d	}t|d}t|tj}tjt|d
t|d
}t|d}t|| }t|d | j j }|| }t|d}|s|f|	dd  }|dur|f| S |S t|||	j|	jdS )a  
        bool_masked_pos (`tf.Tensor` of type bool and shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Returns:

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, TFDeiTForMaskedImageModeling
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = TFDeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="tf").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = tf.cast(tf.random.uniform((1, num_patches), minval=0, maxval=2, dtype=tf.int32), tf.bool)

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```N)rp   r   r   r   r   rn   rq   r   r   rU   g      ?ru   )r   r   r   rC   rC   )r   rC   r   r   gh㈵>)r   )lossreconstructionr    r!   )r*   r  r"  r   rS   rL   r_   rO  rd   r   r^   rx   rf   rw   float32r   lossesmean_absolute_error
reduce_sumr   r	   r    r!   )r9   ro   rp   r   r   r   r   rn   rq   r   r  rz   sequence_lengthr   rR   rT   reconstructed_pixel_valuesmasked_im_lossrW   r}   reconstruction_loss
total_lossnum_masked_pixelsr   r'   r'   r(   r     sX   +

z!TFDeiTForMaskedImageModeling.callc                 C  r   )NTr"  rO  )rJ   rK   rL   rM   r"  r0   rN   rO  r   r'   r'   r(   rN     r   z"TFDeiTForMaskedImageModeling.buildr   r  )ro   r   rp   r   r   r   r   r  r   r  r   r  rn   r,   rq   r,   r-   rR  r   )r"   r#   r$   r4   r   r   r-  r   r	   r/  r   rN   r   r'   r'   r;   r(   rN  |  s    
crN  z
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                      s\   e Zd Zd fddZeeeeee	d								ddddZ
dddZ  ZS )TFDeiTForImageClassificationr*   r   c                   s\   t  | |j| _t|ddd| _|jdkr tjj|jddntjjddd| _	|| _
d S )NFr"  r
  r0   r   
classifierr2   linear)r3   r4   
num_labelsr	  r"  r   r6   r   
Activationra  r*   rP  r;   r'   r(   r4     s   

z%TFDeiTForImageClassification.__init__rQ  NFro   r   r   labelsr   r  r   r   rn   r,   rq   r-   #tf.Tensor | TFImageClassifierOutputc	              	   C  s   |dur|n| j j}| j|||||||d}	|	d }
| |
dddddf }|du r.dn| ||}|sJ|f|	dd  }|durH|f| S |S t|||	j|	jdS )a  
        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Returns:

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, TFDeiTForImageClassification
        >>> import tensorflow as tf
        >>> from PIL import Image
        >>> import requests

        >>> keras.utils.set_random_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a TFDeiTForImageClassificationWithTeacher from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = TFDeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="tf")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
        Predicted class: little blue heron, Egretta caerulea
        ```Nr   r   r   r   rn   rq   r   r   )rS  r   r    r!   )r*   r  r"  ra  hf_compute_lossr   r    r!   )r9   ro   r   re  r   r   r   rn   rq   r   r  r   rS  r   r'   r'   r(   r     s,   /
z!TFDeiTForImageClassification.callc                 C  s   | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urZt| jj | jd d | jj	g W d    d S 1 sSw   Y  d S d S )NTr"  ra  )
rJ   rK   rL   rM   r"  r0   rN   ra  r*   rF   r   r'   r'   r(   rN   ]  s   "z"TFDeiTForImageClassification.buildr   r  )ro   r   r   r   re  r   r   r  r   r  r   r  rn   r,   rq   r,   r-   rf  r   )r"   r#   r$   r4   r   r   r-  r   r   r/  r   rN   r   r'   r'   r;   r(   r_    s    
Jr_  a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

            This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
            supported.
    c                      s^   e Zd Zd fddZeeeeee	e
ed									ddddZdddZ  ZS )'TFDeiTForImageClassificationWithTeacherr*   r   r-   r.   c                   s   t  | |j| _t|ddd| _|jdkr tjj|jddntjjddd| _	|jdkr7tjj|jddntjjddd| _
|| _d S )	NFr"  r`  r   cls_classifierr2   rb  distillation_classifier)r3   r4   rc  r	  r"  r   r6   r   rd  rj  rk  r*   rP  r;   r'   r(   r4   v  s   


z0TFDeiTForImageClassificationWithTeacher.__init__)r(  r)  r   r+  NFro   r   r   r   r  r   r   rn   r,   rq   5tuple | TFDeiTForImageClassificationWithTeacherOutputc              	   C  s   |d ur|n| j j}| j|||||||d}|d }	| |	d d dd d f }
| |	d d dd d f }|
| d }|sK||
|f|dd   }|S t||
||j|jdS )Nrg  r   r   rC   )r   r   r   r    r!   )r*   r  r"  rj  rk  r   r    r!   )r9   ro   r   r   r   r   rn   rq   r   r  r   r   r   r   r'   r'   r(   r     s0   
z,TFDeiTForImageClassificationWithTeacher.callc                 C  s  | j rd S d| _ t| dd d ur-t| jj | jd  W d    n1 s(w   Y  t| dd d urWt| jj | jd d | jj	g W d    n1 sRw   Y  t| dd d urt| j
j | j
d d | jj	g W d    d S 1 s}w   Y  d S d S )NTr"  rj  rk  )rJ   rK   rL   rM   r"  r0   rN   rj  r*   rF   rk  r   r'   r'   r(   rN     s    "z-TFDeiTForImageClassificationWithTeacher.buildr   )NNNNNFF)ro   r   r   r   r   r  r   r  r   r  rn   r,   rq   r,   r-   rl  r   )r"   r#   r$   r4   r   r   r-  r   _IMAGE_CLASS_CHECKPOINTr   r/  _IMAGE_CLASS_EXPECTED_OUTPUTr   rN   r   r'   r'   r;   r(   ri  i  s&    *ri  )r_  ri  rN  r%  r!  )Dr%   
__future__r   collections.abcr   r`   dataclassesr   
tensorflowrL   activations_tfr   modeling_tf_outputsr   r   r   r	   modeling_tf_utilsr
   r   r   r   r   r   tf_utilsr   r   utilsr   r   r   r   r   r   configuration_deitr   
get_loggerr"   loggerr/  r.  r0  rm  rn  r   r6   Layerr)   r5   r   r   r   r   r   r   r   r	  r!  DEIT_START_DOCSTRINGr-  r%  r  r5  rG  rN  r_  ri  __all__r'   r'   r'   r(   <module>   st     
m.[(D6r4yh	U