o
    wi?                     @   sd  d Z ddlZddlmZ ddlmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% e"&e'Z(G dd dej)Z*G dd dej)Z+	d@dej)de	j,de	j,de	j,dee	j, de-de-fddZ.G dd dej)Z/G d d! d!ej)Z0G d"d# d#ej)Z1G d$d% d%ej)Z2G d&d' d'ej)Z3G d(d) d)eZ4G d*d+ d+ej)Z5e!G d,d- d-eZ6e!G d.d/ d/e6Z7G d0d1 d1ej)Z8e!d2d3G d4d5 d5e6Z9e!d6d3G d7d8 d8e6Z:ee!d9d3G d:d; d;e Z;e!d<d3G d=d> d>e6Z<g d?Z=dS )AzPyTorch DeiT model.    N)	dataclass)CallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )
DeiTConfigc                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdeej dedejfddZ  ZS )DeiTEmbeddingszv
    Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenreturnNc                    s   t    ttdd|j| _ttdd|j| _|r*ttdd|jnd | _	t
|| _| jj}ttd|d |j| _t|j| _|j| _d S )Nr      )super__init__r   	Parametertorchzeroshidden_size	cls_tokendistillation_token
mask_tokenDeiTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_size)selfr   r   r+   	__class__ c/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/deit/modeling_deit.pyr!   0   s   
 
zDeiTEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and 2 class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   N      ?r   r
   bicubicF)sizemodealign_cornersdim)shaper,   r#   jit
is_tracingr0   r   reshapepermuter   
functionalinterpolateviewcat)r1   r6   r7   r8   r+   num_positionsclass_and_dist_pos_embedpatch_pos_embedr@   
new_height	new_widthsqrt_num_positionsr4   r4   r5   interpolate_pos_encoding<   s(   



z'DeiTEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posrP   c                 C   s   |j \}}}}| |}| \}}	}|d ur1| j||	d}
|d|
}|d|  |
|  }| j|dd}| j|dd}t	j
|||fdd}| j}|rW| |||}|| }| |}|S )Nr9         ?r   r?   )rA   r*   r<   r(   expand	unsqueezetype_asr&   r'   r#   rI   r,   rP   r/   )r1   rQ   rR   rP   _r7   r8   r6   
batch_size
seq_lengthmask_tokensmask
cls_tokensdistillation_tokensposition_embeddingr4   r4   r5   forwardd   s    

zDeiTEmbeddings.forward)FNF)__name__
__module____qualname____doc__r   boolr!   r#   TensorintrP   r   
BoolTensorr_   __classcell__r4   r4   r2   r5   r   +   s    +r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r)   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r    r!   
image_sizer0   num_channelsr%   
isinstancecollectionsabcIterabler+   r   Conv2d
projection)r1   r   rl   r0   rm   r%   r+   r2   r4   r5   r!      s   
 zDeiTPatchEmbeddings.__init__rQ   r   c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   )rA   rm   
ValueErrorrs   flatten	transpose)r1   rQ   rX   rm   r7   r8   xr4   r4   r5   r_      s   
zDeiTPatchEmbeddings.forward)	ra   rb   rc   rd   r!   r#   rf   r_   ri   r4   r4   r2   r5   r)      s    r)           modulequerykeyvalueattention_maskscalingr/   c           
      K   s|   t ||dd| }tjj|dt jd|j}tjj	||| j
d}|d ur,|| }t ||}	|	dd }	|	|fS )Nr9   )r@   dtype)ptrainingr   r   )r#   matmulrv   r   rF   softmaxfloat32tor   r/   r   
contiguous)
ry   rz   r{   r|   r}   r~   r/   kwargsattn_weightsattn_outputr4   r4   r5   eager_attention_forward   s   r   c                
       sv   e Zd Zdeddf fddZdejdejfddZ		dd
eej de	de
eejejf eej f fddZ  ZS )DeiTSelfAttentionr   r   Nc                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .g      F)bias)r    r!   r%   num_attention_headshasattrrt   r   rg   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr~   	is_causalr   Linearqkv_biasrz   r{   r|   r1   r   r2   r4   r5   r!      s"   

zDeiTSelfAttention.__init__rw   c                 C   s6   |  d d | j| jf }||}|ddddS )Nr9   r   r   r   r
   )r<   r   r   rH   rE   )r1   rw   new_x_shaper4   r4   r5   transpose_for_scores   s   
z&DeiTSelfAttention.transpose_for_scoresF	head_maskoutput_attentionsc              
   C   s   |  | |}|  | |}|  | |}t}| jjdkr4| jjdkr.|r.td nt	| jj }|| ||||| j
| j| jsCdn| jd\}}	| d d | jf }
||
}|rc||	f}|S |f}|S )Neagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.rx   )r   r~   r/   r   )r   r{   r|   rz   r   r   _attn_implementationloggerwarning_oncer   r   r~   r   r   r<   r   rD   )r1   hidden_statesr   r   	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputsr4   r4   r5   r_      s4   

zDeiTSelfAttention.forwardr`   )ra   rb   rc   r   r!   r#   rf   r   r   re   r   tupler_   ri   r4   r4   r2   r5   r      s    r   c                       sF   e Zd ZdZdeddf fddZdejdejdejfd	d
Z  Z	S )DeiTSelfOutputz
    The residual connection is defined in DeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   r   Nc                    s.   t    t|j|j| _t|j| _d S N)	r    r!   r   r   r%   denser-   r.   r/   r   r2   r4   r5   r!   	     
zDeiTSelfOutput.__init__r   input_tensorc                 C      |  |}| |}|S r   r   r/   r1   r   r   r4   r4   r5   r_        

zDeiTSelfOutput.forward)
ra   rb   rc   rd   r   r!   r#   rf   r_   ri   r4   r4   r2   r5   r     s    $r   c                       s~   e Zd Zdeddf fddZdee ddfddZ			dd
ej	de
ej	 dedeeej	ej	f eej	 f fddZ  ZS )DeiTAttentionr   r   Nc                    s*   t    t|| _t|| _t | _d S r   )r    r!   r   	attentionr   outputsetpruned_headsr   r2   r4   r5   r!     s   


zDeiTAttention.__init__headsc                 C   s   t |dkrd S t|| jj| jj| j\}}t| jj|| j_t| jj|| j_t| jj	|| j_	t| j
j|dd| j
_| jjt | | j_| jj| jj | j_| j|| _d S )Nr   r   r?   )lenr   r   r   r   r   r   rz   r{   r|   r   r   r   union)r1   r   indexr4   r4   r5   prune_heads  s   zDeiTAttention.prune_headsFr   r   r   c                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r1   r   r   r   self_outputsattention_outputr   r4   r4   r5   r_   /  s   zDeiTAttention.forwardr`   )ra   rb   rc   r   r!   r   rg   r   r#   rf   r   re   r   r   r_   ri   r4   r4   r2   r5   r     s    r   c                       s<   e Zd Zdeddf fddZdejdejfddZ  ZS )	DeiTIntermediater   r   Nc                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r    r!   r   r   r%   intermediate_sizer   rn   
hidden_actstrr   intermediate_act_fnr   r2   r4   r5   r!   ?  s
   
zDeiTIntermediate.__init__r   c                 C   r   r   )r   r   )r1   r   r4   r4   r5   r_   G  r   zDeiTIntermediate.forward	ra   rb   rc   r   r!   r#   rf   r_   ri   r4   r4   r2   r5   r   >  s    r   c                       sB   e Zd Zdeddf fddZdejdejdejfdd	Z  ZS )

DeiTOutputr   r   Nc                    s.   t    t|j|j| _t|j| _	d S r   )
r    r!   r   r   r   r%   r   r-   r.   r/   r   r2   r4   r5   r!   P  r   zDeiTOutput.__init__r   r   c                 C   s    |  |}| |}|| }|S r   r   r   r4   r4   r5   r_   U  s   

zDeiTOutput.forwardr   r4   r4   r2   r5   r   O  s    $r   c                       sl   e Zd ZdZdeddf fddZ		ddejd	eej d
e	de
eejejf eej f fddZ  ZS )	DeiTLayerz?This corresponds to the Block class in the timm implementation.r   r   Nc                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r    r!   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr%   layer_norm_epslayernorm_beforelayernorm_afterr   r2   r4   r5   r!   b  s   



zDeiTLayer.__init__Fr   r   r   c                 C   s`   | j | |||d}|d }|dd  }|| }| |}| |}| ||}|f| }|S )N)r   r   r   )r   r   r   r   r   )r1   r   r   r   self_attention_outputsr   r   layer_outputr4   r4   r5   r_   l  s   


zDeiTLayer.forwardr`   )ra   rb   rc   rd   r   r!   r#   rf   r   re   r   r   r_   ri   r4   r4   r2   r5   r   _  s    r   c                       sb   e Zd Zdeddf fddZ				ddejd	eej d
ededede	e
ef fddZ  ZS )DeiTEncoderr   r   Nc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r4   )r   ).0rW   r   r4   r5   
<listcomp>  s    z(DeiTEncoder.__init__.<locals>.<listcomp>F)	r    r!   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r2   r   r5   r!     s   
 
zDeiTEncoder.__init__FTr   r   r   output_hidden_statesreturn_dictc                 C   s   |rdnd }|r
dnd }t | jD ](\}}	|r||f }|d ur$|| nd }
|	||
|}|d }|r9||d f }q|rA||f }|sOtdd |||fD S t|||dS )Nr4   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r4   )r   vr4   r4   r5   	<genexpr>  s    z&DeiTEncoder.forward.<locals>.<genexpr>)last_hidden_stater   
attentions)	enumerater   r   r   )r1   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_modulelayer_head_masklayer_outputsr4   r4   r5   r_     s(   

zDeiTEncoder.forward)NFFT)ra   rb   rc   r   r!   r#   rf   r   re   r   r   r   r_   ri   r4   r4   r2   r5   r     s&    	
r   c                   @   sT   e Zd ZeZdZdZdZdgZdZ	dZ
dZdZdeejejejf ddfdd	ZdS )
DeiTPreTrainedModeldeitrQ   Tr   ry   r   Nc                 C   s   t |tjtjfr0tjj|jjt	j
d| jjd|jj|j_|jdur.|jj  dS dS t |tjrE|jj  |jjd dS t |tri|jj  |jj  |jj  |jdurk|jj  dS dS dS )zInitialize the weightsrx   )meanstdNrS   )rn   r   r   rr   inittrunc_normal_weightdatar   r#   r   r   initializer_ranger   r   zero_r   fill_r   r&   r,   r'   r(   )r1   ry   r4   r4   r5   _init_weights  s(   



z!DeiTPreTrainedModel._init_weights)ra   rb   rc   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2_supports_flex_attn_supports_attention_backendr   r   r   rr   r   r   r4   r4   r4   r5   r     s    &r   c                       s   e Zd Zddedededdf fdd	Zdefd
dZdd Ze								dde
ej de
ej de
ej de
e de
e de
e dedeeef fddZ  ZS )	DeiTModelTFr   add_pooling_layerr   r   Nc                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   r   N)r    r!   r   r   r6   r   encoderr   r   r%   r   	layernorm
DeiTPoolerpooler	post_init)r1   r   r  r   r2   r4   r5   r!     s   
zDeiTModel.__init__c                 C   s   | j jS r   )r6   r*   )r1   r4   r4   r5   get_input_embeddings  s   zDeiTModel.get_input_embeddingsc                 C   s*   |  D ]\}}| jj| j| qdS )z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )r1   heads_to_pruner   r   r4   r4   r5   _prune_heads  s   zDeiTModel._prune_headsrQ   rR   r   r   r   r   rP   c                 C   s
  |dur|n| j j}|dur|n| j j}|dur|n| j j}|du r&td| || j j}| jjj	j
j}|j|kr?||}| j|||d}	| j|	||||d}
|
d }| |}| jdurd| |nd}|s{|durp||fn|f}||
dd  S t|||
j|
jdS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rR   rP   )r   r   r   r   r   r   )r   pooler_outputr   r   )r   r   r   use_return_dictrt   get_head_maskr   r6   r*   rs   r   r   r   r  r  r  r   r   r   )r1   rQ   rR   r   r   r   r   rP   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputhead_outputsr4   r4   r5   r_     s@   


zDeiTModel.forward)TFNNNNNNF)ra   rb   rc   r   re   r!   r)   r  r
  r   r   r#   rf   rh   r   r   r   r_   ri   r4   r4   r2   r5   r     s:     
	r   c                       s*   e Zd Zdef fddZdd Z  ZS )r  r   c                    s,   t    t|j|j| _t|j | _	d S r   )
r    r!   r   r   r%   pooler_output_sizer   r   
pooler_act
activationr   r2   r4   r5   r!   7  s   
zDeiTPooler.__init__c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )r1   r   first_token_tensorr  r4   r4   r5   r_   <  s   

zDeiTPooler.forward)ra   rb   rc   r   r!   r_   ri   r4   r4   r2   r5   r  6  s    r  ad  
    DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                       s   e Zd Zdeddf fddZe							ddeej deej	 d	eej d
ee
 dee
 dee
 de
deeef fddZ  ZS )DeiTForMaskedImageModelingr   r   Nc                    sX   t  | t|ddd| _ttj|j|jd |j	 ddt
|j| _|   d S )NFT)r  r   r   r   )in_channelsout_channelsrj   )r    r!   r   r   r   
Sequentialrr   r%   encoder_striderm   PixelShuffledecoderr  r   r2   r4   r5   r!   R  s   

z#DeiTForMaskedImageModeling.__init__FrQ   rR   r   r   r   r   rP   c              	   C   sJ  |dur|n| j j}| j|||||||d}|d }	|	ddddf }	|	j\}
}}t|d  }}|	ddd|
|||}	| |	}d}|dur| j j| j j	 }|d||}|
| j j	d
| j j	dd }tjj||dd	}||  | d
  | j j }|s|f|dd  }|dur|f| S |S t|||j|jdS )a;  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```N)rR   r   r   r   r   rP   r   r   r9   r:   r   none)	reductiongh㈵>)lossreconstructionr   r   )r   r  r   rA   rg   rE   rD   r   rl   r0   repeat_interleaverU   r   r   rF   l1_losssumrm   r   r   r   )r1   rQ   rR   r   r   r   r   rP   r   r  rX   sequence_lengthrm   r7   r8   reconstructed_pixel_valuesmasked_im_lossr<   r[   reconstruction_lossr   r4   r4   r5   r_   c  sH   &

 z"DeiTForMaskedImageModeling.forwardr  )ra   rb   rc   r   r!   r   r   r#   rf   rh   re   r   r   r   r_   ri   r4   r4   r2   r5   r  E  s6    
	r  z
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       s   e Zd Zdeddf fddZe							ddeej deej d	eej d
ee	 dee	 dee	 de	de
eef fddZ  ZS )DeiTForImageClassificationr   r   Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S NF)r  r   )r    r!   
num_labelsr   r   r   r   r%   Identity
classifierr  r   r2   r4   r5   r!     s
   $z#DeiTForImageClassification.__init__FrQ   r   labelsr   r   r   rP   c                 C   s  |dur|n| j j}| j||||||d}|d }	| |	dddddf }
d}|dur||
j}| j jdu r]| jdkrCd| j _n| jdkrY|jt	j
ksT|jt	jkrYd| j _nd| j _| j jdkr{t }| jdkru||
 | }n+||
|}n%| j jdkrt }||
d| j|d}n| j jdkrt }||
|}|s|
f|dd  }|dur|f| S |S t||
|j|jd	S )
aZ  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DeiTForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: Polaroid camera, Polaroid Land camera
        ```Nr   r   r   r   rP   r   r   
regressionsingle_label_classificationmulti_label_classificationr9   )r#  logitsr   r   )r   r  r   r0  r   deviceproblem_typer.  r   r#   longrg   r	   squeezer   rH   r   r   r   r   )r1   rQ   r   r1  r   r   r   rP   r   r  r6  r#  loss_fctr   r4   r4   r5   r_     sP   *	

"


z"DeiTForImageClassification.forwardr  )ra   rb   rc   r   r!   r   r   r#   rf   re   r   r   r   r_   ri   r4   r4   r2   r5   r,    s6    
	r,  zC
    Output type of [`DeiTForImageClassificationWithTeacher`].
    c                   @   st   e Zd ZU dZdZeej ed< dZ	eej ed< dZ
eej ed< dZeeej  ed< dZeeej  ed< dS )+DeiTForImageClassificationWithTeacherOutputaj  
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores as the average of the cls_logits and distillation logits.
    cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
        class token).
    distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
        distillation token).
    Nr6  
cls_logitsdistillation_logitsr   r   )ra   rb   rc   rd   r6  r   r#   FloatTensor__annotations__r=  r>  r   r   r   r4   r4   r4   r5   r<  )  s   
 r<  a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                       s~   e Zd Zdeddf fddZe						ddeej deej d	ee	 d
ee	 dee	 de	de
eef fddZ  ZS )%DeiTForImageClassificationWithTeacherr   r   Nc                    sv   t  | |j| _t|dd| _|jdkrt|j|jnt | _	|jdkr0t|j|jnt | _
|   d S r-  )r    r!   r.  r   r   r   r   r%   r/  cls_classifierdistillation_classifierr  r   r2   r4   r5   r!   N  s     z.DeiTForImageClassificationWithTeacher.__init__FrQ   r   r   r   r   rP   c                 C   s   |d ur|n| j j}| j||||||d}|d }| |d d dd d f }	| |d d dd d f }
|	|
 d }|sJ||	|
f|dd   }|S t||	|
|j|jdS )Nr2  r   r   r   )r6  r=  r>  r   r   )r   r  r   rB  rC  r<  r   r   )r1   rQ   r   r   r   r   rP   r   r  r=  r>  r6  r   r4   r4   r5   r_   _  s.   
	z-DeiTForImageClassificationWithTeacher.forward)NNNNNF)ra   rb   rc   r   r!   r   r   r#   rf   re   r   r   r<  r_   ri   r4   r4   r2   r5   rA  B  s0    
rA  )r,  rA  r  r   r   )rx   )>rd   collections.abcro   dataclassesr   typingr   r   r   r#   torch.utils.checkpointr   torch.nnr   r   r	   activationsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   r   configuration_deitr   
get_loggerra   r   Moduler   r)   rf   floatr   r   r   r   r   r   r   r   r   r   r  r  r,  r<  rA  __all__r4   r4   r4   r5   <module>   s   
Y(
?(++ _hj<