o
    eiHv                     @   sv  d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z
 ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% e&e'Z(G dd dej)Z*G dd dej)Z+		dCdej)dej,dej,dej,dej,dB de-dB de-dee fdd Z.G d!d" d"ej)Z/G d#d$ d$ej)Z0G d%d& d&ej)Z1G d'd( d(ej)Z2G d)d* d*ej)Z3G d+d, d,eZ4G d-d. d.ej)Z5eG d/d0 d0eZ6eG d1d2 d2e6Z7G d3d4 d4ej)Z8ed5d6G d7d8 d8e6Z9ed9d6G d:d; d;e6Z:eed<d6G d=d> d>eZ;ed?d6G d@dA dAe6Z<g dBZ=dS )DzPyTorch DeiT model.    N)Callable)	dataclass)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )
DeiTConfigc                	       sx   e Zd ZdZddededdf fddZd	ejd
e	de	dejfddZ
		ddejdejdB dedejfddZ  ZS )DeiTEmbeddingszv
    Construct the CLS token, distillation token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenreturnNc                    s   t    ttdd|j| _ttdd|j| _|r*ttdd|jnd | _	t
|| _| jj}ttd|d |j| _t|j| _|j| _d S )Nr      )super__init__r   	Parametertorchzeroshidden_size	cls_tokendistillation_token
mask_tokenDeiTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_size)selfr   r   r*   	__class__ d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/deit/modeling_deit.pyr    0   s   
 
zDeiTEmbeddings.__init__
embeddingsheightwidthc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and 2 class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   N      ?r   r   bicubicF)sizemodealign_cornersdim)shaper+   r"   jit
is_tracingr/   r   reshapepermuter   
functionalinterpolateviewcat)r0   r5   r6   r7   r*   num_positionsclass_and_dist_pos_embedpatch_pos_embedr?   
new_height	new_widthsqrt_num_positionsr3   r3   r4   interpolate_pos_encoding<   s(   



z'DeiTEmbeddings.interpolate_pos_encodingpixel_valuesbool_masked_posrO   c                 C   s   |j \}}}}| |}| \}}	}|d ur1| j||	d}
|d|
}|d|  |
|  }| j|dd}| j|dd}t	j
|||fdd}| j}|rW| |||}|| }| |}|S )Nr8   g      ?r   r>   )r@   r)   r;   r'   expand	unsqueezetype_asr%   r&   r"   rH   r+   rO   r.   )r0   rP   rQ   rO   _r6   r7   r5   
batch_size
seq_lengthmask_tokensmask
cls_tokensdistillation_tokensposition_embeddingr3   r3   r4   forwardd   s    

zDeiTEmbeddings.forward)FNF)__name__
__module____qualname____doc__r   boolr    r"   TensorintrO   
BoolTensorr]   __classcell__r3   r3   r1   r4   r   +   s    +r   c                       s6   e Zd ZdZ fddZdejdejfddZ  ZS )r(   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r   r    
image_sizer/   num_channelsr$   
isinstancecollectionsabcIterabler*   r   Conv2d
projection)r0   r   rj   r/   rk   r$   r*   r1   r3   r4   r       s   
 zDeiTPatchEmbeddings.__init__rP   r   c                 C   s<   |j \}}}}|| jkrtd| |ddd}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   )r@   rk   
ValueErrorrq   flatten	transpose)r0   rP   rV   rk   r6   r7   xr3   r3   r4   r]      s   
zDeiTPatchEmbeddings.forward)	r_   r`   ra   rb   r    r"   rd   r]   rg   r3   r3   r1   r4   r(      s    r(           modulequerykeyvalueattention_maskscalingr.   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )Nr8         r   r   r>   )ptrainingr   )
r;   r"   matmulrt   r   rE   softmaxr.   r   
contiguous)
rw   rx   ry   rz   r{   r|   r.   r}   attn_weightsattn_outputr3   r3   r4   eager_attention_forward   s   
r   c                       sB   e Zd Zdef fddZdejdeejejf fddZ  Z	S )DeiTSelfAttentionr   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .r~   F)bias)r   r    r$   num_attention_headshasattrrr   r   re   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probr|   	is_causalr   Linearqkv_biasrx   ry   rz   r0   r   r1   r3   r4   r       s"   

zDeiTSelfAttention.__init__hidden_statesr   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t	| j
jt}|| |||d | j| j| jsHdn| jd\}}	| d d | jf }
||
}||	fS )Nr   r8   r   r   rv   )r   r|   r.   )r@   r   r   ry   rG   rt   rz   rx   r   get_interfacer   _attn_implementationr   r   r|   r   r   r;   r   rC   )r0   r   rV   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper3   r3   r4   r]      s*   


zDeiTSelfAttention.forward)
r_   r`   ra   r   r    r"   rd   tupler]   rg   r3   r3   r1   r4   r      s    (r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
DeiTSelfOutputz
    The residual connection is defined in DeiTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S N)	r   r    r   r   r$   denser,   r-   r.   r   r1   r3   r4   r          
zDeiTSelfOutput.__init__r   input_tensorr   c                 C      |  |}| |}|S r   r   r.   r0   r   r   r3   r3   r4   r]         

zDeiTSelfOutput.forward
r_   r`   ra   rb   r   r    r"   rd   r]   rg   r3   r3   r1   r4   r      s    $r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )DeiTAttentionr   c                    s"   t    t|| _t|| _d S r   )r   r    r   	attentionr   outputr   r1   r3   r4   r      s   

zDeiTAttention.__init__r   r   c                 C   s   |  |\}}| ||}|S r   )r   r   )r0   r   self_attn_outputrU   r   r3   r3   r4   r]     s   zDeiTAttention.forward	r_   r`   ra   r   r    r"   rd   r]   rg   r3   r3   r1   r4   r         r   c                       r   )DeiTIntermediater   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r    r   r   r$   intermediate_sizer   rl   
hidden_actstrr   intermediate_act_fnr   r1   r3   r4   r      s
   
zDeiTIntermediate.__init__r   r   c                 C   r   r   )r   r   )r0   r   r3   r3   r4   r]     r   zDeiTIntermediate.forwardr   r3   r3   r1   r4   r     s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	
DeiTOutputr   c                    s.   t    t|j|j| _t|j| _	d S r   )
r   r    r   r   r   r$   r   r,   r-   r.   r   r1   r3   r4   r    %  r   zDeiTOutput.__init__r   r   r   c                 C   s    |  |}| |}|| }|S r   r   r   r3   r3   r4   r]   *  s   

zDeiTOutput.forwardr   r3   r3   r1   r4   r   $  s    $r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )		DeiTLayerz?This corresponds to the Block class in the timm implementation.r   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r   r    chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr$   layer_norm_epslayernorm_beforelayernorm_afterr   r1   r3   r4   r    5  s   



zDeiTLayer.__init__r   r   c                 C   s@   |  |}| |}|| }| |}| |}| ||}|S r   )r   r   r   r   r   )r0   r   hidden_states_normattention_outputlayer_outputr3   r3   r4   r]   ?  s   



zDeiTLayer.forwardr   r3   r3   r1   r4   r   2  s    
r   c                       s6   e Zd Zdef fddZdejdefddZ  Z	S )DeiTEncoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r3   )r   ).0rU   r   r3   r4   
<listcomp>U  s    z(DeiTEncoder.__init__.<locals>.<listcomp>F)	r   r    r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r1   r   r4   r    R  s   
 
zDeiTEncoder.__init__r   r   c                 C   s&   t | jD ]\}}||}qt|dS )N)last_hidden_state)	enumerater   r	   )r0   r   ilayer_moduler3   r3   r4   r]   X  s   

zDeiTEncoder.forward)
r_   r`   ra   r   r    r"   rd   r	   r]   rg   r3   r3   r1   r4   r   Q  s    r   c                   @   sn   e Zd ZU eed< dZdZdZdZdgZ	dZ
dZdZdZeedZe dejejB ejB d	d
fddZd
S )DeiTPreTrainedModelr   deitrP   )imageTr   )r   
attentionsrw   r   Nc                 C   s   t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjr7t
|j	 t|j dS t |tr[t
|j t
|j t
|j |jdur]t
|j dS dS dS )zInitialize the weightsrv   )meanstdN)rl   r   r   rp   inittrunc_normal_weightr   initializer_ranger   zeros_r   ones_r   r%   r+   r&   r'   )r0   rw   r3   r3   r4   _init_weightsp  s    


z!DeiTPreTrainedModel._init_weights)r_   r`   ra   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr"   no_gradr   r   rp   r   r   r3   r3   r3   r4   r   _  s    
 &r   c                       s   e Zd Zddedededdf fdd	Zdefd
dZee	dde
			ddejdB dejdB dedee def
ddZ  ZS )	DeiTModelTFr   add_pooling_layerr   r   Nc                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   r   N)r   r    r   r   r5   r   encoderr   r   r$   r   	layernorm
DeiTPoolerpooler	post_init)r0   r   r   r   r1   r3   r4   r      s   
zDeiTModel.__init__c                 C   s   | j jS r   )r5   r)   )r0   r3   r3   r4   get_input_embeddings  s   zDeiTModel.get_input_embeddings)tie_last_hidden_statesrP   rQ   rO   r}   c           
      K   s   |du rt d| jjjjj}|j|kr||}| j|||d}| |}|j}| 	|}| j
dur8| 
|nd}	t||	dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_valuesrQ   rO   )r   pooler_output)rr   r5   r)   rq   r   dtypetor   r   r   r   r
   )
r0   rP   rQ   rO   r}   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputr3   r3   r4   r]     s    



zDeiTModel.forward)TFNNF)r_   r`   ra   r   rc   r    r(   r   r   r   r   r"   rd   rf   r   r   r
   r]   rg   r3   r3   r1   r4   r     s(     r   c                       r   )r   r   c                    s,   t    t|j|j| _t|j | _	d S r   )
r   r    r   r   r$   pooler_output_sizer   r   
pooler_act
activationr   r1   r3   r4   r      s   
zDeiTPooler.__init__r   r   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r  )r0   r   first_token_tensorr   r3   r3   r4   r]     s   

zDeiTPooler.forwardr   r3   r3   r1   r4   r     r   r   ad  
    DeiT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                       sd   e Zd Zdeddf fddZee			ddejdB dej	dB d	e
d
ee def
ddZ  ZS )DeiTForMaskedImageModelingr   r   Nc                    sX   t  | t|ddd| _ttj|j|jd |j	 ddt
|j| _|   d S )NFT)r   r   r   r   )in_channelsout_channelsrh   )r   r    r   r   r   
Sequentialrp   r$   encoder_striderk   PixelShuffledecoderr   r   r1   r3   r4   r      s   

z#DeiTForMaskedImageModeling.__init__FrP   rQ   rO   r}   c                 K   s  | j |f||d|}|j}|ddddf }|j\}}}	t|d  }
}|ddd||	|
|}| |}d}|durz| jj| jj	 }|d||}|
| jj	d
| jj	dd }tjj||dd	}||  | d
  | jj }t|||j|jdS )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, DeiTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForMaskedImageModeling.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```r   Nr   r8   r9   r   r   none)	reductiongh㈵>)lossreconstructionr   r   )r   r   r@   re   rD   rC   r
  r   rj   r/   repeat_interleaverS   r   r   rE   l1_losssumrk   r   r   r   )r0   rP   rQ   rO   r}   outputsr   rV   sequence_lengthrk   r6   r7   reconstructed_pixel_valuesmasked_im_lossr;   rY   reconstruction_lossr3   r3   r4   r]     s>   '
 z"DeiTForMaskedImageModeling.forwardr   )r_   r`   ra   r   r    r   r   r"   rd   rf   rc   r   r   r   r]   rg   r3   r3   r1   r4   r    s$    r  z
    DeiT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    c                       sd   e Zd Zdeddf fddZee			ddejdB dejdB d	e	d
e
e def
ddZ  ZS )DeiTForImageClassificationr   r   Nc                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S NF)r   r   )r   r    
num_labelsr   r   r   r   r$   Identity
classifierr   r   r1   r3   r4   r    C  s
   $z#DeiTForImageClassification.__init__FrP   labelsrO   r}   c           	      K   sp   | j |fd|i|}|j}| |dddddf }d}|dur.| j||| jfi |}t|||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, DeiTForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random
        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/deit-base-distilled-patch16-224")
        >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the 1000 ImageNet classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: Polaroid camera, Polaroid Land camera
        ```rO   Nr   )r  logitsr   r   )r   r   r  loss_functionr   r   r   r   )	r0   rP   r  rO   r}   r  r   r  r  r3   r3   r4   r]   O  s$   +z"DeiTForImageClassification.forwardr   )r_   r`   ra   r   r    r   r   r"   rd   rc   r   r   r   r]   rg   r3   r3   r1   r4   r  <  s$    r  zC
    Output type of [`DeiTForImageClassificationWithTeacher`].
    c                   @   st   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	ejdB ed< dZ
eej dB ed< dZeej dB ed< dS )+DeiTForImageClassificationWithTeacherOutputaj  
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores as the average of the cls_logits and distillation logits.
    cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
        class token).
    distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
        distillation token).
    Nr  
cls_logitsdistillation_logitsr   r   )r_   r`   ra   rb   r  r"   FloatTensorr   r   r!  r   r   r   r3   r3   r3   r4   r    s   
 r  a  
    DeiT Model transformer with image classification heads on top (a linear layer on top of the final hidden state of
    the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.

    .. warning::

           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
           supported.
    c                       sX   e Zd Zdeddf fddZee		ddejdB de	d	e
e defd
dZ  ZS )%DeiTForImageClassificationWithTeacherr   r   Nc                    sv   t  | |j| _t|dd| _|jdkrt|j|jnt | _	|jdkr0t|j|jnt | _
|   d S r  )r   r    r  r   r   r   r   r$   r  cls_classifierdistillation_classifierr   r   r1   r3   r4   r      s     z.DeiTForImageClassificationWithTeacher.__init__FrP   rO   r}   c           	      K   sv   | j |fd|i|}|j}| |d d dd d f }| |d d dd d f }|| d }t||||j|jdS )NrO   r   r   r   )r  r   r!  r   r   )r   r   r$  r%  r  r   r   )	r0   rP   rO   r}   r  r   r   r!  r  r3   r3   r4   r]     s$   z-DeiTForImageClassificationWithTeacher.forwardr^   )r_   r`   ra   r   r    r   r   r"   rd   rc   r   r   r  r]   rg   r3   r3   r1   r4   r#    s    r#  )r  r#  r  r   r   )Nrv   )>rb   collections.abcrm   r   dataclassesr   r"   r    r   r   activationsr   modeling_layersr   modeling_outputsr	   r
   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_deitr   
get_loggerr_   loggerModuler   r(   rd   floatr   r   r   r   r   r   r   r   r   r   r   r  r  r  r#  __all__r3   r3   r3   r4   <module>   s   
Y'
3">`O1