o
    eie                     @   s6  d Z ddlZddlZddlmZ ddlZddlmZ ddlmZ	 ddl
mZ ddlmZ dd	lmZmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# e$e%Z&G dd dej'Z(G dd dej'Z)		d<dej'dej*dej*dej*dej*dB de+dB de+dee fddZ,G d d! d!ej'Z-G d"d# d#ej'Z.G d$d% d%ej'Z/G d&d' d'ej'Z0G d(d) d)ej'Z1G d*d+ d+eZ2G d,d- d-ej'Z3eG d.d/ d/eZ4eG d0d1 d1e4Z5G d2d3 d3ej'Z6ed4d5G d6d7 d7e4Z7ed8d5G d9d: d:e4Z8g d;Z9dS )=zPyTorch ViT model.    N)Callable)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedImageModelingOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )	ViTConfigc                	       st   e Zd ZdZddedef fddZdejde	d	e	d
ejfddZ
		ddejdejdB ded
ejfddZ  ZS )ViTEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    Fconfiguse_mask_tokenc                    s   t    ttdd|j| _|rttdd|jnd | _	t
|| _| jj}ttd|d |j| _t|j| _|j| _|| _d S )Nr   )super__init__r   	Parametertorchrandnhidden_size	cls_tokenzeros
mask_tokenViTPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropout
patch_sizer   )selfr   r   r&   	__class__ b/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/vit/modeling_vit.pyr   0   s   
 

zViTEmbeddings.__init__
embeddingsheightwidthreturnc                 C   s   |j d d }| jj d d }tj s||kr||kr| jS | jddddf }| jddddf }|j d }|| j }	|| j }
t|d }|d|||}|dddd}t	j
j||	|
fdd	d
}|dddddd|}tj||fddS )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   N      ?r   r      bicubicF)sizemodealign_cornersdim)shaper'   r   jit
is_tracingr+   r   reshapepermuter   
functionalinterpolateviewcat)r,   r1   r2   r3   r&   num_positionsclass_pos_embedpatch_pos_embedr=   
new_height	new_widthsqrt_num_positionsr/   r/   r0   interpolate_pos_encoding<   s(   



z&ViTEmbeddings.interpolate_pos_encodingNpixel_valuesbool_masked_posrM   c                 C   s   |j \}}}}| j||d}|d ur1|j d }	| j||	d}
|d|
}|d|  |
|  }| j|dd}tj||fdd}|rN|| 	||| }n|| j
 }| |}|S )N)rM   r   r5   g      ?r<   )r>   r%   r#   expand	unsqueezetype_asr!   r   rF   rM   r'   r*   )r,   rN   rO   rM   
batch_sizenum_channelsr2   r3   r1   
seq_lengthmask_tokensmask
cls_tokensr/   r/   r0   forwardd   s   


zViTEmbeddings.forwardF)NF)__name__
__module____qualname____doc__r   boolr   r   TensorintrM   
BoolTensorrY   __classcell__r/   r/   r-   r0   r   +   s    +r   c                       sB   e Zd ZdZdef fddZddejdedejfd	d
Z	  Z
S )r$   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    r   c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r   r   
image_sizer+   rT   r    
isinstancecollectionsabcIterabler&   r   Conv2d
projection)r,   r   rf   r+   rT   r    r&   r-   r/   r0   r      s   
 zViTPatchEmbeddings.__init__FrN   rM   r4   c              
   C   s   |j \}}}}|| jkrtd| j d| d|s?|| jd ks(|| jd kr?td| d| d| jd  d| jd  d		| |d
dd
}|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).r7   )r>   rT   
ValueErrorrf   rl   flatten	transpose)r,   rN   rM   rS   rT   r2   r3   r1   r/   r/   r0   rY      s(   
zViTPatchEmbeddings.forwardrZ   )r[   r\   r]   r^   r   r   r   r`   r_   rY   rc   r/   r/   r-   r0   r$      s    $r$           modulequerykeyvalueattention_maskscalingr*   kwargsc           
      K   s   |d u r| dd }t||dd| }|d ur|| }tjj|dd}tjj||| jd}t||}	|	dd	 }	|	|fS )Nr5         r7   r   r<   )ptrainingr   )
r9   r   matmulrq   r   rC   softmaxr*   r|   
contiguous)
rs   rt   ru   rv   rw   rx   r*   ry   attn_weightsattn_outputr/   r/   r0   eager_attention_forward   s   
r   c                       sB   e Zd Zdef fddZdejdeejejf fddZ  Z	S )ViTSelfAttentionr   c                    s   t    |j|j dkrt|dstd|j d|j d|| _|j| _t|j|j | _| j| j | _	|j
| _| jd | _d| _tj|j| j	|jd| _tj|j| j	|jd| _tj|j| j	|jd| _d S )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads rm   rz   F)bias)r   r   r    num_attention_headshasattrro   r   ra   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probrx   	is_causalr   Linearqkv_biasrt   ru   rv   r,   r   r-   r/   r0   r      s"   

zViTSelfAttention.__init__hidden_statesr4   c              
   C   s   |j d }|d| j| jf}| |j| dd}| |j| dd}| |j| dd}t	| j
jt}|| |||d | j| j| jsHdn| jd\}}	| d d | jf }
||
}||	fS )Nr   r5   r   r7   rr   )r   rx   r*   )r>   r   r   ru   rE   rq   rv   rt   r   get_interfacer   _attn_implementationr   r   rx   r|   r   r9   r   rA   )r,   r   rS   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shaper/   r/   r0   rY      s*   


zViTSelfAttention.forward)
r[   r\   r]   r   r   r   r`   tuplerY   rc   r/   r/   r-   r0   r      s    (r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
ViTSelfOutputz
    The residual connection is defined in ViTLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                    s.   t    t|j|j| _t|j| _d S N)	r   r   r   r   r    denser(   r)   r*   r   r-   r/   r0   r         
zViTSelfOutput.__init__r   input_tensorr4   c                 C      |  |}| |}|S r   r   r*   r,   r   r   r/   r/   r0   rY        

zViTSelfOutput.forward
r[   r\   r]   r^   r   r   r   r`   rY   rc   r/   r/   r-   r0   r      s    $r   c                       8   e Zd Zdef fddZdejdejfddZ  ZS )ViTAttentionr   c                    s"   t    t|| _t|| _d S r   )r   r   r   	attentionr   outputr   r-   r/   r0   r     s   

zViTAttention.__init__r   r4   c                 C   s   |  |\}}| ||}|S r   )r   r   )r,   r   self_attn_output_r   r/   r/   r0   rY     s   zViTAttention.forward	r[   r\   r]   r   r   r   r`   rY   rc   r/   r/   r-   r0   r   
      r   c                       r   )ViTIntermediater   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r   r   r   r   r    intermediate_sizer   rg   
hidden_actstrr   intermediate_act_fnr   r-   r/   r0   r     s
   
zViTIntermediate.__init__r   r4   c                 C   r   r   )r   r   )r,   r   r/   r/   r0   rY     r   zViTIntermediate.forwardr   r/   r/   r-   r0   r     s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )		ViTOutputr   c                    s.   t    t|j|j| _t|j| _	d S r   )
r   r   r   r   r   r    r   r(   r)   r*   r   r-   r/   r0   r   &  r   zViTOutput.__init__r   r   r4   c                 C   s    |  |}| |}|| }|S r   r   r   r/   r/   r0   rY   +  s   

zViTOutput.forwardr   r/   r/   r-   r0   r   %  s    $r   c                       s<   e Zd ZdZdef fddZdejdejfddZ  Z	S )	ViTLayerz?This corresponds to the Block class in the timm implementation.r   c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   eps)r   r   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r   r   	LayerNormr    layer_norm_epslayernorm_beforelayernorm_afterr   r-   r/   r0   r   5  s   



zViTLayer.__init__r   r4   c                 C   s@   |  |}| |}|| }| |}| |}| ||}|S r   )r   r   r   r   r   )r,   r   hidden_states_normattention_outputlayer_outputr/   r/   r0   rY   ?  s   



zViTLayer.forwardr   r/   r/   r-   r0   r   2  s    
r   c                       s6   e Zd Zdef fddZdejdefddZ  Z	S )
ViTEncoderr   c                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r/   )r   ).0r   r   r/   r0   
<listcomp>T  s    z'ViTEncoder.__init__.<locals>.<listcomp>F)	r   r   r   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r-   r   r0   r   Q  s   
 
zViTEncoder.__init__r   r4   c                 C   s&   t | jD ]\}}||}qt|dS )N)last_hidden_state)	enumerater   r   )r,   r   ilayer_moduler/   r/   r0   rY   W  s   

zViTEncoder.forward)
r[   r\   r]   r   r   r   r`   r   rY   rc   r/   r/   r-   r0   r   P  s    r   c                   @   sl   e Zd ZU eed< dZdZdZdZddgZ	dZ
dZdZdZeedZe d	ejejB ejB fd
dZdS )ViTPreTrainedModelr   vitrN   )imageTr   r   )r   
attentionsrs   c                 C   s   t |tjtjfr#tj|jd| jjd |j	dur!t
|j	 dS dS t |tjr7t
|j	 t|j dS t |tr_tj|jd| jjd tj|jd| jjd |jdurat
|j dS dS dS )zInitialize the weightsrr   )meanstdN)rg   r   r   rk   inittrunc_normal_weightr   initializer_ranger   zeros_r   ones_r   r'   r!   r#   )r,   rs   r/   r/   r0   _init_weightso  s   


z ViTPreTrainedModel._init_weightsN)r[   r\   r]   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr   no_gradr   r   rk   r   r   r/   r/   r/   r0   r   ^  s    
 "r   c                       s   e Zd Zddededef fddZdefd	d
Zee	dde
			ddejdB dejdB dedB dee def
ddZ  ZS )ViTModelTFr   add_pooling_layerr   c                    s\   t  | || _t||d| _t|| _tj|j	|j
d| _|r%t|nd| _|   dS )z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   r   N)r   r   r   r   r1   r   encoderr   r   r    r   	layernorm	ViTPoolerpooler	post_init)r,   r   r   r   r-   r/   r0   r     s   
zViTModel.__init__r4   c                 C   s   | j jS r   )r1   r%   )r,   r/   r/   r0   get_input_embeddings  s   zViTModel.get_input_embeddings)tie_last_hidden_statesNrN   rO   rM   ry   c           
      K   s   |du rt d| jjjjj}|j|kr||}| j|||d}| |}|j}| 	|}| j
dur8| 
|nd}	t||	dS )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_valuesrO   rM   )r   pooler_output)ro   r1   r%   rl   r   dtypetor   r   r   r   r	   )
r,   rN   rO   rM   ry   expected_dtypeembedding_outputencoder_outputssequence_outputpooled_outputr/   r/   r0   rY     s   



zViTModel.forward)TFNNN)r[   r\   r]   r   r_   r   r$   r   r   r   r   r   r`   rb   r   r   r	   rY   rc   r/   r/   r-   r0   r     s(    r   c                       r   )r   r   c                    s,   t    t|j|j| _t|j | _	d S r   )
r   r   r   r   r    pooler_output_sizer   r   
pooler_act
activationr   r-   r/   r0   r     s   
zViTPooler.__init__r   r4   c                 C   s(   |d d df }|  |}| |}|S )Nr   )r   r   )r,   r   first_token_tensorr   r/   r/   r0   rY     s   

zViTPooler.forwardr   r/   r/   r-   r0   r     r   r   ac  
    ViT Model with a decoder on top for masked image modeling, as proposed in [SimMIM](https://huggingface.co/papers/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                       sd   e Zd Zdef fddZee			ddejdB dej	dB de
dB dee d	ef
d
dZ  ZS )ViTForMaskedImageModelingr   c                    sX   t  | t|ddd| _ttj|j|jd |j	 ddt
|j| _|   d S )NFT)r   r   r7   r   )in_channelsout_channelsrd   )r   r   r   r   r   
Sequentialrk   r    encoder_striderT   PixelShuffledecoderr   r   r-   r/   r0   r     s   

z"ViTForMaskedImageModeling.__init__NrN   rO   rM   ry   r4   c                 K   s@  |dur| j j| j jkrtd| j j d| j j d| j|f||d|}|j}|ddddf }|j\}}}	t|d  }
}|	dd	d
||	|
|}| |}d}|dur| j j| j j }|
d
||}|| j jd| j jd	d }tjj||dd}||  | d  | j j }t|||j|jdS )a|  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, ViTForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
        >>> model = ViTForMaskedImageModeling.from_pretrained("google/vit-base-patch16-224-in21k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.reconstruction
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 224, 224]
        ```NzWhen `bool_masked_pos` is provided, `patch_size` must be equal to `encoder_stride` to ensure that the reconstructed image has the same dimensions as the input. Got `patch_size` = z and `encoder_stride` = rm   r   r   r6   r   r7   r5   none)	reductiongh㈵>)lossreconstructionr   r   )r   r+   r  ro   r   r   r>   mathfloorrB   rA   r  rf   repeat_interleaverQ   r   r   rC   l1_losssumrT   r   r   r   )r,   rN   rO   rM   ry   outputsr   rS   sequence_lengthrT   r2   r3   reconstructed_pixel_valuesmasked_im_lossr9   rW   reconstruction_lossr/   r/   r0   rY     sN   '
 z!ViTForMaskedImageModeling.forwardr   )r[   r\   r]   r   r   r   r   r   r`   rb   r_   r   r   r   rY   rc   r/   r/   r-   r0   r     s$    r   a  
    ViT Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune ViT on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    c                       sd   e Zd Zdef fddZee			ddejdB dejdB de	dB de
e d	ef
d
dZ  ZS )ViTForImageClassificationr   c                    sR   t  | |j| _t|dd| _|jdkrt|j|jnt | _	| 
  d S )NF)r   r   )r   r   
num_labelsr   r   r   r   r    Identity
classifierr   r   r-   r/   r0   r   M  s
   $z"ViTForImageClassification.__init__NrN   labelsrM   ry   r4   c           
      K   st   | j |fd|i|}|j}|dddddf }| |}d}	|dur0| j||| jfi |}	t|	||j|jdS )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        rM   Nr   )r	  logitsr   r   )r   r   r  loss_functionr   r
   r   r   )
r,   rN   r  rM   ry   r  r   r   r  r	  r/   r/   r0   rY   Y  s&   
z!ViTForImageClassification.forwardr   )r[   r\   r]   r   r   r   r   r   r`   r_   r   r   r
   rY   rc   r/   r/   r-   r0   r  >  s$    r  )r  r   r   r   )Nrr   ):r^   collections.abcrh   r  r   r   r    r   r   activationsr   modeling_layersr   modeling_outputsr   r	   r
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_vitr   
get_loggerr[   loggerModuler   r$   r`   floatr   r   r   r   r   r   r   r   r   r   r   r   r  __all__r/   r/   r/   r0   <module>   sv   
X.
2!;g3