o
    ei\                     @   sl  d Z ddlZddlZddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZ ee Z!eeddG dd deZ"G dd dej#Z$G dd dej#Z%G dd dej#Z&G dd dej#Z'G dd dej#Z(G dd dej#Z)G dd  d ej#Z*G d!d" d"ej#Z+G d#d$ d$eZ,G d%d& d&ej#Z-eG d'd( d(eZ.eG d)d* d*e.Z/G d+d, d,ej#Z0ed-dG d.d/ d/e.Z1G d0d1 d1ej#Z2G d2d3 d3ej#Z3ed4dG d5d6 d6e.Z4ed7dG d8d9 d9e.Z5ed:dG d;d< d<e.Z6eG d=d> d>e.Z7g d?Z8dS )@zPyTorch ViLT model.    N)	dataclass)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )
ViltConfigzF
    Class for outputs of [`ViltForImagesAndTextClassification`].
    )custom_introc                   @   sj   e Zd ZU dZdZejdB ed< dZejdB ed< dZ	e
eej  dB ed< dZe
eej  dB ed< dS )(ViltForImagesAndTextClassificationOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`list[tuple(torch.FloatTensor)]`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the output of
        the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    Nlosslogitshidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   listtupler    r#   r#   d/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/vilt/modeling_vilt.pyr   +   s   
 r   c                       s6   e Zd ZdZ fddZd
ddZ	ddd	Z  ZS )ViltEmbeddingsz
    Construct the text and patch embeddings.

    Text embeddings are equivalent to BERT embeddings.

    Patch embeddings are equivalent to ViT embeddings.
    c                    s   t    t|| _ttdd|j| _	t
|| _| jj}ttd|d |j| _t|j|j| _t|j| _|| _d S Nr   )super__init__TextEmbeddingstext_embeddingsr   	Parameterr   zeroshidden_size	cls_tokenViltPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddings	Embeddingmodality_type_vocab_sizetoken_type_embeddingsDropouthidden_dropout_probdropoutconfig)selfr9   r1   	__class__r#   r$   r(   L   s   



zViltEmbeddings.__init__   c                    s  | j jjj\}}}}|  |}|d d d d d d d f  }tjj||jd |jd fd }|d d df j	ddd d df }	|d d df j	ddd d df }
|j\}} | j
j| j
j }| jd d dd d d f ddd|||tj fddt|	|
D dd}|ddd}|ddd}tjtjt|jd	 t|jd
 ddd
dj|jd}|d d d d d d d d f }||jd |jd d
d
d
}|dd}|d}dk sd u stts|	|
 }| n|	|
 }t| |jddd| jddd d df  }fdd|D }fdd|D }dd |D }dd |D }fdd|D }g }t t|||D ]B\}\}}}|dkrvt!t"| }|#|| |  qTtj!t"| |dd}|#tj|| || | gdd qTtj|dd}||d d df |d d df f |d
|}||d d df |d d df f |d
}||d d df |d d df f |d
d}||d d df |d d df f |d
|}| j$|d
d
}tj||fdd}tj| jd d dd d f d d d d d f |d
d
|fdd}|| }| %|}tjt"|jd d||gdd}||| fffS )N   r   sizer   r   dimc              
      sB   g | ]\}}t jt jj||fd ddd| d | fqS )bilinearT)r@   modealign_cornersr   )r   
functionalpadinterpolate).0hw)heightspatial_poswidthr#   r$   
<listcomp>h   s    
z/ViltEmbeddings.visual_embed.<locals>.<listcomp>ij)indexingdeviceF)as_tuplec                    $   g | ]}  d d df |k qS Nr   r#   rI   u)	valid_idxr#   r$   rO         $ c                    rW   rX   r#   rY   )non_valid_idxr#   r$   rO      r\   c                 S      g | ]}| d qS r   r?   rI   vr#   r#   r$   rO          c                 S   r^   r_   r?   r`   r#   r#   r$   rO      rb   c                    s   g | ]} | qS r#   r#   r`   max_image_lengthr#   r$   rO          T)replacement)&r0   
projectionweightshapefloatr   rF   rH   longsumr9   
image_size
patch_sizer2   	transposeviewr   catzipflattenstackmeshgridarangetorU   expand
isinstanceintmaxminnonzerounique	enumeratemultinomialonesappendr.   r8   )r:   pixel_values
pixel_maskrd   _phpwxx_maskx_hx_w
batch_sizenum_channels	patch_dim	pos_embedpatch_indexeffective_resolutionunique_rowsvalid_row_idxnon_valid_row_idx
valid_numsnon_valid_numspad_numsselectira   nvpvalid_choice
pad_choice
cls_tokensr#   )rL   rd   r]   rM   r[   rN   r$   visual_embed[   sx   
 $$$0
(


(.,..8
&zViltEmbeddings.visual_embedr   c	              	   C   s   | j |||d}	|d u r| j||| jjd\}}
}n|d}
|d u r%d}|	| tj|tj|	j	d }	|| tj
|
|tj|	j	d }tj|	|gdd}tj||
gdd}||fS )N)	input_idstoken_type_idsinputs_embedsrc   r   dtyperU   rA   )r*   r   r9   rd   rs   r5   r   
zeros_likerk   rU   	full_likerq   )r:   r   attention_maskr   r   r   r   image_embedsimage_token_type_idxtext_embedsimage_masksr   
embeddingsmasksr#   r#   r$   forward   s&   

zViltEmbeddings.forward)r=   )r   )r   r   r   r   r(   r   r   __classcell__r#   r#   r;   r$   r%   C   s    
ar%   c                       s*   e Zd ZdZ fddZdddZ  ZS )r)   zGConstruct the embeddings from word, position and token_type embeddings.c                    s   t    tj|j|j|jd| _t|j|j| _	t|j
|j| _tj|j|jd| _t|j| _| jdt|jddd | jdtj| j tjddd d S )	N)padding_idxepsposition_idsr   rQ   F)
persistentr   r   )r'   r(   r   r3   
vocab_sizer-   pad_token_idword_embeddingsmax_position_embeddingsr2   type_vocab_sizer5   	LayerNormlayer_norm_epsr6   r7   r8   register_bufferr   rv   rx   r,   r   r@   rk   r:   r9   r;   r#   r$   r(      s   

zTextEmbeddings.__init__Nc                 C   s   |d ur	|  }n|  d d }|d }|d u r$| jd d d |f }|d u rNt| drC| jd d d |f }||d |}|}ntj|tj| jjd}|d u rW| 	|}| 
|}	||	 }
| |}|
|7 }
| |
}
| |
}
|
S )NrQ   r   r   r   r   )r@   r   hasattrr   rx   r   r,   rk   rU   r   r5   r2   r   r8   )r:   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr5   r   r2   r#   r#   r$   r      s*   






zTextEmbeddings.forward)NNNNr   r   r   r   r(   r   r   r#   r#   r;   r$   r)      s    r)   c                       s(   e Zd ZdZ fddZdd Z  ZS )r/   z#
    Image to Patch Embedding.
    c                    s   t    |j|j}}|j|j}}t|tjj	r|n||f}t|tjj	r)|n||f}|d |d  |d |d   }|| _|| _|| _|| _
tj||||d| _d S )Nr   r   )kernel_sizestride)r'   r(   rm   rn   r   r-   ry   collectionsabcIterabler1   r   Conv2drg   )r:   r9   rm   rn   r   r-   r1   r;   r#   r$   r(     s   
 zViltPatchEmbeddings.__init__c                 C   s@   |j \}}}}|| jkrtd| jjj}| |j|d}|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   )ri   r   
ValueErrorrg   rh   r   rw   )r:   r   r   r   rL   rN   target_dtyper   r#   r#   r$   r   '  s   

zViltPatchEmbeddings.forwardr   r#   r#   r;   r$   r/     s    r/   c                       &   e Zd Z fddZdddZ  ZS )ViltSelfAttentionc                    s   t    |j|j dkrt|dstd|j d|j d|j| _t|j|j | _| j| j | _t	j
|j| j|jd| _t	j
|j| j|jd| _t	j
|j| j|jd| _t	|j| _d S )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)bias)r'   r(   r-   num_attention_headsr   r   rz   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer6   attention_probs_dropout_probr8   r   r;   r#   r$   r(   3  s   

zViltSelfAttention.__init__NFc                 C   s  |j \}}}| ||d| j| jdd}| ||d| j| jdd}| ||d| j| jdd}	t	||dd}
|
t
| j }
|d urS|
| }
tjdd|
}| |}t	||	}|dddd }| d d | jf }|j| }|r||f}|S |f}|S )NrQ   r   r>   rP   rA   r   r   )ri   r   rp   r   r   ro   r   r   r   matmulmathsqrtr   Softmaxr8   permute
contiguousr@   r   )r:   r   r   output_attentionsr   r   r   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputsr#   r#   r$   r   E  s6   

zViltSelfAttention.forwardNFr   r   r   r(   r   r   r#   r#   r;   r$   r   2  s    r   c                       sB   e Zd ZdZdef fddZdejdejdejfdd	Z  Z	S )
ViltSelfOutputz
    The residual connection is defined in ViltLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r9   c                    s.   t    t|j|j| _t|j| _d S N)	r'   r(   r   r   r-   denser6   r7   r8   r   r;   r#   r$   r(   w     
zViltSelfOutput.__init__r   input_tensorreturnc                 C      |  |}| |}|S r   r   r8   r:   r   r   r#   r#   r$   r   |     

zViltSelfOutput.forward)
r   r   r   r   r   r(   r   Tensorr   r   r#   r#   r;   r$   r   q  s    $r   c                       r   )ViltAttentionc                    s"   t    t|| _t|| _d S r   )r'   r(   r   	attentionr   outputr   r;   r#   r$   r(     s   

zViltAttention.__init__NFc                 C   s4   |  |||}| |d |}|f|dd   }|S )Nr   r   )r   r   )r:   r   r   r   self_outputsattention_outputr   r#   r#   r$   r     s   zViltAttention.forwardr   r   r#   r#   r;   r$   r     s    r   c                       s8   e Zd Zdef fddZdejdejfddZ  ZS )ViltIntermediater9   c                    sD   t    t|j|j| _t|jt	rt
|j | _d S |j| _d S r   )r'   r(   r   r   r-   intermediate_sizer   ry   
hidden_actstrr   intermediate_act_fnr   r;   r#   r$   r(     s
   
zViltIntermediate.__init__r   r   c                 C   r   r   )r   r  r:   r   r#   r#   r$   r     r   zViltIntermediate.forward	r   r   r   r   r(   r   r   r   r   r#   r#   r;   r$   r     s    r   c                       s>   e Zd Zdef fddZdejdejdejfddZ  ZS )	
ViltOutputr9   c                    s.   t    t|j|j| _t|j| _	d S r   )
r'   r(   r   r   r   r-   r   r6   r7   r8   r   r;   r#   r$   r(     r   zViltOutput.__init__r   r   r   c                 C   s    |  |}| |}|| }|S r   r   r   r#   r#   r$   r     s   

zViltOutput.forwardr  r#   r#   r;   r$   r    s    $r  c                       s*   e Zd ZdZ fddZdddZ  ZS )		ViltLayerz?This corresponds to the Block class in the timm implementation.c                    sb   t    |j| _d| _t|| _t|| _t|| _	t
j|j|jd| _t
j|j|jd| _d S )Nr   r   )r'   r(   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater  r   r   r   r-   r   layernorm_beforelayernorm_afterr   r;   r#   r$   r(     s   



zViltLayer.__init__NFc                 C   sh   | j | |||d}|d }|dd  }|||j }| |}| |}| ||}|f| }|S )N)r   r   r   )r   r	  rw   rU   r
  r  r   )r:   r   r   r   self_attention_outputsr   r   layer_outputr#   r#   r$   r     s   


zViltLayer.forwardr   r   r#   r#   r;   r$   r    s    
r  c                       s.   e Zd Z fddZ				dddZ  ZS )	ViltEncoderc                    s:   t     | _t fddt jD | _d| _d S )Nc                    s   g | ]}t  qS r#   )r  )rI   r   r9   r#   r$   rO     re   z(ViltEncoder.__init__.<locals>.<listcomp>F)	r'   r(   r9   r   
ModuleListrangenum_hidden_layerslayergradient_checkpointingr   r;   r  r$   r(     s   
 
zViltEncoder.__init__NFTc                 C   s   |rdnd }|r
dnd }t | jD ]\}}	|r||f }|	|||}
|
d }|r/||
d f }q|r7||f }|sEtdd |||fD S t|||dS )Nr#   r   r   c                 s   s    | ]	}|d ur|V  qd S r   r#   r`   r#   r#   r$   	<genexpr>  s    z&ViltEncoder.forward.<locals>.<genexpr>)last_hidden_stater   r   )r   r  r"   r	   )r:   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsr   layer_modulelayer_outputsr#   r#   r$   r     s&   

zViltEncoder.forward)NFFTr   r#   r#   r;   r$   r    s    	r  c                       s:   e Zd ZU eed< dZdZdZddgZ fddZ	  Z
S )	ViltPreTrainedModelr9   vilt)imagetextTr%   r   c                    sL   t  | t|tr$t|jt|jj	d 
d t|j d S d S )NrQ   r   )r'   _init_weightsry   r)   initcopy_r   r   rv   ri   rx   zeros_r   )r:   moduler;   r#   r$   r     s
   
"z!ViltPreTrainedModel._init_weights)r   r   r   r   r    base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modulesr   r   r#   r#   r;   r$   r    s   
 r  c                       s   e Zd Zd fdd	Zdd Zdd Ze											dd	ejdB d
ej	dB dejdB dej	dB dejdB dej	dB dej	dB de
dB dedB dedB dedB deeej	 B fddZ  ZS )	ViltModelTc                    sX   t  | || _t|| _t|| _tj|j	|j
d| _|r#t|nd| _|   dS )zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r'   r(   r9   r%   r   r  encoderr   r   r-   r   	layernorm
ViltPoolerpooler	post_init)r:   r9   add_pooling_layerr;   r#   r$   r(     s   

zViltModel.__init__c                 C   s
   | j jjS r   r   r*   r   r:   r#   r#   r$   get_input_embeddings  s   
zViltModel.get_input_embeddingsc                 C   s   || j j_d S r   r0  )r:   r   r#   r#   r$   set_input_embeddings"  s   zViltModel.set_input_embeddingsNr   r   r   r   r   r   r   r   r   r  r  r   c              
   K   s  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur*|dur*td|dur9| || | }n|durF| dd }ntd|\}}|durU|jn|j}|du retj	||f|d}|durq|durqtd|du r}|du r}td|dur|j
d n|j
d }||krtd	|du rtj	|| j j| j jf|d}| j||||||||d
\}}| ||}| j|||	|
|d}|d }| |}| jdur| |nd}|s||f|dd  S t|||j|jdS )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        image_token_type_idx (`int`, *optional*):
            - The token type ids for images.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltModel
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> # prepare image and text
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "hello world"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")

        >>> inputs = processor(image, text, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timerQ   z5You have to specify either input_ids or inputs_embedsrT   zFYou cannot specify both pixel_values and image_embeds at the same timez7You have to specify either pixel_values or image_embedsr   zAThe text inputs and image inputs need to have the same batch size)r   )r   r   r  r  r   )r  pooler_outputr   r   )r9   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr@   rU   r   r   ri   rm   r   get_extended_attention_maskr*  r+  r-  r
   r   r   )r:   r   r   r   r   r   r   r   r   r   r  r  kwargsr   text_batch_sizer   rU   image_batch_sizeembedding_outputextended_attention_maskencoder_outputssequence_outputpooled_outputr#   r#   r$   r   %  sl   ,


zViltModel.forward)TNNNNNNNNNNN)r   r   r   r(   r2  r3  r   r   
LongTensorr   rz   boolr
   r"   r   r   r#   r#   r;   r$   r)    sR    	
r)  c                       $   e Zd Z fddZdd Z  ZS )r,  c                    s*   t    t|j|j| _t | _d S r   )r'   r(   r   r   r-   r   Tanh
activationr   r;   r#   r$   r(     s   
zViltPooler.__init__c                 C   s(   |d d df }|  |}| |}|S rX   )r   rE  )r:   r   first_token_tensorr?  r#   r#   r$   r     s   

zViltPooler.forwardr   r#   r#   r;   r$   r,    s    r,  zU
    ViLT Model with a language modeling head on top as done during pretraining.
    c                       s   e Zd ZddiZ fddZdd Zdd Ze																						dd
ej	d	B dej
d	B dej	d	B dej
d	B dej	d	B dej
d	B dej
d	B dej	d	B ded	B ded	B ded	B deeej
 B fddZ  ZS )ViltForMaskedLMzmlm_score.decoder.weightz6vilt.embeddings.text_embeddings.word_embeddings.weightc                    s,   t  | t|| _t|| _|   d S r   )r'   r(   r)  r  ViltMLMHead	mlm_scorer.  r   r;   r#   r$   r(     s   

zViltForMaskedLM.__init__c                 C   s   | j jS r   )rI  decoderr1  r#   r#   r$   get_output_embeddings  s   z%ViltForMaskedLM.get_output_embeddingsc                 C   s   || j _|j| j _d S r   )rI  rJ  r   )r:   new_embeddingsr#   r#   r$   set_output_embeddings  s   z%ViltForMaskedLM.set_output_embeddingsNr   r   r   r   r   r   r   labelsr   r  r  r   c                 K   s  |dur|n| j j}| j||||||||	|
|d
}|dd \}}|dur*|jd n|jd }|ddd|f |dd|df }}| |}d}|durft }||j}||d| j j	|d}|s||f|dd  }|durz|f| S |S t
|||j|jdS )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in *[-100, 0, ...,
            config.vocab_size]* (see *input_ids* docstring) Tokens with indices set to *-100* are ignored (masked), the
            loss is only computed for the tokens with labels in *[0, ..., config.vocab_size]*

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForMaskedLM
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import re
        >>> import torch

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "a bunch of [MASK] laying on a [MASK]."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltForMaskedLM.from_pretrained("dandelin/vilt-b32-mlm")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)

        >>> tl = len(re.findall("\[MASK\]", text))
        >>> inferred_token = [text]

        >>> # gradually fill in the MASK tokens, one by one
        >>> with torch.no_grad():
        ...     for i in range(tl):
        ...         encoded = processor.tokenizer(inferred_token)
        ...         input_ids = torch.tensor(encoded.input_ids)
        ...         encoded = encoded["input_ids"][0][1:-1]
        ...         outputs = model(input_ids=input_ids, pixel_values=encoding.pixel_values)
        ...         mlm_logits = outputs.logits[0]  # shape (seq_len, vocab_size)
        ...         # only take into account text features (minus CLS and SEP token)
        ...         mlm_logits = mlm_logits[1 : input_ids.shape[1] - 1, :]
        ...         mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
        ...         # only take into account text
        ...         mlm_values[torch.tensor(encoded) != 103] = 0
        ...         select = mlm_values.argmax().item()
        ...         encoded[select] = mlm_ids[select].item()
        ...         inferred_token = [processor.decode(encoded)]

        >>> selected_token = ""
        >>> encoded = processor.tokenizer(inferred_token)
        >>> output = processor.decode(encoded.input_ids[0], skip_special_tokens=True)
        >>> print(output)
        a bunch of cats laying on a couch.
        ```N	r   r   r   r   r   r   r   r  r  r>   r   rQ   r   r   r   r   )r9   r5  r  ri   rI  r   rw   rU   rp   r   r   r   r   )r:   r   r   r   r   r   r   r   rN  r   r  r  r8  r   r>  r?  text_seq_lentext_featuresr   
mlm_logitsmasked_lm_lossloss_fctr   r#   r#   r$   r     s>   K*
zViltForMaskedLM.forwardr@  )r   r   r   _tied_weights_keysr(   rK  rM  r   r   rA  r   rB  r   r"   r   r   r#   r#   r;   r$   rG    sV    		
rG  c                       rC  )ViltPredictionHeadTransformc                    sV   t    t|j|j| _t|jtrt	|j | _
n|j| _
tj|j|jd| _d S )Nr   )r'   r(   r   r   r-   r   ry   r   r   r   transform_act_fnr   r   r   r;   r#   r$   r(   5  s   
z$ViltPredictionHeadTransform.__init__c                 C   s"   |  |}| |}| |}|S r   )r   rX  r   r  r#   r#   r$   r   >  s   


z#ViltPredictionHeadTransform.forwardr   r#   r#   r;   r$   rW  4  s    	rW  c                       rC  )rH  c                    s0   t    || _t|| _t|j|j| _	d S r   )
r'   r(   r9   rW  	transformr   r   r-   r   rJ  r   r;   r#   r$   r(   F  s   

zViltMLMHead.__init__c                 C   r   r   )rY  rJ  )r:   r   r#   r#   r$   r   L  r   zViltMLMHead.forwardr   r#   r#   r;   r$   rH  E  s    rH  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for visual question answering, e.g. for VQAv2.
    c                          e Zd Z fddZe											ddejdB dejdB dejdB dejdB dejdB d	ejdB d
ejdB dejdB dedB dedB dedB de	e
ej B fddZ  ZS )ViltForQuestionAnsweringc              	      sl   t  | |j| _t|| _tt|j|jd t	|jd t
 t|jd |j| _|   d S )Nr>   )r'   r(   
num_labelsr)  r  r   
Sequentialr   r-   r   GELU
classifierr.  r   r;   r#   r$   r(   Y  s   
z!ViltForQuestionAnswering.__init__Nr   r   r   r   r   r   r   rN  r   r  r  r   c                 K   s   |dur|n| j j}| j||||||||	|
|d
}|r|jn|d }| |}d}|dur?||j}tj	|||j
d  }|sU|f|dd  }|durS|f| S |S t|||j|jdS )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
            Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
            all answers that are applicable for a given example in the batch, or a soft encoding indicating which
            answers are applicable, where 1.0 is the highest score.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForQuestionAnswering
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "How many cats are there?"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
        >>> model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: 2
        ```NrO  r   r>   rP  )r9   r5  r  r4  r_  rw   rU   r   rF    binary_cross_entropy_with_logitsri   r   r   r   )r:   r   r   r   r   r   r   r   rN  r   r  r  r8  r   r4  r   r   r   r#   r#   r$   r   j  s8   3
z ViltForQuestionAnswering.forwardr@  r   r   r   r(   r   r   rA  r   rB  r   r"   r   r   r#   r#   r;   r$   r[  R  sN    	
r[  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for image-to-text or text-to-image retrieval, e.g. MSCOCO and F30K.
    c                       rZ  )ViltForImageAndTextRetrievalc                    s2   t  | t|| _t|jd| _|   d S r&   )	r'   r(   r)  r  r   r   r-   rank_outputr.  r   r;   r#   r$   r(     s   
z%ViltForImageAndTextRetrieval.__init__Nr   r   r   r   r   r   r   rN  r   r  r  r   c                 K   s   |dur|n| j j}d}|durtd| j||||||||	|
|d
}|r(|jn|d }| |}|sG|f|dd  }|durE|f| S |S t|||j|jdS )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels are currently not supported.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImageAndTextRetrieval
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
        >>> model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco")

        >>> # forward pass
        >>> scores = dict()
        >>> for text in texts:
        ...     # prepare inputs
        ...     encoding = processor(image, text, return_tensors="pt")
        ...     outputs = model(**encoding)
        ...     scores[text] = outputs.logits[0, :].item()
        ```NzTraining is not yet supported.rO  r   r>   rP  )	r9   r5  NotImplementedErrorr  r4  rc  r   r   r   )r:   r   r   r   r   r   r   r   rN  r   r  r  r8  r   r   r4  r   r   r#   r#   r$   r     s6   /
z$ViltForImageAndTextRetrieval.forwardr@  ra  r#   r#   r;   r$   rb    sN    	
rb  zq
    Vilt Model transformer with a classifier head on top for natural language visual reasoning, e.g. NLVR2.
    c                       rZ  )"ViltForImagesAndTextClassificationc              	      sv   t  | |j| _t|| _|j}tt|j	| |j	| t
|j	| t t|j	| |j| _|   d S r   )r'   r(   r\  r)  r  
num_imagesr   r]  r   r-   r   r^  r_  r.  )r:   r9   rf  r;   r#   r$   r(   -  s   
z+ViltForImagesAndTextClassification.__init__Nr   r   r   r   r   r   r   rN  r   r  r  r   c                 K   sB  |	dur|	n| j j}	|
dur|
n| j j}
|dur|n| j j}|dur,|jdkr,|d}|dur:|jdkr:|d}|durC|jd nd}|du rT|durR|jd nd}|| j jkr^tdg }|
rdg nd}|	rjg nd}t	|D ]k}| j
||||dur|dd|ddddddf nd|dur|dd|ddddf nd||dur|dd|ddddf nd|d |	|
|d}|r|jn|d }|| |
r||j |	r||j qptj|dd}| |}d}|durt }||j}||d| j|d}|s|||f}|dur|f| S |S t||||d	S )
a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Binary classification labels.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImagesAndTextClassification
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> url_1 = "https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg"
        >>> with httpx.stream("GET", url_1) as response:
        ...     image_1 = Image.open(BytesIO(response.read()))

        >>> url_2 = "https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg"
        >>> with httpx.stream("GET", url_2) as response:
        ...     image_2 = Image.open(BytesIO(response.read()))

        >>> text = "The left image contains twice the number of dogs as the right image."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
        >>> model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")

        >>> # prepare inputs
        >>> encoding = processor([image_1, image_2], text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: True
        ```N   r   r   z\Make sure to match the number of images in the model with the number of images in the input.)
r   r   r   r   r   r   r   r   r  r  rQ   rA   rP  )r9   r   r  r5  ndim	unsqueezeri   rf  r   r  r  r4  r   r   r   r   rq   r_  r   rw   rU   rp   r\  r   )r:   r   r   r   r   r   r   r   rN  r   r  r  r8  rf  pooler_outputsr   r   r   r   r4  r?  r   r   rU  r   r#   r#   r$   r   ?  sn   6

,&&



z*ViltForImagesAndTextClassification.forwardr@  )r   r   r   r(   r   r   rA  r   rB  r   r"   r   r   r#   r#   r;   r$   re  '  sN    	
re  c                       rZ  )ViltForTokenClassificationc                    sN   t  | |j| _t|dd| _t|j| _t	|j
|j| _|   d S )NF)r/  )r'   r(   r\  r)  r  r   r6   r7   r8   r   r-   r_  r.  r   r;   r#   r$   r(     s   z#ViltForTokenClassification.__init__Nr   r   r   r   r   r   r   rN  r   r  r  r   c                 K   s   |dur|n| j j}| j||||||||	|
|d
}|d }|dur&|jd n|jd }| |}| |ddd|f }d}|durYt }||j}||	d| j
|	d}|so|f|dd  }|durm|f| S |S t|||j|jdS )a/  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        NrO  r   r   rQ   r>   rP  )r9   r5  r  ri   r8   r_  r   rw   rU   rp   r\  r   r   r   )r:   r   r   r   r   r   r   r   rN  r   r  r  r8  r   r>  text_input_sizer   r   rU  r   r#   r#   r$   r     s>   
z"ViltForTokenClassification.forwardr@  )r   r   r   r(   r   r   rA  r   rB  r   r"   r   r   r#   r#   r;   r$   rk    sN    	
rk  )rb  re  rk  rG  r[  r  r)  r  )9r   collections.abcr   r   dataclassesr   r   r   torch.nnr    r   r!  activationsr   modeling_layersr   modeling_outputsr	   r
   r   r   r   r   modeling_utilsr   utilsr   r   configuration_viltr   
get_loggerr   loggerr   Moduler%   r)   r/   r   r   r   r   r  r  r  r  r)  r,  rG  rW  rH  r[  rb  re  rk  __all__r#   r#   r#   r$   <module>   sx    
 6?%)  
k^ N