o
    eil@                     @   s   d dl mZmZ ddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZmZmZ ddlmZmZ e r;d dlZG d	d
 d
e
ddZdZdd edD dd edD  Zdd ZeG dd deZdgZdS )    )OptionalUnion   )BatchFeature)
ImageInputmake_flat_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenPreTokenizedInput	TextInput)auto_docstringis_torch_availableNc                   @   s&   e Zd ZddidddddidZd	S )
ColPaliProcessorKwargspaddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults r    r    l/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/colpali/processing_colpali.pyr   #   s    
r   F)totalz<image>c                 C      g | ]	}d |ddqS )z<locz0>4>r    .0ir    r    r!   
<listcomp>1       r(   i   c                 C   r#   )z<segz0>3r$   r    r%   r    r    r!   r(   1   r)      c                 C   s   || |  | |  dS )aZ  
    Builds a string from the input prompt and image tokens.
    For example, for the call:
    build_string_from_input(
        prompt="Prefix str"
        bos_token="<s>",
        image_seq_len=3,
        image_token="<im>",
    )
    The output will be:
    "<im><im><im><s>Initial str"
    Args:
        prompt (`list[Union[str, ImageInput]]`): The input prompt.
        bos_token (`str`): The beginning of sentence token.
        image_seq_len (`int`): The length of the image sequence.
        image_token (`str`): The image token.
        num_images (`int`): Number of images in the prompt.
    
r    prompt	bos_tokenimage_seq_lenimage_token
num_imagesr    r    r!   build_string_from_input4   s   r2   c                       s,  e Zd Z					d$dedef fddZe		d%dedB d	eeB e	e B e	e B d
e
e defddZd&ddZedd ZedefddZ	d&dedB d
e
e defddZd	ee	e B d
e
e defddZ			d'dede	d f dede	d f deded d ed!ef ddfd"d#Z  ZS )(ColPaliProcessorNDescribe the image.
Question: visual_prompt_prefixquery_prefixc                    s   || _ || _t|dstd|j| _t|ds3ttddd}d|gi}|| |t| _	t| _
n|j	| _	|j
| _
|t d|_d|_t j|||d d	S )
a!  
        visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
            A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*, defaults to `"Question: "`):
            A prefix to be used for the query.
        image_seq_lengthz;Image processor is missing an `image_seq_length` attribute.r0   FT)
normalizedspecialadditional_special_tokens)chat_templateN)r6   r7   hasattr
ValueErrorr8   r   IMAGE_TOKENadd_special_tokensconvert_tokens_to_idsimage_token_idr0   
add_tokensEXTRA_TOKENSadd_bos_tokenadd_eos_tokensuper__init__)selfimage_processor	tokenizerr<   r6   r7   r0   tokens_to_add	__class__r    r!   rH   L   s"   




zColPaliProcessor.__init__imagestextkwargsreturnc                    s   j tfd jji|}|d dd}d}|du r#|du r#td|dur/|dur/td|dur j|}t|} j	gt
| }dd	 |D } fd
d	t||D } j|fi |d d }	|d dddurz|d d   j7  <  j|fd|i|d }
i |
d|	i}|r|
d |
d dkd}|d|i t|dS |durt|tr|g}nt|trt|d tstd|du rЈ jd }g }|D ]} jj j | | d }|| q|d dd|d d<  j|fd|i|d }|S dS )a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   suffixNTz&Either text or images must be providedz5Only one of text or images can be processed at a timec                 S   s   g | ]}| d qS )RGB)convert)r&   imager    r    r!   r(      s    z-ColPaliProcessor.__call__.<locals>.<listcomp>c              
      s:   g | ]\}}t | jj jtt|trt|nd dqS )   r,   )r2   rK   r.   r8   r?   
isinstancelistlen)r&   r-   
image_listrI   r    r!   r(      s    r   pixel_values
max_lengthreturn_token_type_ids	input_idstoken_type_idsr   ilabels)dataz*Text must be a string or a list of strings
   r+   2   )_merge_kwargsr   rK   init_kwargspopr>   rJ   fetch_imagesr   r6   r[   zipgetr8   masked_fillupdater   rY   strrZ   query_augmentation_tokenr.   r7   append)rI   rO   rP   rQ   output_kwargsrT   r`   	texts_docinput_stringsr^   inputsreturn_datarc   texts_queryquerybatch_queryr    r]   r!   __call__q   sr   





zColPaliProcessor.__call__c                 K   sH   i }|dur| j gt| }dgt| }|||d tdi |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (list[list[str]], *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        NrX   )num_image_tokensnum_image_patchesr    )r8   r[   rn   r   )rI   image_sizesrQ   vision_datar{   r|   r    r    r!   _get_num_multimodal_tokens   s   z+ColPaliProcessor._get_num_multimodal_tokensc                 C   s$   | j jddg }| jj}t|| S )Nrb   rc   )rK   model_input_namesrJ   rZ   )rI   tokenizer_input_namesimage_processor_input_namesr    r    r!   r      s   z"ColPaliProcessor.model_input_namesc                 C   s   | j jS )z
        Return the query augmentation token.

        Query augmentation buffers are used as reasoning buffers during inference.
        )rK   	pad_tokenr]   r    r    r!   rp      s   z)ColPaliProcessor.query_augmentation_tokenc                 K      | j dd|i|S )a  
        Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColPaliProcessor's
        [`ColPaliProcessor.__call__`].

        This method forwards the `images` and `kwargs` arguments to the image processor.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        rO   Nr    rz   )rI   rO   rQ   r    r    r!   process_images   s   zColPaliProcessor.process_imagesc                 K   r   )ag  
        Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColPaliProcessor's
        [`ColPaliProcessor.__call__`].

        This method forwards the `text` and `kwargs` arguments to the tokenizer.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        rP   Nr    r   )rI   rP   rQ   r    r    r!   process_queries  s   z ColPaliProcessor.process_queriesr*   cpuquery_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec              	   C   s@  t |dkr
tdt |dkrtd|d j|d jkr"td|d j|d jkr0td|du r9|d j}g }tdt ||D ]U}g }tjjjj	||||  ddd}	tdt ||D ]'}
tjjjj	||
|
|  ddd}|
td	|	|jd
dd jdd q`|
tj|dd|| qCtj|ddS )aZ  
        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
        query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the
        image of a document page.

        Because the embedding tensors are multi-vector and can thus have different shapes, they
        should be fed as:
        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
            obtained by padding the list of tensors.

        Args:
            query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
            passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
            output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
                If `None`, the dtype of the input embeddings is used.
            output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

        Returns:
            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
            tensor is saved on the "cpu" device.
        r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeNT)batch_firstpadding_valuezbnd,csd->bcnsr   )dim   rX   )r[   r>   devicedtyperangetorchnnutilsrnnpad_sequencerq   einsummaxsumcatto)rI   r   r   r   r   r   scoresr'   batch_scoresbatch_queriesjbatch_passagesr    r    r!   score_retrieval.  s2    


 "z ColPaliProcessor.score_retrieval)NNNr4   r5   )NN)N)r*   Nr   )r   r   r   ro   rH   r   r   r   r   rZ   r   r   r   rz   r   propertyr   rp   r   r   r   intr   r   __classcell__r    r    rM   r!   r3   J   sv    %
Z


!

$
r3   )typingr   r   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r	   r
   r   tokenization_utils_baser   r   r   r   r   r   r   r   r?   r   rD   r2   r3   __all__r    r    r    r!   <module>   s"   $  
&