o
    ei{A                     @   s   d dl mZmZ ddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZ e r9d dlZG d	d
 d
e
ddZeG dd deZdgZdS )    )OptionalUnion   )BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringis_torch_availableNc                   @   s&   e Zd ZddidddddidZd	S )
ColQwen2ProcessorKwargspaddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults r   r   n/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/colqwen2/processing_colqwen2.pyr   "   s    
r   F)totalc                       s4  e Zd Z					d"dedB dedB f fddZe		d#dedB deeB e	e B e	e B de
e d	efd
dZd$ddZedd Zed	efddZ	d$dedB de
e d	efddZdee	e B de
e d	efddZ			d%dede	d f dede	d f deded dedef d	dfd d!Z  ZS )&ColQwen2ProcessorNvisual_prompt_prefixquery_prefixc                    sf   t  j|||d t|dsdn|j| _t|dsdn|j| _|du r%d}|| _|du r.d}|| _dS )	ar  
        visual_prompt_prefix (`str`, *optional*, defaults to `"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"`):
            A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*, defaults to `"Query: "`):
            A prefix to be used for the query.
        )chat_templateimage_tokenz<|image_pad|>video_tokenz<|video_pad|>Nzf<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>zQuery: )super__init__hasattrr&   r'   r#   r$   )selfimage_processor	tokenizerr%   r#   r$   kwargs	__class__r   r    r)   1   s   
zColQwen2Processor.__init__imagestextr.   returnc                 K   s  | j tfd| jji|}|d dd}|du}|du r%|du r%td|dur1|dur1td|durt|r>|g}n$t|trJt|d rJnt|tr^t|d tr^t|d d sbtd| j	gt
| }| jdd	|i|d
 }|d }	|	dur| jjd }
d}tt
|D ]3}| j|| v r|| | jd|	|  |
  d||< |d7 }| j|| v s|| d| j||< q| j|fddi|d }ti ||d}|d dddf |d dddf  }tt|d | }tjjjj|dd|d< |r|d |d dkd}|d|i |S |durht|tr)|g}nt|tr7t|d ts;td|du rE| jd }g }|D ]}| j| | }|| qI| j|fddi|d }|S dS )a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   suffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesr1   r   image_grid_thw   z<|placeholder|>   return_token_type_idsF)datapixel_valuesT)batch_first	input_idstoken_type_idsilabelsz*Text must be a string or a list of strings
   r   )_merge_kwargsr   r-   init_kwargspop
ValueErrorr   
isinstancelistr#   lenr,   
merge_sizeranger&   replaceprodr   torchsplittolistnnutilsrnnpad_sequencemasked_fillupdatestrquery_augmentation_tokenr$   append)r+   r1   r2   r.   output_kwargsr5   r9   	texts_docimage_inputsr6   merge_lengthindexitext_inputsreturn_dataoffsetsr;   r?   texts_queryqueryaugmented_querybatch_queryr   r   r    __call__L   s   
((




zColQwen2Processor.__call__c                    s|   i }|dur7t jdi   |  ddpjj fdd|D }fdd|D }|||d tdi |S )	a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr   rH   c                    s"   g | ]}j jg | R  qS r   )r,   get_number_of_image_patches).0
image_size)r   r+   r   r    
<listcomp>   s    z@ColQwen2Processor._get_num_multimodal_tokens.<locals>.<listcomp>c                    s   g | ]}| d   qS )r7   r   )rg   num_patches)rH   r   r    ri      s    )num_image_tokensnum_image_patchesr   )r   r   getrT   r,   rH   r   )r+   image_sizesr.   vision_datarl   rk   r   )r   rH   r+   r    _get_num_multimodal_tokens   s   
z,ColQwen2Processor._get_num_multimodal_tokensc                 C   s&   | j j}| jj}dd |D }|| S )Nc                 S   s   g | ]}|d vr|qS ))pixel_values_videosvideo_grid_thwr   )rg   namer   r   r    ri      s    z7ColQwen2Processor.model_input_names.<locals>.<listcomp>)r-   model_input_namesr,   )r+   tokenizer_input_namesimage_processor_input_namesr   r   r    rt      s   z#ColQwen2Processor.model_input_namesc                 C   s   | j jS )z
        Return the query augmentation token.

        Query augmentation buffers are used as reasoning buffers during inference.
        )r-   	pad_token)r+   r   r   r    rV      s   z*ColQwen2Processor.query_augmentation_tokenc                 K      | j dd|i|S )a  
        Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColQwen2Processor's
        [`ColQwen2Processor.__call__`].

        This method forwards the `images` and `kwargs` arguments to the image processor.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        r1   Nr   re   )r+   r1   r.   r   r   r    process_images   s   z ColQwen2Processor.process_imagesc                 K   rx   )ai  
        Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColQwen2Processor's
        [`ColQwen2Processor.__call__`].

        This method forwards the `text` and `kwargs` arguments to the tokenizer.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        r2   Nr   ry   )r+   r2   r.   r   r   r    process_queries  s   z!ColQwen2Processor.process_queries   cpuquery_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec              	   C   s@  t |dkr
tdt |dkrtd|d j|d jkr"td|d j|d jkr0td|du r9|d j}g }tdt ||D ]U}g }tjjjj	||||  ddd}	tdt ||D ]'}
tjjjj	||
|
|  ddd}|
td	|	|jd
dd jdd q`|
tj|dd|| qCtj|ddS )a[  
        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
        query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the
        image of a document page.

        Because the embedding tensors are multi-vector and can thus have different shapes, they
        should be fed as:
        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
            obtained by padding the list of tensors.

        Args:
            query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
            passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
            output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
                If `None`, the dtype of the input embeddings is used.
            output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

        Returns:
            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
            tensor is saved on the "cpu" device.
        r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeNT)r<   padding_valuezbnd,csd->bcnsr   )dimr7   r8   )rG   rD   devicedtyperI   rL   rO   rP   rQ   rR   rW   einsummaxsumcatto)r+   r~   r   r   r   r   scoresr]   batch_scoresbatch_queriesjbatch_passagesr   r   r    score_retrieval%  s2    


 "z!ColQwen2Processor.score_retrieval)NNNNN)NN)N)r|   Nr}   )r   r   r   rU   r)   r   r   r   r   rF   r   r   r   re   rp   propertyrt   rV   rz   r{   r   intr   r   __classcell__r   r   r/   r    r"   /   sv    
h


!

$
r"   )typingr   r   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r	   r
   r   tokenization_utils_baser   r   rP   r   r   rL   r   r"   __all__r   r   r   r    <module>   s     
8