o
    i.                     @   s   d Z ddlmZ ddlmZmZmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZmZ ddlmZmZ dd	lmZ erDdd
lmZ eeZdefddZdd ZG dd deddZG dd deddZG dd deZ dgZ!dS )z
Processor class for IDEFICS2.
    )
accumulate)TYPE_CHECKINGOptionalUnion   )BatchFeature)
ImageInputis_valid_image
load_image)ImagesKwargsProcessingKwargsProcessorMixinUnpack)
AddedToken	TextInput)logging)PreTokenizedInputreturnc                 C   s   t | to	| dS )Nhttp)
isinstancestr
startswith)val r   d/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/idefics2/processing_idefics2.pyis_url)   s   r   c                 C   s   t | pt| S N)r   r	   )elemr   r   r   is_image_or_image_url-   s   r   c                   @   s   e Zd ZU ee ed< dS )Idefics2ImagesKwargsimage_seq_lenN)__name__
__module____qualname__r   int__annotations__r   r   r   r   r   1   s   
 r   F)totalc                   @   s(   e Zd ZU eed< ddddi dZdS )Idefics2ProcessorKwargsimages_kwargsTF)add_special_tokenspaddingis_split_into_words)text_kwargsr(   N)r!   r"   r#   r   r%   	_defaultsr   r   r   r   r'   5   s   
 
r'   c                
       s   e Zd ZdZddgZdZdZ	dded	ee	 f fd
dZ
dd Z				ddeeee eee  f deedee ed f dee defddZ  ZS )Idefics2Processora  
    Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor.

    [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See
    the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information.

    Args:
        image_processor (`Idefics2ImageProcessor`):
            An instance of [`Idefics2ImageProcessor`]. The image processor is a required input.
        tokenizer (`PreTrainedTokenizerBase`, *optional*):
            An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
        image_seq_len (`int`, *optional*, defaults to 64):
            The length of the image sequence i.e. the number of <image> tokens per image in the input.
            This parameter is used to build the string from the input prompt and image tokens and should match the
            config.perceiver_config.resampler_n_latents value for the model used.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    image_processor	tokenizerIdefics2ImageProcessorAutoTokenizerN@   r    chat_templatec                    s   t |ds,tddddj| _tddddj| _d| j| jgi}|| || j| _n|j| _|j| _|j| _tdddd| _	|d| j	gi || _
t j|||d	 d S )
Nimage_tokenz<fake_token_around_image>FT)
normalizedspecialz<image>additional_special_tokensz<end_of_utterance>)r4   )hasattrr   contentfake_image_tokenr5   r)   convert_tokens_to_idsimage_token_idimage_boundary_tokenend_of_utterance_tokenr    super__init__)selfr/   r0   r    r4   kwargstokens_to_add	__class__r   r   rA   Z   s   

zIdefics2Processor.__init__c                 C   sT   g }|D ]#}g }|D ]}t |r|| q
t|r!|t| q
|| q|S r   )r	   appendr   r
   )rB   promptsprompt_imagespromptimagesr   r   r   r   _extract_images_from_promptsn   s   z.Idefics2Processor._extract_images_from_promptsrK   textr   rC   r   c              
      s  |du rdu rt d| jtfd| jji|}|d dd}|dur'|n| j}|d dd}g }	i }
|durt|trC|g}nt|t	sSt|d tsSt d	| j
}| j}| ||  | }| jjro|d
 }|d
9 }g }|D ] }|	|| |||}|| | | }|| qs| j|fi |d }| j||dgd |
| durZtrggnltt	tfrtd r|durt|	tkrt d| dt|	 d| dt d	dgt	t|	   fddtt|	D n#gntt	tfs'td t	tfs'td d s't ddd D }|durC||	ksCt d|	 d| ddd D | jfi |d }|
| t|
|dS )a
  
        Processes the input prompts and returns a BatchEncoding.

        Example:

        ```python
        >>> import requests
        >>> from transformers import Idefics2Processor
        >>> from transformers.image_utils import load_image

        >>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2)
        >>> processor.image_processor.do_image_splitting = False  # Force as False to simplify the example

        >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
        >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg"

        >>> image1, image2 = load_image(url1), load_image(url2)
        >>> images = [[image1], [image2]]

        >>> text = [
        ...     "<image>In this image, we see",
        ...     "bla bla bla<image>",
        ... ]
        >>> outputs = processor(images=images, text=text, return_tensors="pt", padding=True)
        >>> input_ids = outputs.input_ids
        >>> input_tokens = processor.tokenizer.batch_decode(input_ids)
        >>> print(input_tokens)
        ['<s><fake_token_around_image><image><image><fake_token_around_image> In this image, we see', '<s> bla bla bla<fake_token_around_image><image><image><fake_token_around_image>']
        ```

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`, *optional*):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. If is of type `list[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1.
            text (`Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]`, *optional*):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).

                Wherever an image token, `<image>` is encountered it is expanded to
                `<fake_token_around_image>` + `<image>` * `image_seq_len` * <fake_token_around_image>`.
            return_tensors (`Union[str, TensorType]`, *optional*):
                If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more
                information.

        Nz+You must provide either `text` or `images`.tokenizer_init_kwargsr(   r    r,   return_tensorsr   zAInvalid input text. Please provide a string, or a list of strings   image)
modalitieszThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images.c                    s$   g | ]} |  |d    qS )   r   ).0icumsum_images_in_textrK   r   r   
<listcomp>   s    z.Idefics2Processor.__call__.<locals>.<listcomp>zdInvalid input images. Please provide a single image or a list of images or a list of list of images.c                 S      g | ]}t |qS r   )lenrU   sampler   r   r   rY          z!The number of images in the text z and images  z should be the same.c                 S   s   g | ]	}d d |D qS )c                 S   rZ   r   )r
   )rU   imr   r   r   rY      r^   z9Idefics2Processor.__call__.<locals>.<listcomp>.<listcomp>r   r\   r   r   r   rY      s    )tensor_type)
ValueError_merge_kwargsr'   r0   init_kwargspopr    r   r   listr;   r5   r/   do_image_splittingrG   countreplace_check_special_mm_tokensupdater   tuplesumr[   r   ranger   )rB   rK   rM   audiovideosrC   output_kwargsr    rO   n_images_in_textinputsr;   r5   	image_strprompt_stringsr]   text_inputsn_images_in_imagesimage_inputsr   rW   r   __call__z   s   6






zIdefics2Processor.__call__)Nr3   N)NNNN)r!   r"   r#   __doc__
attributesimage_processor_classtokenizer_classr$   r   r   rA   rL   r   r   re   r   r   r'   r   rx   __classcell__r   r   rE   r   r.   B   s2    r.   N)"ry   	itertoolsr   typingr   r   r   feature_extraction_utilsr   image_utilsr   r	   r
   processing_utilsr   r   r   r   tokenization_utils_baser   r   utilsr   r   
get_loggerr!   loggerboolr   r   r   r'   r.   __all__r   r   r   r   <module>   s$   
 
D