o
    ei6                  
   @   s   d Z ddlZddlmZ ddlmZmZ ddlm	Z	m
Z
mZ ddlmZmZ ddlmZ G d	d
 d
e	ddZdee dedeee  fddZdeeee   deee  dededejf
ddZdedededefddZeG dd de
ZdgZdS )zProcessor class for Mllama.    N   )BatchFeature)
ImageInputmake_nested_list_of_images)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringc                   @   s   e Zd ZdddiiZdS )MllamaProcessorKwargsimage_kwargsmax_image_tiles   N)__name__
__module____qualname__	_defaults r   r   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/mllama/processing_mllama.pyr      s
    r   F)total	input_idsimage_token_idreturnc                    s    fddt | D }t|dkrg S t|dkr |d dggS dd t|dd |dd D }||d t| g |d d }|ddd D ]}|d |d d krZ||d< |d }qJ|S )a  
    Generate a cross-attention token mask for image tokens in the input sequence.

    This function identifies the positions of image tokens in the input sequence and creates
    a mask that defines which subsequent tokens each image token should attend to.

    Args:
        input_ids (list[int]): A list of token ids representing the input sequence.
        image_token_id (int): The id of the token used to represent images in the sequence.

    Returns:
        list[list[int]]: A list of [start, end] pairs, where each pair represents the range
        of tokens an image token should attend to.

    Notes:
        - If no image tokens are present, an empty list is returned.
        - For a single image token, it attends to all subsequent tokens until the end of the sequence.
        - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
        - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
    c                    s   g | ]
\}}| kr|qS r   r   ).0itokenr   r   r   
<listcomp>8       z2get_cross_attention_token_mask.<locals>.<listcomp>r      c                 S   s   g | ]\}}||gqS r   r   )r   loc1loc2r   r   r   r   A       N)	enumeratelenzipappend)r   r   image_token_locationsvision_maskslast_mask_endvision_maskr   r   r   get_cross_attention_token_mask"   s   $
r-   cross_attention_token_mask	num_tilesmax_num_tileslengthc              	   C   s   t | }tdd | D }tj||||ftjd}tt| |D ]5\}\}}	tt||	D ]'\}
\}}t |dkrT|\}}t||}|dkrH|}d|||||
d|f< q-q |S )a  
    Convert the cross attention mask indices to a cross attention mask 4D array.

    This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
    The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

    Args:
        cross_attention_token_mask (list[list[list[int]]]): A nested list structure where:
            - The outer list represents the batch dimension.
            - The middle list represents different images within each batch item.
            - The inner list contains pairs of integers [start, end] representing token ranges for each image.
        num_tiles (list[list[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
        max_num_tiles (int): The maximum possible number of tiles.
        length (int): The total sequence length of the input.

    Returns:
        np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
            The array contains `1` where attention is allowed and `0` where it is not.

    Note:
        - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
    c                 s       | ]}t |V  qd S Nr&   )r   masksr   r   r   	<genexpr>p       z?convert_sparse_cross_attention_mask_to_dense.<locals>.<genexpr>)shapedtype   r!   r    N)r&   maxnpzerosint64r%   r'   min)r.   r/   r0   r1   
batch_sizemax_num_imagescross_attention_mask
sample_idxsample_maskssample_num_tilesmask_idx	locationsmask_num_tilesstartendr   r   r   ,convert_sparse_cross_attention_mask_to_denseR   s"   

rK   prompt	bos_tokenimage_tokenc                 C   sP   || v r| S d}|  |r| t|d } |d7 }|  |s||  | |  S )a\  
    Builds a string from the input prompt by adding `bos_token` if not already present.

    Args:
        prompt (`str`):
            The input prompt string.
        bos_token (`str`):
            The beginning of sentence token to be added.
        image_token (`str`):
            The image token used to identify the start of an image sequence.

    Returns:
        str: The modified prompt string with the `bos_token` added if necessary.

    Examples:
        >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'

        >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
        '<|image|><begin_of_text>Hello world'

        >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
        '<begin_of_text>Hello world'
    r   Nr    )
startswithr&   )rL   rM   rN   num_image_tokens_on_startr   r   r   build_string_from_input   s   

rQ   c                
       sz   e Zd Zd fdd	Ze		ddedB deeB ee B ee B dB de	e
 defdd	Z	dddZedd Z  ZS )MllamaProcessorNc                    sb   t |dsd| _|| j| _n|j| _|j| _d| _|| j| _|j| _t j|||d d S )NrN   z	<|image|>z<|python_tag|>)chat_template)	hasattrrN   convert_tokens_to_idsr   python_tokenpython_token_idrM   super__init__)selfimage_processor	tokenizerrS   	__class__r   r   rY      s   
zMllamaProcessor.__init__imagestextkwargsr   c                    sb  |du r|du rt d jtfd jji|}|d dd}i }|durzt|tr0|g}nt|tt	fr@t
dd |D sDt d fd	d
|D } fdd
|D } j|fi |d } j||dgd  fdd
|d D }	|| dg}
|dur j|}t|}dd
 |D }
|durtdd |D rt
dd |D st dt|dkr|
|ks|	|
kr|du rt dd}t|
t|kr|
|krd}n|	|
krd}t d| d|
 d| |dur j|fi |d }|d}|| |dur+|dur+ fdd
|d D }t|| jjtdd |d D d}||d < t||d!S )"a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
        Nz'You must specify either text or images.tokenizer_init_kwargstext_kwargsreturn_tensorsc                 s   s    | ]}t |tV  qd S r3   )
isinstancestrr   tr   r   r   r6      s    z+MllamaProcessor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsc                       g | ]}|  jqS r   )countrN   rg   rZ   r   r   r      r$   z,MllamaProcessor.__call__.<locals>.<listcomp>c                    s   g | ]
}t | j jqS r   )rQ   rM   rN   )r   	text_itemrk   r   r   r      r   image)
modalitiesc                    ri   r   )rj   r   r   	token_idsrk   r   r   r      r$   r   r   c                 S   s   g | ]}t |qS r   r4   )r   sampler   r   r   r      s    c                 s   s    | ]}|d kV  qdS )r   Nr   )r   	batch_imgr   r   r   r6      r7   zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the prompt zZMake sure to pass your images as a nested list, where each sub-list holds images per batchzhIf you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped.z)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). images_kwargsr/   c                    s   g | ]}t | jqS r   )r-   r   ro   rk   r   r   r     s    c                 s   r2   r3   r4   )r   r   r   r   r   r6     r7   )r/   r0   r1   rB   )datatensor_type)
ValueError_merge_kwargsr   r\   init_kwargspopre   rf   listtupleall_check_special_mm_tokensupdater[   fetch_imagesr   anysumrK   r   r;   r   )rZ   r_   r`   ra   output_kwargsrd   ru   n_images_in_textencodingn_images_in_idsn_images_in_imagesadd_messageimage_featuresr/   r.   rB   r   rk   r   __call__   s   
 



zMllamaProcessor.__call__TFc                 K   s   | j j|f||d|S )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[str]`: The decoded text.
        )skip_special_tokensclean_up_tokenization_spaces)r\   batch_decode)rZ   generated_outputsr   r   ra   r   r   r   post_process_image_text_to_text  s   z/MllamaProcessor.post_process_image_text_to_textc                 C   s0   | j j}| jj}dd |D }t|| dg S )Nc                 S   s   g | ]}|d kr|qS )r/   r   )r   namer   r   r   r   3  r$   z5MllamaProcessor.model_input_names.<locals>.<listcomp>rB   )r\   model_input_namesr[   r{   )rZ   tokenizer_input_namesimage_processor_input_namesr   r   r   r   ,  s   z!MllamaProcessor.model_input_namesr3   )NN)TF)r   r   r   rY   r   r   r
   r	   r{   r   r   r   r   r   propertyr   __classcell__r   r   r]   r   rR      s$    [
rR   )__doc__numpyr<   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r   tokenization_utils_baser	   r
   utilsr   r   r{   intr-   ndarrayrK   rf   rQ   rR   __all__r   r   r   r   <module>   s2   "0

0% 
