o
    ei,                     @   s   d dl ZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZmZ ddlmZmZ ddlmZ e r?d	d
lmZmZ G dd de
ddZG dd deddZeeddG dd de	ZdgZdS )    N   )BatchFeature)
ImageInput)MultiModalDataProcessingKwargsProcessorMixin
TextKwargsUnpack)PreTokenizedInput	TextInput)auto_docstringis_vision_available)requires   )Emu3ImageProcessorKwargssmart_resizec                   @   s   e Zd ZU dZeed< dS )Emu3TextKwargsa  
    return_for_image_generation (`bool`, *optional*, defaults to `False`):
        Whether the processed text is intended for image generation tasks. When `True`, the processor prepares
        inputs for image generation by appending image start tokens and size information to the prompt, and
        images should not be provided. When `False`, the processor prepares inputs for text generation from
        images and text, requiring both inputs to be provided.
    return_for_image_generationN)__name__
__module____qualname____doc__bool__annotations__ r   r   f/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/emu3/processing_emu3.pyr      s   
 r   F)totalc                   @   s4   e Zd ZU eed< eed< dddddddZd	S )
Emu3ProcessorKwargstext_kwargsimages_kwargsF)r   return_mm_token_type_idsz1:1i  )ratio
image_area)r   r   N)r   r   r   r   r   r   	_defaultsr   r   r   r   r   +   s   
 
r   )vision)backendsc                
       s   e Zd Z	d fdd	Ze		ddedB deeB ee B ee B dB de	e
 defdd	Zdd
dZdd ZdefddZ	dddZ  ZS )Emu3ProcessorNc                    sT   |j | _ |j| _|j| _|j| _|j| _|j| _|j	| _	d| _
t j|||d d S )N   )chat_template)image_tokenimage_token_id	boi_tokenimage_start_token	eoi_tokenimage_end_tokenimage_wrapper_tokenfake_token_around_image	eof_token	bos_tokendownsample_ratiosuper__init__)selfimage_processor	tokenizerr(   kwargs	__class__r   r   r5   =   s   zEmu3Processor.__init__imagestextr9   returnc                    s  t |tr	|g}nt |tst |d tstdjtfdjji|}|d dd}|d dd	}|d d
d	}|rH|d	urHt	d|sV|d	u rV|d	u rVt	di }j
 }	j j }
|s|d	urӈj|fi |d }t|j}g }|D ]J}j|v rt|}|\}}|j }|j }||d  }|	 | d| j d|  |
 }|j|d}j | }j|v s|| q~fdd|D }n.|r||j\}}|	 | d| j   fdd|D }||ggt| |d< |d dd	}|d dd}j|fi |d dd	i}j||dgd |rGt|d }t|d }d||jk< | |d< ti |||dS )aA  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        r   zAInvalid input text. Please provide a string, or a list of stringstokenizer_init_kwargsr   r   Fr   r!   Nr"   zGYou should not provide `images` when `return_for_image_generation=True`zOYou must provide either text or images when `return_for_image_generation=False`r   *<placeholder>c                    s   g | ]	}| d  jqS )rA   )replacer)   .0sample)r6   r   r   
<listcomp>   s    z*Emu3Processor.__call__.<locals>.<listcomp>c                    s   g | ]}j  |   qS r   )r2   rC   image_promptr6   r   r   rF      s    image_sizesreturn_tensorsr    image)
modalities	input_idsmm_token_type_ids)datatensor_type) 
isinstancestrlist	TypeError_merge_kwargsr   r8   init_kwargspop
ValueErrorr,   r1   r.   r7   iterrI   r)   nextr3   r0   rB   r2   appendcalculate_generate_sizelen_check_special_mm_tokensnparray
zeros_liker*   tolistr   )r6   r<   r=   r9   output_kwargsr   r!   r"   image_featuresimage_start_tokensimage_end_tokensrI   prompt_stringsrE   
image_sizeheightwidthimage_seq_lengthimage_placeholderrJ   r    text_inputs	array_idsrN   r   rG   r   __call__N   sj   




$

zEmu3Processor.__call__c           	      K   s   i }|durCg }|D ])\}}t ||| jj| jj| jj\}}|| j }|| j }||d  }|| q
dgt| }|||d t	di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr   )num_image_tokensnum_image_patchesr   )
r   r7   spatial_factor
min_pixels
max_pixelsr3   r[   r]   updater   )	r6   rI   r9   vision_datarp   ri   rj   rk   rq   r   r   r   _get_num_multimodal_tokens   s$   

z(Emu3Processor._get_num_multimodal_tokensc           
      C   sX   t t|d\}}|| }|| d }tt|| | }tt|| | }	||	fS )N:g      ?)mapintsplitround)
r6   r!   r"   rr   rj   ri   current_areatarget_ratiotoken_heighttoken_widthr   r   r   r\      s   z%Emu3Processor.calculate_generate_sizec                 K   s   | j j|fi |S N)r7   postprocess)r6   r<   r9   r   r   r   r      s   zEmu3Processor.postprocessTc                 K   s\   |du s|dkr| j |fd|i|S |dkr"| j|dd}|d S t| jj d| d	)
a  
        Post-process the output of a multimodal model to return the requested modality output.
        If the model cannot generated the requested modality, an error will be raised.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            generation_mode (`str`, *optional*):
                Generation mode indicated which modality to output and can be one of `["text", "image", "audio"]`.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[Union[str, PIL.Image.Image]]`: The decoded text or generated image.
        Nr=   skip_special_tokensrK   zPIL.Image.Image)rJ   pixel_valuesz# got an unexpected generation_mode=z.. Supported options are only `text` and `image)post_process_image_text_to_textr   rX   r;   r   )r6   generated_outputsr   generation_moder9   r<   r   r   r   post_process_multimodal_output   s   z,Emu3Processor.post_process_multimodal_outputr   )NN)TN)r   r   r   r5   r   r   r   r
   rS   r	   r   r   ro   rw   r\   r   r   __classcell__r   r   r:   r   r&   :   s(    
S"	r&   )numpyr_   image_processing_utilsr   image_utilsr   processing_utilsr   r   r   r   r	   tokenization_utils_baser
   r   utilsr   r   utils.import_utilsr   image_processing_emu3r   r   r   r   r&   __all__r   r   r   r   <module>   s     
9