o
    ei;                     @   s   d Z ddlmZ ddlmZ ddlmZmZmZm	Z	 ddl
mZmZ ddlmZmZ eeZdZG dd	 d	ed
dZG dd ded
dZeG dd deZdgZdS )z
Processor class for Janus.
   )BatchFeature)
ImageInput)ProcessingKwargsProcessorMixin
TextKwargsUnpack)PreTokenizedInput	TextInput)auto_docstringloggingzYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.

c                   @   s   e Zd ZU dZeed< dS )JanusTextKwargsas  
    generation_mode (`str`, *optional*, defaults to `"text"`):
        The generation mode indicating which modality to generate. Can be one of `"text"` or `"image"`. When set
        to `"text"`, the processor prepares inputs for text generation. When set to `"image"`, it prepares inputs
        for image generation by appending image start tokens to the prompt.
    generation_modeN)__name__
__module____qualname____doc__str__annotations__ r   r   h/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/janus/processing_janus.pyr   "   s   
 r   F)totalc                   @   s*   e Zd ZU eed< dddddidZdS )	JanusProcessorKwargstext_kwargsFtext)paddingr   return_tensorspt)r   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r   r   -   s
   
 
r   c                
       sx   e Zd Zd fdd	Ze		ddeeB ee B ee B dedB de	e
 defd	d
ZdefddZ	dddZ  ZS )JanusProcessorNFc                    s:   d| _ |j| _|j| _|j| _|| _t j|||d dS )z
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Use default system prompt for Text Generation.
        i@  )chat_templateN)	num_image_tokensimage_token	boi_tokenimage_start_token	eoi_tokenimage_end_tokenuse_default_system_promptsuper__init__)selfimage_processor	tokenizerr    r'   kwargs	__class__r   r   r)   7   s   zJanusProcessor.__init__r   imagesr-   returnc           
      K   s8  | j tfd| jji|}|du r|du rtd|dur:t|tr&|g}nt|ttfr6t	dd |D s:td|d 
d}g }| j| j| j  | j }|D ]"}|| j|}| jre|d	kret| }|d
krn|| j7 }|| qQ| j|fi |d }	|dur|d
kr| jdd|i|d d |	d< t|	dS )aA  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsNz'You must specify either text or images.c                 s   s    | ]}t |tV  qd S )N)
isinstancer   ).0tr   r   r   	<genexpr>`   s    z*JanusProcessor.__call__.<locals>.<genexpr>zAInvalid input text. Please provide a string, or a list of stringsr   r   r   imager0   images_kwargspixel_values)datar   )_merge_kwargsr   r,   init_kwargs
ValueErrorr3   r   listtupleallpopr$   r"   r!   r&   replacer'   DEFAULT_SYSTEM_PROMPTappendr+   r   )
r*   r   r0   r-   output_kwargsr   prompt_stringsone_img_tokenspromptr:   r   r   r   __call__D   s<   
 

zJanusProcessor.__call__c                 K   s   | j j|fi |S )z
        Forwards all arguments to the image processor's `postprocess` method.
        Refer to the original method's docstring for more details.
        )r+   postprocess)r*   r0   r-   r   r   r   rJ   z   s   zJanusProcessor.postprocessTc                 K   sh   |du s|dkr| j |fd|i|S |dkr(t| }| j|dd}|d S t| jj d| d	)
a  
        Post-process the output of a multimodal model to return the requested modality output.
        If the model cannot generated the requested modality, an error will be raised.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            generation_mode (`str`, *optional*):
                Generation mode indicated which modality to output and can be one of `["text", "image", "audio"]`.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[Union[str, PIL.Image.Image]]`: The decoded text or generated image.
        Nr   skip_special_tokensr7   zPIL.Image.Image)r   r9   z# got an unexpected generation_mode=z.. Supported options are only `text` and `image)post_process_image_text_to_textr>   floatrJ   r=   r/   r   )r*   generated_outputsrK   r   r-   r0   r   r   r   post_process_multimodal_output   s   z-JanusProcessor.post_process_multimodal_output)NF)NN)TN)r   r   r   r)   r
   r	   r   r>   r   r   r   r   rI   rJ   rO   __classcell__r   r   r.   r   r   5   s"    5r   N)r   feature_extraction_utilsr   image_utilsr   processing_utilsr   r   r   r   tokenization_utils_baser   r	   utilsr
   r   
get_loggerr   loggerrC   r   r   r   __all__r   r   r   r   <module>   s   

p