o
    wi                      @   s   d dl mZmZ d dlZddlmZ ddlmZm	Z	 ddl
mZmZmZmZmZ ddlmZmZ G dd	 d	eZG d
d deddZG dd deZdgZdS )    )OptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)AudioKwargsImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   @   sJ   e Zd ZU ee ed< ee ed< ee ed< ee ed< ee ed< dS )Gemma3nImagesKwargsdo_pan_and_scanpan_and_scan_min_crop_sizepan_and_scan_max_num_crops"pan_and_scan_min_ratio_to_activatedo_convert_rgbN)__name__
__module____qualname__r   bool__annotations__intfloat r   r   k/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/transformers/models/gemma3n/processing_gemma3n.pyr      s   
 r   c                   @   s*   e Zd ZU eed< eed< dddiiZdS )Gemma3nProcessorKwargsaudio_kwargsimages_kwargstext_kwargspaddingFN)r   r   r   r   r   r   	_defaultsr   r   r   r   r   "   s   
 r   F)totalc                       s   e Zd ZdZg dZdZdZdZ			dd	ed
ef fddZ					dde
deeeee ee f deeejee eej eee  f  dee def
ddZdd Zdd Zedd Z  ZS )Gemma3nProcessorat  
    A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer
    into a single processor.

    Args:
        feature_extractor (`Gemma3nAudioFeatureExtractor`):
            Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This
            should return a `BatchFeature` with `input_features` and `input_features_mask` features.
        image_processor (`SiglipImageProcessorFast`):
            Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature`
            with a `pixel_values` feature.
        tokenizer (`GemmaTokenizerFast`):
            The text tokenizer for the model.
        chat_template (`string`, *optional*):
            A Jinja template for generating text prompts from a set of messages.
        audio_seq_length (int, *optional*, defaults to 188):
            The number of audio soft tokens that will be added to the text prompt
        image_seq_length (int, *optional*, defaults to 256):
            The number of image soft tokens that should be added to
    )feature_extractorimage_processor	tokenizerAutoFeatureExtractorAutoImageProcessorAutoTokenizerN      audio_seq_lengthimage_seq_lengthc           
         s   || _ |j| _|j| _|j| _d|jg| }d|j | |j d| _|| _|j| _|j	| _	|j
| _
d|j
g| }	d|j	 |	 |j d| _t jd||||d| d S )N z

)r&   r'   r(   chat_templater   )r.   audio_token_id	boa_tokenaudio_tokenjoin	eoa_tokenfull_audio_sequencer/   image_token_id	boi_tokenimage_token	eoi_tokenfull_image_sequencesuper__init__)
selfr&   r'   r(   r1   r.   r/   kwargsaudio_tokens_expandedimage_tokens_expanded	__class__r   r   r>   G   s(   

zGemma3nProcessor.__init__imagestextaudior@   returnc                    s  |d u r|d u r|d u rt d jtfd jji|}t|tr&|g}nt|ts6t|d ts6t d|d urZ j|fi |d }|sP fdd|D } fdd|D }ni }|d urt	|} j
|fi |d	 }	|sz fd
d|D }t|t|krt dt| dt| d fdd|D }ni }	|d dd }
 jdd|i|d ddi} j||dgd |d }t|}d|| jk< d|| jk< dd | D }| |d< ti ||	||
dS )Nz5Provide at least one of `text`, `images`, or `audio`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr   c                    s   g | ]} j qS r   )r4   ).0_r?   r   r   
<listcomp>   s    z-Gemma3nProcessor.__call__.<locals>.<listcomp>c                       g | ]
}|  j jqS r   )replacer4   r7   rJ   promptrL   r   r   rM          r    c                    s"   g | ]}d   jgt| qS ) )r5   r:   len)rJ   rE   rL   r   r   rM      s   " z1Received inconsistently sized batches of images (z) and text (z).c                    rN   r   )rO   r:   r<   rP   rL   r   r   rM      rR   r!   return_tensorsrF   npimage)
modalities	input_ids   r   c                 S   s   i | ]	\}}||  qS r   )tolist)rJ   kvr   r   r   
<dictcomp>   s    z-Gemma3nProcessor.__call__.<locals>.<dictcomp>token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr   r(   init_kwargs
isinstancestrlistr&   r   r'   rT   pop_check_special_mm_tokensrV   
zeros_liker8   r2   itemsr[   r   )r?   rE   rF   rG   videosr@   output_kwargsaudio_inputsbatched_imagesimage_inputsrU   text_inputs	array_idsr_   r   rL   r   __call__g   sP   
 
zGemma3nProcessor.__call__c                 O      | j j|i |S )z
        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        )r(   batch_decoder?   argsr@   r   r   r   ru         zGemma3nProcessor.batch_decodec                 O   rt   )z
        This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r(   decoderv   r   r   r   ry      rx   zGemma3nProcessor.decodec                 C   s4   | j jdg }| jj}| jj}tt|| | S )Nr_   )r(   model_input_namesr'   r&   rg   dictfromkeys)r?   tokenizer_input_namesimage_processor_input_namesfeature_extactor_input_namesr   r   r   rz      s   z"Gemma3nProcessor.model_input_names)Nr,   r-   )NNNN)r   r   r   __doc__
attributesfeature_extractor_classimage_processor_classtokenizer_classr   r>   r   r   r   r   rg   r   rV   ndarrayr   r   r   r   rs   ru   ry   propertyrz   __classcell__r   r   rC   r   r%   ,   sB    "&
Ar%   )typingr   r   numpyrV   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r	   r
   r   r   tokenization_utils_baser   r   r   r   r%   __all__r   r   r   r   <module>   s   
 
