o
    i                     @   s   d dl mZmZ d dlZddlmZ ddlmZm	Z	 ddl
mZmZmZmZmZ ddlmZmZ G dd	 d	eZG d
d deddZG dd deZdgZdS )    )OptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)AudioKwargsImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   @   s   e Zd ZU ee ed< dS )Gemma3nImagesKwargsdo_convert_rgbN)__name__
__module____qualname__r   bool__annotations__ r   r   k/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/gemma3n/processing_gemma3n.pyr      s   
 r   c                   @   s*   e Zd ZU eed< eed< dddiiZdS )Gemma3nProcessorKwargsaudio_kwargsimages_kwargstext_kwargspaddingFN)r   r   r   r   r   r   	_defaultsr   r   r   r   r      s   
 r   F)totalc                       s   e Zd ZdZg dZdZdZdZ			dd	ed
ef fddZ					dde
e deeeee ee f de
eejee eej eee  f  dee def
ddZ  ZS )Gemma3nProcessorat  
    A processor for Gemma 3n, wrapping the full capabilities of a feature extractor, image processor, and tokenizer
    into a single processor.

    Args:
        feature_extractor (`Gemma3nAudioFeatureExtractor`):
            Feature extractor that converts raw audio waveforms into MEL spectrograms for the audio encoder. This
            should return a `BatchFeature` with `input_features` and `input_features_mask` features.
        image_processor (`SiglipImageProcessorFast`):
            Image processor that prepares batches of images for the vision encoder. This should return a `BatchFeature`
            with a `pixel_values` feature.
        tokenizer (`GemmaTokenizerFast`):
            The text tokenizer for the model.
        chat_template (`string`, *optional*):
            A Jinja template for generating text prompts from a set of messages.
        audio_seq_length (int, *optional*, defaults to 188):
            The number of audio soft tokens that will be added to the text prompt
        image_seq_length (int, *optional*, defaults to 256):
            The number of image soft tokens that should be added to
    )feature_extractorimage_processor	tokenizerAutoFeatureExtractorAutoImageProcessorAutoTokenizerN      audio_seq_lengthimage_seq_lengthc           
         s   || _ |j| _|j| _|j| _d|jg| }d|j | |j d| _|| _|j| _|j	| _	|j
| _
d|j
g| }	d|j	 |	 |j d| _t jd||||d| d S )N z

)r    r!   r"   chat_templater   )r(   audio_token_id	boa_tokenaudio_tokenjoin	eoa_tokenfull_audio_sequencer)   image_token_id	boi_tokenimage_token	eoi_tokenfull_image_sequencesuper__init__)
selfr    r!   r"   r+   r(   r)   kwargsaudio_tokens_expandedimage_tokens_expanded	__class__r   r   r8   C   s(   

zGemma3nProcessor.__init__imagestextaudior:   returnc                    s  |d u r|d u r|d u rt d jtfd jji|}t|tr&|g}nt|ts6t|d ts6t d|d urZ j|fi |d }|sP fdd|D } fdd|D }ni }|d ur j	
|}t|} j	|fi |d	 }	|s fd
d|D }t|t|krt dt| dt| d fdd|D }ni }	|d dd }
 jdd|i|d ddi} j||dgd |d }t|}d|| jk< d|| jk< dd | D }| |d< ti ||	||
dS )Nz5Provide at least one of `text`, `images`, or `audio`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr   c                    s   g | ]} j qS r   )r.   ).0_r9   r   r   
<listcomp>}   s    z-Gemma3nProcessor.__call__.<locals>.<listcomp>c                       g | ]
}|  j jqS r   )replacer.   r1   rD   promptrF   r   r   rG          r   c                    s"   g | ]}d   jgt| qS ) )r/   r4   len)rD   r?   rF   r   r   rG      s   " z1Received inconsistently sized batches of images (z) and text (z).c                    rH   r   )rI   r4   r6   rJ   rF   r   r   rG      rL   r   return_tensorsr@   npimage)
modalities	input_ids   r   c                 S   s   i | ]	\}}||  qS r   )tolist)rD   kvr   r   r   
<dictcomp>   s    z-Gemma3nProcessor.__call__.<locals>.<dictcomp>token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr   r"   init_kwargs
isinstancestrlistr    r!   fetch_imagesr   rN   pop_check_special_mm_tokensrP   
zeros_liker2   r,   itemsrU   r   )r9   r?   r@   rA   videosr:   output_kwargsaudio_inputsbatched_imagesimage_inputsrO   text_inputs	array_idsrY   r   rF   r   __call__c   sR   
 
zGemma3nProcessor.__call__)Nr&   r'   )NNNN)r   r   r   __doc__
attributesfeature_extractor_classimage_processor_classtokenizer_classintr8   r   r   r   r   r   ra   rP   ndarrayfloatr   r   r   rn   __classcell__r   r   r=   r   r   (   s:    "&r   )typingr   r   numpyrP   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r	   r
   r   r   tokenization_utils_baser   r   r   r   r   __all__r   r   r   r   <module>   s   

}