o
    i                     @   s   d dl Z d dlmZmZ d dlZddlmZ ddlm	Z	m
Z
 ddlmZmZmZmZmZ ddlmZmZ ddlmZ G d	d
 d
eZG dd deddZG dd deZdgZdS )    N)OptionalUnion   )BatchFeature)
ImageInputmake_nested_list_of_images)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)	to_py_objc                   @   sJ   e Zd ZU ee ed< ee ed< ee ed< ee ed< ee ed< dS )Gemma3ImagesKwargsdo_pan_and_scanpan_and_scan_min_crop_sizepan_and_scan_max_num_crops"pan_and_scan_min_ratio_to_activatedo_convert_rgbN)__name__
__module____qualname__r   bool__annotations__intfloat r   r   i/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/transformers/models/gemma3/processing_gemma3.pyr      s   
 r   c                   @   s2   e Zd ZU eed< dddddddddd	Zd
S )Gemma3ProcessorKwargsimages_kwargsFT)paddingreturn_mm_token_type_ids      g333333?)r   r   r   r   r   )text_kwargsr    N)r   r   r   r   r   	_defaultsr   r   r   r   r   $   s   
 
r   F)totalc                
       s   e Zd ZddgZdZdZ		ddef fdd	Z				dd
ee	 de
eeee ee f dee defddZdddZedd Z  ZS )Gemma3Processorimage_processor	tokenizerAutoImageProcessorAutoTokenizerNr#   image_seq_lengthc                    sh   || _ |j| _|j| _|j| _d|jg| }d|j | |j d| _t jd|||d| d S )N z

)r)   r*   chat_templater   )	r-   image_token_id	boi_tokenimage_tokenjoin	eoi_tokenfull_image_sequencesuper__init__)selfr)   r*   r/   r-   kwargsimage_tokens_expanded	__class__r   r   r7   :   s   
zGemma3Processor.__init__imagestextr9   returnc                    sz  |d u r|d u rt djtfdjji|}t|tr"|g}nt|ts2t|d ts2tdi }|d urj	
|}t|}j	|fi |d }|sXfdd|D }t|t|krot dt| d	t| d
t|d  fdd|D }	tt|||	D ]a\}
\}} dd tj|D }t|t|krt dt| dt| dttt |D ]-\}}|rdj ddjg|  }|d | | ||tj d   }|||
< qqfdd|D }|d dd }|d dd}jd d|i|d }j||dgd |r3t|d }t|}d||jk< | |d< ti |||dS )!Nz+Provide at least one of `text` or `images`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr    c                    s"   g | ]}d   jgt| qS ) )r3   r1   len.0r=   r8   r   r   
<listcomp>n   s   " z,Gemma3Processor.__call__.<locals>.<listcomp>z1Received inconsistently sized batches of images (z) and text (z).	num_cropsc                    s&   g | ]} fd dt t|D qS )c                    s   g | ]}  d qS )r   )pop)rD   _rG   r   r   rF   w   s    z7Gemma3Processor.__call__.<locals>.<listcomp>.<listcomp>)rangerB   rC   rJ   r   r   rF   w   s   & c                 S   s   g | ]}|  qS r   )start)rD   mr   r   r   rF   y   s    zPrompt contained z image tokens but received z images.zHere is the original image z0 and here are some crops to help you see better rA   c                    s   g | ]
}|  j jqS r   )replacer1   r5   )rD   promptrE   r   r   rF      s    r%   return_tensorsr"   Fr>   image)
modalities	input_ids   token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr   r*   init_kwargs
isinstancestrlist	TypeErrorr)   fetch_imagesr   rB   r   rH   	enumerateziprefinditerr1   reversedr3   _check_special_mm_tokensnparray
zeros_liker0   tolistr   )r8   r=   r>   videosaudior9   output_kwargsimage_inputsbatched_imagesbatch_num_crops	batch_idxrO   image_indexesnumidxformatted_image_textrP   r"   text_inputs	array_idsmm_token_type_idsr   )rG   r8   r   __call__P   sj   
&

zGemma3Processor.__call__c                 K   sH   i }|dur| j gt| }dgt| }|||d tdi |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        NrT   )num_image_tokensnum_image_patchesr   )r-   rB   updater	   )r8   image_sizesr9   vision_datary   rz   r   r   r   _get_num_multimodal_tokens   s   z*Gemma3Processor._get_num_multimodal_tokensc                 C   s0   | j jdg }| jj}dd |D }t|| S )NrU   c                 S   s   g | ]}|d kr|qS rJ   r   )rD   namer   r   r   rF      s    z5Gemma3Processor.model_input_names.<locals>.<listcomp>)r*   model_input_namesr)   r]   )r8   tokenizer_input_namesimage_processor_input_namesr   r   r   r      s   z!Gemma3Processor.model_input_names)Nr#   )NNNN)N)r   r   r   
attributesimage_processor_classtokenizer_classr   r7   r   r   r   r   r   r]   r   r   r   rx   r~   propertyr   __classcell__r   r   r;   r   r(   5   s2    

Kr(   )rb   typingr   r   numpyrf   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r	   r
   r   r   tokenization_utils_baser   r   utilsr   r   r   r(   __all__r   r   r   r   <module>   s    
