o
    ei                     @   s   d dl Z d dlZddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZ G dd	 d	e
d
dZeG dd deZdgZdS )    N   )BatchFeature)
ImageInputmake_nested_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstring	to_py_objc                   @   s(   e Zd ZddddddddddZd	S )
Gemma3ProcessorKwargsFT)paddingreturn_mm_token_type_ids      g333333?)do_convert_rgbdo_pan_and_scanpan_and_scan_min_crop_sizepan_and_scan_max_num_crops"pan_and_scan_min_ratio_to_activate)text_kwargsimages_kwargsN)__name__
__module____qualname__	_defaults r   r   j/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/transformers/models/gemma3/processing_gemma3.pyr      s    
r   F)totalc                
       s~   e Zd Z		ddef fddZe		ddedB deeB e	e B e	e B de
e d	efd
dZdddZedd Z  ZS )Gemma3ProcessorNr   image_seq_lengthc                    sh   || _ |j| _|j| _|j| _d|jg| }d|j | |j d| _t jd|||d| d S )N z

)image_processor	tokenizerchat_templater   )	r"   image_token_id	boi_tokenimage_tokenjoin	eoi_tokenfull_image_sequencesuper__init__)selfr$   r%   r&   r"   kwargsimage_tokens_expanded	__class__r   r   r.   ,   s   
zGemma3Processor.__init__imagestextr0   returnc                    sz  |d u r|d u rt djtfdjji|}t|tr"|g}nt|ts2t|d ts2tdi }|d urj	
|}t|}j	|fi |d }|sXfdd|D }t|t|krot dt| d	t| d
t|d  fdd|D }tt|||D ]a\}\}	} dd tj|	D }
t|t|
krt dt|
 dt| dttt |
D ]-\}}|rdj ddjg|  }|	d | | |	|tj d   }	|	||< qqfdd|D }|d dd }|d dd}jd d|i|d }j||dgd |r3t|d }t|}d||jk< | |d< ti |||dS )!Nz+Provide at least one of `text` or `images`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr   c                    s"   g | ]}d   jgt| qS ) )r*   r(   len.0r4   r/   r   r   
<listcomp>_   s   " z,Gemma3Processor.__call__.<locals>.<listcomp>z1Received inconsistently sized batches of images (z) and text (z).	num_cropsc                    s&   g | ]} fd dt t|D qS )c                    s   g | ]}  d qS )r   )pop)r;   _r>   r   r   r=   h   s    z7Gemma3Processor.__call__.<locals>.<listcomp>.<listcomp>)ranger9   r:   rA   r   r   r=   h   s   & c                 S   s   g | ]}|  qS r   )start)r;   mr   r   r   r=   j   s    zPrompt contained z image tokens but received z images.zHere is the original image z0 and here are some crops to help you see better r8   c                    s   g | ]
}|  j jqS r   )replacer(   r,   )r;   promptr<   r   r   r=   |   s    r   return_tensorsr   Fr5   image)
modalities	input_ids   token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr   r%   init_kwargs
isinstancestrlist	TypeErrorr$   fetch_imagesr   r9   r   r?   	enumerateziprefinditerr(   reversedr*   _check_special_mm_tokensnparray
zeros_liker'   tolistr   )r/   r4   r5   r0   output_kwargsimage_inputsbatched_imagesbatch_num_crops	batch_idxrF   image_indexesnumidxformatted_image_textrG   r   text_inputs	array_idsmm_token_type_idsr   )r>   r/   r   __call__B   sj   
&

zGemma3Processor.__call__c                 K   sH   i }|dur| j gt| }dgt| }|||d tdi |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        NrK   )num_image_tokensnum_image_patchesr   )r"   r9   updater   )r/   image_sizesr0   vision_datarn   ro   r   r   r   _get_num_multimodal_tokens   s   z*Gemma3Processor._get_num_multimodal_tokensc                 C   s0   | j jdg }| jj}dd |D }t|| S )NrL   c                 S   s   g | ]}|d kr|qS rA   r   )r;   namer   r   r   r=      s    z5Gemma3Processor.model_input_names.<locals>.<listcomp>)r%   model_input_namesr$   rT   )r/   tokenizer_input_namesimage_processor_input_namesr   r   r   ru      s   z!Gemma3Processor.model_input_names)Nr   )NN)N)r   r   r   intr.   r   r   r   r
   rT   r	   r   r   rm   rs   propertyru   __classcell__r   r   r2   r   r!   *   s*    
Ir!   )rY   numpyr]   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r   r	   tokenization_utils_baser
   r   utilsr   r   r   r!   __all__r   r   r   r   <module>   s    
