o
    
۾iL                     @   s   d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZmZ d dlmZmZ d dlmZ dgZd	ZG d
d de
ddZG dd deZede dS )    )cached_propertyN)AutoProcessorBatchFeature)
ImageInput)ProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)convert_image_modeOvisProcessoric                   @   s    e Zd ZddiddddZdS )OvisProcessorKwargspaddingFTpt)do_convert_rgbreturn_tensors)text_kwargsimages_kwargsN)__name__
__module____qualname__	_defaults r   r   [/home/ubuntu/.local/lib/python3.10/site-packages/vllm/transformers_utils/processors/ovis.pyr   (   s    
r   F)totalc                	       s   e Zd ZdZddgZg dZdZdZ					d) fd	d
	Ze	dd Z
		d*dedeeB ee B ee B dee defddZdee dejfddZdd Zdd Zdd Zdd Zdejjfdd Zd!d" Zd#d$ Zd%d& Z e!d'd( Z"  Z#S )+r   a  
    Constructs an Ovis processor which wraps an Ovis image processor and a Qwen2 tokenizer into a single processor.
    [`OvisProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
    [`~OvisProcessor.__call__`] and [`~OvisProcessor.decode`] for more information.
    Args:
        image_processor ([`Qwen2VLImageProcessor`], *optional*):
            The image processor is a required input.
        tokenizer ([`Qwen2TokenizerFast`], *optional*):
            The tokenizer is a required input.
        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
            in a chat into a tokenizable string.
    image_processor	tokenizer)chat_templateimage_pad_tokenimage_segment_lenAutoImageProcessorAutoTokenizerN   c                    s(   d| _ || _|| _t j|||d d S )Nz<image>)r   )image_tokenr   r   super__init__)selfr   r   r   r   r   kwargs	__class__r   r   r%   H   s   	zOvisProcessor.__init__c              	   C   s*   | j  | j }ddddddd|d}|S )	Ni8iiiiii)r#   
image_atomimage_startimage_prefiximage_col_sepimage_row_sep	image_end	image_pad)r   	get_vocabr   )r&   image_pad_token_idextra_special_tokensr   r   r   r3   V   s   
z"OvisProcessor.extra_special_tokensimagestextr'   returnc                 K   s  | dd}| dd}| jtfd| jji|}i }|durXg }g }	g }
t|tr,|n|gD ]"}| jd|||d|d \}}}|| |	| |
| q/|rX|	|d	< |durt|tsd|g}| 	|}| 
d
}g }d}|D ]L}||v rd	|v r|t|d	 k r| }g }t|D ]\}}||kr|d	 | }|| |d7 }q|| qtj|tjd}ntd|| qt|rt|}ntjg tjd}td|id}|r||d< |
|d< |S t|dS )a  
        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
        and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
        Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
            Args:
                images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                    The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                    tensor. Both channels-first and channels-last formats are supported.
                text (`str`, `list[str]`, `list[list[str]]`):
                    The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                    (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                    `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
                videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
                    The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
                    tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
                return_tensors (`str` or [`~utils.TensorType`], *optional*):
                    If set, will return tensors of a particular framework. Acceptable values are:
                    - `'tf'`: Return TensorFlow `tf.constant` objects.
                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
                    - `'np'`: Return NumPy `np.ndarray` objects.
                    - `'jax'`: Return JAX `jnp.ndarray` objects.
            Returns:
                [`BatchFeature`]: A [`BatchFeature`] with the following fields:
                - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
                - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
                  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
                  `None`).
                - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
                - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
                - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
                - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
                - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
        max_partition	   covering_thresholdg?tokenizer_init_kwargsN)imager7   r9   r   image_placeholdersr#   r      dtypezZMismatch between the images you provided and the number of placeholder present in the text	input_ids)datapixel_valuesgridsr   )pop_merge_kwargsr   r   init_kwargs
isinstancelistpreprocess_imageappend_tokenize_with_image_symbolget_token_valuelentolist	enumerateextendtorchtensorlongRuntimeErrorstackr   )r&   r4   r5   r'   r7   r9   output_kwargsimage_featuresprocessed_imagesimage_placeholders_listrC   r;   rB   r<   gridtokenized_batched_textimage_token_idreplaced_ids_listidx
ids_tensorids_listnew_idsitoken_idplaceholder_idsreplaced_and_tokenized_idsoutputr   r   r   __call__e   s   ,







zOvisProcessor.__call__	text_listc           	         s   g }|D ]6} fdd|  jD }g }t|}t|D ]\}}|| ||d k r4| d q|| qtj|tj	dS )Nc                    s   g | ]
} j |d djqS )F)add_special_tokens)r   r@   ).0chunkr&   r   r   
<listcomp>   s    z=OvisProcessor._tokenize_with_image_symbol.<locals>.<listcomp>r=   r#   r>   )
splitr#   rM   rO   rP   rJ   rL   rQ   rR   rS   )	r&   rh   batch_token_idsr5   text_chunks	token_ids	num_chuckrb   rk   r   rl   r   rK      s   


z)OvisProcessor._tokenize_with_image_symbolc                 C   sT   | j j}d|v r|d  }}||fS d|v r&d|v r&|d }|d }||fS td)Nshortest_edgeheightwidthz3Can't parse image size from image_processor config.)r   size
ValueError)r&   rv   ru   rt   r   r   r   get_image_size   s   zOvisProcessor.get_image_sizec                 C   s
   | j | S N)r3   )r&   tokr   r   r   rL   
  s   
zOvisProcessor.get_token_valuec                 C   s   |  d|  d|  dg}|d |d  dkrRt|d D ]3}t|d D ]}||  d ||d d k r@||  d q&||d d k rQ||  d q||  d |S )	Nr+   r*   r,   r   r=   r-   r.   r/   )rL   rangerJ   )r&   rZ   r<   rcr   r   r   construct_image_indicators  s    z(OvisProcessor.construct_image_indicatorsc                 C   sT   |  |}| d}| d}g }|D ]}|| ||kr'||g| j  q|S )Nr*   r0   )r~   rL   rJ   rP   r   )r&   rZ   r<   image_atom_token_idimage_padding_token_idpadded_placeholder_tokenstokenr   r   r   construct_image_placeholders  s   




z*OvisProcessor.construct_image_placeholdersr;   c                    s  dt jjffdddttttttf  fdddd   fd	d
}|r/td }|d |d kr?td|d |}|}	fdd|	D }
t|
dkrb|
	d t
jfdd|
D dd}|}t
||t
|fS )Nimgc           
         s:  | j \}}||kr| }}n||kr|}t|| | }n
|}t|| | }t||d}jj| | dd }tjdd||g|j|jd}|j	dd  \}}||kre||d d d d d d d d f< |S ||kr|| d }	||d d d d d d |	|	| f< |S || d }	||d d d d |	|	| d d f< |S )N)rt   ru   )rv   r   rB   r=      )r?   device   )
rv   intdictr   
preprocessrQ   zerosr?   r   shape)
r   sidewh	new_width
new_heightnew_sizerB   square_values
from_index)r   r&   r   r   _preprocess7  s<   

 "	"z3OvisProcessor.preprocess_image.<locals>._preprocessr6   c                 S   s   | j \}}||d  }||d  }g }t|d D ]<}t|d D ]3}|| }	|| }
||d d kr5|n|d | }||d d krE|n|d | }||	|
||f q!q|S Nr   r=   )rv   r{   rJ   )r   rZ   r   r   
row_height	col_width	partitionrowcolleftupperrightlowerr   r   r   
_partition[  s   
  z2OvisProcessor.preprocess_image.<locals>._partitionc                 S   sF   ||  }|| }t ||t||}}||kr|| | }|}|| S ry   )maxmin)r   r   r   r   r   r   r   r   r   r   _covering_areak  s   z6OvisProcessor.preprocess_image.<locals>._covering_areac                    s   | j d | j d  }g }tdd D ]}tdd D ]}|| kr+|||f qqg }g }|D ]-}| |}	t fdd|	D | }
|
dksNJ |||
f |
kr`|||
f q3t|dkrst|dd dd d S t|d	d dd d S )
Nr   r=   c                    s   g | ]} g |R  qS r   r   rj   p)r   r   r   r   rm     s    zJOvisProcessor.preprocess_image.<locals>._get_best_grid.<locals>.<listcomp>g      ?c                 S   s"   | d d | d d  | d  fS r   r   xr   r   r   <lambda>     " zHOvisProcessor.preprocess_image.<locals>._get_best_grid.<locals>.<lambda>)keyc                 S   s"   | d  | d d | d d  fS )Nr=   r   r   r   r   r   r   r     r   )rv   r{   rJ   sumrM   sorted)r   r   img_areacandidate_gridsrb   j	all_grids
good_gridsrZ   r   covering_ratio)r   r   r9   r7   )r   r   _get_best_gridt  s0   
z6OvisProcessor.preprocess_image.<locals>._get_best_gridRGBr   r=   z(get_image_size() returns non-square sizec                    s   g | ]}  |qS r   )cropr   )r;   r   r   rm         z2OvisProcessor.preprocess_image.<locals>.<listcomp>c                    s   g | ]} |qS r   r   )rj   r   )r   r   r   r   rm     r   )dim)PILImagerH   tupler   r   rx   rw   rM   insertrQ   catr   rR   )r&   r;   r7   r9   r   r   r   sidesrZ   r   cropsrB   r<   r   )	r   r   r   r9   r;   r7   r   r&   r   r   rI   /  s$   $	



zOvisProcessor.preprocess_imagec                 O      | j j|i |S )z
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
        refer to the docstring of this method for more information.
        r   batch_decoder&   argsr'   r   r   r   r        zOvisProcessor.batch_decodec                 O   r   )z
        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
        the docstring of this method for more information.
        )r   decoder   r   r   r   r     r   zOvisProcessor.decodec                 C   s   | j j|dddS )a  
        Post-process the output of the model to decode the text.
        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
        Returns:
            `list[str]`: The decoded text.
        TF)skip_special_tokensclean_up_tokenization_spacesr   )r&   generated_outputsr   r   r   post_process_image_text_to_text  s
   
z-OvisProcessor.post_process_image_text_to_textc                 C   s,   | j j}| jj}tt|| }|dg S )Nsecond_per_grid_ts)r   model_input_namesr   rH   r   fromkeys)r&   tokenizer_input_namesimage_processor_input_namesnames_from_processorr   r   r   r     s   
zOvisProcessor.model_input_names)NNNNr"   )NN)$r   r   r   __doc__
attributesvalid_kwargsimage_processor_classtokenizer_classr%   r   r3   r   r
   r	   rH   r   r   r   rg   strrQ   
LongTensorrK   rx   rL   r~   r   r   r   rI   r   r   r   propertyr   __classcell__r   r   r(   r   r   4   sZ    

 
s)	functoolsr   r   rQ   transformersr   r   transformers.image_utilsr   transformers.processing_utilsr   r   r   $transformers.tokenization_utils_baser	   r
   vllm.multimodal.imager   __all__	IGNORE_IDr   r   registerr   r   r   r   <module>   s       