o
    ίi9                     @   sp   d dl Z d dlZd dlmZmZmZmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ e eZG dd dZdS )    N)AnyCallableDictListOptional)Image)DatasetConfc                
   @   s   e Zd Zdee dedee defddZdee	e
f dejjfd	d
Zdee	e
f dejjfddZ	dde	de	deee	e	f  fddZdS )VisionPreprocessor	transform	tokenizermax_video_framesdataset_configc                 C   s*   || _ || _d| _|jr|j| _|| _d S )N )mllm_tokenizerr
   root_dirr   )selfr
   r   r   r    r   J/home/ubuntu/.local/lib/python3.10/site-packages/core/data/preprocessor.py__init__   s   
zVisionPreprocessor.__init__rowrngc              
   C   sF   z|  ||W S  ty" } ztd|  W Y d }~d S d }~ww )NzError processing row: )process	Exceptionloggererror)r   r   r   er   r   r   __call__   s   zVisionPreprocessor.__call__c                 C   sl  ~d|v r
|d }n	| j |d dd}d|v r:d|v rd|v s#J d|d |d }}|d }| jd	 ||||}d }d}d
|v rg }	|d
 }
t|
trP|
g}
g }|
D ]8}| jratj| j|}zt	|
d}|| W qT ty } ztd|  W Y d }~ d S d }~ww | jrt|dkr| jd
 }||d \}	}n| jd }||\}	}t|	jdkr|	d}	|	}t|
dkrdnd
}nNd|v r|d }|dd }|dd }|dd }| jrtj| j|}|| j|||f}| jd |\}}|}d}ntdd| jd
 j| jd
 j}d}| j|||d}|jr2||j|j|j|j|d}|S d }|S )Nconversationstextr   )captionpromptbboxwidthheightzibbox is present in the annotation, however image width or height is not specified, which is not expected.regionimageRGBz6loading image failed because of the following error:
    r   video   multi_image
start_timebbox_mapend_time)r   media
media_type)r.   text_idsresponse_pos	image_posnum_image_chunksr/   )get_conversationr
   
isinstancestrr   ospathjoinr   openconvertappendr   r   infolen_process_multiple_images_pilshape	unsqueezegetr   torchonessizer   is_validr0   r1   r2   num_media_chunks)r   r   r   r   whbboxesr.   r/   processed_imagesimage_files
pil_images
image_filer%   r   r
   _
video_filer+   r,   r-   
video_infor(   tokenized_sampleoutr   r   r   r   &   s   





zVisionPreprocessor.processNr   r    returnc                 C   s$   d|dur|nddd|dg}|S )z
        Converts plain caption to conversation.

        Args:
            caption (str): plain caption

        Returns:
            List[Dict[str, str]]: conversation
        humanNr   )fromvalue	assistantr   )r   r   r    r   r   r   r   r4      s   z#VisionPreprocessor.get_conversation)N)__name__
__module____qualname__r   r   intr   r   r   r6   r   nprandomRandomStater   r   r   r4   r   r   r   r   r	      s(    
Zr	   )loggingr7   typingr   r   r   r   r   numpyr]   rC   PILr   apps.plm.dataset_confr   	getLoggerrY   r   r	   r   r   r   r   <module>   s   
