o
    i'                     @   s   d Z ddlZddlmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZmZmZmZ ddlmZmZ ddlmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ G dd deZ ededG dd deZ!dgZ"dS )z#video processor class for GLM-4.1V.    N)OptionalUnion   )BatchFeature)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimensionPILImageResamplingSizeDictget_image_size)UnpackVideosKwargs)
TensorTypeadd_start_docstrings)BASE_VIDEO_PROCESSOR_DOCSTRINGBaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videos   )smart_resizec                   @   sz   e Zd ZU dZeeef ed< dZe	e ed< dZ
e	e ed< dZe	e ed< dZe	ee  ed< dZe	ee  ed< dS )Glm4vVideoProcessorInitKwargsNmax_image_size
patch_sizetemporal_patch_size
merge_size
image_mean	image_std)__name__
__module____qualname__r   dictstrint__annotations__r   r   r   r   r   listfloatr    r'   r'   d/home/ubuntu/.local/lib/python3.10/site-packages/transformers/models/glm4v/video_processing_glm4v.pyr   '   s   
 r   zfConstructs a fast GLM-4V image processor that dynamically resizes videos based on the original videos.aj  
        patch_size (`int`, *optional*, defaults to 14):
            The spacial patch size of the vision encoder.
        temporal_patch_size (`int`, *optional*, defaults to 2):
            The temporal patch size of the vision encoder.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
    c                        s`  e Zd ZejZdddZddiZeZ	e
ZdZdZdZdZdZdZdZdZdZeZd	ZdZd
dgZdee f fddZ	d(dee def fddZ	d(de dee!e"e#f  fddZ$dddejdddddddddfde%e&j' de(de(dee dede(de#de(d ee!e#e%e# f  d!ee!e#e%e# f  d"ee" d#ee" d$ee" d%ee!e)e*f  fd&d'Z+  Z,S ))Glm4vVideoProcessori 1  i )shortest_edgelongest_edger+   T      i,     pixel_values_videosvideo_grid_thwkwargsc                    sP   t  jdi | | jd ur$| jdd d u s | jdd d u r&tdd S d S )Nr*   r+   :size must contain 'shortest_edge' and 'longest_edge' keys.r'   )super__init__sizeget
ValueError)selfr1   	__class__r'   r(   r4   Q   s   
$zGlm4vVideoProcessor.__init__Nr5   returnc                    s6   |durd|vsd|vrt dt jdd|i|S )z
        Update kwargs that need further processing before being validated
        Can be overridden by subclasses to customize the processing of kwargs.
        Nr*   r+   r2   r5   r'   )r7   r3   _further_process_kwargs)r8   r5   r1   r9   r'   r(   r<   X   s   	z+Glm4vVideoProcessor._further_process_kwargsmetadatafpsc                    s0  du st dddu rtdj}|dur|n| j|d  jp+t j d }|| jkrHtt	| } fddt
|D }n%t| j }||krZtt
|}ntjd||dd	}	 fd
d|	D }t g }
}|D ]}||
vr|
| || qut|d@ r||d  t|S )a  
        Args:
            metadata (`VideoMetadata`):
                Metadata of the video containing information about total duration, fps and total number of frames.
            fps (`int` or `float`, *optional*):
                Target frames to sample per second. Defaults to `self.fps`.
        Returns:
            np.ndarray:
                Indices to sample video frames.
        Nr>   zAsked to sample frames per second but no video metadata was provided which is required when sampling in GLM4V. Please pass in `VideoMetadata` object or set `do_sample_frames=False`r   c              	      s*   g | ]}t  tt|j  qS r'   minr#   mathceilr>   ).0imax_frame_idxr=   requested_fpsr'   r(   
<listcomp>   s   * z5Glm4vVideoProcessor.sample_frames.<locals>.<listcomp>r   T)endpointc              	      s&   g | ]}t  tt|j qS r'   r?   )rC   t)rF   r=   r'   r(   rH      s   & )getattrr7   total_num_framesr>   durationroundmax_durationr#   rA   floorranger%   nplinspacesetaddappendlenarray)r8   r=   r>   r1   total_framesrN   nframe_indicesnum_samplestarget_secondsseenuniqidxr'   rE   r(   sample_framesf   s2   



z!Glm4vVideoProcessor.sample_framesgp?videosdo_convert_rgb	do_resizeinterpolation
do_rescalerescale_factordo_normalizer   r   r   r   r   return_tensorsc           .      K   sD  t |\}}i }| D ]J\}}|j\}}}}}|||}}}|rRt|||||| |j|jd\}}||| |||}| j|t||d|d}||||||}|||< qt	||}t |\}}i } i }!| D ]\}}t
|d tjd\}}| |||||	|
}|}"|"jd | dkr|"d d dd f d|d ddd}#tj|"|#gdd}"|"jd d	 \}$}%}&|%| }%|| || }'}(|"|$|%||&|'| |||(| ||
}"|"ddd
dddd	ddd
}"|"|$|%|' |( |&| | | })|)| |< |%|'|(gg|$ |!|< qjt	| |}*t	|!|}!tj|*dd}+t|!},|+|,d}-t|-|dS )N)
num_framesheightwidthtemporal_factorfactor
min_pixels
max_pixels)rl   rm   )r5   rf   r   )channel_dimr   rK   )dimr               r-      	   )r/   r0   )datatensor_type)r   itemsshaper   r*   r+   viewresizer
   r   r   r   FIRSTrescale_and_normalizerepeattorchcatpermutereshapetensorr   ).r8   rc   rd   re   r5   rf   rg   rh   ri   r   r   r   r   r   rj   r1   grouped_videosgrouped_videos_indexresized_videos_groupedr}   stacked_videosBTCHWrk   rl   rm   resized_heightresized_widthresized_videosprocessed_videos_groupedprocessed_gridspatchesrepeats
batch_sizegrid_tchannelgrid_hgrid_wflatten_patchesprocessed_videosr/   r0   rz   r'   r'   r(   _preprocess   s   
	


&



zGlm4vVideoProcessor._preprocess)N)-r   r   r    r	   BICUBICresampler5   r   r   r   r   r   re   rg   ri   rd   do_sample_framesr   r   rP   r   r   valid_kwargsrk   r>   model_input_namesr   r4   r   r
   r!   r<   r   r   r#   r&   rb   r%   r   Tensorboolr"   r   r   __classcell__r'   r'   r9   r(   r)   0   s    
	
5	
r)   )#__doc__rA   typingr   r   numpyrS   r   image_processing_utilsr   image_utilsr   r   r   r	   r
   r   processing_utilsr   r   utilsr   r   video_processing_utilsr   r   video_utilsr   r   r   image_processing_glm4vr   r   r)   __all__r'   r'   r'   r(   <module>   s*    	 
A