o
    QiMR                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZmZmZmZmZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZ d d	lm Z  d
Z!dZ"dZ#dZ$dZ%dZ&dZ'dZ(dZ)dZ*dZ+e,e-ej./ddZ0e1e2Z3de,de,de,fddZ4de,de,de,fddZ5de,de,de,fddZ6dLde,de,de,dee, d ee, dee,e,f fd!d"Z7d#ejdejfd$d%Z8dMd'ee9ee9ejf f d(e,dejfd)d*Z:d'ee9ef d+e,d,ee,e-f de,fd-d.Z;d'ee9ef deej<e-f fd/d0Z=de>fd1d2Z?d'ee9ef d+e,d,e-dee,e,e,f fd3d4Z@d'ee9ef deej<e-f fd5d6ZAde>fd7d8ZBd'ee9ef deej<e-f fd9d:ZCeAe=eCd;ZDeEd<dZFed=d>de9fd?d@ZG	A	AdNd'ee9ef d(e,dBe>dCe>deej<eej f f
dDdEZHdFeeee9ef  eeee9ef   f deee9ef  fdGdHZI	A	A	&dOdFeeee9ef  eeee9ef   f dIe>dCe>d(e,deeeej  eeeej<eej f   eee9ef  f f
dJdKZJdS )P    N)ThreadPoolExecutor)	lru_cache)BytesIO)AnyDictListOptionalTupleUnion)version)Image)io
transforms)InterpolationMode         i @     i          @   MODEL_SEQ_LENi  numberfactorreturnc                 C   s   t | | | S )zFReturns the closest integer to 'number' that is divisible by 'factor'.)roundr   r    r   W/home/ubuntu/.local/lib/python3.10/site-packages/qwen_omni_utils/v2_5/vision_process.pyround_by_factor)   s   r   c                 C      t | | | S )z]Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'.)mathceilr   r   r   r   ceil_by_factor.      r"   c                 C   r   )zYReturns the largest integer less than or equal to 'number' that is divisible by 'factor'.)r    floorr   r   r   r   floor_by_factor3   r#   r%   heightwidth
min_pixels
max_pixelsc                 C   s   |dur|nt |d  }|dur|nt|d  }||ks J dt| |t| | tkr>tdt dt| |t| |  t|t| |}t|t||}|| |krot| | | }t	| | |}t	|| |}||fS || |k rt|| |  }t
| | |}t
|| |}||fS )a+  
    Rescales the image so that the following conditions are met:

    1. Both dimensions (height and width) are divisible by 'factor'.
    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
    3. The aspect ratio of the image is maintained as closely as possible.
    Nr   zDThe max_pixels of image must be greater than or equal to min_pixels.z+absolute aspect ratio must be smaller than z, got )IMAGE_MAX_TOKEN_NUMIMAGE_MIN_TOKEN_NUMmaxmin	MAX_RATIO
ValueErrorr   r    sqrtr%   r"   )r&   r'   r   r(   r)   h_barw_barbetar   r   r   smart_resize8   s&   r4   	pil_imagec                 C   s>   | j dkrtd| jd}|j| |  d d |S | dS )NRGBARGB)   r8   r8      )mask)moder   newsizepastesplitconvert)r5   white_backgroundr   r   r   to_rgbT   s
   

rB      eleimage_patch_sizec              	   C   s  d| v r	| d }n| d }d }t |t }t|tjr|}n|ds(|dratj|dd)}|  t|j	}t
t|}W d    n1 sLw   Y  W d    n1 s[w   Y  nG|drpt|dd  }n8|d	rd
|v r|d
d\}}t|}	t|	}t
t|}W d    n1 sw   Y  nt|}|d u rtd| t|}d| v rd| v rt| d | d |d\}
}n$|j\}}| dt|d  }| dt|d  }t|||||d\}
}|||
f}|S )Nimage	image_urlhttp://https://T)streamfile://   z
data:imagezbase64,   zRUnrecognized image input, support local path, http url, base64 and PIL.Image, got resized_heightresized_widthr   r(   r   r)   r   r(   r)   )intSPATIAL_MERGE_SIZE
isinstancer   
startswithrequestsgetraise_for_statusr   contentcopydeepcopyopenr?   base64	b64decoder/   rB   r4   r=   r+   r*   resize)rD   rE   rF   	image_objpatch_factorresponsebio_base64_datadatarN   rO   r'   r&   r(   r)   r   r   r   fetch_image]   s`   







rg   total_frames	video_fpsc                 C   s   d| v rd| v rJ dd| v rt | d t}nA| dt}t| dtt}t| dtt|t}|| | }||krIt	
d| d| d ttt||||}t|t}t|kra||ksotd	t d
| d| d|S )aa  calculate the number of frames for video used for model inputs.

    Args:
        ele (dict): a dict contains the configuration of video.
            support either `fps` or `nframes`:
                - nframes: the number of frames to extract for model inputs.
                - fps: the fps to extract frames for model inputs.
                    - min_frames: the minimum number of frames of the video, only used when fps is provided.
                    - max_frames: the maximum number of frames of the video, only used when fps is provided.
        total_frames (int): the original total number of frames of the video.
        video_fps (int | float): the original fps of the video.

    Raises:
        ValueError: nframes should in interval [FRAME_FACTOR, total_frames].

    Returns:
        int: the number of frames for video used for model inputs.
    fpsnframesz%Only accept either `fps` or `nframes`
min_frames
max_frameszsmart_nframes: nframes[z] > total_frames[]znframes should in interval [z, z], but got .)r   FRAME_FACTORrW   FPSr"   FPS_MIN_FRAMESr%   r-   FPS_MAX_FRAMESloggerwarningr,   r/   )rD   rh   ri   rk   rj   rl   rm   r   r   r   smart_nframes   s   
rv   c                 C   s  | d }t tjt dk r&d|v sd|v rtd d|v r&|dd }t }tj|| 	d	d
| 	ddddd\}}}|
d|d }}td|d|d|dt | dd	 t| ||d}td|d |  }	|t|d | }
||	 }t||	|dd}|||
fS )a  read video using torchvision.io.read_video

    Args:
        ele (dict): a dict contains the configuration of video.
        support keys:
            - video: the path of video. support "file://", "http://", "https://" and local path.
            - video_start: the start time of video.
            - video_end: the end time of video.
    Returns:
        torch.Tensor: the video tensor with shape (T, C, H, W).
    videoz0.19.0rH   rI   zVtorchvision < 0.19.0 does not support http/https video path, please upgrade to 0.19.0.rK   rL   Nvideo_start        	video_endsecTCHW)	start_ptsend_ptspts_unitoutput_formatr   ri   ztorchvision:  video_path=, total_frames=, video_fps=, time=.3fsrh   ri   rM   ư>torchvisionrj   frames_indicestotal_num_framesvideo_backend)r   parser   __version__warningswarntimer   
read_videorW   r=   rt   inforv   torchlinspacer   longr,   dict)rD   
video_pathstrw   audior   rh   ri   rk   idx
sample_fpsvideo_metadatar   r   r   _read_video_torchvision   s6   


.
r   c                  C      dd l } | jdd uS )Nr   decordimportlib.utilutil	find_spec	importlibr   r   r   is_decord_available      r   c           
      C   s`  |dkrt d|dkrt d| dd}| dd}|du r+|du r+d|d |fS || }|durCtdt||}t|| }nd}|dur`tdt||}t|| }	t|	|d }	n|d }	||	krt d	| d
|durs|nd d|	 d
|dur|n| d|dd| d| dtd|d|	d|d|d|d|d ||	|	| d fS )a  
    Calculate the start and end frame indices based on the given time range.

    Args:
        ele (dict): A dictionary containing optional 'video_start' and 'video_end' keys (in seconds).
        total_frames (int): Total number of frames in the video.
        video_fps (float): Frames per second of the video.

    Returns:
        tuple: A tuple containing (start_frame, end_frame, frame_count).

    Raises:
        ValueError: If input parameters are invalid or the time range is inconsistent.
    r   z#video_fps must be a positive numberz'total_frames must be a positive integerrx   Nrz   rM   ry   z Invalid time range: Start frame z (at zs) exceeds end frame zs). Video duration: z.2fzs (z
 frames @ zfps)z)calculate video frame range: start_frame=z, end_frame=r   z from video_start=z, video_end=r   r   )	r/   rW   r,   r-   r    r!   r$   rt   r   )
rD   rh   ri   rx   rz   max_durationvideo_start_clampedstart_framevideo_end_clamped	end_framer   r   r   calculate_video_frame_range   sD   0r   c                 C   s   ddl }| d }t }||}t|| }}t| ||\}}}t| ||d}	t|||		 
  }
||
 }t|dddd}td|d	|d
|dt | dd	 |	t|d | }t||
|dd}|||fS )a  read video using decord.VideoReader

    Args:
        ele (dict): a dict contains the configuration of video.
        support keys:
            - video: the path of video. support "file://", "http://", "https://" and local path.
            - video_start: the start time of video.
            - video_end: the end time of video.
    Returns:
        torch.Tensor: the video tensor with shape (T, C, H, W).
    r   Nrw   r   r9   rM   r   zdecord:  video_path=r   r   r   r   r   r   r   r   )r   r   VideoReaderlenget_avg_fpsr   rv   r   r   r   r   tolist	get_batchasnumpytensorpermutert   r   r,   r   )rD   r   r   r   vrrh   ri   r   r   rk   r   rw   r   r   r   r   r   _read_video_decord$  s.   

.
r   c                  C   r   )Nr   
torchcodecr   r   r   r   r   is_torchcodec_availableL  r   r   c                 C   s   ddl m} ttjdd}td|  | d }t }|||d}|j	j
}|j	j}t| ||\}}	}t| ||d}
t||	|
   }|
t|d	 | }|j|d
j}td|d|d|dt | dd	 t|||dd}|||fS )a  read video using torchcodec.decoders.VideoDecoder

    Args:
        ele (dict): a dict contains the configuration of video.
        support keys:
            - video: the path of video. support "file://", "http://", "https://" and local path.
            - video_start: the start time of video.
            - video_end: the end time of video.
    Returns:
        torch.Tensor: the video tensor with shape (T, C, H, W).
    r   )VideoDecoderTORCHCODEC_NUM_THREADSr   zset TORCHCODEC_NUM_THREADS: rw   )num_ffmpeg_threadsr   r   )indicesztorchcodec:  video_path=r   r   r   r   r   r   r   )torchcodec.decodersr   rR   osenvironrW   rt   r   r   metadataaverage_fps
num_framesr   rv   r   r   r   r   r   r,   get_frames_atrf   r   )rD   r   r   r   r   decoderri   rh   r   r   rk   r   r   rw   r   r   r   r   _read_video_torchcodecR  s2   
.
r   )r   r   r   FORCE_QWENVL_VIDEO_READERrM   )maxsizec                  C   sD   t d urt } nt rd} nt rd} nd} td|  dtjd | S )Nr   r   r   zqwen-vl-utils using z to read video.)file)r   r   r   printsysstderr)video_reader_backendr   r   r   get_video_reader_backend  s   r   Freturn_video_sample_fpsreturn_video_metadatac              
      s  |t  t  }t  }t| d trLt }zt| | \}}}	W n tyK }
 zt	d| d|
  td | \}}}	W Y d }
~
nd }
~
ww t| d t
tfsWJ |  dd  dd  ttt| d }t|d  fdd| d D }d	d |D }W d    n1 sw   Y  tt|t}t||k r||d
 g|t|   | dd}	tdd |D }d|	}t|dd tt|D ||	 | d}|j\}}}}| d|}| dt  d }tt||| t t|d }| d|}||krt	d| d| d t||}d| v r;d| v r;t| d | d d\}}nt||||d\}}tj j!|||gt"j#dd$ }|r\||fn|}|re||	fS |S )Nrw   zvideo_reader_backend z) error, use torchvision as default, msg: r   type)max_workersc                    s"   g | ]}  td |iqS )rF   )submitrg   ).0video_elementexecutorimage_factorprocess_infor   r   
<listcomp>  s    zfetch_video.<locals>.<listcomp>c                 S   s   g | ]}|  qS r   )result)r   futurer   r   r   r     s    r   r   c              	   S   s&   g | ]}t t|d ddqS )r   r   rM   )r   
from_numpynparray	transpose)r   rF   r   r   r   r     s    raw_fpsc                 S   s   g | ]}|qS r   r   )r   ir   r   r   r     s    )rj   r   r   r(   total_pixelsg?g?r)   zThe given max_pixels[z] exceeds limit[z].rN   rO   rP   rQ   T)interpolation	antialias)%rS   VIDEO_MIN_TOKEN_NUMVIDEO_MAX_TOKEN_NUMrT   strr   VIDEO_READER_BACKENDS	Exceptionrt   ru   listtuplerZ   popr-   MAX_NUM_WORKERS_FETCH_VIDEOr   r   r"   rp   extendrW   r   stackr   rangeshaper   r,   rR   r4   r   
functionalr_   r   BICUBICfloat)rD   rE   r   r   VIDEO_FRAME_MIN_PIXELSVIDEO_FRAME_MAX_PIXELSr   rw   r   r   er   futures
image_listrk   r   rd   r&   r'   r(   r   r)   max_pixels_supposedrN   rO   final_videor   r   r   fetch_video  s   

 


r   conversationsc                 C   s~   g }t | d tr| g} | D ].}|D ])}t |d tr;|d D ]}d|v s5d|v s5d|v s5|dddv r:|| qqq|S )	Nr   rY   rF   rG   rw   r   text)rF   rG   rw   )rT   r   r   rW   append)r   vision_infosconversationmessagerD   r   r   r   extract_vision_info  s   

r  return_video_kwargsc                 C   s   t | }g }g }g }|D ]0}d|v sd|v r |t||d qd|v r9t|d||d\}	}
||
 ||	 qtdt|dkrEd }t|dkrMd }d	d
i}|sZ|d|i |ra|||fS ||fS )NrF   rG   )rE   rw   T)r   rE   r   z,image, image_url or video should in content.r   do_sample_framesFrj   )r  r  rg   r   r/   r   update)r   r  r   rE   r  image_inputsvideo_inputsvideo_sample_fps_listvision_infovideo_inputvideo_sample_fpsvideo_kwargsr   r   r   process_vision_info  s0   


r  )NN)rC   )rC   FF)FFrC   )Kr]   rZ   loggingr    r   r   r   r   concurrent.futuresr   	functoolsr   r   r   typingr   r   r   r   r	   r
   numpyr   rV   r   r   	packagingr   PILr   r   torchvision.transformsr   r.   rS   r+   r*   r   r   rq   rp   rr   rs   r   rR   r   r   rW   r   	getLogger__name__rt   r   r"   r%   r4   rB   r   rg   rv   Tensorr   boolr   r   r   r   r   r   getenvr   r   r   r  r  r   r   r   r   <module>   s     
4,	3


(

,

:

(

+
BP&: