o
    -i-k                     @   sJ  d dl Z d dlmZ d dlmZ d dlmZmZmZ d dl	Z
d dlmZ er*d dlZd dlmZ d dlmZ eeZdejdeeef d	ejfd
dZdejded	ejfddZdejded	ejfddZG dd dZe ZedG dd deZedG dd deZedG dd deZ edG dd deZ!dS )     N)abstractmethod)BytesIO)TYPE_CHECKINGAnycast)init_logger)ExtensionManagerframessizereturnc                 C   sd   | j \}}}}|\}}tj||||f| jd}dd l}t| D ]\}	}
||
||f}|||	< q|S )Ndtyper   )shapenpemptyr   cv2	enumerateresize)r	   r
   
num_frames_channels
new_height	new_widthresized_framesr   iframeresized_frame r   R/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/multimodal/video.pyresize_video   s   
r   size_factorc                 C   s4   | j \}}}}t|| }t|| }t| ||fS N)r   intr   )r	   r    r   heightwidthr   r   r   r   r   rescale_video_size#   s   r%   r   c                 C   s<   | j d }|dkr| S tjd|d |td}| |df }|S )Nr      r   .)r   r   linspacer"   )r	   r   total_framesframe_indicessampled_framesr   r   r   sample_frames_from_video+   s   
r,   c                   @   s   e Zd Zee	ddededeej	e
eef f fddZededee d	e
eef d
edef
ddZedddee d
edeej	ee e
eef f fddZedee dededeej	eee f fddZdS )VideoLoaderr&   datar   r   c                 K   s   t r!   )NotImplementedErrorclsr.   r   kwargsr   r   r   
load_bytes6   s   zVideoLoader.load_bytesidxfailed_framesnext_target_mapr)   c                 C   s$   |sdS |d }| ||}| |k S )z;Check if current frame can recover the oldest failed frame.Fr   )get)r4   r5   r6   r)   oldest_failedlimitr   r   r   _can_use_for_recovery=   s
   z!VideoLoader._can_use_for_recoverycapzcv2.VideoCapturer*   c                 C   s  ddl }t| |j}t| |j}|dkr|dks&J d| d| t|}|r0|d nd}i }tt|d D ]}	||	d  |||	 < q<|||d < g }
g }g }i }d}t|d D ]o}||v }|  }|sy|rxt	
d| || q_t||||}|s|r|  \}}|r|dur|jdkr|||j}|
| || |d7 }|r|d}|||< t	d||||  q_|rt	
d	| || q_|D ]}t	
d
| q|
rt|
}ntjd||dftjd}|||fS )a  
        Read frames with dynamic window forward-scan recovery.

        When a target frame fails to load, the next successfully grabbed
        frame (before the next target frame) will be used to recover it.

        Args:
            cap: OpenCV VideoCapture object
            frame_indices: Sorted list of target frame indices to load
            total_frames: Total number of frames in the video

        Returns:
            Tuple of (frames_array, valid_frame_indices, recovered_map)
            - frames_array: Array of loaded frames
            - valid_frame_indices: List of frame indices that were loaded
            - recovered_map: Dict mapping recovered_idx -> source_idx
        r   Nz Invalid video frame size: width=z	, height=r&   r'   z-Failed to grab frame %d during video loading.z-Recovered frame %d using frame %d (delay: %d)z1Failed to retrieve frame %d during video loading.z/Frame %d could not be recovered (end of video).   r   )r   r"   r7   CAP_PROP_FRAME_WIDTHCAP_PROP_FRAME_HEIGHTsetrangelengrabloggerwarningappendr-   r:   retriever
   cvtColorCOLOR_BGR2RGBpopinfor   stackr   uint8)r;   r*   r)   r   r$   r#   frame_idx_setmax_frame_idxr6   kframes_listvalid_frame_indicesfailed_frames_idxrecovered_mapr   r4   is_target_frameokcan_recoverretr   	rgb_framerecovered_idx
failed_idxr	   r   r   r   _read_frames_with_recoveryK   s   





z&VideoLoader._read_frames_with_recoverynum_expected_framesrN   c                 C   s   dd l }t| |j}t| |j}tj|||dftjd}d}g }	t|d D ]8}
| 	 }|s=|
|v r<t
d|
 q*|
|v rb|  \}}|r\|||j||< |	|
 |d7 }q*t
d|
 q*t|	}||k rut
d|| || |d | ||	fS )Nr   r<   r   r'   zIFailed to grab frame %d during video loading. This frame will be skipped.zMFailed to retrieve frame %d during video loading. This frame will be skipped.zgVideo loading completed with %d broken/unreadable frames. Expected %d frames but only loaded %d frames.)r   r"   r7   r=   r>   r   r   rL   r@   rB   rC   rD   rF   rG   rH   rE   rA   )r;   r*   r\   rN   r   r$   r#   r	   r   rQ   r4   rU   rW   r   valid_num_framesr   r   r   _read_frames   sF   

zVideoLoader._read_framesNr&   )__name__
__module____qualname__classmethodr   bytesr"   tuplenptNDArraydictstrr   r3   staticmethodlistboolr:   r[   r?   r^   r   r   r   r   r-   5   sV    
jr-   identityc                   @   s:   e Zd ZdZe	d
dedededeeef fddZ	d	S )IdentityVideoLoaderaJ  IdentityVideoLoader returns raw video bytes without decoding.

    This allows the model processor to handle video decoding and
    is required for models like Kimi-K2.5 that need custom video chunk splitting.

    NOTE: This is temporary for Kimi-K2.5 testing. Remember to change back
    to opencv before release if needed.
    r&   r.   r   r2   r   c                 K   s   |d fS r!   r   r0   r   r   r   r3      s   zIdentityVideoLoader.load_bytesNr_   )
r`   ra   rb   __doc__rc   rd   r"   r   re   r3   r   r   r   r   rn      s    	
rn   opencvc                   @   sV   e Zd Zdd Ze				ddededed	ed
edee	j
eeef f fddZdS )OpenCVVideoBackendc                 C   j   dd l m} d }| D ]&}||sq||s.||\}}}|dk s-|dkr.|dk r.q|} |S |S Nr   r'      cv2.videoio_registryvideoio_registrygetStreamBufferedBackends
hasBackendisBackendBuiltIn%getStreamBufferedBackendPluginVersionselfvrapi_prefbackendr   abiapir   r   r   get_cv2_video_api     

z$OpenCVVideoBackend.get_cv2_video_apir&   ,  Fr.   r   fpsmax_durationframe_recoveryr   c                 K   sL  ddl }|   }|t||g }	|	 stdt|	|j}
|	|j	}|dkr0|
| nd}|
}|dkr=t
||
}|dkrKt
|t|| }td|}||
kr\ttd|}ntjd|
d |td}| }|r| |	||
\}}}t|}|rtdt| nt|}| |	||t|\}}}|
||d|||
kd}||fS )	a  
        Load video frames from bytes.

        Args:
            data: Raw video bytes
            num_frames: Target number of frames to sample (-1 for all)
            fps: Target FPS for sampling (-1 for original)
            max_duration: Maximum duration (unused in base backend)
            frame_recovery: Enable forward-scan recovery for failed frames

        Returns:
            Tuple of (frames_array, metadata_dict)
        r   NCould not open video streamr'   r   7Frame recovery: %d frames recovered using forward scan.rp   total_num_framesr   durationvideo_backendframes_indicesdo_sample_frames)r   r   VideoCapturer   isOpened
ValueErrorr"   r7   CAP_PROP_FRAME_COUNTCAP_PROP_FPSminmathfloormaxrk   r@   r   r(   tolistr[   rA   rC   rJ   r?   r^   )r1   r.   r   r   r   r   r2   r   r   r;   total_frames_numoriginal_fpsr   num_frames_to_sample	frame_idxuniform_sampled_framesr	   rQ   rS   r]   rM   metadatar   r   r   r3     sV   




zOpenCVVideoBackend.load_bytesN)r&   r&   r   F)r`   ra   rb   r   rc   rd   r"   rl   re   rf   rg   rh   ri   r   r3   r   r   r   r   rq     s(    rq   opencv_dynamicc                   @   sN   e Zd Ze				ddedededed	ed
eej	e
eef f fddZdS )OpenCVDynamicVideoBackendr&   rt   r   Fr.   r   r   r   r   r   c                    s|  ddl }|   }|t||g }	|	 stdt|	|j}
|	|j	dkr0|
 nd}|
d |p?t
 d }||kr]tt|  }t fddt|D }n&t|  }||
krntt|
}ntjd||dd}tfd	d|D }|r| |	||
\}}}t|}|rtd
t| nt|}| |	|t||
d \}}}|
|d|dd}||fS )a  
        Load video frames with dynamic sampling based on duration.

        Args:
            data: Raw video bytes
            num_frames: Not used in dynamic backend
            fps: Target FPS for sampling (default: 2)
            max_duration: Maximum video duration to process (default: 300s)
            frame_recovery: Enable forward-scan recovery for failed frames

        Returns:
            Tuple of (frames_array, metadata_dict)
        r   Nr   r'   c              	      s(   h | ]}t tt|   qS r   r   r"   r   ceil).0r   r   rN   r   r   r   	<setcomp>  s    z7OpenCVDynamicVideoBackend.load_bytes.<locals>.<setcomp>T)endpointc              	      s$   h | ]}t  tt| qS r   r   )r   t)rN   r   r   r   r     s    r   r   Fr   )r   r   r   r   r   r   r"   r7   r   r   roundr   r   sortedr@   rk   r   r(   r[   rA   rC   rJ   r?   r^   )r1   r.   r   r   r   r   r2   r   r   r;   r   r   nframe_indices_listnum_samplestarget_secondsr	   rQ   rS   r]   frame_indices_setr   r   r   r   r3   k  sb   


	z$OpenCVDynamicVideoBackend.load_bytesN)r&   rt   r   F)r`   ra   rb   rc   rd   r"   rl   re   rf   rg   rh   ri   r   r3   r   r   r   r   r   i  s&    r   molmo2c                   @   sz  e Zd Zdd Ze	d"dedededee fdd	Zeded
edede	dee dedB fddZ
ededB ded
ededeedB ejf f
ddZe	d#ded
ede	dedB dee dB dejfddZedededede	dedededejfddZe				d$dede	dB dedededeejee	ef f fddZe	d%dededeejee	ef f fd d!ZdS )&Molmo2VideoBackendc                 C   rr   rs   ru   r|   r   r   r   r     r   z$Molmo2VideoBackend.get_cv2_video_api       @	video_fpssampling_fpsmax_fpsr   c                 C   s   t |}t |}t |}|du rtd|dks|dkr'td| d| d|| dkr8td| d| d	g }t||d
 |D ]}||krK |S || dkrX|t| qB|S )a  
        Return the subset of `video_fps` factors that remain multiples
        of `sampling_fps`.

        Examples:
            >>> get_candidate_target_fps(video_fps=6, sampling_fps=2)
            [2, 6]
            >>> get_candidate_target_fps(video_fps=5, sampling_fps=1)
            [1, 5]
            >>> get_candidate_target_fps(video_fps=2, sampling_fps=2)
            [2]
            >>> get_candidate_target_fps(video_fps=5, sampling_fps=2)
            Traceback (most recent call last):
                ...
            ValueError: sampling_fps=2 must divide video_fps=5 to produce
                consistent frame steps.
        Nzsampling_fps must be providedr   z1video_fps and sampling_fps must be positive (got z, )zsampling_fps=z must divide video_fps=.r'   )r"   r   r@   rE   float)r1   r   r   r   
candidates	candidater   r   r   get_candidate_target_fps  s4   z+Molmo2VideoBackend.get_candidate_target_fps
max_framesr)   frame_sample_modecandidate_target_fpsNc                 C   s   d}d}|D ]8}t t|| d}	t||	 }
|dkr+d|v r&|
|kr& |S |}|
}q||
ks1J |
|kr6q|
|kr>|}|
}q|S )z]
        Get the target fps that best spans the videoand has the most frames sampled
        r   Nr'   uniform)r   r"   )r1   r   r   r)   r   r   num_frames_sampledselected_target_fps
target_fps	step_sizenum_frames_sampled_at_fpsr   r   r   get_target_fps	  s(   z!Molmo2VideoBackend.get_target_fpsr   c                 C   s^   |d u rt jd||dtd}ntt|| d}t d||}t||kr+|d | }||fS )Nr   F)r   r   r'   )r   r(   r"   r   arangerA   )r1   r   r)   r   r   r*   r   r   r   r   get_frame_times_and_chosen_fps1  s   
z1Molmo2VideoBackend.get_frame_times_and_chosen_fpsr   c                 K   s   |dkr1|d us
J |d }|dd  D ]}|| |k r n|}qt d|| }	|	|	|k  }	|	S |dkr{|d urn|d | }
|
|k rPt jd||dt jd}	|	S t jd|d| d}	t j|	|ggdd	}	t|	|kslJ |	S t jd||dt jd}	|	S t|)
Nr   r   r'   uniform_last_frameT)numr   r           stopstepaxis)r   r   r(   float64concatenaterA   r/   )r1   r   r   r   r   r   r2   r   candidate_fpstimesr   r   r   r   sample_timesD  s:   
zMolmo2VideoBackend.sample_timesr   r   c                 C   sT  |dkrp|d urp|dkrt |t}|S ||d | kr1t jd|d t||ddt}|S t jd|d t|| d}	t |	d	 |d krVt j|	|d ggdd
}	t |	t}|d	 |k sfJ t	|	|ksnJ |S |dkrt jd|d t||ddt}|S |dkr| 
||}
| |||||
}| ||||\}}|S t|)Nr   rt   r'   r   T)r   r   r   r   r&   r   r   )r   r   astyper"   r(   r   r   r   r   rA   r   r   r   r/   )r1   r   r   r   r   r   r   r   indicesfloat_indicesr   r   r   r   r   r   _sample_framesn  sl   /,
	z!Molmo2VideoBackend._sample_framesr&   rt   r.   c              	   K   s  dd l }|   }|t||g }	|	 stdt|	|j}
|	|j	}|dkr0|
| nd}|d u rdt
td|
}t|}| |	||
t|\}}}||
k}|
||d|d}|s`||d< ||fS | |
|||||| }| |	t|t||
d \}}}|
||d|dd}||fS )	Nr   r   rp   )r   r   r   r   r   r   r'   Fr   )r   r   r   r   r   r   r"   r7   r   r   rk   r@   r?   r^   r   r   r   rA   )r1   r.   r   r   r   r   r2   r   r   r;   r   r   r   r   rM   r	   r]   rQ   r   r   r   r   r   load_bytes_opencv  sb   




	z$Molmo2VideoBackend.load_bytes_opencvc                 K   sX   t td B |dd }t t|dd}t t|dd}| j|||||fi |}|S )Nr   r   rt   r   )r   ri   rI   r"   r   )r1   r.   r   r2   r   r   r   outr   r   r   r3     s   zMolmo2VideoBackend.load_bytes)r   r!   )Nr&   rt   rt   r_   )r`   ra   rb   r   rc   r   rk   r   r"   ri   r   re   rf   rg   r   r   r   rd   rh   r   r   r3   r   r   r   r   r     s    0'
)	=Cr   )"r   abcr   ior   typingr   r   r   numpyr   numpy.typingrf   r   vllm.loggerr   vllm.utils.registryr   r`   rC   rg   re   r"   r   r   r%   r,   r-   VIDEO_LOADER_REGISTRYregisterrn   rq   r   r   r   r   r   r   <module>   s2   "
 7e]