o
    iD                     @   s@  U d dl Z d dlZd dlmZmZ d dlmZ d dlmZ d dl	Z	d dl
Zd dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ e eZdejdedejfddZdejdededejfddZdejdej dej!dejfddZ"efde#dededej!dej dedejfddZ$deej dededej!dej dejfdd Z%de#dej&fd!d"Z'd#e	j(j)d$e	j*j+d%eddfd&d'Z,d#e	j(j)d(ede	j*j+fd)d*Z-d#e	j(j)d$e	j*j+d+e	j.ddfd,d-Z/d.ejeej B d/ed%edB d0e#d1eddfd2d3Z0d4d4d5d5d6d6d7Z1e2e#e3f e4d8< d9e	j.dej&fd:d;Z5d<e#de3fd=d>Z6d<e#defd?d@Z7	dUd<e#dej dBe3dCe3dB dedB f
dDdEZ8	 	dVd<e#dedFedGedB deej f
dHdIZ9	A	dUd<e#dedBe3dCe3dB deej f
dJdKZ:dLe#dMej&de3ddfdNdOZ;dPe#dej<fdQdRZ=efdej<de3dej<fdSdTZ>dS )W    N)	GeneratorIterator)Fraction)BytesIO)	rearrange)Image)DeviceLikeType)tqdm)AudioVideoPixelShape)DEFAULT_IMAGE_CRFimage	long_sidereturnc           
      C   st   | j dd \}}t||}|t| }t|| }t|| }t| ||}t|dd }	|	j d dkr8|	d S |	S )a[  
    Resize image preserving aspect ratio (filling target long side).
    Preserves the input dimensions order.
    Args:
        image: Input image tensor with shape (F (optional), H, W, C)
        long_side: Target long side size.
    Returns:
        Tensor with shape (F (optional), H, W, C) F = 1 if input is 3D, otherwise input shape[0]
       zb c f h w -> b f h w cr      )shapemaxfloatintresize_and_center_cropr   )
r   r   heightwidthmax_sidescaletarget_heighttarget_widthresizedresult r    M/home/ubuntu/LTX-2/packages/ltx-pipelines/src/ltx_pipelines/utils/media_io.pyresize_aspect_ratio_preserving   s   

r"   tensorr   r   c                 C   s   | j dkrt| d} n| j dkrt| d} n	td| j d| j\}}}}t|| || }t|| }t|| }tjj	j
| ||fddd	} || d
 }	|| d
 }
| dddd|	|	| |
|
| f } t| d} | S )ac  
    Resize tensor preserving aspect ratio (filling target), then center crop to exact dimensions.
    Args:
        latent: Input tensor with shape (H, W, C) or (F, H, W, C)
        height: Target height
        width: Target width
    Returns:
        Tensor with shape (1, C, 1, height, width) for 3D input or (1, C, F, height, width) for 4D input
       zh w c -> 1 c h w   zf h w c -> f c h wz1Expected input with 3 or 4 dimensions; got shape .bilinearF)sizemodealign_cornersr   Nzf c h w -> 1 c f h w)ndimr   
ValueErrorr   r   mathceiltorchnn
functionalinterpolate)r#   r   r   _src_hsrc_wr   new_hnew_wcrop_top	crop_leftr    r    r!   r   +   s   


(
r   latentdevicedtypec                 C   s   | d d j ||dS )Ng     _@      ?)r;   r<   )to)r:   r;   r<   r    r    r!   normalize_latentN   s   r?   
image_pathcrfc                 C   sD   t | d}t||d}tj|tj|d}t|||}t|||}|S )z
    Loads an image from a path and preprocesses it for conditioning.
    Note: The image is resized to the nearest multiple of 2 for compatibility with video codecs.
    )r@   )r   rA   r<   r;   )decode_image
preprocessr/   r#   float32r   r?   )r@   r   r   r<   r;   rA   r   r    r    r!   load_image_and_preprocessR   s   
rF   framesc                 C   sP   d}| D ]!}t |tj||}t|||}|du r|ntj||gdd}q|S )a  Preprocesses a video frame generator for conditioning.
    Args:
        frames: Generator of video frames as tensors of shape (1, H, W, C), dtype uint8.
        height: Target height in pixels.
        width: Target width in pixels.
        dtype: Target dtype for the output tensor.
        device: Target device for the output tensor.
    Returns:
        Tensor of shape (1, C, F, height, width) with values in [-1, 1].
    Nr   )dim)r   r>   r/   rE   r?   cat)rG   r   r   r<   r;   r   fframer    r    r!   video_preprocessf   s    rL   c                 C   s$   t | }t|dd df }|S )N.r$   )r   opennparray)r@   r   np_arrayr    r    r!   rC      s   
rC   	containeraudio_streamaudioc                 C   s   |j }|jdkr|d d d f }|jd dkr!|jd dkr!|j}|jd dkr1td|j d|jtjkrFt|dd}|d 	tj}t
jj| dd	  d
dd}|j|_t| || d S )Nr   r   r   z,Expected samples with 2 channels; got shape r&   g      r=   g    @s16stereo)formatlayout)waveformr+   r   Tr,   r<   r/   int16clipr>   av
AudioFramefrom_ndarray
contiguousreshapecpunumpysampling_ratesample_rate_resample_audio)rQ   rR   rS   samplesframe_inr    r    r!   _write_audio   s"   
ri   audio_sample_ratec                 C   s0   | j d|d}||j_d|j_td||j_|S )z/
    Prepare the audio stream for writing.
    aacraterV   r   )
add_streamcodec_contextre   rX   r   	time_base)rQ   rj   rR   r    r    r!   _prepare_audio_stream   s
   rq   rh   c                 C   s   |j }|jpd}|jpd}|jp|j}tjjj|||d}d}||D ]}	|	j	d u r.||	_	||	j
7 }|j|	_| ||	 q$| D ]}
| |
 qDd S )NfltprV   )rW   rX   rm   r   )ro   rW   rX   re   r]   rS   	resamplerAudioResamplerresampleptsrg   muxencode)rQ   rR   rh   cctarget_formattarget_layouttarget_rateaudio_resampleraudio_next_ptsrframepacketr    r    r!   rf      s&   



rf   videofpsoutput_pathvideo_chunks_numberc              	   C   sT  t | tjrt| g} t| }|j\}}}}tj|dd}	|	jdt	|d}
||
_
||
_d|
_|d ur9t|	|j}dtjdtttjt	f d d f dtttjt	f d d f fd	d
}t||| |dD ]#}|d }|D ]}tjj|dd}|
|D ]}|	| q|qmqb|
 D ]}|	| q|d urt|	|| |	  td|  d S )Nw)r)   libx264rl   yuv420pfirst_chunktiles_generatorr   c                 s   s    | V  |E d H  d S )Nr    )r   r   r    r    r!   	all_tiles   s   zencode_video.<locals>.all_tiles)totalrb   rgb24rW   zVideo saved to )
isinstancer/   Tensoriternextr   r]   rM   rn   r   r   r   pix_fmtrq   rd   r   tupler	   r>   rc   
VideoFramer_   rx   rw   ri   closeloggerinfo)r   r   rS   r   r   r   r3   r   r   rQ   streamrR   r   video_chunkvideo_chunk_cpuframe_arrayrK   r   r    r    r!   encode_video   s@   

r   g      `@g      @g      A)u8u8prU   s16ps32s32p_INT_FORMAT_MAXrK   c                 C   sR   | j j}|  tj}|tv r|t|  }| j js't| j	j
}|d|j}|S )zaConvert an audio frame to a float32 ndarray with values in [-1, 1] and shape (channels, samples).rT   )rW   name
to_ndarrayastyperN   rE   r   	is_planarlenrX   channelsra   rZ   )rK   fmtarrr   r    r    r!   _audio_frame_to_float   s   r   pathc                 C   s>   t | }ztdd |jD }t|jW |  S |  w )zRead video stream FPS.c                 s       | ]
}|j d kr|V  qdS r   Ntype.0sr    r    r!   	<genexpr>      z&get_videostream_fps.<locals>.<genexpr>)r]   rM   r   streamsr   average_rater   )r   rQ   video_streamr    r    r!   get_videostream_fps  s
   

r   c                 C   s   t | }z:tdd |jD }t|j}|jpd}|dkr*tdd ||D }|j	j
}|j	j}td||||dW |  S |  w )a3  Read video stream metadata as a VideoPixelShape with batch=1.
    If frame count is missing in the container, decodes the stream to count frames.
    Args:
        path: Path to the video file.
    Returns:
        VideoPixelShape with batch=1, frames, height, width, and fps populated from the stream.
    c                 s   r   r   r   r   r    r    r!   r   !  r   z+get_videostream_metadata.<locals>.<genexpr>r   c                 s   s    | ]}d V  qdS )r   Nr    )r   r3   r    r    r!   r   %  s    r   )batchrG   r   r   r   )r]   rM   r   r   r   r   rG   sumdecodero   r   r   r   r   )r   rQ   r   r   
num_framesr   r   r    r    r!   get_videostream_metadata  s   


r           
start_timemax_durationc                 C   s|  t | }ztdd |jD }W n ty   |  Y dS w |j}t||j }|r0|| n|j	|j }|j
||d g }	d}
|jddD ]0}|jdu rOqGt|j|j }||j|j  }||k rdqG||krj n|
du rp|}
|	t| qG|  |	sdS tj|	dd}t||
 | }|dkr|d	|df }|durt|| }|d	d|f }t||d}t||d
S )a  Decodes audio from a file, optionally seeking to a start time and limiting duration.
    Args:
        path: Path to the audio/video file containing an audio stream.
        device: Device to place the resulting tensor on.
        start_time: Start time in seconds to begin reading audio from.
        max_duration: Maximum audio duration in seconds. If None, reads to end of stream.
    Returns:
        An Audio object with waveform of shape (1, channels, samples), or None if no audio stream.
    c                 s   r   )rS   Nr   r   r    r    r!   r   ;  r   z)decode_audio_from_file.<locals>.<genexpr>Nr   r   )rS   rT   )axis.)rY   rd   )r]   rM   r   r   StopIterationr   rm   r   rp   durationseekr   rv   r   rg   re   appendr   rN   concatenateroundr/   
from_numpyr>   	unsqueezer
   )r   r;   r   r   rQ   rR   re   	start_ptsend_timerg   first_frame_timerK   
frame_time	frame_endrS   skip_samplesmax_samplesrY   r    r    r!   decode_audio_from_file-  sJ   

r   starting_frame	frame_capc           	      c   s    t | }zJtdd |jD }t||D ]*\}}||k r!qtj| 	 tj
|dd}|V  |durB|d8 }|dkrB nqW |  dS W |  dS |  w )a  Decodes video from a file by sequential frame index, without relying on pts.
    Args:
        path: Path to the video file.
        device: Device to place the resulting tensors on.
        starting_frame: Number of leading frames to skip (default 0).
        frame_cap: Maximum number of frames to yield. If None, no frame limit (default None).
    Yields:
        Frames as tensors of shape (1, H, W, C), dtype uint8.
    c                 s   r   r   r   r   r    r    r!   r   }  r   z(decode_video_by_frame.<locals>.<genexpr>rB   r   Nr   )r]   rM   r   r   	enumerater   r/   r#   to_rgbr   uint8r   r   )	r   r;   r   r   rQ   r   indexrK   r#   r    r    r!   decode_video_by_framel  s$   
 
r   c                 c   s<   t | }ztdd |jD }t|j}|dkr%|jt|| |d |dur-|| nd}||D ]V}|j	du ret|j
}	t||	 }
|durPt||	 nd}t| ||
|dE dH   W |  dS |j	| }||k roq4|dury||kry ntj|  tj|ddV  q4W |  dS W |  dS |  w )a7  Decodes video from a file using presentation timestamps for time-based trimming.
    If a frame with no pts is encountered, falls back to :func:`decode_video_by_frame`
    using FPS-derived frame indices.
    Args:
        path: Path to the video file.
        device: Device to place the resulting tensors on.
        start_time: Start time in seconds (default 0.0).
        max_duration: Maximum duration in seconds to decode. If None, reads to end of
            stream (default None).
    Yields:
        Frames as tensors of shape (1, H, W, C), dtype uint8.
    c                 s   r   r   r   r   r    r    r!   r     r   z)decode_video_from_file.<locals>.<genexpr>r   r   N)r   r;   r   r   rB   )r]   rM   r   r   r   rp   r   r   r   rv   r   r   r   r   r/   r#   r   r   r   r   )r   r;   r   r   rQ   r   rp   r   rK   r   r   r   r   r    r    r!   decode_video_from_file  s8   




$r   output_fileimage_arrayc                 C   s   t j| ddd}zQ|jddt|ddd}|jd	 d
 d
 }|jd d
 d
 }|d |d |f }||_||_t jj|ddj	dd}|
|| |
|  W |  d S |  w )Nr   mp4r   r   r   veryfast)rA   preset)rm   optionsr   r   r   r   )r]   rM   rn   strr   r   r   r   r_   reformatrw   rx   r   )r   r   rA   rQ   r   r   r   av_framer    r    r!   encode_single_frame  s   r   
video_filec                 C   sP   t | }ztdd |jD }t||}W |  n|  w |jddS )Nc                 s   r   r   r   r   r    r    r!   r     r   z&decode_single_frame.<locals>.<genexpr>r   r   )r]   rM   r   r   r   r   r   )r   rQ   r   rK   r    r    r!   decode_single_frame  s   
r   c                 C   s|   |dkr| S t  }t|| | | }W d    n1 sw   Y  t |}t|}W d    |S 1 s7w   Y  |S )Nr   )r   r   getvaluer   )r   rA   r   video_bytesr   r   r    r    r!   rD     s   



rD   )r   N)r   N)?loggingr-   collections.abcr   r   	fractionsr   ior   r]   rc   rN   r/   einopsr   PILr   torch._prims_commonr   r	   ltx_core.typesr
   r   ltx_pipelines.utils.constantsr   	getLogger__name__r   r   r   r"   r   r;   r<   r?   r   rF   rL   ndarrayrC   rQ   	ContainerrS   AudioStreamri   rq   r^   rf   r   r   dictr   __annotations__r   r   r   r   r   r   r   rO   r   rD   r    r    r    r!   <module>   s   
 
"#


"

0


B
"
4"
