o
    i0                     @   sB  d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
lmZ d dlmZmZmZmZm Z m!Z!m"Z" dej#fddZ$dXddZ%dej&de'dej&fddZ(			dYdede)dedej#dej*de+de+dB dedB dej&dB fddZ,		dZd ej-j.de)dedej#dej*de+de+dB dej&dB fd!d"Z/d#e0e d$e'd%e'dedej*dej#de0e fd&d'Z1d#e0e d$e'd%e'dedej*dej#de0e fd(d)Z2d#e0e d$e'd%e'dedej*dej#de0e fd*d+Z3	,	d[d-ed.e0e d/edej*dej#d0e+d1ej&dB defd2d3Z4d4ed5e0e d6edefd7d8Z5d9ej&d:ej&d;ej&dej&fd<d=Z6	>d\d?ed@ej&dAej&dBe7def
dCdDZ8d:ej&dAe+ej&B dej&fdEdFZ9e):dGdHZ;dIe)de)fdJdKZ<		L	Md]dNedOe)dPe)dB dQe'dRe'de)fdSdTZ=d$e'd%e'dUe7ddfdVdWZ>dS )^    N)Noiser)ConditioningItemVideoConditionByKeyframeIndexVideoConditionByLatentIndex)encode_audio)Modality)TilingConfigVideoEncoder)GemmaTextEncoder)LatentTools)AudioLatentShapeLatentStateVideoLatentShapeVideoPixelShape)ImageConditioningInput)decode_audio_from_filedecode_imagedecode_video_from_fileget_videostream_fpsload_image_and_preprocessresize_aspect_ratio_preservingvideo_preprocessreturnc                   C   s&   t j rt dt j S t dS )Ncudacpu)torchr   is_availabledevicecurrent_device r   r   L/home/ubuntu/LTX-2/packages/ltx-pipelines/src/ltx_pipelines/utils/helpers.py
get_device   s   

r!   c                   C   s    t   tj  tj  d S )N)gccollectr   r   empty_cachesynchronizer   r   r   r    cleanup_memory$   s   
r&   latentexpected_frames_countc                 C   sx   | j d }||kr| d d d d d |f } | S ||k r:t| j }|| |d< tj|| j| jd}tj| |gdd} | S )N   )r   dtype)dim)shapelistr   zerosr   r*   cat)r'   r(   actual_framesshape_as_listpadr   r   r    _conform_latent_length*   s   


r3           video_encoder	file_pathoutput_shaper   r*   
start_timemax_durationtiling_configc                 C   s   t |}||jkrtd| d|j d|p|j| }t||||d}	t|	|j|j||}
| |
|p6t	
 }t|j}t||S )a  Load video from a file, and construct the video latent conforming to video output shape.
    Args:
        video_encoder: Model used to encode pixel frames to latent space.
        file_path: Path to the video file.
        output_shape: Target pixel shape (height, width, frames, fps) for the conditioning.
        device: Device to run the encoder and hold tensors on.
        dtype: Dtype for the output latents.
        start_time: Start time in seconds to begin reading the video (default 0.0).
        max_duration: Maximum duration in seconds. If None, uses output_shape.frames at
            output_shape.fps (default None).
        tiling_config: Tiling configuration for the encoder. Defaults to TilingConfig.default().
    Returns:
        Encoded video latents of shape (1, C, T, H, W) with T = required_latent_frames, or
        None (currently this function always returns a tensor).
    zInput video FPS z does not match output FPS z, not supported)pathr   r8   r9   )r   fps
ValueErrorframesr   r   heightwidthtiled_encoder   defaultr   from_pixel_shaper3   )r5   r6   r7   r   r*   r8   r9   r:   r<   	frame_genr>   latentsrequired_latent_framesr   r   r    video_latent_from_file:   s   

rG   audio_encoderc           
      C   sT   |p|j |j }t||||}|du rdS t|| d||}t|j }	t||	S )a  Load audio from a file, and construct the audio latent conforming to video output shape.
    Args:
        audio_encoder: Model used to encode audio to latent space.
        file_path: Path to the audio or video file containing an audio stream.
        output_shape: Target video pixel shape; used to derive required latent frames
            and, when max_duration is None, the audio duration (output_shape.frames / fps).
        device: Device to run the encoder and hold tensors on.
        dtype: Dtype for the output latents.
        start_time: Start time in seconds to begin reading the audio (default 0.0).
        max_duration: Maximum duration in seconds. If None, uses the full span implied
            by output_shape (default None).
    Returns:
        Encoded audio latents of shape (1, C, T, ...) with T = required_latent_frames, or
        None if the file has no audio stream.
    N)r>   r<   r   r   tor   from_video_pixel_shaper3   )
rH   r6   r7   r   r*   r8   r9   audio_inrE   rF   r   r   r    audio_latent_from_file^   s   
rL   imagesr?   r@   c              	   C   sj   g }| D ].}t |j|||||jd}||}	|jdkr$t|	|jdd}
n	t|	|j|jd}
||
 q|S )zCreate a list of conditionings by replacing the latent at the first frame with the encoded image if present
    and using other encoded images as the keyframe conditionings.
image_pathr?   r@   r*   r   crfr   r'   strength
latent_idx)	keyframesrR   	frame_idx)r   r;   rP   rU   r   rR   r   append)rM   r?   r@   r5   r*   r   conditioningsimgimageencoded_imageconditioningr   r   r    combined_image_conditionings   s0   

r\   c           
   	   C   J   g }| D ]}t |j|||||jd}||}	|t|	|j|jd q|S )NrN   rQ   )r   r;   rP   rV   r   rR   rU   
rM   r?   r@   r5   r*   r   rW   rX   rY   rZ   r   r   r    'image_conditionings_by_replacing_latent   s&   r_   c           
   	   C   r]   )NrN   )rT   rU   rR   )r   r;   rP   rV   r   rU   rR   r^   r   r   r    ,image_conditionings_by_adding_guiding_latent   s   r`         ?toolsrW   noisernoise_scaleinitial_latentc                 C   s(   |  |||}t||| }|||}|S )zCreate a noised latent state from empty state, conditionings, and noiser.
    Creates an empty latent state, applies conditionings, and then adds noise
    using the provided noiser. Returns the final noised state ready for diffusion.
    )create_initial_statestate_with_conditionings)rb   rW   rc   r*   r   rd   re   stater   r   r    create_noised_state   s   
ri   latent_stateconditioning_itemslatent_toolsc                 C   s   |D ]	}|j | |d} q| S )zApply a list of conditionings to a latent state.
    Iterates through the conditioning items and applies each one to the latent
    state in sequence. Returns the modified state with all conditionings applied.
    )rj   rl   )apply_to)rj   rk   rl   r[   r   r   r    rg      s   rg   denoiseddenoise_maskcleanc                 C   s    | | |  d|   | jS )z5Blend denoised output with clean state based on mask.   )floatrI   r*   )rn   ro   rp   r   r   r    post_process_latent   s    rs   Trh   contextsigmaenabledc              
   C   s&   t || j|t| j|| j|d| jdS )zCreate a Modality from a latent state.
    Constructs a Modality object with the latent state's data, timesteps derived
    from the denoise mask and sigma, positions, and the provided context.
    N)rv   r'   ru   	timesteps	positionsrt   context_maskattention_mask)r   r'   timesteps_from_maskro   rx   rz   )rh   rt   ru   rv   r   r   r    modality_from_latent_state  s   

r|   c                 C   s@   t |tjr| dkr|jdgdg|  d  R  }| | S )aX  Compute timesteps from a denoise mask and sigma value.
    Multiplies the denoise mask by sigma to produce timesteps for each position
    in the latent state. Areas where the mask is 0 will have zero timesteps.
    When sigma is ``(B,)`` it is reshaped to ``(B, 1, ...)`` so the batch
    dimension aligns correctly with ``denoise_mask``.
    rq   )
isinstancer   Tensorr+   view)ro   ru   r   r   r    r{     s    r{   u   ‘’“”—– ′−z	''""-- '-textc                 C   s8   |  t} t| D ]\}}| r| |d   S q	| S )zaClean a response from curly quotes and leading non-letter characters which Gemma tends to insert.N)	translate_UNICODE_REPLACEMENTS	enumerateisalpha)r   icharr   r   r    clean_response&  s   
r     *   text_encoderpromptrO   image_long_sideseedc                 C   sf   d}|r t |d}t|}t||tj}| j|||d}n| j||d}t	d|  t
|S )z=Generate an enhanced prompt from a text encoder and a prompt.N)rO   )r   zEnhanced prompt: )r   r   tensorr   rI   uint8enhance_i2venhance_t2vlogginginfor   )r   r   rO   r   r   rY   r   r   r    generate_enhanced_prompt1  s   

r   is_two_stagec                 C   sX   |rdnd}| | dks|| dkr*t d|  d| d| d|r!dnd	 d
| ddS )zAssert that the resolution is divisible by the required divisor.
    For two-stage pipelines, the resolution must be divisible by 64.
    For one-stage pipelines, the resolution must be divisible by 32.
    @       r   zResolution (xz) is not divisible by z. For z	two-stagez	one-stagez2 pipelines, height and width must be multiples of .N)r=   )r?   r@   r   divisorr   r   r    assert_resolutionE  s   
r   )r   N)r4   NN)r4   N)ra   N)T)Nr   r   )?r"   r   r   ltx_core.components.noisersr   ltx_core.conditioningr   r   r   ltx_core.model.audio_vaer   ltx_core.model.transformerr   ltx_core.model.video_vaer   r	   ltx_core.text_encoders.gemmar
   ltx_core.toolsr   ltx_core.typesr   r   r   r   ltx_pipelines.utils.argsr   ltx_pipelines.utils.media_ior   r   r   r   r   r   r   r   r!   r&   r   intr3   strr*   rr   rG   nnModulerL   r-   r\   r_   r`   ri   rg   rs   boolr|   r{   	maketransr   r   r   r   r   r   r   r    <module>   sF   $
	
*
!
%



"	
 
