o
    i1                     @  sN  d dl mZ d dlZd dlmZ d dlZd dlmZmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZ d dlmZmZmZm Z m!Z!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z.m/Z/ d dl0m1Z1 G dd dZ2e3 dddZ4e5dkre4  dS dS )    )annotationsN)Iterator)MultiModalGuiderMultiModalGuiderParams)GaussianNoiser)LTX2Scheduler)TemporalRegionMask)LoraPathStrengthAndSDOps)Registry)TilingConfigget_video_chunks_number)QuantizationPolicy)SpatioTemporalScaleFactors)video_editing_arg_parser)AudioConditionerAudioDecoderDiffusionStageImageConditionerPromptEncoderVideoDecoder)DISTILLED_SIGMA_VALUESdetect_params)GuidedDenoiserSimpleDenoiser)audio_latent_from_file
get_devicevideo_latent_from_file)encode_videoget_videostream_metadata)ModalitySpecc                   @  sH   e Zd ZdZ					d1d2ddZddddddddddd
d3d/d0ZdS )4RetakePipelinea=  Regenerate a time region (retake) of an existing video.
    Given a source video file and a time window ``[start_time, end_time]``
    (in seconds), this pipeline keeps the video/audio outside that window
    unchanged and *regenerates* the content inside the window from a text
    prompt using the LTX-2 diffusion model.
    Parameters
    ----------
    checkpoint_path : str
        Path to the LTX-2 model checkpoint.
    gemma_root : str
        Root directory containing Gemma text-encoder weights.
    loras : list[LoraPathStrengthAndSDOps]
        Optional LoRA configs applied to the transformer.
    device : torch.device
        Target device (default: CUDA if available).
    quantization : QuantizationPolicy | None
        Optional quantization policy for the transformer.
    distilled : bool
        Set to ``True`` if using distilled model or passing distillation
        lora with full model. If set to ``True``, distilled sigma schedule
        (``DISTILLED_SIGMA_VALUES``) and a simple (non-guided) denoising
        function will be used during ``__call__``.
    NTFcheckpoint_pathstr
gemma_rootloraslist[LoraPathStrengthAndSDOps]devicetorch.device | NonequantizationQuantizationPolicy | NoneregistryRegistry | None	distilledbooltorch_compilec	           	   	   C  s   |pt  | _tj| _|| _t||| j| j|d| _t|| j| j|d| _	t
|| j| j|d| _t|| j| jt||||d| _t|| j| j|d| _t|| j| j|d| _d S )N)r!   r#   dtyper&   r*   )r!   r/   r&   r*   )r!   r/   r&   r$   r(   r*   r.   )r   r&   torchbfloat16r/   r,   r   prompt_encoderr   image_conditionerr   audio_conditionerr   tuplestager   video_decoderr   audio_decoder)	selfr!   r#   r$   r&   r(   r*   r,   r.    r:   E/home/ubuntu/LTX-2/packages/ltx-pipelines/src/ltx_pipelines/retake.py__init__C   sV   	zRetakePipeline.__init__ (      )
negative_promptnum_inference_stepsvideo_guider_paramsaudio_guider_paramsregenerate_videoregenerate_audioenhance_prompttiling_configstreaming_prefetch_countmax_batch_size
video_pathprompt
start_timefloatend_timeseedintr@   rA   rB   MultiModalGuiderParams | NonerC   rD   rE   rF   rG   TilingConfig | NonerH   
int | NonerI   return+tuple[Iterator[torch.Tensor], torch.Tensor]c       
   $        s  ||krt d| d| dtjjd|}t|d}j t fdd}	 fdd}j
rC|gn||g}j||||d	}|d
 j|d
 j}}t||
rht||jdgng ||
 d}t||dur|rt||jdgng ||duo| d}j
rttjtjjd}t||d}n-t j|djtjjd}|d j|d j}}t||d}t|	|d}t||||d}j|||jjjj||||d\} }!| j||}" |!j}#|"|#fS )a  Regenerate ``[start_time, end_time]`` of the source video (retake).
        Parameters
        ----------
        video_path : str
            Path to the source video file (must contain video; audio is optional).
        prompt : str
            Text prompt describing the *regenerated* section.
        start_time, end_time : float
            Time window (in seconds) of the section to regenerate.
        seed : int
            Random seed for reproducibility.
        negative_prompt : str
            Negative prompt for CFG guidance (ignored in distilled mode).
        num_inference_steps : int
            Number of Euler denoising steps (ignored in distilled mode which
            uses a fixed 8-step schedule).
        video_guider_params, audio_guider_params : MultiModalGuiderParams | None
            Guidance parameters for video and audio modalities.  Ignored in
            distilled mode.
        regenerate_video : bool
            If ``True`` (default), regenerate video inside ``[start_time, end_time]``.
            If ``False``, video is preserved as-is (no regeneration).
        regenerate_audio : bool
            If True, regenerate audio in the [start_time, end_time] window; if False,
            audio is preserved as-is (no regeneration).
        enhance_prompt : bool
            Whether to enhance the prompt via the text encoder.
        Returns
        -------
        tuple[Iterator[torch.Tensor], torch.Tensor]
            ``(video_frames_iterator, audio_waveform)``
        zstart_time (z) must be less than end_time ())r&   )	generatorc                      t |  jdS )N)video_encoder	file_pathoutput_shaper/   r&   )r   r&   encr/   r[   r9   rJ   r:   r;   <lambda>       z)RetakePipeline.__call__.<locals>.<lambda>c                   rX   )N)audio_encoderrZ   r[   r/   r&   )r   r&   r\   r^   r:   r;   r_      r`   )enhance_first_promptenhance_prompt_seedrH   r   )rL   rN   fps)contextconditioningsinitial_latentfrozenN)r/   r&   )	v_context	a_context)stepsr?   )paramsnegative_context)ri   rj   video_guideraudio_guider)denoisersigmasnoiserwidthheightframesrd   videoaudiorH   rI   )!
ValueErrorr0   	Generatorr&   manual_seedr   r/   r   r3   r4   r,   r2   video_encodingaudio_encodingr   r   rd   tensorr   tofloat32r   r   executer   r   r6   rs   rt   ru   r7   latentr8   )$r9   rJ   rK   rL   rN   rO   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rW   rr   initial_video_latentinitial_audio_latentprompts_to_encodecontextsv_context_pa_context_pvideo_modality_specaudio_modality_specrq   rp   v_context_na_context_nrn   ro   video_stateaudio_statedecoded_videodecoded_audior:   r^   r;   __call__~   s   3


	
zRetakePipeline.__call__)NNNTF)r!   r"   r#   r"   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r-   ) rJ   r"   rK   r"   rL   rM   rN   rM   rO   rP   r@   r"   rA   rP   rB   rQ   rC   rQ   rD   r-   rE   r-   rF   r-   rG   rR   rH   rS   rI   rP   rT   rU   )__name__
__module____qualname____doc__r<   r   r:   r:   r:   r;   r    *   s&    Cr    rT   Nonec                  C  sf  t  t j tdd} d| _|  }|j|jkrt	dt
 }t|j}|jd |j dkrJ|jd |j |j d }t	d|j d| d	|jd
 dksX|jd
 dkret	d|j d|j dt|j|j|jrrt|jnd|j|j|jd}t|j}t }||j|j|j|j|j|j|j||j |j!d
\}}	t"|j|}
t#|t$|j%|	|j&|
d dS )z6CLI entry point for retake (regenerate a time region).T)r,   z7Retake: regenerate a time region of a video with LTX-2.z%start_time must be less than end_timer?   r   z8Video frame count must satisfy 8k+1 (e.g. 97, 193). Got z; use a video with z frames.    z4Video width and height must be multiples of 32. Got x.r:   )r!   r#   r$   r(   r,   r.   )
rJ   rK   rL   rN   rO   rB   rC   rG   rH   rI   )rv   rd   rw   output_pathvideo_chunks_numberN)'logging	getLoggersetLevelINFOr   description
parse_argsrL   rN   rx   r   defaultr   rJ   ru   timers   rt   r    distilled_checkpoint_pathr#   lorar5   r(   r,   compiler   r   rK   rO   rB   rC   rH   rI   r   r   rP   rd   r   )parserargsvideo_scalesrcsnappedpipelinerl   rG   
video_iterrw   r   r:   r:   r;   main  sZ   




r   __main__)rT   r   )6
__future__r   r   collections.abcr   r0   ltx_core.components.guidersr   r   ltx_core.components.noisersr   ltx_core.components.schedulersr   +ltx_core.conditioning.types.noise_mask_condr   ltx_core.loaderr	   ltx_core.loader.registryr
   ltx_core.model.video_vaer   r   ltx_core.quantizationr   ltx_core.typesr   ltx_pipelines.utils.argsr   ltx_pipelines.utils.blocksr   r   r   r   r   r   ltx_pipelines.utils.constantsr   r   ltx_pipelines.utils.denoisersr   r   ltx_pipelines.utils.helpersr   r   r   ltx_pipelines.utils.media_ior   r   ltx_pipelines.utils.typesr   r    inference_moder   r   r:   r:   r:   r;   <module>   s6      l5
