o
    i)                     @   sB  d dl Z d dlmZ d dlZd dlmZmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZmZ d dlmZmZmZ d dlmZmZmZm Z m!Z!m"Z" d dl#m$Z$m%Z% d dl&m'Z'm(Z( d dl)m*Z*m+Z+m,Z, d dl-m.Z. d dl/m0Z0 G dd dZ1e2 dddZ3e4dkre3  dS dS )    N)Iterator)MultiModalGuiderFactoryMultiModalGuiderParams create_multimodal_guider_factory)GaussianNoiser)LTX2Scheduler)LoraPathStrengthAndSDOps)Registry)TilingConfigget_video_chunks_number)QuantizationPolicy)AudioVideoPixelShape)ImageConditioningInputdefault_2_stage_arg_parserdetect_checkpoint_path)AudioDecoderDiffusionStageImageConditionerPromptEncoderVideoDecoderVideoUpsampler)STAGE_2_DISTILLED_SIGMA_VALUESdetect_params)FactoryGuidedDenoiserSimpleDenoiser)assert_resolution
get_device,image_conditionings_by_adding_guiding_latent)encode_video)ModalitySpecc                #   @   s   e Zd ZdZ				d"dedee dededee d	ejdB d
e	dB de
dB defddZ				d#dededededededededeeB deeB dee dedB dededB dedeeej ef f d d!ZdS )$KeyframeInterpolationPipelinea  
    Keyframe-based Two-stage video interpolation pipeline.
    Interpolates between keyframes to generate a video with smoother transitions.
    Stage 1 generates video at half of the target resolution, then Stage 2 upsamples
    by 2x and refines with additional denoising steps for higher quality output.
    Stage 1 uses full model while Stage 2 uses distilled LORA for efficiency,
    as the upsampled video already has good quality and just needs refinement.
    NFcheckpoint_pathdistilled_loraspatial_upsampler_path
gemma_rootlorasdevicequantizationregistrytorch_compilec
              	   C   s   |pt  | _tj| _t||| j| j|d| _t|| j| j|d| _t	|| j| jt
||||	d| _g t
|t
|R }
t	|| j| j|
|||	d| _t||| j| j|d| _t|| j| j|d| _t|| j| j|d| _d S )N)r)   )r&   r(   r)   r*   )r   r'   torchbfloat16dtyper   prompt_encoderr   image_conditionerr   tuplestage_1stage_2r   	upsamplerr   video_decoderr   audio_decoder)selfr"   r#   r$   r%   r&   r'   r(   r)   r*   stage_2_loras r8   U/home/ubuntu/LTX-2/packages/ltx-pipelines/src/ltx_pipelines/keyframe_interpolation.py__init__3   s8   		z&KeyframeInterpolationPipeline.__init__   promptnegative_promptseedheightwidth
num_frames
frame_ratenum_inference_stepsvideo_guider_paramsaudio_guider_paramsimagestiling_configenhance_promptstreaming_prefetch_countmax_batch_sizereturnc           #         s  t ||dd tjjd|}t|d}tj j||g|tdkr+d d nd ||d\}}|j	|j
}}|j	|j
}}t j|djtjjd}td	||d
 |d
 |d fdd}t|	|d}t|
|d}jt||||d||jj||t||dt|d||d\}}|jd d	 }ttj}td	||||d fdd} jt||||||||t|| |d  |dt||d  |jd|d
\}}|j||}!|j}"|!|"fS )NT)r?   r@   is_two_stage)r'   )	generatorr   )enhance_first_promptenhance_prompt_imageenhance_prompt_seedrI   )steps)r-   r'   r;      )batchframesr@   r?   fpsc                       t jj|  jdS N)rF   r?   r@   video_encoderr-   r'   r   r?   r@   r'   enc)r-   rF   r6   stage_1_output_shaper8   r9   <lambda>       z8KeyframeInterpolationPipeline.__call__.<locals>.<lambda>)paramsnegative_context)	v_context	a_contextvideo_guider_factoryaudio_guider_factory)contextconditionings)re   )denoisersigmasnoiserr@   r?   rT   rU   videoaudiorI   rJ   c                    rV   rW   rY   rZ   )r-   rF   r6   stage_2_output_shaper8   r9   r]      r^   )re   rf   noise_scaleinitial_latent)re   rm   rn   )
rg   rh   ri   r@   r?   rT   rU   rj   rk   rI   ) r   r+   	Generatorr'   manual_seedr   r,   r.   lenvideo_encodingaudio_encodingr   executetofloat32r   r/   r   r1   r   r@   r?   r    r3   latentTensorr   r2   r   itemr4   r5   )#r6   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rM   ri   ctx_pctx_nv_context_pa_context_pv_context_na_context_nrh   stage_1_conditioningsrc   rd   video_stateaudio_stateupscaled_video_latentdistilled_sigmasstage_2_conditioningsdecoded_videodecoded_audior8   )r-   rF   r6   r\   rl   r9   __call__]   s   





z&KeyframeInterpolationPipeline.__call__)NNNF)NFNr;   )__name__
__module____qualname____doc__strlistr   r+   r'   r   r	   boolr:   intfloatr   r   r   r
   r0   r   rx   r   r   r8   r8   r8   r9   r!   )   s|    	

7	
r!   rK   c            	      C   s  t  t j t } t| }t|d}| }t|j	|j
|j|j|jr)t|jnd|j|jd}t }t|j|}||j|j|j|j|j|j|j|jt|j|j|j|j |j!|j"dt|j#|j$|j%|j&|j'|j(d|j)||j*|j+d\}}t,||j||j-|d d S )N)r_   r8   )r"   r#   r$   r%   r&   r(   r*   )	cfg_scale	stg_scalerescale_scalemodality_scale	skip_step
stg_blocks)r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rI   rJ   )rj   rU   rk   output_pathvideo_chunks_number).logging	getLoggersetLevelINFOr   r   r   
parse_argsr!   r"   r#   r$   r%   lorar0   r(   compiler
   defaultr   rA   r<   r=   r>   r?   r@   rB   rC   r   video_cfg_guidance_scalevideo_stg_guidance_scalevideo_rescale_scalea2v_guidance_scalevideo_skip_stepvideo_stg_blocksaudio_cfg_guidance_scaleaudio_stg_guidance_scaleaudio_rescale_scalev2a_guidance_scaleaudio_skip_stepaudio_stg_blocksrF   rI   rJ   r   r   )	r"   r_   parserargspipelinerG   r   rj   rk   r8   r8   r9   main   sj   
	

r   __main__)rK   N)5r   collections.abcr   r+   ltx_core.components.guidersr   r   r   ltx_core.components.noisersr   ltx_core.components.schedulersr   ltx_core.loaderr   ltx_core.loader.registryr	   ltx_core.model.video_vaer
   r   ltx_core.quantizationr   ltx_core.typesr   r   ltx_pipelines.utils.argsr   r   r   ltx_pipelines.utils.blocksr   r   r   r   r   r   ltx_pipelines.utils.constantsr   r   ltx_pipelines.utils.denoisersr   r   ltx_pipelines.utils.helpersr   r   r   ltx_pipelines.utils.media_ior   ltx_pipelines.utils.typesr    r!   inference_moder   r   r8   r8   r8   r9   <module>   s2      89
