o
    0#ÑiüA  ã                   @   sô  d Z ddlZddlZddlZddlZej dd¡ ddlZddlZ	ddl
mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z. ddl/m0Z0m1Z1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z= ddl>m?Z? ddlm@Z@ ddlAmBZBmCZC ddlDmEZEmFZFmGZG ddlHmIZI ddlJmKZK ddlLmMZM ddlNmOZO ejPejQdd e ReS¡ZTG dd„ dƒZUdd „ ZVeSd!krøeVƒ  dS dS )"z®
LTX-2 Persistent Inference Server
Loads all models ONCE into GPU memory, reuses across generations.
Avoids the default pipeline behavior of rebuilding models on every call.
é    NÚPYTORCH_ALLOC_CONFzexpandable_segments:True)ÚBatchSplitAdapter)ÚEulerDiffusionStep)ÚGaussianNoiser)ÚAudioPatchifierÚVideoLatentPatchifier)ÚDummyRegistry)ÚSingleGPUModelBuilder)Ú#AUDIO_VAE_DECODER_COMFY_KEYS_FILTERÚVOCODER_COMFY_KEYS_FILTERÚAudioDecoderConfiguratorÚVocoderConfiguratorÚdecode_audio)ÚLTXV_MODEL_COMFY_RENAMING_MAPÚLTXModelConfiguratorÚX0Model)ÚLatentUpsamplerConfiguratorÚupsample_video)ÚVAE_DECODER_COMFY_KEYS_FILTERÚVAE_ENCODER_COMFY_KEYS_FILTERÚTilingConfigÚVideoDecoderConfiguratorÚVideoEncoderConfiguratorÚget_video_chunks_number)ÚQuantizationPolicy)ÚEMBEDDINGS_PROCESSOR_KEY_OPSÚGEMMA_LLM_KEY_OPSÚGEMMA_MODEL_OPSÚEmbeddingsProcessorConfiguratorÚGemmaTextEncoderConfiguratorÚmodule_ops_from_gemma_root)ÚAudioLatentToolsÚVideoLatentTools)ÚAudioLatentShapeÚLatentStateÚVideoLatentShapeÚVideoPixelShape)Úfind_matching_file)ÚSDOps)ÚDISTILLED_SIGMA_VALUESÚSTAGE_2_DISTILLED_SIGMA_VALUES)Úassert_resolutionÚcombined_image_conditioningsÚcreate_noised_state)Úeuler_denoising_loop)ÚSimpleDenoiser)ÚModalitySpec)Úencode_videoz%(asctime)s %(message)s)ÚlevelÚformatc                   @   s‚   e Zd ZdZ		ddededededB dejdB f
dd	„Ze 	¡ 	
					ddede
de
de
de
dededB defdd„ƒZdS )ÚLTX2PersistentEnginez=Load all models once, keep in GPU memory, run inference fast.NÚdistilled_checkpoint_pathÚspatial_upsampler_pathÚ
gemma_rootÚquantizationÚdevicec                 C   sP  |pt  d¡| _t j| _tƒ }t ¡ }t d¡ t ¡ }t|ƒ}	t	|dƒj
}
dd„ |
 d¡D ƒ}tt|ƒtttg|	¢R |d}|j| j| jd ¡ | _t d	t ¡ | d
›dt j ¡ d d
›d¡ t d¡ t ¡ }t|tt|d}|j| j| jd | j¡ ¡ | _t dt ¡ | d
›dt j ¡ d d
›d¡ t d¡ t ¡ }t|tt|d}|j| j| jd | j¡ ¡ | _t dt ¡ | d
›dt j ¡ d d
›d¡ t d¡ t ¡ }t|t|d}|j| j| jd | j¡ ¡ | _t dt ¡ | d
›dt j ¡ d d
›d¡ t d¡ t ¡ }t|tt d|d}|j!}|j"}|d urEg |¢|j"¢R }t#d|j$› d|j%j$› g |j&¢|j%j&¢R d}| '|¡ (|¡}t)|j| jdƒ | j¡ ¡ | _*t dt ¡ | d
›dt j ¡ d d
›d¡ t d¡ t ¡ }t|t+t,|d}|j| j| jd | j¡ ¡ | _-t dt ¡ | d
›dt j ¡ d d
›d¡ t d ¡ t ¡ }t|t.t/|d}|j| j| jd | j¡ ¡ | _0t|t1t2|d}|j| j| jd | j¡ ¡ | _3t d!t ¡ | d
›dt j ¡ d d
›d¡ t ¡ | }t d"|d
›d#¡ t d$t j ¡ d d%›d&t j 4d'¡j5d d
›d(¡ d S ))NÚcudazLoading Gemma text encoder...zmodel*.safetensorsc                 S   s   g | ]}t |ƒ‘qS © )Ústr)Ú.0Úpr;   r;   ú/home/ubuntu/ltx2_server.pyÚ
<listcomp>_   s    z1LTX2PersistentEngine.__init__.<locals>.<listcomp>z*.safetensors)Ú
model_pathÚmodel_class_configuratorÚmodel_sd_opsÚ
module_opsÚregistry)r9   Údtypez  Text encoder loaded in z.1fz
s | VRAM: é   @ÚGBzLoading embeddings processor...)rA   rB   rC   rE   z!  Embeddings processor loaded in zLoading video encoder...z  Video encoder loaded in zLoading spatial upsampler...)rA   rB   rE   z  Upsampler loaded in zLoading transformer...r;   )rA   rB   rC   ÚlorasrE   Úchain_ú+)ÚnameÚmapping©r9   z  Transformer loaded in zLoading video decoder...z  Video decoder loaded in z"Loading audio decoder + vocoder...z"  Audio decoder+vocoder loaded in zALL MODELS LOADED in ÚszTotal VRAM: ú.2fz GB / r   ú GB)6Útorchr9   Úbfloat16rF   r   ÚtimeÚloggerÚinfor    r'   ÚparentÚrglobÚBuilderÚtupler   r   r   ÚbuildÚevalÚtext_encoderr:   Úmemory_allocatedr   r   ÚtoÚembeddings_processorr   r   Úvideo_encoderr   Ú	upsamplerr   r   rC   rD   r(   rL   Úsd_opsrM   Úwith_module_opsÚwith_sd_opsr   Útransformerr   r   Úvideo_decoderr   r
   Úaudio_decoderr   r   ÚvocoderÚget_device_propertiesÚtotal_memory)Úselfr5   r6   r7   r8   r9   rE   Út0Út1rD   Úmodel_folderÚweight_pathsÚtext_enc_builderÚemb_builderÚenc_builderÚ
up_builderÚtrans_builderrc   Úmodule_ops_transÚdec_builderÚaudio_dec_builderÚvocoder_builderÚ
total_loadr;   r;   r?   Ú__init__L   sÂ   

û.
ü .
ü .
ý .
û
þ .
ü .
ü ü .8zLTX2PersistentEngine.__init__é*   é   é   é)   ç      8@ÚpromptÚseedÚheightÚwidthÚ
num_framesÚ
frame_rateÚoutput_pathÚreturnc           3   	   C   sÊ  t ||dd i }t ¡ }	tj| jd |¡}
t|
d}t ¡ }| j |¡\}}| j	 
||¡}|j|j}}t ¡ | |d< t ¡ }t t¡ | j¡}|d |d }}tg ||| j| j| jd}td||||d	}t |¡}ttdd
||ƒ}t |¡}ttdd
|ƒ}t||d}t|d}t||j|| j| jd}t||j|| j| jd}t||ƒ}t ƒ } t!| j"dd}!t#|||| |!|d\}}| $|¡}| %|¡}| $|¡}| %|¡}t ¡ | |d< t ¡ }t&|j'dd… | j| j(d}"t ¡ | |d< t ¡ }t t)¡ | j¡}#tg ||| j| j| jd}$td||||d	}%t |%¡}&ttdd
|&|ƒ}'t |%¡}(ttdd
|(ƒ})t||$|#d  *¡ |"d}*t||#d  *¡ |j'd}+t|'|*j|| j| j|*j+|*j,d},t|)|+j|| j| j|+j+|+j,d}-t||ƒ}.t#|#|,|-| |!|.d\},}-|' $|,¡},|' %|,¡},|) $|-¡}-|) %|-¡}-t ¡ | |d< t ¡ }t- .¡ }/| j/ 0|,j'|/|
¡}0t1|-j'| j2| j3ƒ}1t ¡ | |d< |rÛt ¡ }t4||/ƒ}2t5|0||1||2d t ¡ | |d< t ¡ |	 |d< |S )z&Generate a video. Returns timing dict.T)rƒ   r„   Úis_two_stagerN   )Ú	generatorÚprompt_encodeé   )Úimagesrƒ   r„   ra   rF   r9   é   )ÚbatchÚframesrƒ   r„   Úfps)Ú
patch_size)ÚcontextÚconditionings)r“   )Útoolsr”   ÚnoiserrF   r9   )Úmax_batch_size)ÚsigmasÚvideo_stateÚaudio_stateÚstepperrf   ÚdenoiserÚstage1_denoiseN)Úlatentra   rb   Úupsampler   )r“   r”   Únoise_scaleÚinitial_latent)r“   r    r¡   )r•   r”   r–   rF   r9   r    r¡   Ústage2_denoiseÚdecode)Úvideor‘   Úaudior‡   Úvideo_chunks_numberÚfile_encodeÚtotal)6r+   rT   rR   Ú	Generatorr9   Úmanual_seedr   r]   Úencoder`   Úprocess_hidden_statesÚvideo_encodingÚaudio_encodingÚTensorr)   r_   r,   ra   rF   r&   r%   Úfrom_pixel_shaper"   r   r#   Úfrom_video_pixel_shaper!   r   r0   r-   r”   r/   r   r   rf   r.   Úclear_conditioningÚ
unpatchifyr   rž   rb   r*   Úitemr    r¡   r   Údefaultrg   Údecode_videoÚvae_decode_audiorh   ri   r   r1   )3rl   r   r‚   rƒ   r„   r…   r†   r‡   ÚtimingsÚt_totalrŠ   r–   rm   Úraw_hsÚmaskÚctxÚvideo_contextÚaudio_contextÚstage_1_sigmasÚs1_wÚs1_hÚstage_1_conditioningsÚpixel_shapeÚv_shapeÚvideo_toolsÚa_shapeÚaudio_toolsÚ
video_specÚ
audio_specr™   rš   rœ   r›   rf   Úupscaled_video_latentÚstage_2_sigmasÚstage_2_conditioningsÚpixel_shape_2Ú	v_shape_2Úvideo_tools_2Ú	a_shape_2Úaudio_tools_2Úvideo_spec_2Úaudio_spec_2Úvideo_state_2Úaudio_state_2Ú
denoiser_2Útiling_configÚvideo_chunksr¥   r¦   r;   r;   r?   ÚgenerateÈ   sà   
þ



þ
þ

ú
	


ýþ

þþ
ý
ý

ú
	



ÿzLTX2PersistentEngine.generate)NN)r|   r}   r~   r   r€   N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r<   r   rR   r9   r{   Úinference_modeÚintÚfloatÚdictrÙ   r;   r;   r;   r?   r4   I   sN    úþýüû
ú|øþýüûúùø	÷r4   c               	   C   sP  d} | › d}| › d}| › d}t |||t ¡ d}g d¢}tjddd	 t d
¡ t d¡ t d
¡ |j|d dddd}t d|› ¡ t|ƒD ]J\}}t d
¡ t d|d › ¡ t d
¡ |j|d| dd|› dd}t d|d › d|› ¡ | 	¡ D ]\}	}
t d|	› d|
d›d¡ q„qLt dt
j ¡ d d›d¡ d S )Nz/home/ubuntu/ltx2-modelsz&/ltx-2.3-22b-distilled-fp8.safetensorsz,/ltx-2.3-spatial-upscaler-x2-1.0.safetensorsz/gemma3)r5   r6   r7   r8   )zWA golden retriever running through a field of wildflowers at sunset, cinematic lightingz]A futuristic cityscape at night with neon lights reflecting on wet streets, aerial drone shotzVOcean waves crashing against rocky cliffs, slow motion, dramatic sky with storm cloudsz/home/ubuntu/ltx2_bench_outputT)Úexist_okz<============================================================z
WARMUP RUNr   r|   r   z4/home/ubuntu/ltx2_bench_output/persistent_warmup.mp4)r   r‚   r…   r‡   zWarmup: zRUN rŽ   z*/home/ubuntu/ltx2_bench_output/persistent_z.mp4zRun z
 timings: z  z: rP   rO   zPeak VRAM: rG   rQ   )r4   r   Úfp8_castÚosÚmakedirsrU   rV   rÙ   Ú	enumerateÚitemsrR   r:   Úmax_memory_allocated)Ú	MODEL_DIRÚDISTILLED_CKPTÚ
SPATIAL_UPÚ
GEMMA_ROOTÚengineÚpromptsÚtÚir   ÚkÚvr;   r;   r?   Úmain`  sB   


ü



þ



þÿ"ró   Ú__main__)WrÝ   Úgcrä   rT   ÚloggingÚenvironÚ
setdefaultrR   Últx_core.loaderÚltx_coreÚltx_core.batch_splitr   Ú#ltx_core.components.diffusion_stepsr   Últx_core.components.noisersr   Últx_core.components.patchifiersr   r   Últx_core.loader.registryr   Ú(ltx_core.loader.single_gpu_model_builderr	   rY   Últx_core.model.audio_vaer
   r   r   r   r   r·   Últx_core.model.transformerr   r   r   Últx_core.model.upsamplerr   r   Últx_core.model.video_vaer   r   r   r   r   r   Últx_core.quantizationr   Últx_core.text_encoders.gemmar   r   r   r   r   r    Últx_core.toolsr!   r"   Últx_core.typesr#   r$   r%   r&   Últx_core.utilsr'   r(   Últx_pipelines.utils.constantsr)   r*   Últx_pipelines.utils.helpersr+   r,   r-   Últx_pipelines.utils.samplersr.   Últx_pipelines.utils.denoisersr/   Últx_pipelines.utils.typesr0   Últx_pipelines.utils.media_ior1   ÚbasicConfigÚINFOÚ	getLoggerrÚ   rU   r4   ró   r;   r;   r;   r?   Ú<module>   sN      
  /
ÿ