o
    iV                     @  s  d Z ddlmZ ddlZddlmZ ddlmZmZ ddl	m
Z
 ddlmZmZ ddlZddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl&m-Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZEmFZFmGZGmHZH ddlImJZJ ddlKmLZLmMZMmNZN ddlOmPZPmQZQmRZRmSZSmTZT ddlUmVZV ddlWmXZX ddlYmZZZm[Z[m\Z\ dd l]m^Z^ dd!l_m`Z`maZa ebecZded"Zeed#ejfjgd$ZhedOd.d/ZidPd:d;ZjdQd?d@ZkG dAdB dBZlG dCdD dDZmG dEdF dFZnG dGdH dHZoG dIdJ dJZpG dKdL dLZqG dMdN dNZrdS )Ru   Pipeline blocks — each block owns its model lifecycle.
Blocks build a model on each ``__call__``, use it, then free GPU memory.
This eliminates manual ``del model; cleanup_memory()`` in pipelines and
removes the need for :class:`ModelLedger`.
    )annotationsN)Iterator)AbstractContextManagercontextmanager)replace)CallableTypeVar)BatchSplitAdapter)EulerDiffusionStep)Noiser)AudioPatchifierVideoLatentPatchifier)DiffusionStepProtocol)LayerStreamingWrapper)SDOps)LoraPathStrengthAndSDOps)DummyRegistryRegistry)SingleGPUModelBuilder)#AUDIO_VAE_DECODER_COMFY_KEYS_FILTER#AUDIO_VAE_ENCODER_COMFY_KEYS_FILTERVOCODER_COMFY_KEYS_FILTERAudioDecoderConfiguratorAudioEncoderConfiguratorVocoderConfigurator)decode_audio)LTXV_MODEL_COMFY_RENAMING_MAPLTXModelConfiguratorX0Model)COMPILE_TRANSFORMERmodify_sd_ops_for_compilation)LatentUpsamplerConfiguratorupsample_video)VAE_DECODER_COMFY_KEYS_FILTERVAE_ENCODER_COMFY_KEYS_FILTERTilingConfigVideoDecoderConfiguratorVideoEncoderVideoEncoderConfigurator)QuantizationPolicy)EMBEDDINGS_PROCESSOR_KEY_OPSGEMMA_LLM_KEY_OPSGEMMA_MODEL_OPSEmbeddingsProcessorConfiguratorGemmaTextEncoderConfiguratormodule_ops_from_gemma_root)EmbeddingsProcessorOutput)AudioLatentToolsLatentToolsVideoLatentTools)AudioAudioLatentShapeLatentStateVideoLatentShapeVideoPixelShape)find_matching_file	gpu_model)cleanup_memorycreate_noised_stategenerate_enhanced_prompt)euler_denoising_loop)DenoiserModalitySpecT_M)boundmodellayers_attrstrtarget_devicetorch.deviceprefetch_countintreturnIterator[_M]c                 c  s    t | |||d}z:|V  W |  |d t  tjj|d zttjdr0tj	  W dS W dS  t
yC   tjddd Y dS w |  |d t  tjj|d zttjdretj	  W w W w  t
yv   tjddd Y w w )	zKWrap *model* with :class:`LayerStreamingWrapper`, yield it, then tear down.rF   rH   rJ   metadevice_host_emptyCachez*Host empty cache cleanup failed; ignoring.T)exc_infoN)r   teardowntor<   torchcudasynchronizehasattr_CrR   	Exceptionloggerwarning)rE   rF   rH   rJ   wrapped r_   K/home/ubuntu/LTX-2/packages/ltx-pipelines/src/ltx_pipelines/utils/blocks.py_streaming_modelV   s>   

ra   specrA   toolsr2   noiserr   dtypetorch.dtyperQ   r6   c              	   C  s:   t || j|||| j| jd}| jrt|t|jd}|S )z<Create a noised latent state from a modality spec and tools.)rc   conditioningsrd   re   rQ   noise_scaleinitial_latent)denoise_mask)	r=   rg   rh   ri   frozenr   rV   
zeros_likerj   )rb   rc   rd   re   rQ   stater_   r_   r`   _build_statev   s   	rn   itIterator[torch.Tensor]torch.nn.Modulec                 c  s:    t | | E dH  W d   dS 1 sw   Y  dS )zNWrap an iterator to clean up *model* memory once it is exhausted or abandoned.Nr:   )ro   rE   r_   r_   r`   _cleanup_iter   s   
"rr   c                   @  sV   e Zd ZdZ				d;d<ddZddd=ddZd>d!d"Z						#d?d@d9d:ZdS )ADiffusionStagezOwns transformer lifecycle. Builds on each call, frees on exit.
    Replaces the manual ``model_ledger.transformer()`` / ``del transformer``
    pattern in every pipeline.
    r_   NFcheckpoint_pathrG   re   rf   rQ   rI   loras$tuple[LoraPathStrengthAndSDOps, ...]quantizationQuantizationPolicy | NoneregistryRegistry | Nonetorch_compileboolrL   Nonec                 C  s:   || _ || _|| _|| _t|ttt||pt d| _	d S )N)
model_pathmodel_class_configuratormodel_sd_opsru   ry   )
_dtype_device_quantization_torch_compileBuilderr   r   tupler   _transformer_builder)selfrt   re   rQ   ru   rw   ry   r{   r_   r_   r`   __init__   s   
zDiffusionStage.__init__rP   torch.device | Nonekwargsobjectr   c                  s   |p| j }| jj}| jj}| jj}| jr4g |tR }| j d d  t| }t	 fdd|D }| j
d ur[g || j
jR }td|j d| j
jj g |j| j
jjR d}| j|||}t|jd	d|i|| S )
Ntransformer
num_layersc                 3  s<    | ]}t |j|jt|jd ur|jntdd V  qd S )Nidentity)name)r   pathstrengthr    sd_opsr   ).0loranumber_of_layersr_   r`   	<genexpr>   s    
z4DiffusionStage._build_transformer.<locals>.<genexpr>sd_ops_chain_+)r   mappingrQ   r_   )r   r   r   
module_opsru   r   r   model_configr    r   r   r   r   r   r   with_module_opswith_sd_ops
with_lorasr   buildrU   eval)r   rQ   r   targetr   r   ru   builderr_   r   r`   _build_transformer   s&   



"z!DiffusionStage._build_transformerstreaming_prefetch_count
int | Noner   c                 K  sD   |d urt | jddtdi|d| j|dS t| jdi |S )NrQ   cpuz!velocity_model.transformer_blocksrN   r_   )ra   r   rV   rQ   r   r;   )r   r   r   r_   r_   r`   _transformer_ctx   s   zDiffusionStage._transformer_ctx   denoiserr@   sigmastorch.Tensorrd   r   widthrK   heightframesfpsfloatvideoModalitySpec | NoneaudiostepperDiffusionStepProtocol | NoneloopCCallable[..., tuple[LatentState | None, LatentState | None]] | Nonemax_batch_size-tuple[LatentState | None, LatentState | None]c              	   C  sd  |du r|	du rt d|du rt}|
du rt }
td||||d}d}d}|durBt|}ttdd||}t|||| j	| j
}d}d}|	durat|}ttdd|}t|	||| j	| j
}| j||d}t||d}|||||
||d\}}W d   n1 sw   Y  |dur|dur||}||}|dur|dur||}||}||fS )	u  Build transformer → run denoising loop → free transformer.
        Args:
            width: Output width in pixels.
            height: Output height in pixels.
            frames: Number of output frames.
            fps: Frame rate.
            loop: Denoising loop function. Must accept
                ``(sigmas, video_state, audio_state, stepper, transformer, denoiser)``
                as the first six positional arguments. When ``None``, resolves to
                :func:`euler_denoising_loop` at call time.
            streaming_prefetch_count: When set, build the transformer on CPU and
                wrap with :class:`LayerStreamingWrapper` for memory-efficient
                inference, prefetching this many layers ahead.
            max_batch_size: Maximum batch size per transformer forward pass.
                Guided denoisers make up to 4 transformer calls per step.
                When set to a value > 1, the transformer batches multiple
                calls together, reducing layer-streaming PCIe transfers.
                Default ``1`` preserves sequential behavior.
        Returns ``(video_state | None, audio_state | None)`` with cleared
        conditionings and unpatchified latents for present modalities.
        Nz3At least one of `video` or `audio` must be providedr   )batchr   r   r   r   )
patch_size)video_tools)r   )r   video_stateaudio_stater   r   r   )
ValueErrorr?   r
   r8   r7   from_pixel_shaper3   r   rn   r   r   r5   from_video_pixel_shaper1   r   r   r	   clear_conditioning
unpatchify)r   r   r   rd   r   r   r   r   r   r   r   r   r   r   pixel_shaper   r   v_shaper   audio_toolsa_shapebase_transformerr   r_   r_   r`   __call__   sJ   %





zDiffusionStage.__call__)r_   NNF)rt   rG   re   rf   rQ   rI   ru   rv   rw   rx   ry   rz   r{   r|   rL   r}   )rQ   r   r   r   rL   r   )r   r   r   r   rL   r   )NNNNNr   )r   r@   r   r   rd   r   r   rK   r   rK   r   rK   r   r   r   r   r   r   r   r   r   r   r   r   r   rK   rL   r   )__name__
__module____qualname____doc__r   r   r   r   r_   r_   r_   r`   rs      s     

rs   c                   @  s>   e Zd ZdZ	d#d$ddZd%ddZdddddd&d!d"ZdS )'PromptEncoderzOwns text encoder + embeddings processor lifecycle.
    Loads Gemma, encodes prompts, frees Gemma, then loads the embeddings
    processor to produce final outputs.
    Nrt   rG   
gemma_rootre   rf   rQ   rI   ry   rz   rL   r}   c           	      C  sv   || _ || _t|}t|dj}dd |dD }tt|tt	t
g|R |p)t d| _t|tt|p5t d| _d S )Nzmodel*.safetensorsc                 S  s   g | ]}t |qS r_   )rG   r   pr_   r_   r`   
<listcomp>K  s    z*PromptEncoder.__init__.<locals>.<listcomp>z*.safetensors)r~   r   r   r   ry   r~   r   r   ry   )r   r   r/   r9   parentrglobr   r   r.   r+   r,   r   _text_encoder_builderr-   r*   _embeddings_processor_builder)	r   rt   r   re   rQ   ry   r   model_folderweight_pathsr_   r_   r`   r   >  s$   
zPromptEncoder.__init__r   r   r   c                 C  sN   |d urt | jjtd| jd d| j|dS t| jj| j| jd S )Nr   rQ   re   z!model.model.language_model.layersrN   )	ra   r   r   rV   rQ   r   r   r   r;   )r   r   r_   r_   r`   _text_encoder_ctx[  s   zPromptEncoder._text_encoder_ctxF*   )enhance_first_promptenhance_prompt_imageenhance_prompt_seedr   prompts	list[str]r   r|   r   
str | Noner   rK   list[EmbeddingsProcessorOutput]c                  s   |  |#|rt|}t|d ||d|d< fdd|D }W d   n1 s+w   Y  t| jj| j| jd| j	   fdd|D W  d   S 1 sVw   Y  dS )uV   Encode *prompts* through Gemma → embeddings processor, freeing each model after use.r   )seedc                   s   g | ]}  |qS r_   )encoder   )text_encoderr_   r`   r   x  s    z*PromptEncoder.__call__.<locals>.<listcomp>Nr   c                   s   g | ]
\}}  ||qS r_   )process_hidden_states)r   hsmask)embeddings_processorr_   r`   r   }  s    )
r   listr>   r;   r   r   r   r   rU   r   )r   r   r   r   r   r   raw_outputsr_   )r   r   r`   r   h  s   

$zPromptEncoder.__call__N)rt   rG   r   rG   re   rf   rQ   rI   ry   rz   rL   r}   )r   r   rL   r   )r   r   r   r|   r   r   r   rK   r   r   rL   r   )r   r   r   r   r   r   r   r_   r_   r_   r`   r   8  s    
r   c                   @  s2   e Zd ZdZ	ddddZdddZdddZdS )ImageConditionerzrOwns video encoder lifecycle.
    Builds the encoder, passes it to the user-supplied callable, then frees it.
    Nrt   rG   re   rf   rQ   rI   ry   rz   rL   r}   c                 C  (   || _ || _t|tt|pt d| _d S Nr   )r   r   r   r(   r$   r   _encoder_builderr   rt   re   rQ   ry   r_   r_   r`   r        zImageConditioner.__init__r'   c                 C  s    | j j| j| jd| j S )Nr   )r   r   r   r   rU   r   )r   r_   r_   r`   _build_encoder  s    zImageConditioner._build_encoderfnCallable[[VideoEncoder], T]rB   c                 C  s8   t |  }||W  d   S 1 sw   Y  dS )u<   Build video encoder → call *fn(encoder)* → free encoder.N)r;   r   r   r   encoderr_   r_   r`   r     s   $zImageConditioner.__call__r   
rt   rG   re   rf   rQ   rI   ry   rz   rL   r}   )rL   r'   )r   r   rL   rB   )r   r   r   r   r   r   r   r_   r_   r_   r`   r     s    	
r   c                   @  s(   e Zd ZdZ	ddddZdddZdS )VideoUpsamplerz1Owns video encoder + spatial upsampler lifecycle.Nrt   rG   upsampler_pathre   rf   rQ   rI   ry   rz   rL   r}   c                 C  s>   || _ || _t|tt|pt d| _t|t|pt d| _d S )Nr   )r~   r   ry   )	r   r   r   r(   r$   r   r   r!   _upsampler_builder)r   rt   r   re   rQ   ry   r_   r_   r`   r     s   zVideoUpsampler.__init__latentr   c              	   C  s   t | jj| j| jd| j 9}t | jj| j| jd| j }t|||dW  d   W  d   S 1 s>w   Y  W d   dS 1 sNw   Y  dS )zJUpsample *latent* using video encoder + spatial upsampler, then free both.r   )r   video_encoder	upsamplerN)	r;   r   r   r   r   rU   r   r   r"   )r   r   r   r  r_   r_   r`   r     s   RzVideoUpsampler.__call__r   )rt   rG   r   rG   re   rf   rQ   rI   ry   rz   rL   r}   )r   r   rL   r   r   r   r   r   r   r   r_   r_   r_   r`   r     s
    r   c                   @  s.   e Zd ZdZ	ddddZ		ddddZdS )VideoDecoderztOwns video decoder lifecycle.
    Returns an iterator that cleans up the decoder after all chunks are consumed.
    Nrt   rG   re   rf   rQ   rI   ry   rz   rL   r}   c                 C  r   r   )r   r   r   r&   r#   r   _decoder_builderr   r_   r_   r`   r     r   zVideoDecoder.__init__r   r   tiling_configTilingConfig | None	generatortorch.Generator | Nonerp   c                 C  s4   | j j| j| jd| j }t|||||S )zLDecode *latent* to pixel-space video chunks. Decoder freed after exhaustion.r   )r  r   r   r   rU   r   rr   decode_video)r   r   r  r  decoderr_   r_   r`   r     s    zVideoDecoder.__call__r   r   )NN)r   r   r  r  r  r  rL   rp   r  r_   r_   r_   r`   r    s    	r  c                   @  (   e Zd ZdZ	ddddZdddZdS )AudioDecoderz'Owns audio decoder + vocoder lifecycle.Nrt   rG   re   rf   rQ   rI   ry   rz   rL   r}   c                 C  s@   || _ || _t|tt|pt d| _t|tt|pt d| _	d S r   )
r   r   r   r   r   r   r  r   r   _vocoder_builderr   r_   r_   r`   r     s   zAudioDecoder.__init__r   r   r4   c              	   C  s   t | jj| j| jd| j 8}t | jj| j| jd| j }t|||W  d   W  d   S 1 s=w   Y  W d   dS 1 sMw   Y  dS )zDDecode audio *latent* through VAE decoder + vocoder, then free both.r   N)	r;   r  r   r   r   rU   r   r  vae_decode_audio)r   r   r
  vocoderr_   r_   r`   r     s   
RzAudioDecoder.__call__r   r   )r   r   rL   r4   r  r_   r_   r_   r`   r    s
    r  c                   @  r  )AudioConditionerzOwns audio encoder lifecycle.
    Builds the encoder, passes it to the user-supplied callable, then frees it.
    Mirrors :class:`ImageConditioner` for the audio modality.
    Nrt   rG   re   rf   rQ   rI   ry   rz   rL   r}   c                 C  r   r   )r   r   r   r   r   r   r   r   r_   r_   r`   r   )  r   zAudioConditioner.__init__r   Callable[[torch.nn.Module], T]rB   c                 C  sP   t | jj| j| jd| j }||W  d   S 1 s!w   Y  dS )u<   Build audio encoder → call *fn(encoder)* → free encoder.r   N)r;   r   r   r   r   rU   r   r   r_   r_   r`   r   9  s   $zAudioConditioner.__call__r   r   )r   r  rL   rB   r  r_   r_   r_   r`   r  #  s
    
r  )
rE   rC   rF   rG   rH   rI   rJ   rK   rL   rM   )rb   rA   rc   r2   rd   r   re   rf   rQ   rI   rL   r6   )ro   rp   rE   rq   rL   rp   )sr   
__future__r   loggingcollections.abcr   
contextlibr   r   dataclassesr   typingr   r   rV   ltx_core.batch_splitr	   #ltx_core.components.diffusion_stepsr
   ltx_core.components.noisersr   ltx_core.components.patchifiersr   r   ltx_core.components.protocolsr   ltx_core.layer_streamingr   ltx_core.loaderr   ltx_core.loader.primitivesr   ltx_core.loader.registryr   r   (ltx_core.loader.single_gpu_model_builderr   r   ltx_core.model.audio_vaer   r   r   r   r   r   r   r  ltx_core.model.transformerr   r   r   $ltx_core.model.transformer.compilingr   r    ltx_core.model.upsamplerr!   r"   ltx_core.model.video_vaer#   r$   r%   r&   r'   r(   ltx_core.quantizationr)   ltx_core.text_encoders.gemmar*   r+   r,   r-   r.   r/   1ltx_core.text_encoders.gemma.embeddings_processorr0   ltx_core.toolsr1   r2   r3   ltx_core.typesr4   r5   r6   r7   r8   ltx_core.utilsr9   ltx_pipelines.utils.gpu_modelr;   ltx_pipelines.utils.helpersr<   r=   r>   ltx_pipelines.utils.samplersr?   ltx_pipelines.utils.typesr@   rA   	getLoggerr   r\   rB   nnModulerC   ra   rn   rr   rs   r   r   r   r  r  r  r_   r_   r_   r`   <module>   sb       


 "M#+%+