o
    Gi"                 
   @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlZd dl	Z
d dlZd dlmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, e rd dl-m.  m/Z0 dZ1ndZ1e2e3Z4dZ5eG dd dZ6	d4dej7dej8dB de9fddZ:		 	!	"d5d#e;d$e;d%e<d&e<fd'd(Z=				d6d)e;dB d*e9ej>B dB d+e?e; dB d,e?e< dB fd-d.Z@d7d0d1ZAG d2d3 d3e&eeZBdS )8    N)	dataclass)AnyCallable)Gemma3ForConditionalGenerationGemmaTokenizerGemmaTokenizerFast   )MultiPipelineCallbacksPipelineCallback)FromSingleFileMixinLTX2LoraLoaderMixin)AutoencoderKLLTX2AudioAutoencoderKLLTX2Video)LTX2VideoTransformer3DModel)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )DiffusionPipeline   )LTX2TextConnectors)LTX2PipelineOutput)LTX2VocoderTFaZ  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import LTX2ConditionPipeline
        >>> from diffusers.pipelines.ltx2.export_utils import encode_video
        >>> from diffusers.pipelines.ltx2.pipeline_ltx2_condition import LTX2VideoCondition
        >>> from diffusers.utils import load_image

        >>> pipe = LTX2ConditionPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
        >>> pipe.enable_model_cpu_offload()

        >>> first_image = load_image(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
        ... )
        >>> last_image = load_image(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
        ... )
        >>> first_cond = LTX2VideoCondition(frames=first_image, index=0, strength=1.0)
        >>> last_cond = LTX2VideoCondition(frames=last_image, index=-1, strength=1.0)
        >>> conditions = [first_cond, last_cond]
        >>> prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings."
        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted, static"

        >>> frame_rate = 24.0
        >>> video = pipe(
        ...     conditions=conditions,
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     width=768,
        ...     height=512,
        ...     num_frames=121,
        ...     frame_rate=frame_rate,
        ...     num_inference_steps=40,
        ...     guidance_scale=4.0,
        ...     output_type="np",
        ...     return_dict=False,
        ... )
        >>> video = (video * 255).round().astype("uint8")
        >>> video = torch.from_numpy(video)

        >>> encode_video(
        ...     video[0],
        ...     fps=frame_rate,
        ...     audio=audio[0].float().cpu(),
        ...     audio_sample_rate=pipe.vocoder.config.output_sampling_rate,  # should be 24000
        ...     output_path="video.mp4",
        ... )
        ```
c                   @   sN   e Zd ZU dZejjeejj B ejB e	j
B ed< dZeed< dZeed< dS )LTX2VideoConditiona  
    Defines a single frame-conditioning item for LTX-2 Video - a single frame or a sequence of frames.

    Attributes:
        frames (`PIL.Image.Image` or `List[PIL.Image.Image]` or `np.ndarray` or `torch.Tensor`):
            The image (or video) to condition the video on. Accepts any type that can be handled by
            VideoProcessor.preprocess_video.
        index (`int`, defaults to `0`):
            The index at which the image or video will conditionally affect the video generation.
        strength (`float`, defaults to `1.0`):
            The strength of the conditioning effect. A value of `1.0` means the conditioning effect is fully applied.
    framesr   index      ?strengthN)__name__
__module____qualname____doc__PILImagelistnpndarraytorchTensor__annotations__r   intr    float r/   r/   d/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/ltx2/pipeline_ltx2_condition.pyr   d   s
   
 $r   sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr1   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr5   r1   moder7   AttributeError)r2   r3   r4   r/   r/   r0   retrieve_latentsy   s   

r;               ?ffffff?base_seq_lenmax_seq_len
base_shift	max_shiftc                 C   s,   || ||  }|||  }| | | }|S Nr/   )image_seq_lenr@   rA   rB   rC   mbmur/   r/   r0   calculate_shift   s   rI   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesrL   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)rL   rK   rM   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)rM   rK   rK   r/   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__rL   len)	schedulerrJ   rK   rL   rM   kwargsaccepts_timestepsaccept_sigmasr/   r/   r0   retrieve_timesteps   s2   r[           c                 C   sX   |j ttd|jdd}| j ttd| jdd}| ||  }|| d| |   } | S )a  
    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
    Flawed](https://huggingface.co/papers/2305.08891).

    Args:
        noise_cfg (`torch.Tensor`):
            The predicted noise tensor for the guided diffusion process.
        noise_pred_text (`torch.Tensor`):
            The predicted noise tensor for the text-guided diffusion process.
        guidance_rescale (`float`, *optional*, defaults to 0.0):
            A rescale factor applied to the noise predictions.

    Returns:
        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
    r   Tdimkeepdim)stdr'   rangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaledr/   r/   r0   rescale_noise_cfg   s
   ri   c                =       s`  e Zd ZdZdZg Zg dZdedede	de
deeB d	ed
edef fddZe			ddejdejdeejB dedededejfddZ					ddeee B dedededejdB d ejdB fd!d"Z		#									ddeee B d$eee B dB d%eded&ejdB d'ejdB d(ejdB d)ejdB dededejdB d ejdB fd*d+Z							dd,d-Zedd.ejd/ed0edejfd1d2Ze	dd.ejd3ed4ed5ed/ed0edejfd6d7Z e	8dd.ejd9ejd:ejd;edejf
d<d=Z!e	8dd.ejd9ejd:ejd;edejf
d>d?Z"e	dd.ejd@eejB dAej#dB fdBdCZ$e	dd.ejd/edB d0edB dejfdDdEZ%e		dd.ejdFedGed/edB d0edB dejfdHdIZ&ed.ejd9ejd:ejfdJdKZ'ed.ejd9ejd:ejfdLdMZ(dNedOedPedefdQdRZ)		S	T	U	ddVe*ee* B dB d4ed5ed3edejdB de+eej ee ee f fdWdXZ,d.ejdYejdZeej d[ee d\ee d]ed^ede+ejejejf fd_d`Z-			a	S	T	U	8				ddVe*ee* B dB dbedced4ed5ed3ed@ed ejdB dejdB dAej#dB d.ejdB de+ejejejf fdddeZ.				f	g				ddbedcedhedGed@ed ejdB dejdB dAej#dB d.ejdB dejfdidjZ/e0dkdl Z1e0dmdn Z2e0dodp Z3e0dqdr Z4e0dsdt Z5e0dudv Z6e0dwdx Z7e8 e9e:ddddSdTdUdydzddd{dgddddddddddgdd|d#ddd.gdfdVe*ee* B dB deee B d$eee B dB d4ed5ed3ed}ed~edee dB dee dB deded@edB dedB dAej#eej# B dB d.ejdB dejdB d&ejdB d(ejdB d'ejdB d)ejdB deee B deee B dB dedede;ee<f dB de=eegdf dB dee def:ddZ>  Z?S )LTX2ConditionPipelinez
    Pipeline for video generation which allows image conditions to be inserted at arbitary parts of the video.

    Reference: https://github.com/Lightricks/LTX-Video

    TODO
    z>text_encoder->connectors->transformer->vae->audio_vae->vocoder)r7   prompt_embedsnegative_prompt_embedsrW   vae	audio_vaetext_encoder	tokenizer
connectorstransformervocoderc	           	   
      sR  t    | j||||||||d t| dd d ur| jjnd| _t| dd d ur-| jjnd| _t| dd d ur<| j	j
nd| _t| dd d urK| j	jnd| _t| dd d ur[| jjjnd| _t| dd urj| jjjnd| _t| dd d urz| j	jjnd	| _t| dd d ur| j	jjnd
| _t| jdd| _t| dd d ur| jj| _d S d| _d S )N)rm   rn   ro   rp   rq   rr   rs   rW   rm          rn      rr   r   i>     bilinear)vae_scale_factorresamplerp      )super__init__register_modulesgetattrrm   spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratiorn   mel_compression_ratioaudio_vae_mel_compression_ratio$audio_vae_temporal_compression_ratiorr   config
patch_sizetransformer_spatial_patch_sizepatch_size_ttransformer_temporal_patch_sizesample_rateaudio_sampling_ratemel_hop_lengthaudio_hop_lengthr   video_processorrp   model_max_lengthtokenizer_max_length)	selfrW   rm   rn   ro   rp   rq   rr   rs   rU   r/   r0   r}      s@   
zLTX2ConditionPipeline.__init__leftru   ư>text_hidden_statessequence_lengthsrK   padding_sidescale_factorepsreturnc                 C   s^  | j \}}}}	| j}
tj||dd}|dkr#||dddf k }n|dkr6||dddf  }||k}ntd| |ddddddf }| | d}|| |ddd}|jd	d
d||  }| | t	dj
d	d
d}| | t	djd	d
d}| | || |  }|| }|d}|ddd||	 }|| d}|j|
d}|S )a&  
        Packs and normalizes text encoder hidden states, respecting padding. Normalization is performed per-batch and
        per-layer in a masked fashion (only over non-padded positions).

        Args:
            text_hidden_states (`torch.Tensor` of shape `(batch_size, seq_len, hidden_dim, num_layers)`):
                Per-layer hidden_states from a text encoder (e.g. `Gemma3ForConditionalGeneration`).
            sequence_lengths (`torch.Tensor of shape `(batch_size,)`):
                The number of valid (non-padded) tokens for each batch instance.
            device: (`str` or `torch.device`, *optional*):
                torch device to place the resulting embeddings on
            padding_side: (`str`, *optional*, defaults to `"left"`):
                Whether the text tokenizer performs padding on the `"left"` or `"right"`.
            scale_factor (`int`, *optional*, defaults to `8`):
                Scaling factor to multiply the normalized hidden states by.
            eps (`float`, *optional*, defaults to `1e-6`):
                A small positive value for numerical stability when performing normalization.

        Returns:
            `torch.Tensor` of shape `(batch_size, seq_len, hidden_dim * num_layers)`:
                Normed and flattened text encoder hidden states.
        rK   r   rightNr   z,padding_side must be 'left' or 'right', got r\   r   )r   r   Tr]   infz-infr   dtype)shaper   r*   arange	unsqueezerN   masked_fillviewsumr.   aminamaxflattensqueezeexpandto)r   r   rK   r   r   r   
batch_sizeseq_len
hidden_dim
num_layersoriginal_dtypetoken_indicesmaskstart_indicesmasked_text_hidden_statesnum_valid_positionsmasked_meanx_minx_maxnormalized_hidden_states	mask_flatr/   r/   r0   _pack_text_embeds/  s,    

z'LTX2ConditionPipeline._pack_text_embedsr   r{   Npromptnum_videos_per_promptmax_sequence_lengthr   c                 C   sF  |p| j }|p
| jj}t|tr|gn|}t|}t| dddur1d| j_| jj	du r1| jj
| j_	dd |D }| j|d|dddd	}|j}	|j}
|	|}	|
|}
| j|	|
dd
}|j}tj|dd}|
jdd}| j|||| jj|d}|j|d}|j\}}}|d|d}||| |d}|
|d}
|
|d}
||
fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`str` or `torch.device`):
                torch device to place the resulting embeddings on
            dtype: (`torch.dtype`):
                torch dtype to cast the prompt embeds to
            max_sequence_length (`int`, defaults to 1024): Maximum sequence length to use for the prompt.
        rp   Nr   c                 S   s   g | ]}|  qS r/   )strip).0pr/   r/   r0   
<listcomp>  s    zBLTX2ConditionPipeline._get_gemma_prompt_embeds.<locals>.<listcomp>
max_lengthTpt)paddingr   
truncationadd_special_tokensreturn_tensors)	input_idsattention_maskoutput_hidden_statesr   r^   )rK   r   r   r   r   )_execution_devicero   r   
isinstancestrrV   r   rp   r   	pad_token	eos_tokenr   r   r   hidden_statesr*   stackr   r   r   repeatr   )r   r   r   r   r   rK   r   r   text_inputstext_input_idsprompt_attention_masktext_encoder_outputstext_encoder_hidden_statesr   rk   _r   r/   r/   r0   _get_gemma_prompt_embedst  sR   


z.LTX2ConditionPipeline._get_gemma_prompt_embedsTnegative_promptdo_classifier_free_guidancerk   rl   r   negative_prompt_attention_maskc              
   C   s  |p| j }t|tr|gn|}|durt|}n|jd }|du r.| j|||	|
||d\}}|r|du r|p7d}t|trB||g n|}|dur_t|t|ur_tdt| dt| d|t|krxtd| d	t| d
| d	| d	| j|||	|
||d\}}||||fS )a"  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            device: (`torch.device`, *optional*):
                torch device
            dtype: (`torch.dtype`, *optional*):
                torch dtype
        Nr   )r   r   r   r   rK   r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	r   r   r   rV   r   r   type	TypeErrorrN   )r   r   r   r   r   rk   rl   r   r   r   r   rK   r   r   r/   r/   r0   encode_prompt  sP   
)


	
	z#LTX2ConditionPipeline.encode_promptc                    s  |d dks|d dkrt d| d| d|d ur8t fdd|D s8t d j d	 fd
d|D  |d urK|d urKt d| d| d|d u rW|d u rWt d|d urnt|tsnt|tsnt dt| |d urz|d u rzt d|d ur|d u rt d|d ur|d ur|j|jkrt d|j d|j d|j|jkrt d|j d|j d|	d ur|	jdkrt d|	j d|
d ur|
jdkrt d|	j dd S d S )Nrt   r   z8`height` and `width` have to be divisible by 32 but are z and r   c                 3   s    | ]}| j v V  qd S rD   _callback_tensor_inputsr   kr   r/   r0   	<genexpr>$  s    

z5LTX2ConditionPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r/   r   r   r   r/   r0   r   (  s    z6LTX2ConditionPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is zEMust provide `prompt_attention_mask` when specifying `prompt_embeds`.zWMust provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but got: `prompt_attention_mask` z% != `negative_prompt_attention_mask`    zOnly unpacked (5D) video latents of shape `[batch_size, latent_channels, latent_frames, latent_height, latent_width] are supported, but got ze dims. If you have packed (3D) latents, please unpack them (e.g. using the `_unpack_latents` method).rv   zuOnly unpacked (4D) audio latents of shape `[batch_size, num_channels, audio_length, mel_bins] are supported, but got zk dims. If you have packed (3D) latents, please unpack them (e.g. using the `_unpack_audio_latents` method).)	rN   allr   r   r   r'   r   r   rb   )r   r   heightwidth"callback_on_step_end_tensor_inputsrk   rl   r   r   r7   audio_latentsr/   r   r0   check_inputs  sh   z"LTX2ConditionPipeline.check_inputsr7   r   r   c              
   C   sl   | j \}}}}}|| }|| }	|| }
| |d|||	||
|} | dddddddd	dd	dd} | S )
Nr   r   r   rv      r   r   r      )r   reshapepermuter   )r7   r   r   r   num_channels
num_framesr   r   post_patch_num_framespost_patch_heightpost_patch_widthr/   r/   r0   _pack_latentsX  s    (
z#LTX2ConditionPipeline._pack_latentsr   r   r   c              
   C   sV   |  d}| ||||d|||} | dddddddd	dd	dddd} | S )
Nr   r   rv   r   r   r   r   r   r   )sizer   r   r   )r7   r   r   r   r   r   r   r/   r/   r0   _unpack_latentsp  s   
0z%LTX2ConditionPipeline._unpack_latentsr   latents_meanlatents_stdscaling_factorc                 C   sP   | ddddd| j| j}| ddddd| j| j}| | | | } | S Nr   r   r   r   rK   r   r7   r   r  r  r/   r/   r0   _normalize_latents}     z(LTX2ConditionPipeline._normalize_latentsc                 C   sP   | ddddd| j| j}| ddddd| j| j}| | | | } | S r  r  r  r/   r/   r0   _denormalize_latents  r  z*LTX2ConditionPipeline._denormalize_latentsnoise_scaler3   c                 C   s.   t | j|| j| jd}|| d| |   }|S )Nr3   rK   r   r   )r   r   rK   r   )r7   r	  r3   noisenoised_latentsr/   r/   r0   _create_noised_state  s   z*LTX2ConditionPipeline._create_noised_statec           	      C   s   |d ur5|d ur5| j \}}}}|| }|| }| |d||||} | dddddddddd} | S | dddd} | S )Nr   r   r   rv   r   r   r   )r   r   r   r   	transpose)	r7   r   r   r   r   latent_lengthlatent_mel_binspost_patch_latent_lengthpost_patch_mel_binsr/   r/   r0   _pack_audio_latents  s   $z)LTX2ConditionPipeline._pack_audio_latentsr  num_mel_binsc                 C   sr   |d ur+|d ur+|  d}| |||d||} | dddddddddd} | S | dd|fdd} | S )Nr   r   r   r   rv   r   r   )r   r   r   r   	unflattenr  )r7   r  r  r   r   r   r/   r/   r0   _unpack_audio_latents  s   
$z+LTX2ConditionPipeline._unpack_audio_latentsc                 C   s,   | | j| j}| | j| j}| | | S rD   r   rK   r   r7   r   r  r/   r/   r0   _normalize_audio_latents     z.LTX2ConditionPipeline._normalize_audio_latentsc                 C   s,   | | j| j}| | j| j}| | | S rD   r  r  r/   r/   r0   _denormalize_audio_latents  r  z0LTX2ConditionPipeline._denormalize_audio_latentsstart_framesequence_num_framestarget_num_framesc                 C   s,   | j }t||| }|d | | d }|S )a  
        Trim a conditioning sequence to the allowed number of frames.

        Args:
            start_frame (int): The target frame number of the first frame in the sequence.
            sequence_num_frames (int): The number of frames in the sequence.
            target_num_frames (int): The target number of frames in the generated video.
        Returns:
            int: updated sequence length
        r   )r   min)r   r  r  r  r   r   r/   r/   r0   trim_conditioning_sequence  s   z0LTX2ConditionPipeline.trim_conditioning_sequence      y   
conditionsc              
   C   s  g g g }}}|du rg }t |tr|g}| j}	|d |	 d }
t|D ]\}}t |jtjjr6|jg}n-t |jtjrL|jj	dkrLtj
|jdd}nt |jtjr`|jj	dkr`|jd}n|j}| jj|||dd}|j}|dk rx||
 }||
krtd| d	| d
|
 d q%|d}t|d |	 d d}| |||}|ddddd|f }||j| jj|d ||j || q%|||fS )a  
        Preprocesses the condition images/videos to torch tensors.

        Args:
            conditions (`LTX2VideoCondition` or `List[LTX2VideoCondition]`, *optional*, defaults to `None`):
                A list of image/video condition instances.
            height (`int`, *optional*, defaults to `512`):
                The desired height in pixels.
            width (`int`, *optional*, defaults to `768`):
                The desired width in pixels.
            num_frames (`int`, *optional*, defaults to `121`):
                The desired number of frames in the generated video.
            device (`torch.device`, *optional*, defaults to `None`):
                The device on which to put the preprocessed image/video tensors.

        Returns:
            `Tuple[List[torch.Tensor], List[float], List[int]]`:
                Returns a 3-tuple of lists of length `len(conditions)` as follows:
                    1. The first list is a list of preprocessed video tensors of shape [batch_size=1, num_channels,
                       num_frames, height, width].
                    2. The second list is a list of conditioning strengths.
                    3. The third list is a list of indices in latent space to insert the corresponding condition.
        Nr   r   r   )axiscrop)resize_modezThe starting latent index z of condition z6 is too big for the specified number of latent frames z!. This condition will be skipped.r   )r   rK   )r   r   r   	enumerater   r%   r&   r(   r)   rb   expand_dimsr*   r+   r   r   preprocess_videor   loggerwarningr   maxr   appendr   rm   r   r    )r   r$  r   r   r   rK   conditioning_framesconditioning_strengthsconditioning_indicesframe_scale_factorlatent_num_framesi	conditionvideo_like_condcondition_pixelslatent_start_idxcond_num_frames	start_idxtruncated_cond_framesr/   r/   r0   preprocess_conditions  sH   



z+LTX2ConditionPipeline.preprocess_conditionsconditioning_maskcondition_latentscondition_strengthscondition_indiceslatent_heightlatent_widthc                 C   s   t |}t|||D ]2\}	}
}|	d}|| | }|| }|	|dd||f< |
|dd||f< |	|dd||f< q|||fS )az  
        Applies visual conditioning frames to an initial latent.

        Args:
            latents (`torch.Tensor`):
                Initial packed (patchified) latents of shape [batch_size, patch_seq_len, hidden_dim].
            conditioning_mask (`torch.Tensor`, *optional*):
                Initial packed (patchified) conditioning mask of shape [batch_size, patch_seq_len, 1] with values in
                [0, 1] where 0 means that the denoising model output will be fully used and 1 means that the condition
                will be fully used (with intermediate values specifying a blend of the denoised and latent values).

        Returns:
            `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]`:
                Returns a 3-tuple of tensors where:
                    1. The first element is the packed video latents (with unchanged shape [batch_size, patch_seq_len,
                       hidden_dim]) with the conditions applied
                    2. The second element is the packed conditioning mask with conditioning strengths applied
                    3. The third element holds the clean conditioning latents.
        r   N)r*   
zeros_likezipr   )r   r7   r=  r>  r?  r@  rA  rB  clean_latentscondr    
latent_idxnum_cond_tokensstart_token_idxend_token_idxr/   r/   r0   apply_visual_conditioning8  s   


z/LTX2ConditionPipeline.apply_visual_conditioning   r   num_channels_latentsc              	   C   s  || j  }|| j  }|d | j d }|||||f}|d|||f}|d ur5| || jj| jj| jjj}ntj	||	|d}|
|}| || j| j}| || j| j}|jdksg|jd d |jd d kr{td|j d|jd d |f  dt|
trt| jj d |
d	 }
| j|||||	d
\}}}g }|D ]+}t| j||
dd}| || jj| jjj|	|d}| || j| j}|| q| j|||||||d\}}}t|j|
|j|jd}d| | }|| |d|   }|||fS )Nr   rK   r   r   r   z$Provided `latents` tensor has shape z, but the expected shape is r   z~ does not support using a list of generators. The first generator in the list will be used for all (pseudo-)random operations.r   r   r6   )r3   r4   )rA  rB  r
  r   ) r   r   r  rm   r   r  r   r  r*   zeros	new_zerosr   r   r   rb   r   rN   r   r'   r+  r,  rU   r!   r<  r;   encoder   r.  rK  r   rK   r   )r   r$  r   rM  r   r   r   r	  r   rK   r3   r7   rA  rB  r3  r   
mask_shaper=  condition_framesr?  r@  r>  condition_tensorcondition_latentrE  r  scaled_maskr/   r/   r0   prepare_latentsc  sp   




&"



z%LTX2ConditionPipeline.prepare_latents@   r\   audio_latent_lengthc
                 C   s   |	d ur"|  |	}	| |	| jj| jj}	| |	||}	|	j||dS || j }
||||
f}t|t	rEt
||krEtdt
| d| dt||||d}	|  |	}	|	S )NrN  z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r
  )r  r  rn   r   r  r  r   r   r   r'   rV   rN   r   )r   r   rM  rY  r  r	  r   rK   r3   r7   r  r   r/   r/   r0   prepare_audio_latents  s    


z+LTX2ConditionPipeline.prepare_audio_latentsc                 C      | j S rD   _guidance_scaler   r/   r/   r0   guidance_scale     z$LTX2ConditionPipeline.guidance_scalec                 C   r[  rD   )_guidance_rescaler   r/   r/   r0   re     r_  z&LTX2ConditionPipeline.guidance_rescalec                 C   s
   | j dkS )Nr   r\  r   r/   r/   r0   r     s   
z1LTX2ConditionPipeline.do_classifier_free_guidancec                 C   r[  rD   )_num_timestepsr   r/   r/   r0   num_timesteps  r_  z#LTX2ConditionPipeline.num_timestepsc                 C   r[  rD   )_current_timestepr   r/   r/   r0   current_timestep  r_  z&LTX2ConditionPipeline.current_timestepc                 C   r[  rD   )_attention_kwargsr   r/   r/   r0   attention_kwargs  r_  z&LTX2ConditionPipeline.attention_kwargsc                 C   r[  rD   )
_interruptr   r/   r/   r0   	interrupt  r_  zLTX2ConditionPipeline.interruptg      8@(   g      @pil
frame_raterJ   rM   rL   r^  re   r   decode_timestepdecode_noise_scaleoutput_typereturn_dictrf  callback_on_step_endr   c           Q      C   s  t |ttfr
|j}| j||||||||||d
 || _|| _|| _d| _d| _	|dur4t |t
r4d}n|durBt |trBt|}n|jd }|durSt |tsS|g}|du ra|	dur_|	d nd}| j}| j||| j|||||||d
\}}}}| jrtj||gdd}tj||gdd}d||j d	 } | j|| d
d\}!}"}#|d | j d }$|| j }%|| j }&|durtd |j\}'}'}$}%}&|$|% |& }(| jjj})| ||| |)||||tj|||\}}*}+| jrt|*|*g}*|| },| j| j  t!| j" }-t#|,|- }.|durtd |j\}'}'}.}'t$| dddur$| j%jj&nd}/|/| j' }0t$| dddur9| j%jj(nd}1| j)|| |1|.|/|tj|||d	}|	du rZt*+dd| |n|	}	t,|(| j-j.dd| j-j.dd| j-j.dd| j-j.dd}2t/0| j-}3t1|3|||
|	|2d\}'}'t1| j-|||
|	|2d\}
}t2t|
|| j-j3  d}4t|
| _4| j| | j| jf}5| jj5j6|jd |$|%|&|j7|d}6| jj89|jd |.|j7}7| jr|6:dd|6j;d   }6|7:dd|7j;d   }7| j<|d}8t=|
D ]u\}9}:| j>rq|:| _	| jrt|gd n|};|;|j};| jr+t|gd n|}<|<|j}<|:?|;jd }=|=@d d|*Ad   }>| jBd!C | jd=i d"|;d#|<d$|!d%|"d&|>d'|=d(|#d)|#d*|$d+|%d,|&d-|d.|.d/|6d0|7d1|d2d\}?}@W d   n	1 sw   Y  |?! }?|@! }@| jr|?Cd\}A}B|A| jD|B|A   }?|@Cd\}C}D|C| jD|D|C   }@| jEdkrtF|?|B| jEd3}?tF|@|D| jEd3}@|?Gd}E| j-jH|9 }F||?|F  }G|Gd|*d|E   |+! |*d|E   |?j}H||H |F |?j}I| j-jI|I|:|dd4d }|3jI|@|:|dd4d }|durLi }J|D ]
}KtJ |K |J|K< q.|| |9|:|J}L|LKd5|}|LKd6|}|9t|
d ksg|9d |4krk|9d | j-j3 dkrk|8L  tMrrtNO  qW d   n	1 sw   Y  | P||$|%|&| jQ| jR}| S|| jTjU| jTjV| jTjjW}| X|| j%jU| j%jV}| jY||.|0d7}|d8kr|}M|}Nn||j}| jTjjZsd}=nMt[|j|||jd9}Ot |ts|g| }|du r|}nt |ts|g| }tj\|||jd:}=tj\|||jd:ddddddf }d| | ||O  }|| jTj}| jTj]||=dd4d }M| j^j_|M|d;}M|| j%j}| j%j]|dd4d }P| `|P}N| a  |sS|M|NfS tb|M|Nd<S )>u  
        Function invoked when calling the pipeline for generation.

        Args:
            conditions (`List[LTXVideoCondition], *optional*`):
                The list of frame-conditioning items for the video generation.
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            height (`int`, *optional*, defaults to `512`):
                The height in pixels of the generated image. This is set to 480 by default for the best results.
            width (`int`, *optional*, defaults to `768`):
                The width in pixels of the generated image. This is set to 848 by default for the best results.
            num_frames (`int`, *optional*, defaults to `121`):
                The number of video frames to generate
            frame_rate (`float`, *optional*, defaults to `24.0`):
                The frames per second (FPS) of the generated video.
            num_inference_steps (`int`, *optional*, defaults to 40):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            sigmas (`List[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to `4.0`):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            guidance_rescale (`float`, *optional*, defaults to 0.0):
                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
                [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
                using zero terminal SNR.
            noise_scale (`float`, *optional*, defaults to `None`):
                The interpolation factor between random noise and denoised latents at each timestep. Applying noise to
                the `latents` and `audio_latents` before continue denoising. If not set, will be inferred from the
                sigma schedule.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of videos to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            audio_latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for audio
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            prompt_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask for text embeddings.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
            negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
                Pre-generated attention mask for negative text embeddings.
            decode_timestep (`float`, defaults to `0.0`):
                The timestep at which generated video is decoded.
            decode_noise_scale (`float`, defaults to `None`):
                The interpolation factor between random noise and denoised latents at the decode timestep.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ltx.LTX2PipelineOutput`] instead of a plain tuple.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int`, *optional*, defaults to `1024`):
                Maximum sequence length to use with the `prompt`.

        Examples:

        Returns:
            [`~pipelines.ltx.LTX2PipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ltx.LTX2PipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images.
        )
r   r   r   r   rk   rl   r   r   r7   r   FNr   r   r   )
r   r   r   r   rk   rl   r   r   r   rK   r   g    .T)additive_maskzGot latents of shape [batch_size, latent_dim, latent_frames, latent_height, latent_width], `latent_num_frames`, `latent_height`, `latent_width` will be inferred.zwGot audio_latents of shape [batch_size, num_channels, audio_num_frames, mel_bins], `audio_num_frames` will be inferred.rn   rX  ru   )rM  rY  r  r	  r   rK   r3   r7   base_image_seq_lenr{   max_image_seq_lenr=   rB   gffffff?rC   gffffff @)rM   rH   )fps)r   )r   )totalr   r   cond_uncondr   audio_hidden_statesencoder_hidden_statesaudio_encoder_hidden_statestimestepaudio_timestepencoder_attention_maskaudio_encoder_attention_maskr   r   r   rt  audio_num_framesvideo_coordsaudio_coordsrf  ro  )re   )ro  r7   rk   )r  latentr
  rN  )rn  )r   audior/   )cr   r
   r	   tensor_inputsr   r]  r`  re  rg  rc  r   r'   rV   r   r   r   r   r*   catr   r   rq   r   r   r+  inforr   r   in_channelsrW  float32r   r   r.   r   roundr   rn   mel_binsr   latent_channelsrZ  r(   linspacerI   rW   getcopydeepcopyr[   r-  orderra  ropeprepare_video_coordsrK   
audio_ropeprepare_audio_coordsr   rb   progress_barr(  rh  r   r   r   cache_contextchunkr^  re   ri   r   rM   steplocalspopupdateXLA_AVAILABLExm	mark_stepr   r   r   r  rm   r   r  r  r  r  timestep_conditioningr   tensordecoder   postprocess_videors   maybe_free_model_hooksr   )Qr   r$  r   r   r   r   r   rk  rJ   rM   rL   r^  re   r	  r   r3   r7   r   rk   r   rl   r   rl  rm  rn  ro  rf  rp  r   r   r   rK   additive_attention_maskconnector_prompt_embedsconnector_audio_prompt_embedsconnector_attention_maskr3  rA  rB  r   video_sequence_lengthrM  r=  rE  
duration_saudio_latents_per_secondr~  r  r  num_channels_latents_audiorH   audio_schedulernum_warmup_stepsrope_interpolation_scaler  r  r  r4  tlatent_model_inputaudio_latent_model_inputrz  video_timestepnoise_pred_videonoise_pred_audionoise_pred_video_uncondnoise_pred_video_textnoise_pred_audio_uncondnoise_pred_audio_textbszsigmadenoised_sampledenoised_sample_conddenoised_latents_condcallback_kwargsr   callback_outputsvideor  r  generated_mel_spectrogramsr/   r/   r0   __call__  s(   






 
 	



	


&
6b




zLTX2ConditionPipeline.__call__)r   ru   r   )r   r{   ru   NN)NTr   NNNNr{   ru   NN)NNNNNNN)r   r   )r   rD   )NN)Nr!  r"  r#  N)Nr   rL  r!  r"  r#  r   NNNN)	r   ru   r   rX  r\   NNNN)@r!   r"   r#   r$   model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r   r   r}   staticmethodr*   r+   r   rK   r-   r.   r   r'   r   r   boolr   r   r   r   r  r  	Generatorr  r  r  r  r  r   r   tupler<  rK  rW  rZ  propertyr^  re   r   rb  rd  rf  rh  no_gradr   EXAMPLE_DOC_STRINGdictr   r   r  __classcell__r/   r/   r   r0   rj      s   	7F

M
	

[
D"		
R	
-	

Y	

!







	



rj   )Nr1   )r<   r=   r>   r?   )NNNN)r\   )Cr  rP   dataclassesr   typingr   r   numpyr(   	PIL.Imager%   r*   transformersr   r   r   	callbacksr	   r
   loadersr   r   models.autoencodersr   r   models.transformersr   
schedulersr   utilsr   r   r   utils.torch_utilsr   r   r   pipeline_utilsr   rq   r   pipeline_outputr   rs   r   torch_xla.core.xla_modelcore	xla_modelr  r  
get_loggerr!   r+  r  r   r+   r  r   r;   r-   r.   rI   rK   r'   r[   ri   rj   r/   r/   r/   r0   <module>   s   
4





<