o
    ۷i                  
   @   s  d dl Z d dlZd dlmZmZ d dlZd dlZd dlm	Z	m
Z
mZ ddlmZmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* e rd dl+m,  m-Z. dZ/ndZ/e0e1Z2dZ3	d2dej4dej5dB de6fddZ7				 d3d!e8d"e8d#e9d$e9fd%d&Z:				d4d'e8dB d(e6ej;B dB d)e<e8 dB d*e<e9 dB fd+d,Z=d5d.d/Z>G d0d1 d1e$eeZ?dS )6    N)AnyCallable)Gemma3ForConditionalGenerationGemmaTokenizerGemmaTokenizerFast   )MultiPipelineCallbacksPipelineCallback)PipelineImageInput)FromSingleFileMixinLTX2LoraLoaderMixin)AutoencoderKLLTX2AudioAutoencoderKLLTX2Video)LTX2VideoTransformer3DModel)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )DiffusionPipeline   )LTX2TextConnectors)LTX2PipelineOutput)LTX2VocoderTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import LTX2ImageToVideoPipeline
        >>> from diffusers.pipelines.ltx2.export_utils import encode_video
        >>> from diffusers.utils import load_image

        >>> pipe = LTX2ImageToVideoPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
        >>> pipe.enable_model_cpu_offload()

        >>> image = load_image(
        ...     "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
        ... )
        >>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background."
        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

        >>> frame_rate = 24.0
        >>> video, audio = pipe(
        ...     image=image,
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     width=768,
        ...     height=512,
        ...     num_frames=121,
        ...     frame_rate=frame_rate,
        ...     num_inference_steps=40,
        ...     guidance_scale=4.0,
        ...     output_type="np",
        ...     return_dict=False,
        ... )

        >>> encode_video(
        ...     video[0],
        ...     fps=frame_rate,
        ...     audio=audio[0].float().cpu(),
        ...     audio_sample_rate=pipe.vocoder.config.output_sampling_rate,  # should be 24000
        ...     output_path="video.mp4",
        ... )
        ```
sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr    r   moder"   AttributeError)r   r   r    r&   h/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/ltx2/pipeline_ltx2_image2video.pyretrieve_latents[   s   

r(               ?ffffff?base_seq_lenmax_seq_len
base_shift	max_shiftc                 C   s,   || ||  }|||  }| | | }|S Nr&   )image_seq_lenr-   r.   r/   r0   mbmur&   r&   r'   calculate_shifti   s   r6   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr9   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r9   r8   r:   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r:   r8   r8   r&   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r9   len)	schedulerr7   r8   r9   r:   kwargsaccepts_timestepsaccept_sigmasr&   r&   r'   retrieve_timestepsw   s2   rH           c                 C   sX   |j ttd|jdd}| j ttd| jdd}| ||  }|| d| |   } | S )a  
    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
    Flawed](https://huggingface.co/papers/2305.08891).

    Args:
        noise_cfg (`torch.Tensor`):
            The predicted noise tensor for the guided diffusion process.
        noise_pred_text (`torch.Tensor`):
            The predicted noise tensor for the text-guided diffusion process.
        guidance_rescale (`float`, *optional*, defaults to 0.0):
            A rescale factor applied to the noise predictions.

    Returns:
        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
    r   Tdimkeepdim)stdlistrangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaledr&   r&   r'   rescale_noise_cfg   s
   rW   c                =       sv  e Zd ZdZdZg Zg dZdedede	de
deeB d	ed
edef fddZe			ddejdejdeejB dedededejfddZ					ddeee B dedededejdB d ejdB fd!d"Z		#									ddeee B d$eee B dB d%eded&ejdB d'ejdB d(ejdB d)ejdB dededejdB d ejdB fd*d+Z					dd,d-Zedd.ejd/ed0edejfd1d2Ze	dd.ejd3ed4ed5ed/ed0edejfd6d7Z e	8dd.ejd9ejd:ejd;edejf
d<d=Z!e	8dd.ejd9ejd:ejd;edejf
d>d?Z"e	dd.ejd@eejB dAej#dB fdBdCZ$e	dd.ejd/edB d0edB dejfdDdEZ%e		dd.ejdFedGed/edB d0edB dejfdHdIZ&ed.ejd9ejd:ejfdJdKZ'ed.ejd9ejd:ejfdLdMZ(			N	O	P	Q	R				ddSejdB dTedUed4ed5ed3ed@ed ejdB dejdB dAej#dB d.ejdB dejfdVdWZ)				X	R				ddTedUedYedGed@ed ejdB dejdB dAej#dB d.ejdB dejfdZd[Z*e+d\d] Z,e+d^d_ Z-e+d`da Z.e+dbdc Z/e+ddde Z0e+dfdg Z1e+dhdi Z2e3 e4e5ddddOdjdkdldmdddndRdRdddddddddRddod#ddd.gdfdSe6deee B d$eee B dB d4ed5ed3edpedqedree dB dsee dB dtedued@ededAej#eej# B dB d.ejdB dvejdB d&ejdB d(ejdB d'ejdB d)ejdB dweee B dxeee B dB dyedzed{e7ee8f dB d|e9eegdf dB d}ee def:d~dZ:  Z;S )LTX2ImageToVideoPipelinezs
    Pipeline for image-to-video generation.

    Reference: https://github.com/Lightricks/LTX-Video

    TODO
    z>text_encoder->connectors->transformer->vae->audio_vae->vocoder)r"   prompt_embedsnegative_prompt_embedsrD   vae	audio_vaetext_encoder	tokenizer
connectorstransformervocoderc	           	   
      sR  t    | j||||||||d t| dd d ur| jjnd| _t| dd d ur-| jjnd| _t| dd d ur<| j	j
nd| _t| dd d urK| j	jnd| _t| dd d ur[| jjjnd| _t| dd urj| jjjnd| _t| dd d urz| j	jjnd	| _t| dd d ur| j	jjnd
| _t| jdd| _t| dd d ur| jj| _d S d| _d S )N)r[   r\   r]   r^   r_   r`   ra   rD   r[          r\      r`   r   i>     bilinear)vae_scale_factorresampler^      )super__init__register_modulesgetattrr[   spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratior\   mel_compression_ratioaudio_vae_mel_compression_ratio$audio_vae_temporal_compression_ratior`   config
patch_sizetransformer_spatial_patch_sizepatch_size_ttransformer_temporal_patch_sizesample_rateaudio_sampling_ratemel_hop_lengthaudio_hop_lengthr   video_processorr^   model_max_lengthtokenizer_max_length)	selfrD   r[   r\   r]   r^   r_   r`   ra   rB   r&   r'   rk      s@   
z!LTX2ImageToVideoPipeline.__init__leftrc   ư>text_hidden_statessequence_lengthsr8   padding_sidescale_factorepsreturnc                 C   s^  | j \}}}}	| j}
tj||dd}|dkr#||dddf k }n|dkr6||dddf  }||k}ntd| |ddddddf }| | d}|| |ddd}|jd	d
d||  }| | t	dj
d	d
d}| | t	djd	d
d}| | || |  }|| }|d}|ddd||	 }|| d}|j|
d}|S )a&  
        Packs and normalizes text encoder hidden states, respecting padding. Normalization is performed per-batch and
        per-layer in a masked fashion (only over non-padded positions).

        Args:
            text_hidden_states (`torch.Tensor` of shape `(batch_size, seq_len, hidden_dim, num_layers)`):
                Per-layer hidden_states from a text encoder (e.g. `Gemma3ForConditionalGeneration`).
            sequence_lengths (`torch.Tensor of shape `(batch_size,)`):
                The number of valid (non-padded) tokens for each batch instance.
            device: (`str` or `torch.device`, *optional*):
                torch device to place the resulting embeddings on
            padding_side: (`str`, *optional*, defaults to `"left"`):
                Whether the text tokenizer performs padding on the `"left"` or `"right"`.
            scale_factor (`int`, *optional*, defaults to `8`):
                Scaling factor to multiply the normalized hidden states by.
            eps (`float`, *optional*, defaults to `1e-6`):
                A small positive value for numerical stability when performing normalization.

        Returns:
            `torch.Tensor` of shape `(batch_size, seq_len, hidden_dim * num_layers)`:
                Normed and flattened text encoder hidden states.
        )r8   r   rightNr   z,padding_side must be 'left' or 'right', got rI   r   )r   r   TrJ   infz-infr   dtype)shaper   torcharange	unsqueezer;   masked_fillviewsumfloataminamaxflattensqueezeexpandto)r   r   r8   r   r   r   
batch_sizeseq_len
hidden_dim
num_layersoriginal_dtypetoken_indicesmaskstart_indicesmasked_text_hidden_statesnum_valid_positionsmasked_meanx_minx_maxnormalized_hidden_states	mask_flatr&   r&   r'   _pack_text_embeds  s,    

z*LTX2ImageToVideoPipeline._pack_text_embedsr   ri   Npromptnum_videos_per_promptmax_sequence_lengthr   c                 C   sF  |p| j }|p
| jj}t|tr|gn|}t|}t| dddur1d| j_| jj	du r1| jj
| j_	dd |D }| j|d|dddd	}|j}	|j}
|	|}	|
|}
| j|	|
dd
}|j}tj|dd}|
jdd}| j|||| jj|d}|j|d}|j\}}}|d|d}||| |d}|
|d}
|
|d}
||
fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`str` or `torch.device`):
                torch device to place the resulting embeddings on
            dtype: (`torch.dtype`):
                torch dtype to cast the prompt embeds to
            max_sequence_length (`int`, defaults to 1024): Maximum sequence length to use for the prompt.
        r^   Nr   c                 S   s   g | ]}|  qS r&   )strip).0pr&   r&   r'   
<listcomp>x  s    zELTX2ImageToVideoPipeline._get_gemma_prompt_embeds.<locals>.<listcomp>
max_lengthTpt)paddingr   
truncationadd_special_tokensreturn_tensors)	input_idsattention_maskoutput_hidden_statesr   rK   )r8   r   r   r   r   )_execution_devicer]   r   
isinstancestrrC   rm   r^   r   	pad_token	eos_tokenr   r   r   hidden_statesr   stackr   r   r   repeatr   )r   r   r   r   r   r8   r   r   text_inputstext_input_idsprompt_attention_masktext_encoder_outputstext_encoder_hidden_statesr   rY   _r   r&   r&   r'   _get_gemma_prompt_embedsW  sR   


z1LTX2ImageToVideoPipeline._get_gemma_prompt_embedsTnegative_promptdo_classifier_free_guidancerY   rZ   r   negative_prompt_attention_maskc              
   C   s  |p| j }t|tr|gn|}|durt|}n|jd }|du r.| j|||	|
||d\}}|r|du r|p7d}t|trB||g n|}|dur_t|t|ur_tdt| dt| d|t|krxtd| d	t| d
| d	| d	| j|||	|
||d\}}||||fS )a"  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            device: (`torch.device`, *optional*):
                torch device
            dtype: (`torch.dtype`, *optional*):
                torch dtype
        Nr   )r   r   r   r   r8   r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	r   r   r   rC   r   r   type	TypeErrorr;   )r   r   r   r   r   rY   rZ   r   r   r   r   r8   r   r   r&   r&   r'   encode_prompt  sP   
)


	
	z&LTX2ImageToVideoPipeline.encode_promptc	           	         st  |d dks|d dkrt d| d| d|d ur8t fdd|D s8t d j d	 fd
d|D  |d urK|d urKt d| d| d|d u rW|d u rWt d|d urnt|tsnt|tsnt dt| |d urz|d u rzt d|d ur|d u rt d|d ur|d ur|j|jkrt d|j d|j d|j|jkrt d|j d|j dd S d S d S )Nrb   r   z8`height` and `width` have to be divisible by 32 but are z and r   c                 3   s    | ]}| j v V  qd S r1   _callback_tensor_inputsr   kr   r&   r'   	<genexpr>  s    

z8LTX2ImageToVideoPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r&   r   r   r   r&   r'   r   
  s    z9LTX2ImageToVideoPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is zEMust provide `prompt_attention_mask` when specifying `prompt_embeds`.zWMust provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z`prompt_attention_mask` and `negative_prompt_attention_mask` must have the same shape when passed directly, but got: `prompt_attention_mask` z% != `negative_prompt_attention_mask` )r;   allr   r   r   rN   r   r   )	r   r   heightwidth"callback_on_step_end_tensor_inputsrY   rZ   r   r   r&   r   r'   check_inputs  sR   z%LTX2ImageToVideoPipeline.check_inputsr"   rv   rx   c              
   C   sl   | j \}}}}}|| }|| }	|| }
| |d|||	||
|} | dddddddd	dd	dd} | S )
Nr   r   r   rd      r   r         )r   reshapepermuter   )r"   rv   rx   r   num_channels
num_framesr   r   post_patch_num_framespost_patch_heightpost_patch_widthr&   r&   r'   _pack_latents-  s    (
z&LTX2ImageToVideoPipeline._pack_latentsr   r   r   c              
   C   sV   |  d}| ||||d|||} | dddddddd	dd	dddd} | S )
Nr   r   rd   r   r   r   r   r   r   )sizer   r   r   )r"   r   r   r   rv   rx   r   r&   r&   r'   _unpack_latentsE  s   
0z(LTX2ImageToVideoPipeline._unpack_latents      ?latents_meanlatents_stdscaling_factorc                 C   sP   | ddddd| j| j}| ddddd| j| j}| | | | } | S Nr   r   r   r   r8   r   r"   r   r   r   r&   r&   r'   _normalize_latentsR  s   z+LTX2ImageToVideoPipeline._normalize_latentsc                 C   sP   | ddddd| j| j}| ddddd| j| j}| | | | } | S r   r   r   r&   r&   r'   _denormalize_latents\  s   z-LTX2ImageToVideoPipeline._denormalize_latentsnoise_scaler   c                 C   s.   t | j|| j| jd}|| d| |   }|S )Nr   r8   r   r   )r   r   r8   r   )r"   r   r   noisenoised_latentsr&   r&   r'   _create_noised_stateg  s   z-LTX2ImageToVideoPipeline._create_noised_statec           	      C   s   |d ur5|d ur5| j \}}}}|| }|| }| |d||||} | dddddddddd} | S | dddd} | S )Nr   r   r   rd   r   r   r   )r   r   r   r   	transpose)	r"   rv   rx   r   r   latent_lengthlatent_mel_binspost_patch_latent_lengthpost_patch_mel_binsr&   r&   r'   _pack_audio_latentsp  s   $z,LTX2ImageToVideoPipeline._pack_audio_latentsr   num_mel_binsc                 C   sr   |d ur+|d ur+|  d}| |||d||} | dddddddddd} | S | dd|fdd} | S )Nr   r   r   r   rd   r   r   )r   r   r   r   	unflattenr   )r"   r   r  rv   rx   r   r&   r&   r'   _unpack_audio_latents  s   
$z.LTX2ImageToVideoPipeline._unpack_audio_latentsc                 C   s,   | | j| j}| | j| j}| | | S r1   r   r8   r   r"   r   r   r&   r&   r'   _normalize_audio_latents     z1LTX2ImageToVideoPipeline._normalize_audio_latentsc                 C   s,   | | j| j}| | j| j}| | | S r1   r  r  r&   r&   r'   _denormalize_audio_latents  r  z3LTX2ImageToVideoPipeline._denormalize_audio_latents           rI   imager   num_channels_latentsc                    sz  |j  }|j  }|d j d }|||||f}|d|||f}|d ur|jdkrm|j\}}}}}|d|||f}||}d|d d d d df< |jjjjjj	j
}||d|   }|jj}n||}d|d d d d df< |jjd}|jdks|jd d |jkrtd|j d	|j|f  d
|j|	|d|fS t trt |krtdt  d| d fddt|D }n
 fddD }tj|dd|}|jjjj}|dd|dd}tj||	|d}d|d d d d df< t| |	|d}|| |d|   }|jjd}|jj}||fS )Nr   r   r   r   r   r   r   $Provided `latents` tensor has shape z, but the expected shape is r   r8   r   /You have passed a list of generators of length +, but requested an effective batch size of @. Make sure the batch size matches the length of the generators.c                    s4   g | ]}t j| d d | dqS r   r   r!   r(   r[   encoder   )r   ir   r  r   r&   r'   r     s    &z<LTX2ImageToVideoPipeline.prepare_latents.<locals>.<listcomp>c                    s,   g | ]}t j|d d dqS r  r  )r   img)r   r   r&   r'   r     s     r   r   )ro   rq   rP   r   	new_zerosr   r[   r   r   ru   r   r   r   rw   ry   r   r;   r   r   rN   rC   rO   r   catr   zerosr   )r   r  r   r  r   r   r   r   r   r8   r   r"   r   
mask_shaper   conditioning_maskinit_latentsr   r&   r  r'   prepare_latents  sv   









z(LTX2ImageToVideoPipeline.prepare_latents@   audio_latent_lengthc
                 C   s   |	d ur5|	j dkr| |	}	|	j dkrtd|	j d| |	| jj| jj}	| |	||}	|	j	||dS || j
 }
||||
f}t|trXt||krXtdt| d| dt||||d	}	| |	}	|	S )
Nrd   r   r  z@, but the expected shape is [batch_size, num_seq, num_features].r  r  r  r  r   )rP   r  r;   r   r  r\   r   r   r   r   rs   r   rN   rC   r   )r   r   r  r#  r  r   r   r8   r   r"   r   r   r&   r&   r'   prepare_audio_latents  s*   




z.LTX2ImageToVideoPipeline.prepare_audio_latentsc                 C      | j S r1   _guidance_scaler   r&   r&   r'   guidance_scale&     z'LTX2ImageToVideoPipeline.guidance_scalec                 C   r%  r1   )_guidance_rescaler   r&   r&   r'   rS   *  r)  z)LTX2ImageToVideoPipeline.guidance_rescalec                 C   s
   | j dkS )Nr   r&  r   r&   r&   r'   r   .  s   
z4LTX2ImageToVideoPipeline.do_classifier_free_guidancec                 C   r%  r1   )_num_timestepsr   r&   r&   r'   num_timesteps2  r)  z&LTX2ImageToVideoPipeline.num_timestepsc                 C   r%  r1   )_current_timestepr   r&   r&   r'   current_timestep6  r)  z)LTX2ImageToVideoPipeline.current_timestepc                 C   r%  r1   )_attention_kwargsr   r&   r&   r'   attention_kwargs:  r)  z)LTX2ImageToVideoPipeline.attention_kwargsc                 C   r%  r1   )
_interruptr   r&   r&   r'   	interrupt>  r)  z"LTX2ImageToVideoPipeline.interrupti   y   g      8@(   g      @pil
frame_rater7   r:   r9   r(  rS   audio_latentsdecode_timestepdecode_noise_scaleoutput_typereturn_dictr0  callback_on_step_endr   c           M      C   sV	  t |ttfr
|j}| j||||||||d || _|| _|| _d| _d| _	|dur2t |t
r2d}n|dur@t |tr@t|}n|jd }| j}| j||| j|||||||d
\}}}}| jrqtj||gdd}tj||gdd}d||j d } | j|| d	d
\}!}"}#|d | j d }$|| j }%|| j }&|dur|jdkrtd |j\}'}'}$}%}&n|jdkrtd|j d n	td|j d|$|% |& }(|du r| jj|||d}|j||jd}| jj j!})| "||| |)||||tj#|||\}}*| jrt|*|*g}*|| }+| j$| j% t&| j' },t(|+|, }-|durO|jdkr5td |j\}'}'}-}'n|jdkrFtd|j d n	td|j dt)| dddur]| j*j j+nd}.|.| j, }/t)| dddurr| j*j j-nd}0| j.|| |0|-|.|tj#|||d	}|	du rt/0dd| |n|	}	t1|(| j2j 3dd | j2j 3d!d"| j2j 3d#d$| j2j 3d%d&}1t45| j2}2t6|2|||
|	|1d'\}'}'t6| j2|||
|	|1d'\}
}t7t|
|| j2j8  d}3t|
| _9| j| | j| jf}4| jj:j;|jd |$|%|&|j<|d(}5| jj=>|jd |-|j<}6| jr*|5?d)d*|5jd   }5|6?d)d*|6jd   }6| j@|d+}7tA|
D ]\}8}9| jBrAq6|9| _	| jrPt|gd, n|}:|:|j}:| jrdt|gd, n|};|;|j};|9C|:jd }<|<Dd-d|*  }=| jEd.C | jdIi d/|:d0|;d1|!d2|"d3|=d4|<d5|#d6|#d7|$d8|%d9|&d:|d;|-d<|5d=|6d>|d?d\}>}?W d   n	1 sw   Y  |>& }>|?& }?| jr|>Fd,\}@}A|@| jG|A|@   }>|?Fd,\}B}C|B| jG|C|B   }?| jHdkrtI|>|A| jHd@}>tI|?|C| jHd@}?| J|>|$|%|&| jK| jL}>| J||$|%|&| jK| jL}|>ddddddf }>|ddddddf }D| j2jM|>|9|DddAd }Etj|ddddddf |Egd,d}| N|| jK| jL}|2jM|?|9|ddAd }|duri }F|D ]
}GtO |G |F|G< q|| |8|9|F}H|HPdB|}|HPdC|}|8t|
d ks|8d |3kr|8d | j2j8 dkr|7Q  tRrtST  q6W d   n	1 sw   Y  | J||$|%|&| jK| jL}| U|| jVjW| jVjX| jVj jY}| Z|| j*jW| j*jX}| j[||-|/dD}|dEkr|}I|}Jn||j}| jVj j\sd}<nMt]|j|||jdF}Kt |ts2|g| }|du r:|}nt |tsE|g| }tj^|||jd}<tj^|||jdddddddf }d| | ||K  }|| jVj}| jVj_||<ddAd }I| jj`|I|dG}I|| j*j}| j*j_|ddAd }L| a|L}J| b  |s|I|JfS tc|I|JdHS )Ju  
        Function invoked when calling the pipeline for generation.

        Args:
            image (`PipelineImageInput`):
                The input image to condition the generation on. Must be an image, a list of images or a `torch.Tensor`.
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            height (`int`, *optional*, defaults to `512`):
                The height in pixels of the generated image. This is set to 480 by default for the best results.
            width (`int`, *optional*, defaults to `768`):
                The width in pixels of the generated image. This is set to 848 by default for the best results.
            num_frames (`int`, *optional*, defaults to `121`):
                The number of video frames to generate
            frame_rate (`float`, *optional*, defaults to `24.0`):
                The frames per second (FPS) of the generated video.
            num_inference_steps (`int`, *optional*, defaults to 40):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            sigmas (`List[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to `4.0`):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            guidance_rescale (`float`, *optional*, defaults to 0.0):
                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
                [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
                using zero terminal SNR.
            noise_scale (`float`, *optional*, defaults to `0.0`):
                The interpolation factor between random noise and denoised latents at each timestep. Applying noise to
                the `latents` and `audio_latents` before continue denoising.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of videos to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            audio_latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for audio
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            prompt_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask for text embeddings.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
            negative_prompt_attention_mask (`torch.FloatTensor`, *optional*):
                Pre-generated attention mask for negative text embeddings.
            decode_timestep (`float`, defaults to `0.0`):
                The timestep at which generated video is decoded.
            decode_noise_scale (`float`, defaults to `None`):
                The interpolation factor between random noise and denoised latents at the decode timestep.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ltx.LTX2PipelineOutput`] instead of a plain tuple.
            attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int`, *optional*, defaults to `1024`):
                Maximum sequence length to use with the `prompt`.

        Examples:

        Returns:
            [`~pipelines.ltx.LTX2PipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ltx.LTX2PipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images.
        )r   r   r   r   rY   rZ   r   r   FNr   r   )
r   r   r   r   rY   rZ   r   r   r   r8   r   g    .T)additive_maskr   zGot latents of shape [batch_size, latent_dim, latent_frames, latent_height, latent_width], `latent_num_frames`, `latent_height`, `latent_width` will be inferred.r   z,You have supplied packed `latents` of shape zp, so the latent dims cannot be inferred. Make sure the supplied `height`, `width`, and `num_frames` are correct.r  z, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, latent_dim, latent_frames, latent_height, latent_width].)r   r   r  rd   zsGot audio_latents of shape [batch_size, num_channels, audio_length, mel_bins], `audio_num_frames` will be inferred.z2You have supplied packed `audio_latents` of shape zj, so the latent dims cannot be inferred. Make sure the supplied `num_frames` and `frame_rate` are correct.z*Provided `audio_latents` tensor has shape z}, but the expected shape is either [batch_size, seq_len, num_features] or [batch_size, num_channels, audio_length, mel_bins].r\   r"  rc   )r  r#  r  r   r   r8   r   r"   r   base_image_seq_lenri   max_image_seq_lenr*   r/   gffffff?r0   gffffff @)r:   r5   )fps)r   )r   )totalr   r   cond_uncondr   audio_hidden_statesencoder_hidden_statesaudio_encoder_hidden_statestimestepaudio_timestepencoder_attention_maskaudio_encoder_attention_maskr   r   r   r@  audio_num_framesvideo_coordsaudio_coordsr0  r;  )rS   )r;  r"   rY   )r  latentr   )r:  )framesaudior&   )dr   r	   r   tensor_inputsr   r'  r*  r/  r1  r-  r   rN   rC   r   r   r   r   r   r  r   r   r_   rq   ro   rP   loggerinfowarningr;   r~   
preprocessr`   ru   in_channelsr!  float32r{   r}   r   rt   roundrm   r\   mel_binsrs   latent_channelsr$  nplinspacer6   rD   getcopydeepcopyrH   maxorderr+  ropeprepare_video_coordsr8   
audio_ropeprepare_audio_coordsr   progress_bar	enumerater2  r   r   cache_contextchunkr(  rS   rW   r   rw   ry   stepr   localspopupdateXLA_AVAILABLExm	mark_stepr   r[   r   r   r   r	  r  timestep_conditioningr   tensordecodepostprocess_videora   maybe_free_model_hooksr   )Mr   r  r   r   r   r   r   r6  r7   r:   r9   r(  rS   r   r   r   r"   r7  rY   r   rZ   r   r8  r9  r:  r;  r0  r<  r   r   r   r8   additive_attention_maskconnector_prompt_embedsconnector_audio_prompt_embedsconnector_attention_masklatent_num_frameslatent_heightlatent_widthr   video_sequence_lengthr  r  
duration_saudio_latents_per_secondrJ  r  r   num_channels_latents_audior5   audio_schedulernum_warmup_stepsrope_interpolation_scalerK  rL  re  r  tlatent_model_inputaudio_latent_model_inputrF  video_timestepnoise_pred_videonoise_pred_audionoise_pred_video_uncondnoise_pred_video_textnoise_pred_audio_uncondnoise_pred_audio_textnoise_latentspred_latentscallback_kwargsr   callback_outputsvideorO  r   generated_mel_spectrogramsr&   r&   r'   __call__B  s`   







 
 	



	

	(

6l




z!LTX2ImageToVideoPipeline.__call__)r   rc   r   )r   ri   rc   NN)NTr   NNNNri   rc   NN)NNNNN)r   r   )r   r1   )NN)Nr   r
  r  r  r  rI   NNNN)	r   rc   r   r"  rI   NNNN)<__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r   r   rk   staticmethodr   Tensorr   r8   intr   r   rN   r   r   boolr   r   r   r   r   r   	Generatorr   r  r  r  r	  r!  r$  propertyr(  rS   r   r,  r.  r0  r2  no_gradr   EXAMPLE_DOC_STRINGr
   dictr   r   r  __classcell__r&   r&   r   r'   rX      s   	8F

M
	

\
5"			

Y	

'







	



rX   )Nr   )r)   r*   r+   r,   )NNNN)rI   )@r]  r=   typingr   r   numpyrZ  r   transformersr   r   r   	callbacksr   r	   image_processorr
   loadersr   r   models.autoencodersr   r   models.transformersr   
schedulersr   utilsr   r   r   utils.torch_utilsr   r~   r   pipeline_utilsr   r_   r   pipeline_outputr   ra   r   torch_xla.core.xla_modelcore	xla_modelrn  rm  
get_loggerr  rQ  r  r  r  r   r(   r  r   r6   r8   rN   rH   rW   rX   r&   r&   r&   r'   <module>   sz   
-





<