o
    Gi,N                     @   s   d dl Z ddlmZ ddlmZ ddlmZmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ eeZdZ	dde jde jdB defddZG dd deZdS )    N   )PipelineImageInput)AutoencoderKLLTX2Video)
get_loggerreplace_example_docstring)randn_tensor)VideoProcessor   )LTXPipelineOutput)DiffusionPipeline   )LTX2LatentUpsamplerModela  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import LTX2ImageToVideoPipeline, LTX2LatentUpsamplePipeline
        >>> from diffusers.pipelines.ltx2.export_utils import encode_video
        >>> from diffusers.pipelines.ltx2.latent_upsampler import LTX2LatentUpsamplerModel
        >>> from diffusers.utils import load_image

        >>> pipe = LTX2ImageToVideoPipeline.from_pretrained("Lightricks/LTX-2", torch_dtype=torch.bfloat16)
        >>> pipe.enable_model_cpu_offload()

        >>> image = load_image(
        ...     "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
        ... )
        >>> prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background."
        >>> negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"

        >>> frame_rate = 24.0
        >>> video, audio = pipe(
        ...     image=image,
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     width=768,
        ...     height=512,
        ...     num_frames=121,
        ...     frame_rate=frame_rate,
        ...     num_inference_steps=40,
        ...     guidance_scale=4.0,
        ...     output_type="pil",
        ...     return_dict=False,
        ... )

        >>> latent_upsampler = LTX2LatentUpsamplerModel.from_pretrained(
        ...     "Lightricks/LTX-2", subfolder="latent_upsampler", torch_dtype=torch.bfloat16
        ... )
        >>> upsample_pipe = LTX2LatentUpsamplePipeline(vae=pipe.vae, latent_upsampler=latent_upsampler)
        >>> upsample_pipe.vae.enable_tiling()
        >>> upsample_pipe.to(device="cuda", dtype=torch.bfloat16)

        >>> video = upsample_pipe(
        ...     video=video,
        ...     width=768,
        ...     height=512,
        ...     output_type="np",
        ...     return_dict=False,
        ... )[0]

        >>> encode_video(
        ...     video[0],
        ...     fps=frame_rate,
        ...     audio=audio[0].float().cpu(),
        ...     audio_sample_rate=pipe.vocoder.config.output_sampling_rate,  # should be 24000
        ...     output_path="video.mp4",
        ... )
        ```
sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr   r   moder   AttributeError)r   r   r    r   j/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/ltx2/pipeline_ltx2_latent_upsample.pyretrieve_latents[   s   

r   c                !       s  e Zd ZdZdededdf fddZ					
							d9dejdB de	de	de	de	de	de	dej
dB dejdB dejdB dejdB dejfddZd:dejdejdefddZdejdedejfdd Ze	d:dejd!ejd"ejd#edejf
d$d%Ze	d;dejde	de	de	d&e	d'e	dejfd(d)Zd*d+ Ze ee		
							,	-		-	-		.	/d<dee dB de	de	de	de	de	dejdB d0ed1eee B d2eee B dB d3ed4edejeej B dB d5edB d6efd7d8Z  ZS )=LTX2LatentUpsamplePipelinezvae->latent_upsamplervaelatent_upsamplerreturnNc                    sf   t    | j||d t| dd d ur| jjnd| _t| dd d ur'| jjnd| _t	| jd| _
d S )N)r   r   r          )vae_scale_factor)super__init__register_modulesgetattrr   spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratior   video_processor)selfr   r   	__class__r   r   r#   k   s   
z#LTX2LatentUpsamplePipeline.__init__r   y         video
batch_size
num_framesheightwidthspatial_patch_sizetemporal_patch_sizedtypedevicer   r   c                    s   |d ur-|j dkr&|d j d }|j }|j }||||||}|j|	|dS j|	jjdt tr\t	 |krNt
dt	  d| d fddt|D }n
 fd	dD }tj|d
d|}|S )Nr   r   r9   r8   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.c                    s,   g | ]}t j| d  | qS r   r   r   encode	unsqueeze).0ir   r+   r1   r   r   
<listcomp>   s     z>LTX2LatentUpsamplePipeline.prepare_latents.<locals>.<listcomp>c                    s$   g | ]}t j|d  qS r;   r<   )r?   vid)r   r+   r   r   rB      s   $ r   dim)ndimr)   r'   _unpack_latentstor   r8   
isinstancelistlen
ValueErrorrangetorchcat)r+   r1   r2   r3   r4   r5   r6   r7   r8   r9   r   r   latent_num_frameslatent_heightlatent_widthinit_latentsr   rA   r   prepare_latents|   s.   



z*LTX2LatentUpsamplePipeline.prepare_latents      ?reference_latentsfactorc                 C   s   |  }t|dD ]8}t|dD ].}tj|||f dd\}}tj|||f dd\}	}
|||f |
 |	 | | |||f< qqt|||}|S )a5  
        Applies Adaptive Instance Normalization (AdaIN) to a latent tensor based on statistics from a reference latent
        tensor.

        Args:
            latent (`torch.Tensor`):
                Input latents to normalize
            reference_latents (`torch.Tensor`):
                The reference latents providing style statistics.
            factor (`float`):
                Blending factor between original and transformed latent. Range: -10.0 to 10.0, Default: 1.0

        Returns:
            torch.Tensor: The transformed latent tensor
        r   r   NrD   )clonerM   sizerN   std_meanlerp)r+   r   rV   rW   resultr@   cr_sdr_meani_sdi_meanr   r   r   adain_filter_latent   s   &z.LTX2LatentUpsamplePipeline.adain_filter_latentcompressionc                 C   sD   |d }t |}t d| |d  }dd| |  }|| }|S )aL  
        Applies a non-linear tone-mapping function to latent values to reduce their dynamic range in a perceptually
        smooth way using a sigmoid-based compression.

        This is useful for regularizing high-variance latents or for conditioning outputs during generation, especially
        when controlling dynamic behavior with a `compression` factor.

        Args:
            latents : torch.Tensor
                Input latent tensor with arbitrary shape. Expected to be roughly in [-1, 1] or [0, 1] range.
            compression : float
                Compression strength in the range [0, 1].
                - 0.0: No tone-mapping (identity transform)
                - 1.0: Full compression effect

        Returns:
            torch.Tensor
                The tone-mapped latent tensor of the same shape as input.
        g      ?g      @rU   g?)rN   abssigmoid)r+   r   rc   scale_factorabs_latentssigmoid_termscalesfilteredr   r   r   tone_map_latents   s   
z+LTX2LatentUpsamplePipeline.tone_map_latentslatents_meanlatents_stdscaling_factorc                 C   sP   | ddddd| j| j}| ddddd| j| j}| | | | } | S )Nr   )viewrH   r9   r8   )r   rl   rm   rn   r   r   r   _denormalize_latents   s   z/LTX2LatentUpsamplePipeline._denormalize_latents
patch_sizepatch_size_tc              
   C   sV   |  d}| ||||d|||} | dddddddd	dd	dddd} | S )
Nr   ro      r      r	      r      )rY   reshapepermuteflatten)r   r3   r4   r5   rr   rs   r2   r   r   r   rG      s   
0z*LTX2LatentUpsamplePipeline._unpack_latentsc                 C   s   || j  dks|| j  dkrtd| d| d|d ur%|d ur%td|d u r1|d u r1tdd|  kr>dksCtd tdd S )	Nr   z8`height` and `width` have to be divisible by 32 but are z and .z1Only one of `video` or `latents` can be provided.z/One of `video` or `latents` has to be provided.r   z8`tone_map_compression_ratio` must be in the range [0, 1])r'   rL   )r+   r1   r4   r5   r   tone_map_compression_ratior   r   r   check_inputs   s   z'LTX2LatentUpsamplePipeline.check_inputsF        pilTlatents_normalizeddecode_timestepdecode_noise_scaleadain_factorr|   output_typereturn_dictc                 C   sD  | j |||||d |durd}n|jd }| j}|dur[t|}|| j dkrJ|| j | j d }|d| }td| j dt| d| d | jj|||d	}|j	|t
jd
}|du}| j|||||||t
j|||d}|r|r| || jj| jj| jjj}|	| jj}| |}|dkr| |||}n|}|dkr| ||}|dkr|}ne| jjjsd}nJt|j|||jd}t|	ts|	g| }	|
du r|	}
n
t|
ts|
g| }
t
j|	||jd
}t
j|
||jd
ddddddf }
d|
 | |
|  }| jj||ddd }| jj||d}|   |s|fS t|dS )a  
        Function invoked when calling the pipeline for generation.

        Args:
            video (`list[PipelineImageInput]`, *optional*)
                The video to be upsampled (such as a LTX 2.0 first stage output). If not supplied, `latents` should be
                supplied.
            height (`int`, *optional*, defaults to `512`):
                The height in pixels of the input video (not the generated video, which will have a larger resolution).
            width (`int`, *optional*, defaults to `768`):
                The width in pixels of the input video (not the generated video, which will have a larger resolution).
            num_frames (`int`, *optional*, defaults to `121`):
                The number of frames in the input video.
            spatial_patch_size (`int`, *optional*, defaults to `1`):
                The spatial patch size of the video latents. Used when `latents` is supplied if unpacking is necessary.
            temporal_patch_size (`int`, *optional*, defaults to `1`):
                The temporal patch size of the video latents. Used when `latents` is supplied if unpacking is
                necessary.
            latents (`torch.Tensor`, *optional*):
                Pre-generated video latents. This can be supplied in place of the `video` argument. Can either be a
                patch sequence of shape `(batch_size, seq_len, hidden_dim)` or a video latent of shape `(batch_size,
                latent_channels, latent_frames, latent_height, latent_width)`.
            latents_normalized (`bool`, *optional*, defaults to `False`)
                If `latents` are supplied, whether the `latents` are normalized using the VAE latent mean and std. If
                `True`, the `latents` will be denormalized before being supplied to the latent upsampler.
            decode_timestep (`float`, defaults to `0.0`):
                The timestep at which generated video is decoded.
            decode_noise_scale (`float`, defaults to `None`):
                The interpolation factor between random noise and denoised latents at the decode timestep.
            adain_factor (`float`, *optional*, defaults to `0.0`):
                Adaptive Instance Normalization (AdaIN) blending factor between the upsampled and original latents.
                Should be in [-10.0, 10.0]; supplying 0.0 (the default) means that AdaIN is not performed.
            tone_map_compression_ratio (`float`, *optional*, defaults to `0.0`):
                The compression strength for tone mapping, which will reduce the dynamic range of the latent values.
                This is useful for regularizing high-variance latents or for conditioning outputs during generation.
                Should be in [0, 1], where 0.0 (the default) means tone mapping is not applied and 1.0 corresponds to
                the full compression effect.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ltx.LTXPipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ltx.LTXPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ltx.LTXPipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is the upsampled video.
        )r1   r4   r5   r   r|   Nr   r   z-Video length expected to be of the form `k * z + 1` but is z. Truncating to z frames.)r4   r5   r:   )r1   r2   r3   r4   r5   r6   r7   r8   r9   r   r   r~   latent)r   r9   r8   F)r   )r   )frames) r}   shape_execution_devicerK   r)   loggerwarningr*   preprocess_videorH   rN   float32rT   rq   r   rl   rm   configrn   r   r8   rb   rk   timestep_conditioningr   rI   rJ   tensordecodepostprocess_videomaybe_free_model_hooksr
   )r+   r1   r4   r5   r3   r6   r7   r   r   r   r   r   r|   r   r   r   r2   r9   latents_suppliedlatents_upsampledtimestepnoiser   r   r   __call__  s   J







z#LTX2LatentUpsamplePipeline.__call__)Nr   r.   r/   r0   r   r   NNNN)rU   )r   r   )Nr/   r0   r.   r   r   NFr~   Nr~   r~   Nr   T)__name__
__module____qualname__model_cpu_offload_seqr   r   r#   rN   Tensorintr8   r9   	GeneratorrT   floatrb   rk   staticmethodrq   rG   r}   no_gradr   EXAMPLE_DOC_STRINGrJ   r   boolstrr   __classcell__r   r   r,   r   r   h   s    	

, 	
	

r   )Nr   )rN   image_processorr   modelsr   utilsr   r   utils.torch_utilsr   r*   r   ltx.pipeline_outputr
   pipeline_utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s(   =
