o
    pi                     @   s8  d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	m
Z
 d dlZd dlmZmZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddl m!Z! e"e#Z$dZ%				ddee& dee
e'ej(f  deee&  deee)  fddZ*eG dd deZ+G dd deZ,dS )    N)	dataclass)CallableDictListOptionalTupleUnion)T5EncoderModelT5Tokenizer   )MultiPipelineCallbacksPipelineCallback)AutoencoderKLCogVideoXCogVideoXTransformer3DModel)DiffusionPipeline)CogVideoXDDIMSchedulerCogVideoXDPMScheduler)
BaseOutputloggingreplace_example_docstring)randn_tensor)VideoProcessora  
    Examples:
        ```python
        >>> import torch
        >>> from diffusers import CogVideoXPipeline
        >>> from diffusers.utils import export_to_video

        >>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.float16).to("cuda")
        >>> prompt = (
        ...     "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
        ...     "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
        ...     "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
        ...     "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
        ...     "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
        ...     "atmosphere of this unique musical performance."
        ... )
        >>> video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
        >>> export_to_video(video, "output.mp4", fps=8)
        ```
num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r   r   r   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r   r   r    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r   len)	schedulerr   r   r   r   kwargsaccepts_timestepsaccept_sigmasr   r   m/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/cogvideo/pipeline_cogvideox.pyretrieve_timesteps;   s2   r+   c                   @   s   e Zd ZU dZejed< dS )CogVideoXPipelineOutputa  
    Output class for CogVideo pipelines.

    Args:
        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
            `(batch_size, num_frames, channels, height, width)`.
    framesN)__name__
__module____qualname____doc__torchTensor__annotations__r   r   r   r*   r,   v   s   
 
r,   c                1       st  e Zd ZdZg ZdZg dZdedede	de
deeef f
 fd	d
Z					dFdeeee f dededeej deej f
ddZ								dGdeeee f deeeee f  dededeej deej dedeej deej fddZ	dHddZdejdefd d!Zd"d# Z		dId$d%Zed&d' Zed(d) Z ed*d+ Z!e" e#e$ddd,d-d.d/d0dd1d2dd3ddddd4dddgdfdeeeee f  deeeee f  d5ed6ed7ed8ed9ed:eee  d;e%d<eded=e%d>eeej&eej& f  deej' deej' deej' d?ed@edAeee(eee)gdf e*e+f  dBee dedCee,e-f f,dDdEZ.  Z/S )JCogVideoXPipelinea  
    Pipeline for text-to-video generation using CogVideoX.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
        text_encoder ([`T5EncoderModel`]):
            Frozen text-encoder. CogVideoX uses
            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
        tokenizer (`T5Tokenizer`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
        transformer ([`CogVideoXTransformer3DModel`]):
            A text conditioned `CogVideoXTransformer3DModel` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded video latents.
    ztext_encoder->transformer->vae)latentsprompt_embedsnegative_prompt_embeds	tokenizertext_encodervaetransformerr&   c                    s   t    | j|||||d t| dr$| jd ur$dt| jjjd  nd| _t| dr6| jd ur6| jjj	nd| _
t| jd| _d S )N)r9   r:   r;   r<   r&   r;               )vae_scale_factor)super__init__register_moduleshasattrr;   r%   configblock_out_channelsvae_scale_factor_spatialtemporal_compression_ratiovae_scale_factor_temporalr   video_processor)selfr9   r:   r;   r<   r&   r$   r   r*   rC      s   

, zCogVideoXPipeline.__init__Nr>      promptnum_videos_per_promptmax_sequence_lengthr   dtypec                 C   s  |p| j }|p
| jj}t|tr|gn|}t|}| j|d|dddd}|j}| j|dddj}	|	jd |jd kr[t	
||	s[| j|	d d |d df }
td	| d
|
  | ||d }|j||d}|j\}}}|d|d}||| |d}|S )N
max_lengthTpt)paddingrS   
truncationadd_special_tokensreturn_tensorslongest)rU   rX   r>   zXThe following part of your input was truncated because `max_sequence_length` is set to  z	 tokens: r   )rR   r   )_execution_devicer:   rR   
isinstancestrr%   r9   	input_idsshaper2   equalbatch_decodeloggerwarningtorepeatview)rL   rO   rP   rQ   r   rR   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textr7   _seq_lenr   r   r*   _get_t5_prompt_embeds   s:   
  z'CogVideoXPipeline._get_t5_prompt_embedsTnegative_promptdo_classifier_free_guidancer7   r8   c
              
   C   s  |p| j }t|tr|gn|}|durt|}
n|jd }
|du r+| j|||||	d}|r|du r|p4d}t|tr?|
|g n|}|dur\t|t|ur\tdt| dt| d|
t|krutd| d	t| d
| d	|
 d	| j|||||	d}||fS )a"  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                Whether to use classifier free guidance or not.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            device: (`torch.device`, *optional*):
                torch device
            dtype: (`torch.dtype`, *optional*):
                torch dtype
        Nr   )rO   rP   rQ   r   rR    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)	r[   r\   r]   r%   r_   rn   type	TypeErrorr   )rL   rO   ro   rp   rP   r7   r8   rQ   r   rR   rg   r   r   r*   encode_prompt   sL   
&

zCogVideoXPipeline.encode_promptc
                 C   s   ||d | j  d ||| j || j f}
t|tr,t||kr,tdt| d| d|	d u r9t|
|||d}	n|	|}	|	| jj	 }	|	S )Nr>   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatorr   rR   )
rJ   rH   r\   listr%   r   r   rd   r&   init_noise_sigma)rL   rg   num_channels_latents
num_framesheightwidthrR   r   rv   r6   r_   r   r   r*   prepare_latents6  s"   
z!CogVideoXPipeline.prepare_latentsr6   num_secondsc                 C   s   | ddddd}d| jjj | }g }t|D ]-}|dkr dnd| d d| d f\}}| j|d d d d ||f j}|| q| j  t	j
|dd}|S )Nr   r=   r>   r   r@   )r   r   dim)permuter;   rF   scaling_factorrangedecodesampleappend!clear_fake_context_parallel_cacher2   cat)rL   r6   r~   r-   istart_frame	end_framecurrent_framesr   r   r*   decode_latentsO  s   ($
z CogVideoXPipeline.decode_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Netarv   )r   r   r    r&   stepr"   r#   )rL   rv   r   accepts_etaextra_step_kwargsaccepts_generatorr   r   r*   prepare_extra_step_kwargs`  s   z+CogVideoXPipeline.prepare_extra_step_kwargsc                    sj  |d dks|d dkrt d| d| d|d ur8t fdd|D s8t d j d	 fd
d|D  |d urK|d urKt d| d| d|d u rW|d u rWt d|d urnt|tsnt|tsnt dt| |d ur|d urt d| d| d|d ur|d urt d| d| d|d ur|d ur|j|jkrt d|j d|j dd S d S d S )Nr?   r   z7`height` and `width` have to be divisible by 8 but are z and rr   c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0krL   r   r*   	<genexpr>  s    

z1CogVideoXPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r   r   r   r   r   r*   
<listcomp>  s    z2CogVideoXPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z and `negative_prompt_embeds`: z'Cannot forward both `negative_prompt`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )r   allr   r\   r]   rw   rs   r_   )rL   rO   r{   r|   ro   "callback_on_step_end_tensor_inputsr7   r8   r   r   r*   check_inputsr  sR   
zCogVideoXPipeline.check_inputsc                 C      | j S r   )_guidance_scaler   r   r   r*   guidance_scale     z CogVideoXPipeline.guidance_scalec                 C   r   r   )_num_timestepsr   r   r   r*   num_timesteps  r   zCogVideoXPipeline.num_timestepsc                 C   r   r   )
_interruptr   r   r   r*   	interrupt  r   zCogVideoXPipeline.interrupti  i  0   r?   2      Fg        pilr{   r|   rz   fpsr   r   r   use_dynamic_cfgr   rv   output_typereturn_dictcallback_on_step_endr   returnc           )      C   s  |dkr|| dkr|dksJ d|dt |ttfr |j}|p)| jjj| j }|p3| jjj| j }d}| ||||||| |	| _	d| _
|durSt |trSd}n|durat |trat|}n|jd }| j}|	d	k}| j||||||||d
\}}|rtj||gdd}t| j|||\}}t|| _| jjj}|d7 }| || |||||j|||	}| ||}tt||| jj  d}| j|d}d}t|D ]\}}| jrq|rt|gd n|} | j| |} | | jd }!| j| ||!ddd }"|"! }"|
rd|	dt"#t"j$||%  | d   d   | _	|r0|"&d\}#}$|#| j'|$|#   }"t | jt(sJ| jj)|"||fi |ddid }n| jj)|"|||dkr[||d  nd|fi |ddi\}}|*|j}|duri }%|D ]
}&t+ |& |%|&< qx|| |||%}'|',d|}|',d|}|',d|}|t|d ks|d |kr|d | jj dkr|-  qW d   n	1 sw   Y  |dks| .||| }(| j/j0|(|d}(n|}(| 1  |s|(fS t2|(dS )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. This is set to 1024 by default for the best results.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image. This is set to 1024 by default for the best results.
            num_frames (`int`, defaults to `48`):
                Number of frames to generate. Must be divisible by self.vae_scale_factor_temporal. Generated video will
                contain 1 extra frame because CogVideoX is conditioned with (num_seconds * fps + 1) frames where
                num_seconds is 6 and fps is 4. However, since videos can be saved at any fps, the only condition that
                needs to be satisfied is that of divisibility mentioned above.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 7.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of videos to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
                of a plain tuple.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int`, defaults to `226`):
                Maximum sequence length in encoded prompt. Must be consistent with
                `self.transformer.config.max_text_seq_length` otherwise may lead to poor results.

        Examples:

        Returns:
            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] or `tuple`:
            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is a list with the generated images.
        r   r   r?   z.The number of frames must be divisible by fps=zP and less than 48 frames (for now). Other values are not supported in CogVideoX.r>   FNg      ?)rP   r7   r8   rQ   r   r   )totalr=   )hidden_statesencoder_hidden_statestimestepr   g      @r   r6   r7   r8   latent)videor   )r-   )3r\   r   r   tensor_inputsr<   rF   sample_sizerH   r   r   r   r]   rw   r%   r_   r[   ru   r2   r   r+   r&   r   in_channelsr}   rR   r   maxorderprogress_bar	enumerater   scale_model_inputexpandfloatmathcospiitemchunkr   r   r   rd   localspopupdater   rK   postprocess_videomaybe_free_model_hooksr,   ))rL   rO   ro   r{   r|   rz   r   r   r   r   r   rP   r   rv   r6   r7   r8   r   r   r   r   rQ   rg   r   rp   latent_channelsr   num_warmup_stepsr   old_pred_original_sampler   tlatent_model_inputr   
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr   r   r   r*   __call__  s   i
	





&&	
6
<
zCogVideoXPipeline.__call__)Nr>   rN   NN)NTr>   NNrN   NNr   )NN)0r.   r/   r0   r1   _optional_componentsmodel_cpu_offload_seqr   r
   r	   r   r   r   r   r   rC   r]   r   intr   r2   r   rR   rn   boolr3   ru   r}   r   r   r   propertyr   r   r   no_gradr   EXAMPLE_DOC_STRINGr   	GeneratorFloatTensorr   r   r   r   r,   r   r   __classcell__r   r   rM   r*   r5      s   

-	

R

3



	

r5   )NNNN)-r   r   dataclassesr   typingr   r   r   r   r   r   r2   transformersr	   r
   	callbacksr   r   modelsr   r   pipelines.pipeline_utilsr   
schedulersr   r   utilsr   r   r   utils.torch_utilsr   rK   r   
get_loggerr.   rb   r   r   r]   r   r   r+   r,   r5   r   r   r   r*   <module>   s>    



;