o
    pi                     @   s  d dl Z d dlmZmZmZmZmZmZmZ d dl	Z	d dl
mZmZmZmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZmZmZmZ ddlmZmZm Z  dd	l!m"Z" dd
l#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: e,;e<Z=dZ>dddZ?				d dee@ deeeAe	jBf  deee@  deeeC  fddZDG dd de7e8eeeee5	ZEdS )!    N)AnyCallableDictListOptionalTupleUnion)CLIPImageProcessorCLIPTextModelCLIPTextModelWithProjectionCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInput)FromSingleFileMixinIPAdapterMixin StableDiffusionXLLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionMotionAdapterUNet2DConditionModelUNetMotionModel)AttnProcessor2_0FusedAttnProcessor2_0XFormersAttnProcessor)adjust_lora_scale_text_encoder)DDIMSchedulerDPMSolverMultistepSchedulerEulerAncestralDiscreteSchedulerEulerDiscreteSchedulerLMSDiscreteSchedulerPNDMScheduler)USE_PEFT_BACKENDloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)randn_tensor)VideoProcessor   )FreeInitMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputa$  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers.models import MotionAdapter
        >>> from diffusers import AnimateDiffSDXLPipeline, DDIMScheduler
        >>> from diffusers.utils import export_to_gif

        >>> adapter = MotionAdapter.from_pretrained(
        ...     "a-r-r-o-w/animatediff-motion-adapter-sdxl-beta", torch_dtype=torch.float16
        ... )

        >>> model_id = "stabilityai/stable-diffusion-xl-base-1.0"
        >>> scheduler = DDIMScheduler.from_pretrained(
        ...     model_id,
        ...     subfolder="scheduler",
        ...     clip_sample=False,
        ...     timestep_spacing="linspace",
        ...     beta_schedule="linear",
        ...     steps_offset=1,
        ... )
        >>> pipe = AnimateDiffSDXLPipeline.from_pretrained(
        ...     model_id,
        ...     motion_adapter=adapter,
        ...     scheduler=scheduler,
        ...     torch_dtype=torch.float16,
        ...     variant="fp16",
        ... ).to("cuda")

        >>> # enable memory savings
        >>> pipe.enable_vae_slicing()
        >>> pipe.enable_vae_tiling()

        >>> output = pipe(
        ...     prompt="a panda surfing in the ocean, realistic, high quality",
        ...     negative_prompt="low quality, worst quality",
        ...     num_inference_steps=20,
        ...     guidance_scale=8,
        ...     width=1024,
        ...     height=1024,
        ...     num_frames=16,
        ... )

        >>> frames = output.frames[0]
        >>> export_to_gif(frames, "animation.gif")
        ```
        c                 C   sX   |j ttd|jdd}| j ttd| jdd}| ||  }|| d| |   } | S )z
    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
    r.   T)dimkeepdim)stdlistrangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaled r=   w/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.pyrescale_noise_cfgs   s
   r?   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`List[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`List[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesrB   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)rB   rA   rC   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)rC   rA   rA   r=   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__rB   len)	schedulerr@   rA   rB   rC   kwargsaccepts_timestepsaccept_sigmasr=   r=   r>   retrieve_timesteps   s2   rQ   c                K       s  e Zd ZdZdZg dZg dZ			dldeded	e	d
e
de
deeef dedeeeeeeef dededef fddZ												dmdedee deej dededee dee deej deej deej deej d ee d!ee fd"d#Z dnd$d%Z!d&d' Z"d(d) Z#d*d+ Z$							dod,d-Z%	dnd.d/Z&	dnd0d1Z'd2d3 Z(d4ej)fd5ejd6ed7ej*d8ejfd9d:Z+e,d;d< Z-e,d=d> Z.e,d?d@ Z/e,dAdB Z0e,dCdD Z1e,dEdF Z2e,dGdH Z3e,dIdJ Z4e5 e6e7dddKdddLddddMddddNdddddddddOdddNddPdddPddddQgf#deee8e f deeee8e f  dRedSee dTee dUedVe8e dWe8e dXee dYedeeee8e f  deeee8e f  dee dZed[eeej9e8ej9 f  dQeej deej deej deej deej d\ee: d]ee8ej  d^ee d_ed`ee;ee<f  daedbee=eef  dce=eef ddee=eef  deee=eef  dfe=eef dgee=eef  d!ee dhee>eee;gdf  die8e fFdjdkZ?  Z@S )pAnimateDiffSDXLPipelinea
  
    Pipeline for text-to-video generation using Stable Diffusion XL.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder. Stable Diffusion XL uses the text portion of
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
        text_encoder_2 ([` CLIPTextModelWithProjection`]):
            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
            specifically the
            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
            variant.
        tokenizer (`CLIPTokenizer`):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        tokenizer_2 (`CLIPTokenizer`):
            Second Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
            `stabilityai/stable-diffusion-xl-base-1-0`.
    z6text_encoder->text_encoder_2->image_encoder->unet->vae)	tokenizertokenizer_2text_encodertext_encoder_2image_encoderfeature_extractor)latentsprompt_embedsnegative_prompt_embedsadd_text_embedsadd_time_idsnegative_pooled_prompt_embedsnegative_add_time_idsNTvaerU   rV   rS   rT   unetmotion_adapterrM   rW   rX   force_zeros_for_empty_promptc                    s   t    t|trt||}| j|||||||||	|
d
 | j|d dt| j	j
jd  | _t| jd| _| jj
j| _d S )N)
r`   rU   rV   rS   rT   ra   rb   rM   rW   rX   )rc   r*   r.   )vae_scale_factor)super__init__
isinstancer   r   from_unet2dregister_modulesregister_to_configrL   r`   configblock_out_channelsrd   r)   video_processorra   sample_sizedefault_sample_size)selfr`   rU   rV   rS   rT   ra   rb   rM   rW   rX   rc   rK   r=   r>   rf     s&   

z AnimateDiffSDXLPipeline.__init__r.   promptprompt_2rA   num_videos_per_promptdo_classifier_free_guidancenegative_promptnegative_prompt_2rZ   r[   pooled_prompt_embedsr^   
lora_scale	clip_skipc           !   
   C   s  |p| j }|dur9t| tr9|| _| jdur%tst| j| nt| j| | jdur9ts3t| j| nt| j| t|t	rA|gn|}|durLt
|}n|jd }| jdur\| j| jgn| jg}| jdurk| j| jgn| jg}|du r|pw|}t|t	r|gn|}g }||g}t|||D ]u\}}}t| tr| ||}||d|jddd}|j}||dddj}|jd	 |jd	 krt||s||dd|jd
 d	f }td|j d|  |||dd}|d }
|du r|jd }n|j|d   }|| qtj|d	d}|du o| jj}|r*|	du r*|r*t|}	t|
}n|r|	du r|p6d}|p;|}t|t	rG||g n|}t|t	rT||g n|}|durst|t|urst dt| dt| d|t
|krt!d| dt
| d| d| d	||g}g }t|||D ]8\}}}t| tr| ||}|jd
 }||d|ddd}||j|dd}	|	d }|	jd }	||	 qtj|d	d}	| jdur|j| jj"|d}n	|j| j#j"|d}|j\}}} |$d
|d
}|%|| |d	}|r9|	jd
 }| jdur |	j| jj"|d}	n	|	j| j#j"|d}	|	$d
|d
}	|	%|| |d	}	|
$d
|%|| d	}
|rT|$d
|%|| d	}| jdurit| tritrit&| j| | jdur~t| tr~tr~t&| j| ||	|
|fS )a\  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in both text-encoders
            device: (`torch.device`):
                torch device
            num_videos_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            negative_prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr   
max_lengthTpt)paddingr{   
truncationreturn_tensorslongest)r}   r   r.   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: output_hidden_statesr*   r1    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)dtyperA   )'_execution_devicerg   r   _lora_scalerU   r#   r   r&   rV   strrL   shaperS   rT   zipr   maybe_convert_promptmodel_max_length	input_idstorchequalbatch_decodeloggerwarningtohidden_statesappendconcatrk   rc   
zeros_liketype	TypeErrorrD   r   ra   repeatviewr'   )!rp   rr   rs   rA   rt   ru   rv   rw   rZ   r[   rx   r^   ry   rz   
batch_size
tokenizerstext_encodersprompt_embeds_listpromptsrS   rU   text_inputstext_input_idsuntruncated_idsremoved_textzero_out_negative_promptuncond_tokensnegative_prompt_embeds_listr{   uncond_inputbs_embedseq_len_r=   r=   r>   encode_prompt0  s   
:





 






z%AnimateDiffSDXLPipeline.encode_promptc           
      C   s   t | j j}t|tjs| j|ddj}|j	||d}|rH| j|ddj
d }|j|dd}| jt|ddj
d }|j|dd}||fS | |j}|j|dd}t|}	||	fS )	Nr|   )r   rA   r   Tr   r   r   r   )nextrW   rI   r   rg   r   TensorrX   pixel_valuesr   r   repeat_interleaver   image_embeds)
rp   imagerA   num_images_per_promptr   r   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedsr=   r=   r>   encode_image  s(   

z$AnimateDiffSDXLPipeline.encode_imagec                 C   sl  g }|rg }|d u ret |ts|g}t|t| jjjkr/tdt| dt| jjj dt|| jjjD ],\}}	t |	t }
| 	||d|
\}}|
|d d d f  |rc|
|d d d f  q7n|D ]}|rw|d\}}|
| |
| qgg }t|D ]0\}}tj|g| dd}|rtj|| g| dd}tj||gdd}|j|d}|
| q|S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r.   r*   r   r   )rA   )rg   r4   rL   ra   encoder_hid_projimage_projection_layersrD   r   r   r   r   chunk	enumerater   catr   )rp   ip_adapter_imageip_adapter_image_embedsrA   r   ru   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsir=   r=   r>   prepare_ip_adapter_image_embeds4  sH   


z7AnimateDiffSDXLPipeline.prepare_ip_adapter_image_embedsc           	      C   s   d| j jj | }|j\}}}}}|ddddd|| |||}| j |j}|d d d f ||df|jdd   ddddd}| }|S )Nr.   r   r*   r      r   )	r`   rk   scaling_factorr   permutereshapedecodesamplefloat)	rp   rY   r   channels
num_framesheightwidthr   videor=   r=   r>   decode_latentsb  s   "8z&AnimateDiffSDXLPipeline.decode_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Neta	generator)rE   rF   rG   rM   steprI   rJ   )rp   r   r   accepts_etaextra_step_kwargsaccepts_generatorr=   r=   r>   prepare_extra_step_kwargso  s   z1AnimateDiffSDXLPipeline.prepare_extra_step_kwargsc                    s  |d dks|d dkrt d| d| d|d ur8t fdd|D s8t d j d	 fd
d|D  |d urK|d urKt d| d| d|d ur^|d ur^t d| d| d|d u rj|d u rjt d|d urt|tst|tst dt| |d urt|tst|tst dt| |d ur|d urt d| d| d|d ur|d urt d| d| d|d ur|d ur|j|jkrt d|j d|j d|d ur|	d u rt d|d ur|
d u rt dd S d S )N   r   z7`height` and `width` have to be divisible by 8 but are z and r   c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0krp   r=   r>   	<genexpr>  s    

z7AnimateDiffSDXLPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r=   r   r   r   r=   r>   
<listcomp>  s    z8AnimateDiffSDXLPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.z Cannot forward both `prompt_2`: zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z4`prompt_2` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: z)Cannot forward both `negative_prompt_2`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zIf `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`.zIf `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`.)rD   allr   rg   r   r4   r   r   )rp   rr   rs   r   r   rv   rw   rZ   r[   rx   r^   "callback_on_step_end_tensor_inputsr=   r   r>   check_inputs  sl   z$AnimateDiffSDXLPipeline.check_inputsc
                 C   s~   ||||| j  || j  f}
t|tr%t||kr%tdt| d| d|	d u r2t|
|||d}	n|	|}	|	| jj }	|	S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   rA   r   )	rd   rg   r4   rL   rD   r(   r   rM   init_noise_sigma)rp   r   num_channels_latentsr   r   r   r   rA   r   rY   r   r=   r=   r>   prepare_latents  s"   
z'AnimateDiffSDXLPipeline.prepare_latentsc           	      C   sd   t || | }| jjjt| | }| jjjj}||kr(td| d| dt	j
|g|d}|S )Nz7Model expects an added time embedding vector of length z, but a vector of z was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`.r   )r4   ra   rk   addition_time_embed_dimrL   add_embeddinglinear_1in_featuresrD   r   tensor)	rp   original_sizecrops_coords_top_lefttarget_sizer   text_encoder_projection_dimr]   passed_add_embed_dimexpected_add_embed_dimr=   r=   r>   _get_add_time_ids  s   z)AnimateDiffSDXLPipeline._get_add_time_idsc                 C   sp   | j j}| j jtjd t| j jjjd j	t
ttf}|r6| j j| | j jj| | j jj| d S d S )Nr   r   )r`   r   r   r   float32rg   decoder	mid_block
attentions	processorr   r   r   post_quant_convconv_in)rp   r   use_torch_2_0_or_xformersr=   r=   r>   
upcast_vae  s   
z"AnimateDiffSDXLPipeline.upcast_vaei   wembedding_dimr   returnc                 C   s   t |jdks	J |d }|d }ttd|d  }ttj||d|  }||dddf |dddf  }tjt	|t
|gdd}|d dkrZtjj|d}|j|jd	 |fksfJ |S )
a  
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
            w (`torch.Tensor`):
                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
                Dimension of the embeddings to generate.
            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
                Data type of the generated embeddings.

        Returns:
            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        r.   g     @@r*   g     @r   Nr   )r   r.   r   )rL   r   r   logr   exparanger   r   sincosnn
functionalpad)rp   r  r  r   half_dimembr=   r=   r>   get_guidance_scale_embedding  s   &z4AnimateDiffSDXLPipeline.get_guidance_scale_embeddingc                 C      | j S r   )_guidance_scaler   r=   r=   r>   guidance_scale%     z&AnimateDiffSDXLPipeline.guidance_scalec                 C   r  r   )_guidance_rescaler   r=   r=   r>   r9   )  r  z(AnimateDiffSDXLPipeline.guidance_rescalec                 C   r  r   )
_clip_skipr   r=   r=   r>   rz   -  r  z!AnimateDiffSDXLPipeline.clip_skipc                 C   s   | j dko| jjjd u S )Nr.   )r  ra   rk   time_cond_proj_dimr   r=   r=   r>   ru   4  s   z3AnimateDiffSDXLPipeline.do_classifier_free_guidancec                 C   r  r   )_cross_attention_kwargsr   r=   r=   r>   cross_attention_kwargs8  r  z.AnimateDiffSDXLPipeline.cross_attention_kwargsc                 C   r  r   )_denoising_endr   r=   r=   r>   denoising_end<  r  z%AnimateDiffSDXLPipeline.denoising_endc                 C   r  r   )_num_timestepsr   r=   r=   r>   num_timesteps@  r  z%AnimateDiffSDXLPipeline.num_timestepsc                 C   r  r   )
_interruptr   r=   r=   r>   	interruptD  r  z!AnimateDiffSDXLPipeline.interrupt   2   g      @r0   pil)r   r   rY   r   r   r   r@   rB   rC   r  r  r   r   r   r   output_typereturn_dictr  r9   r   r   r   negative_original_sizenegative_crops_coords_top_leftnegative_target_sizecallback_on_step_endr   c$           @         s  |p| j | j }|p| j | j }d}|p||f}|p||f}| |||||||||||# |
| _|| _|!| _|| _|	| _d| _|durKt	|t
rKd}$n|durYt	|trYt|}$n|jd }$| j}%| jdurm| jddnd}&| j|||%|| j|||||||&| jd\}}}}t| j||%||\}}| jjj}'| |$| |'||||j|%||	}| ||}(|})| jdu rt|jd }*n| jjj}*| j||||j|*d}+|dur| dur| j||| |j|*d},n|+},| jrt j!||gdd	}t j!||)gdd	})t j!|,|+gdd	}+|"|%}|)"|%})|+"|%#|$| d}+|dus |dur,| $|||%|$| | j}-| j%durit	| j%t&ri| j%dkri| j%dk ritt'| jjj(| j%| jjj(   ttt) fd
d|}|d| }d}.| jjj*durt +| j,d #|$| }/| j-|/| jjj*dj"|%|jd}.| j.r| j/nd}0t0|0D ]}1| j.r| 1||1||%|j|\}}t|| _2| j3| j2d}2t4|D ]\}3}4| j5rːq| jrt !|gd n|}5| j6|5|4}5|)|+d}6|dus|r|-|6d< | j|5|4||.| j|6ddd }7| jr|77d\}8}9|8| j,|9|8   }7| jr&| j8dkr&t9|7|9| j8d}7| jj:|7|4|fi |(ddid }|"dur}i }:|#D ]
};t; |; |:|;< qA|"| |3|4|:}<|<<d|}|<<d|}|<<d|}|<<d|)})|<<d|}|<<d|+}+|<<d|,},|2=  qW d   n	1 sw   Y  q| j>jt j?ko| j>jj@}=|=r| A  |"tBtC| j>jDE j}|dkr|}>n| F|}?| jGjH|?|d}>|=r| j>j"t j?d | I  |s|>fS tJ|>d S )!u0  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the video generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in both text-encoders
            num_frames:
                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
                amounts to 2 seconds of video.
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated video. This is set to 1024 by default for the best results.
                Anything below 512 pixels won't work well for
                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
                and checkpoints that are not specifically fine-tuned on low resolutions.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated video. This is set to 1024 by default for the best results.
                Anything below 512 pixels won't work well for
                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
                and checkpoints that are not specifically fine-tuned on low resolutions.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality video at the
                expense of slower inference.
            timesteps (`List[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            sigmas (`List[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            denoising_end (`float`, *optional*):
                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
                completed before it is intentionally prematurely terminated. As a result, the returned sample will
                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
            guidance_scale (`float`, *optional*, defaults to 5.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower video quality.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the video generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            negative_prompt_2 (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the video generation to be sent to `tokenizer_2` and
                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
            num_videos_per_prompt (`int`, *optional*, defaults to 1):
                The number of videos to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
                [`schedulers.DDIMScheduler`], will be ignored for others.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. If not provided, embeddings are computed from the
                `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion_xl.AnimateDiffPipelineOutput`] instead of a
                plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            guidance_rescale (`float`, *optional*, defaults to 0.0):
                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
                Guidance rescale factor should fix overexposure when using zero terminal SNR.
            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
                explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                For most cases, `target_size` should be set to the desired height and width of the generated image. If
                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
                micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
                micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                To negatively condition the generation process based on a target image resolution. It should be as same
                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        r.   FNr   scale)rr   rs   rA   rt   ru   rv   rw   rZ   r[   rx   r^   ry   rz   r   )r   r   r   c                    s   |  kS r   r=   )tsdiscrete_timestep_cutoffr=   r>   <lambda>  s    z2AnimateDiffSDXLPipeline.__call__.<locals>.<lambda>)r  r   )totalr*   )text_embedstime_idsr   )encoder_hidden_statestimestep_condr  added_cond_kwargsr&  r0   )r9   r&  rY   rZ   r[   r\   r^   r]   r_   latent)r   r%  r   )frames)Kro   rd   r   r  r  r  r  r  r   rg   r   r4   rL   r   r   r  getr   ru   rz   rQ   rM   ra   rk   in_channelsr   r   r   rV   intprojection_dimr   r   r   r   r   r   r  r   roundnum_train_timestepsfilterr  r   r  r  free_init_enabled_free_init_num_itersr5   _apply_free_initr  progress_barr   r!  scale_model_inputr   r9   r?   r   localspopupdater`   float16force_upcastr  r   iterr  rI   r   rm   postprocess_videomaybe_free_model_hooksr/   )@rp   rr   rs   r   r   r   r@   rB   rC   r  r  rv   rw   rt   r   r   rY   rZ   r[   rx   r^   r   r   r%  r&  r  r9   r   r   r   r'  r(  r)  rz   r*  r   r   rA   ry   r   r   r\   r   r]   r_   r   r4  guidance_scale_tensornum_free_init_itersfree_init_iterrB  r   tlatent_model_inputr5  
noise_prednoise_pred_uncondr8   callback_kwargsr   callback_outputsneeds_upcastingr   video_tensorr=   r-  r>   __call__H  s   =

	









$
:


z AnimateDiffSDXLPipeline.__call__)NNT)NNr.   TNNNNNNNNr   )NNNNNNN)A__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r
   r   r   r   r   r   r   r   r"   r!   r    r   r   r   r	   boolrf   r   r   r   rA   r:  r   r   r   r   r   r   r   r   r   r   r  r   r   r  propertyr  r9   rz   ru   r  r  r  r!  no_gradr%   EXAMPLE_DOC_STRINGr   	Generatorr   r   r   r   r   rW  __classcell__r=   r=   rq   r>   rR      s   	*
	0	

 
l.
J










	


 !"#$rR   )r0   )NNNN)FrF   typingr   r   r   r   r   r   r   r   transformersr	   r
   r   r   r   image_processorr   loadersr   r   r   r   modelsr   r   r   r   r   models.attention_processorr   r   r   models.lorar   
schedulersr   r   r   r    r!   r"   utilsr#   r$   r%   r&   r'   utils.torch_utilsr(   rm   r)   free_init_utilsr+   pipeline_utilsr,   r-   pipeline_outputr/   
get_loggerrX  r   ra  r?   r:  r   rA   r   rQ   rR   r=   r=   r=   r>   <module>   sR   $ 

2



;