o
    ۷i                     @   s  d dl Z d dlmZmZ d dlZd dlmZmZmZm	Z	 ddl
mZ ddlmZmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 e# rd dl4m5  m6Z7 dZ8ndZ8e$9e:Z;dZ<	d%dej=dej>dB de?fddZ@				d&deAdB de?ejBB dB deCeA dB d eCeD dB fd!d"ZEG d#d$ d$e0e1eeee,e.e
ZFdS )'    N)AnyCallable)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInput)FromSingleFileMixinIPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModelUNetMotionModel)adjust_lora_scale_text_encoder)MotionAdapter)DDIMSchedulerDPMSolverMultistepSchedulerEulerAncestralDiscreteSchedulerEulerDiscreteSchedulerLMSDiscreteSchedulerPNDMScheduler)USE_PEFT_BACKENDis_torch_xla_availableloggingscale_lora_layersunscale_lora_layers)randn_tensor)VideoProcessor   )FreeInitMixin)AnimateDiffFreeNoiseMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputTFa  
    Examples:
        ```py
        >>> import imageio
        >>> import requests
        >>> import torch
        >>> from diffusers import AnimateDiffVideoToVideoPipeline, DDIMScheduler, MotionAdapter
        >>> from diffusers.utils import export_to_gif
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> adapter = MotionAdapter.from_pretrained(
        ...     "guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16
        ... )
        >>> pipe = AnimateDiffVideoToVideoPipeline.from_pretrained(
        ...     "SG161222/Realistic_Vision_V5.1_noVAE", motion_adapter=adapter
        ... ).to("cuda")
        >>> pipe.scheduler = DDIMScheduler(
        ...     beta_schedule="linear", steps_offset=1, clip_sample=False, timespace_spacing="linspace"
        ... )


        >>> def load_video(file_path: str):
        ...     images = []

        ...     if file_path.startswith(("http://", "https://")):
        ...         # If the file_path is a URL
        ...         response = requests.get(file_path)
        ...         response.raise_for_status()
        ...         content = BytesIO(response.content)
        ...         vid = imageio.get_reader(content)
        ...     else:
        ...         # Assuming it's a local file path
        ...         vid = imageio.get_reader(file_path)

        ...     for frame in vid:
        ...         pil_image = Image.fromarray(frame)
        ...         images.append(pil_image)

        ...     return images


        >>> video = load_video(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
        ... )
        >>> output = pipe(
        ...     video=video, prompt="panda playing a guitar, on a boat, in the ocean, high quality", strength=0.5
        ... )
        >>> frames = output.frames[0]
        >>> export_to_gif(frames, "animation.gif")
        ```
sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr(   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr,   r(   moder.   AttributeError)r)   r*   r+    r2   v/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.pyretrieve_latentsl   s   

r4   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr7   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r7   r6   r8   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r8   r6   r6   r2   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r7   len)	schedulerr5   r6   r7   r8   kwargsaccepts_timestepsaccept_sigmasr2   r2   r3   retrieve_timestepsz   s2   rF   c                6       s$  e Zd ZdZdZg dZg dZ		d]dedede	d	e
eB d
edeeB eB eB eB eB dedef fddZ					d^dejdB dejdB dedB dedB fddZd_ddZdd Zd`dedejfddZd`defdd Zd!d" Z								dad#d$Z d%d& Z!		'	'	(	)							*dbd+ejdB d,ed-ed.ed/ed0edB d1ej"dB d2ej#dB d3ej$e%ej$ B dB d4ejdB ded5e&dejfd6d7Z'e(d8d9 Z)e(d:d; Z*e(d<d= Z+e(d>d? Z,e(d@dA Z-e(dBdC Z.e/ dddddDd*dddEdFdd)dGdddddddHdIdddd4gdfd+e%e%e0  dJe1e%e1 B dB d,edB d-edB dKedLe&dMe%e dB dNe%e dB dOedPedQe1e%e1 B dB dRedB dSed3ej$e%ej$ B dB d4ejdB dejdB dejdB dTe0dB dUe%ej dB dVe1dB dWe&dXe2e1e3f dB dedB dYe4eegdf dB dZe%e1 def4d[d\Z5  Z6S )cAnimateDiffVideoToVideoPipelineaw  
    Pipeline for video-to-video generation.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        tokenizer (`CLIPTokenizer`):
            A [`~transformers.CLIPTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
        motion_adapter ([`MotionAdapter`]):
            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    z&text_encoder->image_encoder->unet->vae)feature_extractorimage_encodermotion_adapter)r.   prompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetrJ   rB   rH   rI   c	           	   
      st   t    t|trt||}| j||||||||d t| dd r.dt| j	j
jd  nd| _t| jd| _d S )N)rM   rN   rO   rP   rJ   rB   rH   rI   rM   r!   r&      )vae_scale_factor)super__init__
isinstancer   r   from_unet2dregister_modulesgetattrrA   rM   configblock_out_channelsrR   r    video_processor)	selfrM   rN   rO   rP   rJ   rB   rH   rI   r@   r2   r3   rT      s   

(
z(AnimateDiffVideoToVideoPipeline.__init__rK   rL   
lora_scale	clip_skipc
              
   C   s  |durt | tr|| _tst| j| nt| j| |dur)t |ttfr)d}
n|dur7t |t	r7t
|}
n|jd }
|du rt | trL| || j}| j|d| jjddd}|j}| j|ddd	j}|jd
 |jd
 krt||s| j|dd| jjd d
f }td| jj d|  t| jjdr| jjjr|j|}nd}|	du r| j|||d}|d }n| j|||dd}|d
 |	d   }| jj|}| jdur| jj}n| jdur| jj}n|j}|j||d}|j\}}}|d|d}| || |d
}|r|du r|du rdg|
 }nC|dur:t!|t!|ur:t"dt!| dt!| dt |trD|g}n|
t
|kr^t#d| dt
| d| d|
 d	|}t | trm| || j}|jd }| j|d|ddd}t| jjdr| jjjr|j|}nd}| j|j||d}|d }|r|jd }|j||d}|d|d}| |
| |d
}| jdurt | trtrt$| j| ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr&   r   
max_lengthTpt)paddingr`   
truncationreturn_tensorslongest)rb   rd   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)rh   output_hidden_states)dtyper6    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)%rU   r   _lora_scaler   r   rN   r   strdictlistrA   shaper   maybe_convert_promptrO   model_max_length	input_idstorchequalbatch_decodeloggerwarningr/   rY   rg   rh   to
text_modelfinal_layer_normrj   rP   repeatviewtype	TypeErrorr9   r   )r\   promptr6   num_images_per_promptdo_classifier_free_guidancenegative_promptrK   rL   r^   r_   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrh   prompt_embeds_dtypebs_embedseq_len_uncond_tokensr`   uncond_inputr2   r2   r3   encode_prompt   s   +











z-AnimateDiffVideoToVideoPipeline.encode_promptc           
      C   s   t | j j}t|tjs| j|ddj}|j	||d}|rH| j|ddj
d }|j|dd}| jt|ddj
d }|j|dd}||fS | |j}|j|dd}t|}	||	fS )	Nra   )rd   r6   rj   T)ri   r   dim)nextrI   r>   rj   rU   ru   TensorrH   pixel_valuesrz   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
r\   imager6   r   ri   rj   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedsr2   r2   r3   encode_image  s(   

z,AnimateDiffVideoToVideoPipeline.encode_imagec                 C   sl  g }|rg }|d u ret |ts|g}t|t| jjjkr/tdt| dt| jjj dt|| jjjD ],\}}	t |	t }
| 	||d|
\}}|
|d d d f  |rc|
|d d d f  q7n|D ]}|rw|d\}}|
| |
| qgg }t|D ]0\}}tj|g| dd}|rtj|| g| dd}tj||gdd}|j|d}|
| q|S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r&   r!   r   r   )r6   )rU   rp   rA   rP   encoder_hid_projimage_projection_layersr9   zipr   r   appendchunk	enumerateru   catrz   )r\   ip_adapter_imageip_adapter_image_embedsr6   r   r   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsir2   r2   r3   prepare_ip_adapter_image_embeds  sH   


z?AnimateDiffVideoToVideoPipeline.prepare_ip_adapter_image_embeds   decode_chunk_sizereturnc                 C   sR   g }t dt||D ]}||||  }t| j||d}|| q
t|S )Nr   )r*   )rangerA   r4   rM   encoder   ru   r   )r\   videor*   r   r.   r   batch_videor2   r2   r3   encode_video  s   
z,AnimateDiffVideoToVideoPipeline.encode_videoc                 C   s   d| j jj | }|j\}}}}}|ddddd|| |||}g }td|jd |D ]}	||	|	|  }
| j |
j}
|	|
 q-t
|}|d d d f ||df|jdd   ddddd}| }|S )Nr&   r   r!   r      rf   )rM   rY   scaling_factorrq   permutereshaper   decoder(   r   ru   r   float)r\   r.   r   r   channels
num_framesheightwidthr   r   batch_latentsr2   r2   r3   decode_latents  s   "
8z.AnimateDiffVideoToVideoPipeline.decode_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Netar*   )r:   r;   r<   rB   stepr>   r?   )r\   r*   r   accepts_etaextra_step_kwargsaccepts_generatorr2   r2   r3   prepare_extra_step_kwargs  s   z9AnimateDiffVideoToVideoPipeline.prepare_extra_step_kwargsc                    s  |dk s|dkrt d| |d dks|d dkr&t d| d| d|d urGt fdd	|D sGt d
 j d fdd|D  |d urZ|d urZt d| d| d|d u rf|d u rft d|d ur{t|tttfs{t dt| |d ur|	d urt d| d|	 d|d ur|	d ur|j|	jkrt d|j d|	j d|d ur|d urt d|
d ur|d urt d|d urt|tst dt| |d j	dvrt d|d j	 dd S d S )Nr   r&   z2The value of strength should in [0.0, 1.0] but is rQ   z7`height` and `width` have to be divisible by 8 but are z and rl   c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0kr\   r2   r3   	<genexpr>?  s    

z?AnimateDiffVideoToVideoPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r2   r   r   r   r2   r3   
<listcomp>C  s    z@AnimateDiffVideoToVideoPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z:`prompt` has to be of type `str`, `list` or `dict` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z3Only one of `video` or `latents` should be providedzProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r   r   zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is D)
r9   allr   rU   rn   rp   ro   r   rq   ndim)r\   r   strengthr   r   r   r.   r   rK   rL   r   r   "callback_on_step_end_tensor_inputsr2   r   r3   check_inputs*  sh   
z,AnimateDiffVideoToVideoPipeline.check_inputsc                 C   s@   t t|| |}t|| d}||| jj d  }||| fS )Nr   )minintmaxrB   order)r\   r5   r7   r   r6   init_timestept_startr2   r2   r3   get_timestepsr  s   z-AnimateDiffVideoToVideoPipeline.get_timesteps@   r   r&   Fr   r   r   num_channels_latentsr   timesteprj   r6   r*   r.   	add_noisec                    s"  |
d u r	j d n|
j d }||||j |j f}ttr3t|kr3tdt d| d|
d u rjjjrH	 jj
tjd ttr\ fddt|D }n fd	dD }tj|d
d}jjjryj
| |
|}jjj| }||j d
 kr||j d
  d
krd| d|j d
  d}t|||j d
 kr||j d
  d
krtd|j d
  d| dtj|gd
d}t|j ||d}j|||d
dddd}
|
S ||
j krtd|d|
j |
j
||d}
|rt|||d}j|
||}
|
S )Nr&   r!   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)rj   c                    s(   g | ]} | |  d qS r   r   	unsqueeze)r   r   r   r*   r\   r   r2   r3   r     s    zCAnimateDiffVideoToVideoPipeline.prepare_latents.<locals>.<listcomp>c                    s    g | ]} | d qS r   r   )r   vid)r   r*   r\   r2   r3   r     s     r   r   zYou have passed z# text prompts (`prompt`), but only zp initial images (`image`). Please make sure to update your script to pass as many initial images as text promptsz'Cannot duplicate `image` of batch size z to z text prompts.)r*   r6   rj   r   r   z!`latents` expected to have shape=z, but found latents.shape=)rq   rR   rU   rp   rA   r9   rM   rY   force_upcastr   rz   ru   float32r   r   r   r   rB   r   r   )r\   r   r   r   r   r   r   rj   r6   r*   r.   r   r   r   rq   init_latentserror_messagenoiser2   r   r3   prepare_latents{  s\   



  
z/AnimateDiffVideoToVideoPipeline.prepare_latentsc                 C      | j S r   _guidance_scaler   r2   r2   r3   guidance_scale     z.AnimateDiffVideoToVideoPipeline.guidance_scalec                 C   r   r   )
_clip_skipr   r2   r2   r3   r_     r   z)AnimateDiffVideoToVideoPipeline.clip_skipc                 C   s
   | j dkS )Nr&   r   r   r2   r2   r3   r     s   
z;AnimateDiffVideoToVideoPipeline.do_classifier_free_guidancec                 C   r   r   )_cross_attention_kwargsr   r2   r2   r3   cross_attention_kwargs  r   z6AnimateDiffVideoToVideoPipeline.cross_attention_kwargsc                 C   r   r   )_num_timestepsr   r2   r2   r3   num_timesteps  r   z-AnimateDiffVideoToVideoPipeline.num_timestepsc                 C   r   r   )
_interruptr   r2   r2   r3   	interrupt  r   z)AnimateDiffVideoToVideoPipeline.interrupt2   g      @g?g        pilTr   r5   enforce_inference_stepsr7   r8   r   r   r   num_videos_per_promptr   r   r   output_typereturn_dictr   callback_on_step_endr   c           5      C   s  |p	| j jj| j }|p| j jj| j }d}| j||
||||||||||d |	| _|| _|| _d| _|durAt	|t
tfrAd}n|durOt	|trOt|}n|jd }| j}| j}tr_d}n|}|st| j||||\}}| |||
|\}}|dd || }n#t||
 } t| j| |||\}} || d }|dd || }|du r| jj|||d}|dddd	d
}|j||d}| j jj}!| j||||!|| |||||||d}| jdur| jddnd}"|jd }#| jr| j ||#||| j!||||"| j"d
\}}n$| j#|||| j!||||"| j"d	\}}| j!r$t$%||g}|j&|#dd}|dus5|durA| '||||| | j!}$| (||}%|dusQ|durUd|$ind}&| j)r^| j*nd}'t+|'D ]}(| j)r| ,||(|||j|\}}t|}| |||
|\}}t|| _-t||| jj.  })| j/| j-d}*t0|D ]\}+},| j1rq| j!rt$%|gd n|}-| j2|-|,}-| j |-|,|| j|&dj3}.| j!r|.4d\}/}0|/|	|0|/   }.| jj5|.|,|fi |%j6}|duri }1|D ]
}2t7 |2 |1|2< q|| |+|,|1}3|38d|}|38d|}|38d|}|+t|d ks3|+d |)kr7|+d | jj. dkr7|*9  tr>t:;  qW d   n	1 sKw   Y  qd|dkrZ|}n| <||}4| jj=|4|d}| >  |sr|fS t?|dS )u  
        The call function to the pipeline for generation.

        Args:
            video (`list[PipelineImageInput]`):
                The input video to condition the generation on. Must be a list of images/frames of the video.
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated video.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated video.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                expense of slower inference.
            timesteps (`list[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            sigmas (`list[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            strength (`float`, *optional*, defaults to 0.8):
                Higher strength leads to more differences between original video and generated video.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
                `(batch_size, num_channel, num_frames, height, width)`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            decode_chunk_size (`int`, defaults to `16`):
                The number of frames to decode at a time when calling `decode_latents` method.

        Examples:

        Returns:
            [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        r&   )r   r   r   r   r   rK   rL   r   r.   r   r   r   FNr   cpu)r   r   r!   r   r   r   )r   r   r   r   r   r   rj   r6   r*   r.   r   r   scale)
r   r   r6   r   r   r   rK   rL   r^   r_   )rK   rL   r^   r_   )repeatsr   r   )total)encoder_hidden_statesr   added_cond_kwargsr.   rK   rL   latent)r   r  )frames)@rP   rY   sample_sizerR   r   r   r   r   r   rU   rn   ro   rp   rA   rq   _execution_devicerj   XLA_AVAILABLErF   rB   r   r}   r   r[   preprocess_videor   rz   in_channelsr   r   getfree_noise_enabled_encode_prompt_free_noiser   r_   r   ru   r   r   r   r   free_init_enabled_free_init_num_itersr   _apply_free_initr   r   progress_barr   r   scale_model_inputr(   r   r   prev_samplelocalspopupdatexm	mark_stepr   postprocess_videomaybe_free_model_hooksr'   )5r\   r   r   r   r   r5   r   r7   r8   r   r   r   r   r   r*   r.   rK   rL   r   r   r  r  r   r_   r  r   r   r   r6   rj   timestep_devicelatent_timestepdenoising_inference_stepsr   text_encoder_lora_scaler   r   r   r	  num_free_init_itersfree_init_iternum_warmup_stepsr  r   tlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsvideo_tensorr2   r2   r3   __call__  s<  t




	
	
6
,
z(AnimateDiffVideoToVideoPipeline.__call__)NN)NNNNNr   )r   )NNNNNNNN)Nr   r   r   r&   NNNNNr   F)7__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rT   ru   r   r   r   r   r   r   r   r   r   r   r   rj   r6   	Generatorrp   boolr   propertyr   r_   r   r   r   r   no_gradr	   rn   ro   r   r   r0  __classcell__r2   r2   r]   r3   rG      s   
'	

 
8-	
H	

R








	
rG   )Nr(   )NNNN)Gr;   typingr   r   ru   transformersr   r   r   r   image_processorr	   loadersr
   r   r   r   modelsr   r   r   r   models.lorar   models.unets.unet_motion_modelr   
schedulersr   r   r   r   r   r   utilsr   r   r   r   r   utils.torch_utilsr   r[   r    free_init_utilsr"   free_noise_utilsr#   pipeline_utilsr$   r%   pipeline_outputr'   torch_xla.core.xla_modelcore	xla_modelr  r  
get_loggerr1  rx   EXAMPLE_DOC_STRINGr   r7  rn   r4   r   r6   rp   r   rF   rG   r2   r2   r2   r3   <module>   sl    
8




;