o
    Gi                    @   s  d dl Z d dlmZmZ d dlZd dlm  mZ d dl	m
Z
mZmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
l m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z: e) rd dl;m<  m=Z> dZ?ndZ?e*@eAZBdZC	d%dejDdejEdB deFfddZG				d&deHdB deFejIB dB deJeH dB d eJeK dB fd!d"ZLG d#d$ d$e7e8eeee3e5e
ZMdS )'    N)AnyCallable)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInput)FromSingleFileMixinIPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLControlNetModelImageProjectionMultiControlNetModelUNet2DConditionModelUNetMotionModel)adjust_lora_scale_text_encoder)MotionAdapter)DDIMSchedulerDPMSolverMultistepSchedulerEulerAncestralDiscreteSchedulerEulerDiscreteSchedulerLMSDiscreteSchedulerPNDMScheduler)USE_PEFT_BACKENDis_torch_xla_availableloggingscale_lora_layersunscale_lora_layers)is_compiled_modulerandn_tensor)VideoProcessor   )FreeInitMixin)AnimateDiffFreeNoiseMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputTFak
  
    Examples:
        ```py
        >>> import torch
        >>> from PIL import Image
        >>> from tqdm.auto import tqdm

        >>> from diffusers import AnimateDiffVideoToVideoControlNetPipeline
        >>> from diffusers.utils import export_to_gif, load_video
        >>> from diffusers import AutoencoderKL, ControlNetModel, MotionAdapter, LCMScheduler

        >>> controlnet = ControlNetModel.from_pretrained(
        ...     "lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16
        ... )
        >>> motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
        >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)

        >>> pipe = AnimateDiffVideoToVideoControlNetPipeline.from_pretrained(
        ...     "SG161222/Realistic_Vision_V5.1_noVAE",
        ...     motion_adapter=motion_adapter,
        ...     controlnet=controlnet,
        ...     vae=vae,
        ... ).to(device="cuda", dtype=torch.float16)

        >>> pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
        >>> pipe.load_lora_weights(
        ...     "wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora"
        ... )
        >>> pipe.set_adapters(["lcm-lora"], [0.8])

        >>> video = load_video(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/dance.gif"
        ... )
        >>> video = [frame.convert("RGB") for frame in video]

        >>> from controlnet_aux.processor import OpenposeDetector

        >>> open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
        >>> for frame in tqdm(video):
        ...     conditioning_frames.append(open_pose(frame))

        >>> prompt = "astronaut in space, dancing"
        >>> negative_prompt = "bad quality, worst quality, jpeg artifacts, ugly"

        >>> strength = 0.8
        >>> with torch.inference_mode():
        ...     video = pipe(
        ...         video=video,
        ...         prompt=prompt,
        ...         negative_prompt=negative_prompt,
        ...         num_inference_steps=10,
        ...         guidance_scale=2.0,
        ...         controlnet_conditioning_scale=0.75,
        ...         conditioning_frames=conditioning_frames,
        ...         strength=strength,
        ...         generator=torch.Generator().manual_seed(42),
        ...     ).frames[0]

        >>> video = [frame.resize(conditioning_frames[0].size) for frame in video]
        >>> export_to_gif(video, f"animatediff_vid2vid_controlnet.gif", fps=8)
        ```
sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr+   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr/   r+   moder1   AttributeError)r,   r-   r.    r5   /home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_video2video_controlnet.pyretrieve_latents~   s   

r7   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr:   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r:   r9   r;   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r;   r9   r9   r5   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r:   len)	schedulerr8   r9   r:   r;   kwargsaccepts_timestepsaccept_sigmasr5   r5   r6   retrieve_timesteps   s2   rI   c                @       s  e Zd ZdZdZg dZg dZ		dfdedede	d	e
eB d
edeee B ee B eB deeB eB eB eB eB dedef fddZ					dgdejdB dejdB dedB dedB fddZdhddZdd Z didedejfddZ!didefd d!Z"d"d# Z#										$	%	$djd&d'Z$d(d) Z%		*	*	+	,							-dkd.ejdB d/ed0ed1ed2ed3edB d4ej&dB d5ej'dB d6ej(eej( B dB d7ejdB ded8e)dejfd9d:Z*	-	-dld;d<Z+e,d=d> Z-e,d?d@ Z.e,dAdB Z/e,dCdD Z0e,dEdF Z1e,dGdH Z2e3 dddddId-dddJdKdd,d%ddddddddLdMdd$d-d%d$ddd7gdfd.eee4  dNe5ee5 B dB d/edB d0edB dOedPe)dQee dB dRee dB dSedTedUe5ee5 B dB dVedB dWed6ej(eej( B dB d7ejdB dejdB dejdB dXe4dB dYeej dB dZee4 dB d[e5dB d\e)d]e6e5e7f dB d^eee B d_e)d`eee B daeee B dedB dbe8eegdf dB dcee5 def>dddeZ9  Z:S )m)AnimateDiffVideoToVideoControlNetPipelinea  
    Pipeline for video-to-video generation with ControlNet guidance.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        tokenizer (`CLIPTokenizer`):
            A [`~transformers.CLIPTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
        motion_adapter ([`MotionAdapter`]):
            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
        controlnet ([`ControlNetModel`] or `list[ControlNetModel]` or `tuple[ControlNetModel]` or `MultiControlNetModel`):
            Provides additional conditioning to the `unet` during the denoising process. If you set multiple
            ControlNets as a list, the outputs from each ControlNet are added together to create one combined
            additional conditioning.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    z&text_encoder->image_encoder->unet->vae)feature_extractorimage_encodermotion_adapter)r1   prompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetrM   
controlnetrE   rK   rL   c
           
         s   t    t|trt||}t|ttfrt|}| j	|||||||||	d	 t
| dd r:dt| jjjd  nd| _t| jd| _t| jddd	| _d S )
N)	rP   rQ   rR   rS   rM   rT   rE   rK   rL   rP   r$   r)      )vae_scale_factorTF)rV   do_convert_rgbdo_normalize)super__init__
isinstancer   r   from_unet2dlisttupler   register_modulesgetattrrD   rP   configblock_out_channelsrV   r#   video_processorcontrol_video_processor)
selfrP   rQ   rR   rS   rM   rT   rE   rK   rL   rC   r5   r6   rZ      s*   

(z2AnimateDiffVideoToVideoControlNetPipeline.__init__rN   rO   
lora_scale	clip_skipc
              
   C   s  |durt | tr|| _tst| j| nt| j| |dur)t |ttfr)d}
n|dur7t |t	r7t
|}
n|jd }
|du rt | trL| || j}| j|d| jjddd}|j}| j|ddd	j}|jd
 |jd
 krt||s| j|dd| jjd d
f }td| jj d|  t| jjdr| jjjr|j|}nd}|	du r| j|||d}|d }n| j|||dd}|d
 |	d   }| jj|}| jdur| jj}n| jdur| jj}n|j}|j||d}|j\}}}|d|d}| || |d
}|r|du r|du rdg|
 }nC|dur:t!|t!|ur:t"dt!| dt!| dt |trD|g}n|
t
|kr^t#d| dt
| d| d|
 d	|}t | trm| || j}|jd }| j|d|ddd}t| jjdr| jjjr|j|}nd}| j|j||d}|d }|r|jd }|j||d}|d|d}| |
| |d
}| jdurt | trtrt$| j| ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr)   r   
max_lengthTpt)paddingri   
truncationreturn_tensorslongest)rk   rm   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)rq   output_hidden_states)dtyper9    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)%r[   r   _lora_scaler   r   rQ   r   strdictr]   rD   shaper   maybe_convert_promptrR   model_max_length	input_idstorchequalbatch_decodeloggerwarningr2   ra   rp   rq   to
text_modelfinal_layer_normrs   rS   repeatviewtype	TypeErrorr<   r    )re   promptr9   num_images_per_promptdo_classifier_free_guidancenegative_promptrN   rO   rg   rh   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrq   prompt_embeds_dtypebs_embedseq_len_uncond_tokensri   uncond_inputr5   r5   r6   encode_prompt  s   +











z7AnimateDiffVideoToVideoControlNetPipeline.encode_promptc           
      C   s   t | j j}t|tjs| j|ddj}|j	||d}|rH| j|ddj
d }|j|dd}| jt|ddj
d }|j|dd}||fS | |j}|j|dd}t|}	||	fS )	Nrj   )rm   r9   rs   T)rr   r   dim)nextrL   rA   rs   r[   r}   TensorrK   pixel_valuesr   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
re   imager9   r   rr   rs   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedsr5   r5   r6   encode_image  s(   

z6AnimateDiffVideoToVideoControlNetPipeline.encode_imagec                 C   sl  g }|rg }|d u ret |ts|g}t|t| jjjkr/tdt| dt| jjj dt|| jjjD ],\}}	t |	t }
| 	||d|
\}}|
|d d d f  |rc|
|d d d f  q7n|D ]}|rw|d\}}|
| |
| qgg }t|D ]0\}}tj|g| dd}|rtj|| g| dd}tj||gdd}|j|d}|
| q|S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r)   r$   r   r   )r9   )r[   r]   rD   rS   encoder_hid_projimage_projection_layersr<   zipr   r   appendchunk	enumerater}   catr   )re   ip_adapter_imageip_adapter_image_embedsr9   r   r   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsir5   r5   r6   prepare_ip_adapter_image_embeds  sH   


zIAnimateDiffVideoToVideoControlNetPipeline.prepare_ip_adapter_image_embeds   decode_chunk_sizereturnc                 C   sR   g }t dt||D ]}||||  }t| j||d}|| q
t|S )Nr   )r-   )rangerD   r7   rP   encoder   r}   r   )re   videor-   r   r1   r   batch_videor5   r5   r6   encode_video  s   
z6AnimateDiffVideoToVideoControlNetPipeline.encode_videoc                 C   s   d| j jj | }|j\}}}}}|ddddd|| |||}g }td|jd |D ]}	||	|	|  }
| j |
j}
|	|
 q-t
|}|d d d f ||df|jdd   ddddd}| }|S )Nr)   r   r$   r      ro   )rP   ra   scaling_factorry   permutereshaper   decoder+   r   r}   r   float)re   r1   r   r   channels
num_framesheightwidthr   r   batch_latentsr5   r5   r6   decode_latents&  s   "
8z8AnimateDiffVideoToVideoControlNetPipeline.decode_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Netar-   )r=   r>   r?   rE   steprA   rB   )re   r-   r   accepts_etaextra_step_kwargsaccepts_generatorr5   r5   r6   prepare_extra_step_kwargs9  s   zCAnimateDiffVideoToVideoControlNetPipeline.prepare_extra_step_kwargs      ?        c              
      s*  |dk s|dkrt d| |d dks|d dkr&t d| d| d|d urGtfdd	|D sGt d
j dfdd|D  |d urZ|	d urZt d| d|	 d|d u rf|	d u rft d|d ur{t|tttfs{t dt| |d ur|
d urt d| d|
 d|	d ur|
d ur|	j|
jkrt d|	j d|
j d|d ur|d urt d|d ur|d urt d|d urt|tst dt| |d j	dvrt d|d j	 dtj
trt|trtdtj
j dt| d ttd otj
tjjj}|d u rt|n|jd! }tj
ts3|rVtj
jtrVt tsBtd"t  t |krUt d#| d$t nQtj
tsh|rtj
jtrt trvt d tstd%t t d |krt d&| d$t t fd'd	 D rt d(nJ tj
ts|rtj
jtrt|tstd*n?tj
ts|rtj
jtrt|trtd+d	 |D rt d,nt|tr t|tj
jkr t d-nJ t|ttfs|g}t|ttfs|g}t|t|kr1t d.t| d/t| d0tj
tr^t|tj
jkr^t d1| d2t| d3tj
j d4tj
j d	t||D ]/\}}||krwt d5| d6| d|d7k rt d5| d8|d9krt d:| d;qcd S )<Nr   r)   z2The value of strength should in [0.0, 1.0] but is rU   z7`height` and `width` have to be divisible by 8 but are z and ru   c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0kre   r5   r6   	<genexpr>c  s    

zIAnimateDiffVideoToVideoControlNetPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r5   r   r   r   r5   r6   
<listcomp>g  s    zJAnimateDiffVideoToVideoControlNetPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z:`prompt` has to be of type `str`, `list` or `dict` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z3Only one of `video` or `latents` should be providedzProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r   r   zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is Dz	You have z! ControlNets and you have passed z= prompts. The conditionings will be fixed across the prompts.scaled_dot_product_attentionr$   z>For single controlnet, `image` must be of type `list` but got zExcepted image to have length z" but got len(conditioning_frames)=z_For multiple controlnets: `image` must be type list of lists but got type(conditioning_frames)=z$Expected length of image sublist as c                 3   s$    | ]}t |t  d  kV  qdS )r   N)rD   )r   img)conditioning_framesr5   r6   r     s   " zDAll conditioning frame batches for multicontrolnet must be same sizeFzLFor single controlnet: `controlnet_conditioning_scale` must be type `float`.c                 s   s    | ]}t |tV  qd S r   )r[   r]   r   r   r5   r5   r6   r     s    zEA single batch of multiple conditionings are supported at the moment.zFor multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have the same length as the number of controlnetsz`control_guidance_start` has z* elements, but `control_guidance_end` has zI elements. Make sure to provide the same number of elements to each list.z`control_guidance_start`: z has z elements but there are z- controlnets available. Make sure to provide zcontrol guidance start: z4 cannot be larger or equal to control guidance end: r   z can't be smaller than 0.r   zcontrol guidance end: z can't be larger than 1.0.)r<   allr   r[   rw   r]   rx   r   ry   ndimrT   r   r   r   rD   netsr2   Fr}   _dynamo
eval_frameOptimizedModuler   	_orig_modr   anyr   r^   r   )re   r   strengthr   r   r   r   r1   r   rN   rO   r   r   "callback_on_step_end_tensor_inputscontrolnet_conditioning_scalecontrol_guidance_startcontrol_guidance_endis_compiledr   startendr5   )r   re   r6   check_inputsJ  s  




0


z6AnimateDiffVideoToVideoControlNetPipeline.check_inputsc                 C   s@   t t|| |}t|| d}||| jj d  }||| fS )Nr   )minintmaxrE   order)re   r8   r:   r   r9   init_timestept_startr5   r5   r6   get_timesteps  s   z7AnimateDiffVideoToVideoControlNetPipeline.get_timesteps@   r   r)   Fr   r   r   num_channels_latentsr   timesteprs   r9   r-   r1   	add_noisec                    s"  |
d u r	j d n|
j d }||||j |j f}ttr3t|kr3tdt d| d|
d u rjjjrH	 jj
tjd ttr\ fddt|D }n fd	dD }tj|d
d}jjjryj
| |
|}jjj| }||j d
 kr||j d
  d
krd| d|j d
  d}t|||j d
 kr||j d
  d
krtd|j d
  d| dtj|gd
d}t|j ||d}j|||d
dddd}
|
S ||
j krtd|d|
j |
j
||d}
|rt|||d}j|
||}
|
S )Nr)   r$   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.rs   c                    s(   g | ]} | |  d qS r   r   	unsqueezer   r   r-   re   r   r5   r6   r   $  s    zMAnimateDiffVideoToVideoControlNetPipeline.prepare_latents.<locals>.<listcomp>c                    s    g | ]} | d qS r  r  )r   vid)r   r-   re   r5   r6   r   )  s     r   r   zYou have passed z# text prompts (`prompt`), but only zp initial images (`image`). Please make sure to update your script to pass as many initial images as text promptsz'Cannot duplicate `image` of batch size z to z text prompts.)r-   r9   rs   r   r   z!`latents` expected to have shape=z, but found latents.shape=)ry   rV   r[   r]   rD   r<   rP   ra   force_upcastr   r   r}   float32r   r   r   r"   rE   r   r   )re   r   r   r   r   r   r   rs   r9   r-   r1   r   r   r   ry   init_latentserror_messagenoiser5   r  r6   prepare_latents  s\   



  
z9AnimateDiffVideoToVideoControlNetPipeline.prepare_latentsc
                 C   s   | j j|||djtjd}|ddddddd}|jd }
|
dkr'|}n|}|j|dd}|j||d	}|rC|	sCt	|gd }|S )
Nr   r   r  r   r$   r)   r   r   r   r   )
rd   preprocess_videor   r}   r  r   flattenry   r   r   )re   r   r   r   r   num_videos_per_promptr9   rs   r   
guess_modevideo_batch_size	repeat_byr5   r5   r6   prepare_conditioning_framesR  s   
zEAnimateDiffVideoToVideoControlNetPipeline.prepare_conditioning_framesc                 C      | j S r   _guidance_scaler   r5   r5   r6   guidance_scaler     z8AnimateDiffVideoToVideoControlNetPipeline.guidance_scalec                 C   r  r   )
_clip_skipr   r5   r5   r6   rh   v  r  z3AnimateDiffVideoToVideoControlNetPipeline.clip_skipc                 C   s
   | j dkS )Nr)   r  r   r5   r5   r6   r   }  s   
zEAnimateDiffVideoToVideoControlNetPipeline.do_classifier_free_guidancec                 C   r  r   )_cross_attention_kwargsr   r5   r5   r6   cross_attention_kwargs  r  z@AnimateDiffVideoToVideoControlNetPipeline.cross_attention_kwargsc                 C   r  r   )_num_timestepsr   r5   r5   r6   num_timesteps  r  z7AnimateDiffVideoToVideoControlNetPipeline.num_timestepsc                 C   r  r   )
_interruptr   r5   r5   r6   	interrupt  r  z3AnimateDiffVideoToVideoControlNetPipeline.interrupt2   g      @g?pilTr   r8   enforce_inference_stepsr:   r;   r  r   r   r  r   r   r   r   output_typereturn_dictr  r   r  r   r   callback_on_step_endr   c            G         s  t | jr	| jjn| j} t|tst|trt||g }n3t|ts0t|tr0t||g }n!t|tsQt|tsQt| trDt| jnd}!|!|g |!|g }}|pZ| jj	j
| j }|pd| jj	j
| j }d}| jd,i d|d|
d|d|d|d|d|d	|d
|d|d|d|d|d|d|d| |	| _|| _|| _d| _|durt|ttfrd}"n|durt|trt|}"n|jd }"| j}#| j}$trd}%n|#}%|st| j||%|\}| ||
|#\}dd |"| }&n#t||
 }'t| j|'|%|\}'| d dd |"| }&|du r;| jj|||d}|ddddd}|j|#|$d}| jj	j }(| j!||||(|"| |&|$|#||||d}| j"dur`| j"#ddnd})|jd }*| j$r| j%||*|#|| j&||||)| j'd
\}}n$| j(||#|| j&||||)| j'd	\}}| j&rt)*||g}|j+|*dd}|dus|dur| ,|||#|"| | j&}+t| trt|t-r|gt| j }t| t.r| j	j/n| jd j	j/},|p|,}g }-t0tD ]  fd d!t1||D }.|-2t| t.r|.d n|. qt| t.r&| j3||||"| |* ||#| j| j&|d"	}n,t| trPg }/|D ]}0| j3|0|||"| |* ||#| j| j&|d"	}1|/2|1 q0|/}nJ | 4||}2|dusb|durfd#|+ind}3| j5ro| j6nd}4t0|4D ][}5| j5r| 7||5||#|j|\}t}| ||
|#\}t| _8t|| jj9  }6| j:| j8d$}7t;D ]\ }8| j<rq| j&rt)*|gd n|}9| j=|9|8}9|r| j&r|}:| j=|:|8}:|>dd };n|9}:|};t|-  trd%d! t1||-  D }<n|}=t|=tr|=d }=|=|-   }<t)?|:dd}:|:@d&|:jd |:jd |:jd f}:| j|:|8|;||<|dd'\}>}?| j|9|8|| j"|3|>|?d(jA}@| j&r^|@>d\}A}B|A|	|B|A   }@| jjB|@|8|fi |2jC}|duri }C|D ]
}DtD |D |C|D< qt||  |8|C}E|EEd|}|EEd|}|EEd|} td ks d |6kr d | jj9 dkr|7F  trtGH  qW d   n	1 sw   Y  qu|d)kr|}n| I||}F| jjJ|F|d*}| K  |s|fS tL|d+S )-u   
        The call function to the pipeline for generation.

        Args:
            video (`list[PipelineImageInput]`):
                The input video to condition the generation on. Must be a list of images/frames of the video.
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated video.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated video.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                expense of slower inference.
            timesteps (`list[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            sigmas (`list[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            strength (`float`, *optional*, defaults to 0.8):
                Higher strength leads to more differences between original video and generated video.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
                `(batch_size, num_channel, num_frames, height, width)`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            conditioning_frames (`list[PipelineImageInput]`, *optional*):
                The ControlNet input condition to provide guidance to the `unet` for generation. If multiple
                ControlNets are specified, images must be passed as a list such that each element of the list can be
                correctly batched for input to a single ControlNet.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`AnimateDiffPipelineOutput`] instead of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 1.0):
                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
                the corresponding scale as a list.
            guess_mode (`bool`, *optional*, defaults to `False`):
                The ControlNet encoder tries to recognize the content of the input image even if you remove all
                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
            control_guidance_start (`float` or `list[float]`, *optional*, defaults to 0.0):
                The percentage of total steps at which the ControlNet starts applying.
            control_guidance_end (`float` or `list[float]`, *optional*, defaults to 1.0):
                The percentage of total steps at which the ControlNet stops applying.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            decode_chunk_size (`int`, defaults to `16`):
                The number of frames to decode at a time when calling `decode_latents` method.

        Examples:

        Returns:
            [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        r)   r   r   r   r   r   rN   rO   r   r   r1   r   r   r   r   r   r   FNr   cpur  r$   r   r   r   )r   r   r   r   r   r   rs   r9   r-   r1   r   r   scale)
r   r   r9   r  r   r   rN   rO   rg   rh   )rN   rO   rg   rh   )repeatsr   c                    s<   g | ]\}}d t  t |k p d t |k qS )r   r)   )r   rD   )r   ser   r:   r5   r6   r     s    *zFAnimateDiffVideoToVideoControlNetPipeline.__call__.<locals>.<listcomp>)	r   r   r   r   r  r9   rs   r   r  r   )totalc                 S   s   g | ]\}}|| qS r5   r5   )r   cr*  r5   r5   r6   r     s    ro   )encoder_hidden_statescontrolnet_condconditioning_scaler  r%  )r/  r  added_cond_kwargsdown_block_additional_residualsmid_block_additional_residuallatent)r   r$  )framesr5   )Mr!   rT   r   r[   r]   rD   r   r   rS   ra   sample_sizerV   r   r  r  r  r  rw   rx   ry   _execution_devicers   XLA_AVAILABLErI   rE   r   r   r   rc   r  r   r   in_channelsr  r  getfree_noise_enabled_encode_prompt_free_noiser   rh   r   r}   r   r   r   r   r   global_pool_conditionsr   r   r   r  r   free_init_enabled_free_init_num_iters_apply_free_initr  r   progress_barr   r   scale_model_inputr   	transposer   r+   r   prev_samplelocalspopupdatexm	mark_stepr   postprocess_videomaybe_free_model_hooksr*   )Gre   r   r   r   r   r8   r#  r:   r;   r  r   r   r  r   r-   r1   rN   rO   r   r   r   r$  r%  r  r   r  r   r   rh   r&  r   r   rT   multr   r9   rs   timestep_devicelatent_timestepdenoising_inference_stepsr   text_encoder_lora_scaler   r   r>  controlnet_keepkeepscond_prepared_videosframe_prepared_videor   r2  num_free_init_itersfree_init_iternum_warmup_stepsrB  tlatent_model_inputcontrol_model_inputcontrolnet_prompt_embeds
cond_scalecontrolnet_cond_scaledown_block_res_samplesmid_block_res_sample
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsvideo_tensorr5   r,  r6   __call__  s   
	






	
"




6
N
z2AnimateDiffVideoToVideoControlNetPipeline.__call__)NN)NNNNNr   )r   )NNNNNNNNNr   r   r   )Nr   r   r   r)   NNNNNr   F)FF);__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r]   r^   r   r   r   r   r   r   r   r   r   rZ   r}   r   r   r   r   r   r   r   r   r   r   r   rs   r9   	Generatorboolr  r  propertyr  rh   r   r  r  r   no_gradr	   rw   rx   r   r   rh  __classcell__r5   r5   rf   r6   rJ      s   
 0	

 
8.	
 ,	

\
 








	




 rJ   )Nr+   )NNNN)Nr>   typingr   r   r}   torch.nn.functionalnn
functionalr   transformersr   r   r   r   image_processorr	   loadersr
   r   r   r   modelsr   r   r   r   r   r   models.lorar   models.unets.unet_motion_modelr   
schedulersr   r   r   r   r   r   utilsr   r   r   r   r    utils.torch_utilsr!   r"   rc   r#   free_init_utilsr%   free_noise_utilsr&   pipeline_utilsr'   r(   pipeline_outputr*   torch_xla.core.xla_modelcore	xla_modelrI  r9  
get_loggerri  r   EXAMPLE_DOC_STRINGr   ro  rw   r7   r   r9   r]   r   rI   rJ   r5   r5   r5   r6   <module>   sn     
B




;