o
    ۷i                     @   sp  d dl Z d dlmZmZ d dlZd dlm  mZ d dl	m
Z
mZmZmZ ddlmZ ddlmZmZmZmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZ dd
l m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5 e$ rd dl6m7  m8Z9 dZ:ndZ:e%;e<Z=dZ>G dd de2e3eeee.e0e
Z?dS )    N)AnyCallable)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInput)FromSingleFileMixinIPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLControlNetModelImageProjectionMultiControlNetModelUNet2DConditionModelUNetMotionModel)adjust_lora_scale_text_encoder)MotionAdapter)KarrasDiffusionSchedulers)USE_PEFT_BACKENDis_torch_xla_availableloggingscale_lora_layersunscale_lora_layers)is_compiled_modulerandn_tensor)VideoProcessor   )FreeInitMixin)AnimateDiffFreeNoiseMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputTFaT  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import (
        ...     AnimateDiffControlNetPipeline,
        ...     AutoencoderKL,
        ...     ControlNetModel,
        ...     MotionAdapter,
        ...     LCMScheduler,
        ... )
        >>> from diffusers.utils import export_to_gif, load_video

        >>> # Additionally, you will need a preprocess videos before they can be used with the ControlNet
        >>> # HF maintains just the right package for it: `pip install controlnet_aux`
        >>> from controlnet_aux.processor import ZoeDetector

        >>> # Download controlnets from https://huggingface.co/lllyasviel/ControlNet-v1-1 to use .from_single_file
        >>> # Download Diffusers-format controlnets, such as https://huggingface.co/lllyasviel/sd-controlnet-depth, to use .from_pretrained()
        >>> controlnet = ControlNetModel.from_single_file("control_v11f1p_sd15_depth.pth", torch_dtype=torch.float16)

        >>> # We use AnimateLCM for this example but one can use the original motion adapters as well (for example, https://huggingface.co/guoyww/animatediff-motion-adapter-v1-5-3)
        >>> motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")

        >>> vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
        >>> pipe: AnimateDiffControlNetPipeline = AnimateDiffControlNetPipeline.from_pretrained(
        ...     "SG161222/Realistic_Vision_V5.1_noVAE",
        ...     motion_adapter=motion_adapter,
        ...     controlnet=controlnet,
        ...     vae=vae,
        ... ).to(device="cuda", dtype=torch.float16)
        >>> pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
        >>> pipe.load_lora_weights(
        ...     "wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora"
        ... )
        >>> pipe.set_adapters(["lcm-lora"], [0.8])

        >>> depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
        >>> video = load_video(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif"
        ... )
        >>> conditioning_frames = []

        >>> with pipe.progress_bar(total=len(video)) as progress_bar:
        ...     for frame in video:
        ...         conditioning_frames.append(depth_detector(frame))
        ...         progress_bar.update()

        >>> prompt = "a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality"
        >>> negative_prompt = "bad quality, worst quality"

        >>> video = pipe(
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     num_frames=len(video),
        ...     num_inference_steps=10,
        ...     guidance_scale=2.0,
        ...     conditioning_frames=conditioning_frames,
        ...     generator=torch.Generator().manual_seed(42),
        ... ).frames[0]

        >>> export_to_gif(video, "animatediff_controlnet.gif", fps=8)
        ```
c                8       s  e Zd ZdZdZddgZg dZ		dSdeded	e	d
e
eB dedeee B ee B eB dededB dedB f fddZ					dTdejdB dejdB dedB dedB fddZdUddZdd ZdVdefddZdd Z						 	!	 dWd"d#Z	dUd$d%Z	&	&dXd'd(Z e!d)d* Z"e!d+d, Z#e!d-d. Z$e!d/d0 Z%e!d1d2 Z&e!d3d4 Z'e( ddddd5d6dd7d!dddddddd8d9dd d&d!d ddd:gdfd;e)ee) B d<edB d=edB d>edB d?ed@edAe)ee) B dB dBedB dCedDej*eej* B dB d:ejdB dejdB dejdB dEe+dB dFe+dB dGee+ dB dHe)dB dIe,dJe-e)e.f dB dKeee B dLe,dMeee B dNeee B dedB dOe/eegdf dB dPee) def6dQdRZ0  Z1S )YAnimateDiffControlNetPipelinea  
    Pipeline for text-to-video generation with ControlNet guidance.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        tokenizer (`CLIPTokenizer`):
            A [`~transformers.CLIPTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
        motion_adapter ([`MotionAdapter`]):
            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    ztext_encoder->unet->vaefeature_extractorimage_encoder)latentsprompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetmotion_adapter
controlnet	schedulerc
           
         s   t    t|trt||}t|ttfrt|}| j	|||||||||	d	 t
| dd r:dt| jjjd  nd| _t| jd| _t| jddd	| _d S )
N)	r,   r-   r.   r/   r0   r1   r2   r'   r(   r,   r   r$      )vae_scale_factorTF)r4   do_convert_rgbdo_normalize)super__init__
isinstancer   r   from_unet2dlisttupler   register_modulesgetattrlenr,   configblock_out_channelsr4   r   video_processorcontrol_video_processor)
selfr,   r-   r.   r/   r0   r1   r2   r'   r(   	__class__ u/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.pyr8      s*   

(z&AnimateDiffControlNetPipeline.__init__r*   r+   
lora_scale	clip_skipc
              
   C   s  |durt | tr|| _tst| j| nt| j| |dur't |tr'd}
n|dur5t |tr5t	|}
n|j
d }
|du rt | trJ| || j}| j|d| jjddd}|j}| j|ddd	j}|j
d
 |j
d
 krt||s| j|dd| jjd d
f }td| jj d|  t| jjdr| jjjr|j|}nd}|	du r| j|||d}|d }n| j|||dd}|d
 |	d   }| jj|}| jdur| jj}n| jdur| jj}n|j}|j||d}|j
\}}}|d|d}||| |d
}|r|du r|du rdg|
 }nC|dur8t |t |ur8t!dt | dt | dt |trB|g}n|
t	|kr\t"d| dt	| d| d|
 d	|}t | trk| || j}|j
d }| j|d|ddd}t| jjdr| jjjr|j|}nd}| j|j||d}|d }|r|j
d }|j||d}|d|d}||
| |d
}| jdurt | trtrt#| j| ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr$   r   
max_lengthTpt)paddingrK   
truncationreturn_tensorslongest)rM   rO   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)rS   output_hidden_states)dtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)$r9   r   _lora_scaler   r   r-   r   strr;   r?   shaper   maybe_convert_promptr.   model_max_length	input_idstorchequalbatch_decodeloggerwarninghasattrr@   rR   rS   to
text_modelfinal_layer_normrU   r/   repeatviewtype	TypeError
ValueErrorr   )rD   promptrV   num_images_per_promptdo_classifier_free_guidancenegative_promptr*   r+   rI   rJ   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrS   prompt_embeds_dtypebs_embedseq_len_uncond_tokensrK   uncond_inputrG   rG   rH   encode_prompt   s   +











z+AnimateDiffControlNetPipeline.encode_promptc           
      C   s   t | j j}t|tjs| j|ddj}|j	||d}|rH| j|ddj
d }|j|dd}| jt|ddj
d }|j|dd}||fS | |j}|j|dd}t|}	||	fS )	NrL   )rO   rV   rU   T)rT   r   dim)nextr(   
parametersrU   r9   r_   Tensorr'   pixel_valuesre   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
rD   imagerV   rn   rT   rU   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedsrG   rG   rH   encode_image~  s(   

z*AnimateDiffControlNetPipeline.encode_imagec                 C   sl  g }|rg }|d u ret |ts|g}t|t| jjjkr/tdt| dt| jjj dt|| jjjD ],\}}	t |	t }
| 	||d|
\}}|
|d d d f  |rc|
|d d d f  q7n|D ]}|rw|d\}}|
| |
| qgg }t|D ]0\}}tj|g| dd}|rtj|| g| dd}tj||gdd}|j|d}|
| q|S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r$   r   r   r   rV   )r9   r;   r?   r/   encoder_hid_projimage_projection_layersrl   zipr   r   appendchunk	enumerater_   catre   )rD   ip_adapter_imageip_adapter_image_embedsrV   rn   ro   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsirG   rG   rH   prepare_ip_adapter_image_embeds  sH   


z=AnimateDiffControlNetPipeline.prepare_ip_adapter_image_embeds   decode_chunk_sizec                 C   s   d| j jj | }|j\}}}}}|ddddd|| |||}g }td|jd |D ]}	||	|	|  }
| j |
j}
|	|
 q-t
|}|d d d f ||df|jdd   ddddd}| }|S )Nr$   r   r   r      rQ   )r,   r@   scaling_factorr[   permutereshaperangedecodesampler   r_   r   float)rD   r)   r   rq   channels
num_framesheightwidthvideor   batch_latentsrG   rG   rH   decode_latents  s   "
8z,AnimateDiffControlNetPipeline.decode_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Neta	generator)setinspect	signaturer2   stepr   keys)rD   r   r   accepts_etaextra_step_kwargsaccepts_generatorrG   rG   rH   prepare_extra_step_kwargs  s   z7AnimateDiffControlNetPipeline.prepare_extra_step_kwargs      ?        c              
      sl  |d dks|d dkrt d| d| d|d ur8t fdd|D s8t d j d	 fd
d|D  |d urK|d urKt d| d| d|d u rW|d u rWt d|d urlt|tttfslt dt| |d ur|d urt d| d| d|d ur|d ur|j|jkrt d|j d|j dt j	t
rt|trtdt j	j dt| d ttdot j	tjjj}t j	ts|rt j	jtrttstdt t|krt d| dtnSt j	t
s|rFt j	jt
rFttrtd tstdttd |kr5t d| dtd tfddD rEt d nJ t j	tsZ|ret j	jtret|
tsdtd"n?t j	t
sw|rt j	jt
rt|
trtd#d |
D rt d$nt|
trt|
t j	jkrt d%nJ t|ttfs|g}t|ttfs|g}t|t|krt d&t| d't| d(t j	t
rt|t j	jkrt d)| d*t| d+t j	j d,t j	j d	t||D ]/\}}||krt d-| d.| d|d/k r%t d-| d0|d1kr2t d2| d3qd S )4Nr3   r   z7`height` and `width` have to be divisible by 8 but are z and rX   c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0krD   rG   rH   	<genexpr>  s    

z=AnimateDiffControlNetPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS rG   r   r   r   rG   rH   
<listcomp>  s    z>AnimateDiffControlNetPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z:`prompt` has to be of type `str`, `list` or `dict` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` z	You have z! ControlNets and you have passed z= prompts. The conditionings will be fixed across the prompts.scaled_dot_product_attentionz>For single controlnet, `image` must be of type `list` but got zExcepted image to have length z but got len(video)=zQFor multiple controlnets: `image` must be type list of lists but got type(video)=z$Expected length of image sublist as z but got len(video[0])=c                 3   s$    | ]}t |t  d  kV  qdS )r   N)r?   )r   img)r   rG   rH   r   ;  s   " zDAll conditioning frame batches for multicontrolnet must be same sizeFzLFor single controlnet: `controlnet_conditioning_scale` must be type `float`.c                 s   s    | ]}t |tV  qd S r   )r9   r;   )r   r   rG   rG   rH   r   N  s    zEA single batch of multiple conditionings are supported at the moment.zFor multiple controlnets: When `controlnet_conditioning_scale` is specified as `list`, it must have the same length as the number of controlnetsz`control_guidance_start` has z* elements, but `control_guidance_end` has zI elements. Make sure to provide the same number of elements to each list.z`control_guidance_start`: z has z elements but there are z- controlnets available. Make sure to provide zcontrol guidance start: z4 cannot be larger or equal to control guidance end: r   z can't be smaller than 0.r   zcontrol guidance end: z can't be larger than 1.0.)rl   allr   r9   rZ   r;   dictrj   r[   r1   r   rb   rc   r?   netsrd   Fr_   _dynamo
eval_frameOptimizedModuler   	_orig_modrk   anyr   r<   r   )rD   rm   r   r   r   rp   r*   r+   "callback_on_step_end_tensor_inputsr   controlnet_conditioning_scalecontrol_guidance_startcontrol_guidance_endis_compiledstartendrG   )rD   r   rH   check_inputs  s   





0


z*AnimateDiffControlNetPipeline.check_inputsc
                 C   s   | j r| |||||||||		}	t|tr(t||kr(tdt| d| d||||| j || j f}
|	d u rBt|
|||d}	n|	|}	|	| j	j
 }	|	S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   rV   rU   )free_noise_enabled_prepare_latents_free_noiser9   r;   r?   rl   r4   r   re   r2   init_noise_sigma)rD   rq   num_channels_latentsr   r   r   rU   rV   r   r)   r[   rG   rG   rH   prepare_latentsv  s*   
z-AnimateDiffControlNetPipeline.prepare_latentsFc
                 C   s   | j j|||djtjd}|ddddddd}|jd }
|
dkr'|}n|}|j|dd}|j||d	}|rC|	sCt	|gd }|S )
N)r   r   )rU   r   r   r$   r   r   r   r}   )
rC   preprocess_videore   r_   float32r   flattenr[   r   r   )rD   r   r   r   rq   num_videos_per_promptrV   rU   ro   
guess_modevideo_batch_size	repeat_byrG   rG   rH   prepare_video  s   
z+AnimateDiffControlNetPipeline.prepare_videoc                 C      | j S r   _guidance_scaler   rG   rG   rH   guidance_scale     z,AnimateDiffControlNetPipeline.guidance_scalec                 C   r   r   )
_clip_skipr   rG   rG   rH   rJ     r   z'AnimateDiffControlNetPipeline.clip_skipc                 C   s
   | j dkS )Nr$   r   r   rG   rG   rH   ro     s   
z9AnimateDiffControlNetPipeline.do_classifier_free_guidancec                 C   r   r   )_cross_attention_kwargsr   rG   rG   rH   cross_attention_kwargs  r   z4AnimateDiffControlNetPipeline.cross_attention_kwargsc                 C   r   r   )_num_timestepsr   rG   rG   rH   num_timesteps  r   z+AnimateDiffControlNetPipeline.num_timestepsc                 C   r   r   )
_interruptr   rG   rG   rH   	interrupt  r   z'AnimateDiffControlNetPipeline.interrupt2   g      @r$   pilTr)   rm   r   r   r   num_inference_stepsr   rp   r   r   r   r   r   conditioning_framesoutput_typereturn_dictr   r   r   r   r   callback_on_step_endr   c           ?         s  t | jr	| jjn| j}t|tst|trt||g }n3t|ts0t|tr0t||g }n!t|tsQt|tsQt|trDt|jnd}||g ||g }}|pZ| jj	j
| j }|pd| jj	j
| j }d}| j||||||||||||d || _|| _|| _d| _|durt|ttfrd}n|durt|trt|}n|jd }| j}t|trt|tr|gt|j }t|tr|j	jn|jd j	j} |p| }|dur|ddnd}!| jr| j||||| j||||!| jd
\}}n$| j|||| j||||!| jd	\}}| jrt||g}|j |dd	}|dus|dur+| !||||| | j}"t|trF| j"||||| | |||j#| j|d
	}n,t|trpg }#|D ]}$| j"|$|||| | |||j#| j|d
	}%|#$|% qP|#}nJ | j%j&||d | j%j'| jj	j(}&| )|| |&||||j#||
|	}| *|
|	}'|dus|durd|"ind}(g })t+tD ]  fddt,||D }*|)$t|tr|*d n|* q| j-r| j.nd}+t+|+D ]M},| j-r| /||,|||j#|
\}t| _0t|| j%j1  }-| j2| j0d}.t3D ]\ }/| j4rq| jr%t|gd n|}0| j%5|0|/}0|rF| jrF|}1| j%5|1|/}1|6dd }2n|0}1|}2t|)  tr_dd t,||)  D }3n|}4t|4trk|4d }4|4|)   }3t7|1dd}1|18d|1jd |1jd |1jd f}1| j|1|/|2||3|dd\}5}6| j|0|/|| j9|(|5|6dj:}7| jr|76d\}8}9|8||9|8   }7| j%j;|7|/|fi |'j<}|duri }:|D ]
};t= |; |:|;< q||  |/|:}<|<>d|}|<>d|}|<>d|} td ks d |-kr d | j%j1 dkr|.?  t@rtAB  qW d   n	1 s&w   Y  q|dkr5|}=n| C||}>| jDjE|>|d}=| F  |sM|=fS tG|=dS )u  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated video.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated video.
            num_frames (`int`, *optional*, defaults to 16):
                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
                amounts to 2 seconds of video.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
                `(batch_size, num_channel, num_frames, height, width)`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            conditioning_frames (`list[PipelineImageInput]`, *optional*):
                The ControlNet input condition to provide guidance to the `unet` for generation. If multiple
                ControlNets are specified, images must be passed as a list such that each element of the list can be
                correctly batched for input to a single ControlNet.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
                of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 1.0):
                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
                the corresponding scale as a list.
            guess_mode (`bool`, *optional*, defaults to `False`):
                The ControlNet encoder tries to recognize the content of the input image even if you remove all
                prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended.
            control_guidance_start (`float` or `list[float]`, *optional*, defaults to 0.0):
                The percentage of total steps at which the ControlNet starts applying.
            control_guidance_end (`float` or `list[float]`, *optional*, defaults to 1.0):
                The percentage of total steps at which the ControlNet stops applying.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        r$   )rm   r   r   r   rp   r   r*   r+   r   r   r   r   FNr   scale)
rm   r   rV   r   ro   rp   r*   r+   rI   rJ   )r*   r+   rI   rJ   )repeatsr   )	r   r   r   rq   r   rV   rU   ro   r   r   r   c                    s<   g | ]\}}d t  t |k p d t |k qS )r   r$   )r   r?   )r   ser   	timestepsrG   rH   r     s    *z:AnimateDiffControlNetPipeline.__call__.<locals>.<listcomp>)totalr   c                 S   s   g | ]\}}|| qS rG   rG   )r   cr   rG   rG   rH   r     s    rQ   r   r   )encoder_hidden_statescontrolnet_condconditioning_scaler   r   )r  r   added_cond_kwargsdown_block_additional_residualsmid_block_additional_residualr)   r*   r+   latent)r   r   )frames)Hr   r1   r   r9   r;   r?   r   r   r/   r@   sample_sizer4   r   r   r   r   r   rZ   r   r[   _execution_devicer   r   global_pool_conditionsgetr   _encode_prompt_free_noisero   rJ   r|   r_   r   r   r   r   rU   r   r2   set_timestepsr   in_channelsr   r   r   r   free_init_enabled_free_init_num_iters_apply_free_initr   orderprogress_barr   r   scale_model_inputr   	transposer   r   r   r   prev_samplelocalspopupdateXLA_AVAILABLExm	mark_stepr   rB   postprocess_videomaybe_free_model_hooksr%   )?rD   rm   r   r   r   r   r   rp   r   r   r   r)   r*   r+   r   r   r   r   r   r   r   r   r   r   rJ   r   r   r   r1   multrq   rV   r  text_encoder_lora_scaler   cond_prepared_videosframe_prepared_videor   r   r  controlnet_keepkeepsnum_free_init_itersfree_init_iternum_warmup_stepsr  tlatent_model_inputcontrol_model_inputcontrolnet_prompt_embeds
cond_scalecontrolnet_cond_scaledown_block_res_samplesmid_block_res_sample
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr   video_tensorrG   r   rH   __call__  s  x






"


6
N
z&AnimateDiffControlNetPipeline.__call__)NN)NNNNNr   )r   )NNNNNr   r   r   )FF)2__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r;   r<   r   r   r   r   r8   r_   r   r   intr|   r   r   r   r   r   r   r   propertyr   rJ   ro   r   r   r   no_gradrZ   	Generatorr	   boolr   r   r   r9  __classcell__rG   rG   rE   rH   r&   x   s6   
	
+	

 
8.
 
)
 






	




r&   )@r   typingr   r   r_   torch.nn.functionalnn
functionalr   transformersr   r   r   r   image_processorr	   loadersr
   r   r   r   modelsr   r   r   r   r   r   models.lorar   models.unets.unet_motion_modelr   
schedulersr   utilsr   r   r   r   r   utils.torch_utilsr   r   rB   r   free_init_utilsr    free_noise_utilsr!   pipeline_utilsr"   r#   pipeline_outputr%   torch_xla.core.xla_modelcore	xla_modelr  r  
get_loggerr:  rb   EXAMPLE_DOC_STRINGr&   rG   rG   rG   rH   <module>   sD    

B