o
    ۷i                  
   @   s  d dl Z d dlmZmZ d dlZd dlZd dlZd dlm	  m
Z d dlmZmZmZmZ ddlmZmZ ddlmZmZmZmZ ddlmZmZmZmZ ddlmZ dd	l m!Z! dd
l"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1 ddl2m3Z3 ddl4m5Z5m6Z6 ddl7m8Z8 e( rd dl9m:  m;Z< dZ=ndZ=e)>e?Z@dZA	ddejBdejCdB deDfddZEG dd de5e6eeee3e	ZFdS )     N)AnyCallable)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInputVaeImageProcessor)FromSingleFileMixinIPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModelUNetMotionModel)SparseControlNetModel)adjust_lora_scale_text_encoder)MotionAdapter)KarrasDiffusionSchedulers)USE_PEFT_BACKENDis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)is_compiled_modulerandn_tensor)VideoProcessor   )FreeInitMixin)DiffusionPipelineStableDiffusionMixin   )AnimateDiffPipelineOutputTFa  
    Examples:
        ```python
        >>> import torch
        >>> from diffusers import AnimateDiffSparseControlNetPipeline
        >>> from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
        >>> from diffusers.schedulers import DPMSolverMultistepScheduler
        >>> from diffusers.utils import export_to_gif, load_image

        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
        >>> motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
        >>> controlnet_id = "guoyww/animatediff-sparsectrl-scribble"
        >>> lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
        >>> vae_id = "stabilityai/sd-vae-ft-mse"
        >>> device = "cuda"

        >>> motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
        >>> controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
        >>> vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
        >>> scheduler = DPMSolverMultistepScheduler.from_pretrained(
        ...     model_id,
        ...     subfolder="scheduler",
        ...     beta_schedule="linear",
        ...     algorithm_type="dpmsolver++",
        ...     use_karras_sigmas=True,
        ... )
        >>> pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
        ...     model_id,
        ...     motion_adapter=motion_adapter,
        ...     controlnet=controlnet,
        ...     vae=vae,
        ...     scheduler=scheduler,
        ...     torch_dtype=torch.float16,
        ... ).to(device)
        >>> pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
        >>> pipe.fuse_lora(lora_scale=1.0)

        >>> prompt = "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality"
        >>> negative_prompt = "low quality, worst quality, letterboxed"

        >>> image_files = [
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png",
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png",
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png",
        ... ]
        >>> condition_frame_indices = [0, 8, 15]
        >>> conditioning_frames = [load_image(img_file) for img_file in image_files]

        >>> video = pipe(
        ...     prompt=prompt,
        ...     negative_prompt=negative_prompt,
        ...     num_inference_steps=25,
        ...     conditioning_frames=conditioning_frames,
        ...     controlnet_conditioning_scale=1.0,
        ...     controlnet_frame_indices=condition_frame_indices,
        ...     generator=torch.Generator().manual_seed(1337),
        ... ).frames[0]
        >>> export_to_gif(video, "output.gif")
        ```
sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr&   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr*   r&   moder,   AttributeError)r'   r(   r)    r0   u/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.pyretrieve_latentsw   s   

r2   c                6       s  e Zd ZdZdZg dZg dZ		dXdedede	d	e
eB d
ededededef fddZ					dYdejdB dejdB dedB dedB fddZdZddZdd Zdd Zdd Z								d[d efd!d"Zd#d$ Z	dZd%d&Zd'd( Zd)ejd*ed+ed,ejd-ej d.e!ejejf fd/d0Z"e#d1d2 Z$e#d3d4 Z%e#d5d6 Z&e#d7d8 Z'e#d9d: Z(e) e*e+dddd;d<d=dd>d?dddddddd@dAdddBgdCdddDgfdEe,e-e, B dB dFedB dGedB d*edHedIedJe,e-e, B dB dKedLedMej.e-ej. B dB dDejdB dejdB dejdB dNe/dB dOe-ej dB d)e-e/ dB dPe,dQe0dRe1e,e2f dB d ee-e B d+e-e dSe0dedB dTe3eegdf dB dUe-e, f2dVdWZ4  Z5S )\#AnimateDiffSparseControlNetPipelinea  
    Pipeline for controlled text-to-video generation using the method described in [SparseCtrl: Adding Sparse Controls
    to Text-to-Video Diffusion Models](https://huggingface.co/papers/2311.16933).

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        tokenizer (`CLIPTokenizer`):
            A [`~transformers.CLIPTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
        motion_adapter ([`MotionAdapter`]):
            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    z&text_encoder->image_encoder->unet->vae)feature_extractorimage_encodermotion_adapter)r,   prompt_embedsnegative_prompt_embedsNvaetext_encoder	tokenizerunetr6   
controlnet	schedulerr4   r5   c
           
         s   t    t|trt||}| j|||||||||	d	 t| dd r/dt| j	j
jd  nd| _td| jd| _t| jddd	| _d S )
N)	r9   r:   r;   r<   r6   r=   r>   r4   r5   r9   r    r$      F)	do_resizevae_scale_factorT)rA   do_convert_rgbdo_normalize)super__init__
isinstancer   r   from_unet2dregister_modulesgetattrlenr9   configblock_out_channelsrA   r   video_processorr
   control_image_processor)
selfr9   r:   r;   r<   r6   r=   r>   r4   r5   	__class__r0   r1   rE      s&   

(z,AnimateDiffSparseControlNetPipeline.__init__r7   r8   
lora_scale	clip_skipc
              
   C   s  |durt | tr|| _tst| j| nt| j| |dur't |tr'd}
n|dur5t |tr5t	|}
n|j
d }
|du rt | trJ| || j}| j|d| jjddd}|j}| j|ddd	j}|j
d
 |j
d
 krt||s| j|dd| jjd d
f }td| jj d|  t| jjdr| jjjr|j|}nd}|	du r| j|||d}|d }n| j|||dd}|d
 |	d   }| jj|}| jdur| jj}n| jdur| jj}n|j}|j||d}|j
\}}}|d|d}||| |d
}|r|du r|du rdg|
 }nC|dur8t |t |ur8t!dt | dt | dt |trB|g}n|
t	|kr\t"d| dt	| d| d|
 d	|}t | trk| || j}|j
d }| j|d|ddd}t| jjdr| jjjr|j|}nd}| j|j||d}|d }|r|j
d }|j||d}|d|d}||
| |d
}| jdurt | trtrt#| j| ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr$   r   
max_lengthTpt)paddingrT   
truncationreturn_tensorslongest)rV   rX   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)r\   output_hidden_statesdtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)$rF   r   _lora_scaler   r   r:   r   strlistrJ   shaper   maybe_convert_promptr;   model_max_length	input_idstorchequalbatch_decodeloggerwarningr-   rK   r[   r\   to
text_modelfinal_layer_normr_   r<   repeatviewtype	TypeError
ValueErrorr   )rO   promptr`   num_images_per_promptdo_classifier_free_guidancenegative_promptr7   r8   rR   rS   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textr\   prompt_embeds_dtypebs_embedseq_len_uncond_tokensrT   uncond_inputr0   r0   r1   encode_prompt   s   +











z1AnimateDiffSparseControlNetPipeline.encode_promptc           
      C   s   t | j j}t|tjs| j|ddj}|j	||d}|rH| j|ddj
d }|j|dd}| jt|ddj
d }|j|dd}||fS | |j}|j|dd}t|}	||	fS )	NrU   )rX   )r`   r_   T)r]   r   dim)nextr5   
parametersr_   rF   rj   Tensorr4   pixel_valuesro   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
rO   imager`   rx   r]   r_   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedsr0   r0   r1   encode_image  s(   

z0AnimateDiffSparseControlNetPipeline.encode_imagec                 C   sl  g }|rg }|d u ret |ts|g}t|t| jjjkr/tdt| dt| jjj dt|| jjjD ],\}}	t |	t }
| 	||d|
\}}|
|d d d f  |rc|
|d d d f  q7n|D ]}|rw|d\}}|
| |
| qgg }t|D ]0\}}tj|g| dd}|rtj|| g| dd}tj||gdd}|j|d}|
| q|S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r$   r    r   r   r`   )rF   re   rJ   r<   encoder_hid_projimage_projection_layersrv   zipr   r   appendchunk	enumeraterj   catro   )rO   ip_adapter_imageip_adapter_image_embedsr`   rx   ry   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsir0   r0   r1   prepare_ip_adapter_image_embeds  sH   


zCAnimateDiffSparseControlNetPipeline.prepare_ip_adapter_image_embedsc           	      C   s   d| j jj | }|j\}}}}}|ddddd|| |||}| j |j}|d d d f ||df|jdd   ddddd}| }|S )Nr$   r   r    r      rZ   )	r9   rK   scaling_factorrf   permutereshapedecoder&   float)	rO   r,   r{   channels
num_framesheightwidthr   videor0   r0   r1   decode_latents  s   "8z2AnimateDiffSparseControlNetPipeline.decode_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Netar(   )setinspect	signaturer>   stepr   keys)rO   r(   r   accepts_etaextra_step_kwargsaccepts_generatorr0   r0   r1   prepare_extra_step_kwargs  s   z=AnimateDiffSparseControlNetPipeline.prepare_extra_step_kwargs      ?controlnet_conditioning_scalec                    sN  |d dks|d dkrt d| d| d|	d ur8t fdd|	D s8t d j d	 fd
d|	D  |d urK|d urKt d| d| d|d u rW|d u rWt d|d urnt|tsnt|tsnt dt| |d ur|d urt d| d| d|d ur|d ur|j|jkrt d|j d|j d|d ur|d urt d|d urt|tst dt| |d jdvrt d|d j dt	t
dot jtjjj}t jts|rt jjtrt|
tr|
D ]	} ||| qn
 |
|| nJ t jts|r%t jjtr%t|ts#tdd S J )Nr?   r   z7`height` and `width` have to be divisible by 8 but are z and rb   c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0krO   r0   r1   	<genexpr>  s    

zCAnimateDiffSparseControlNetPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r0   r   r   r   r0   r1   
<listcomp>  s    zDAnimateDiffSparseControlNetPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r   r   zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is Dscaled_dot_product_attentionFzLFor single controlnet: `controlnet_conditioning_scale` must be type `float`.)rv   allr   rF   rd   re   rt   rf   ndimr-   Fr=   rj   _dynamo
eval_frameOptimizedModuler   	_orig_modcheck_imager   ru   )rO   rw   r   r   rz   r7   r8   r   r   "callback_on_step_end_tensor_inputsr   r   is_compiledimage_r0   r   r1   check_inputs  s   



z0AnimateDiffSparseControlNetPipeline.check_inputsc                 C   s$  t |tjj}t |tj}t |tj}t |to t |d tjj}t |to-t |d tj}t |to:t |d tj}	|sP|sP|sP|sP|sP|	sPtdt	| |rUd}
nt
|}
|d uret |tred}n|d urst |trst
|}n	|d ur||jd }|
dkr|
|krtd|
 d| d S d S )Nr   zimage must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is r$   zdIf image batch size is not 1, image batch size must be same as prompt batch size. image batch size: z, prompt batch size: )rF   PILImagerj   r   npndarrayre   ru   rt   rJ   rd   rf   rv   )rO   r   rw   r7   image_is_pilimage_is_tensorimage_is_npimage_is_pil_listimage_is_tensor_listimage_is_np_listimage_batch_sizeprompt_batch_sizer0   r0   r1   r   K  sF   

z/AnimateDiffSparseControlNetPipeline.check_imagec
                 C   s~   ||||| j  || j  f}
t|tr%t||kr%tdt| d| d|	d u r2t|
|||d}	n|	|}	|	| jj }	|	S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r(   r`   r_   )	rA   rF   re   rJ   rv   r   ro   r>   init_noise_sigma)rO   r{   num_channels_latentsr   r   r   r_   r`   r(   r,   rf   r0   r0   r1   prepare_latentsq  s"   
z3AnimateDiffSparseControlNetPipeline.prepare_latentsc                 C   s   | j j|||d}|d||}|j\}}}	}}| dkr&| dks(J | jjrY|	|| |	||}d| d }t
| j|| jjj }
|
	||d|| j || j }
n|}
|
ddddd}
|
S )N)r   r   r   r$   r    r   r   )rN   
preprocess	unsqueezero   rf   minmaxr=   "use_simplified_condition_embeddingr   r2   r9   encoderK   r   rA   r   )rO   r   r   r   r`   r_   controlnet_imagesr{   r   r   conditioning_framesr0   r0   r1   prepare_image  s   z1AnimateDiffSparseControlNetPipeline.prepare_imager   r   controlnet_frame_indicesr`   r_   returnc                 C   s   |j d t|ksJ |j \}}}}	}
tj||||	|
f||d}tj|d||	|
f||d}|d d d d d t|f |d d d d |f< d|d d d d |f< ||fS )Nr    r^   r$   )rf   rJ   rj   zeros)rO   r   r   r   r`   r_   r{   r   r   r   r   controlnet_condcontrolnet_cond_maskr0   r0   r1   #prepare_sparse_control_conditioning  s   0zGAnimateDiffSparseControlNetPipeline.prepare_sparse_control_conditioningc                 C      | j S r   _guidance_scaler   r0   r0   r1   guidance_scale     z2AnimateDiffSparseControlNetPipeline.guidance_scalec                 C   r   r   )
_clip_skipr   r0   r0   r1   rS     r   z-AnimateDiffSparseControlNetPipeline.clip_skipc                 C   s
   | j dkS )Nr$   r   r   r0   r0   r1   ry     s   
z?AnimateDiffSparseControlNetPipeline.do_classifier_free_guidancec                 C   r   r   )_cross_attention_kwargsr   r0   r0   r1   cross_attention_kwargs  r   z:AnimateDiffSparseControlNetPipeline.cross_attention_kwargsc                 C   r   r   )_num_timestepsr   r0   r0   r1   num_timesteps  r   z1AnimateDiffSparseControlNetPipeline.num_timesteps   2   g      @r$   g        pilTr   Fr,   rw   r   r   num_inference_stepsr   rz   num_videos_per_promptr   r(   r   r   output_typereturn_dictr   
guess_modecallback_on_step_endr   c           9      C   sr  t | jr	| jjn| j}|p| jjj| j }|p| jjj| j }d}| j|||||||||||d || _|| _	|| _
|durGt|trGd}n|durUt|trUt|}n|jd }| j}t|trf|jjn|jd jj}|pp|}| jdur}| jddnd}| j|||| j||||| jd	\}}| jrt||g}|j|dd}|dus|dur| ||||| | j}| |||||j}| |||||j\} }!| j j!||d | j j"}"| jjj#}#| $|| |#||||j||
|	}| %|
|	}$|dus|durd	|ind}%| j&r| j'nd}&t(|&D ]}'| j&r"| )||'|||j|
\}}"t|"| _*t|"|| j j+  }(| j,| j*d
})t-|"D ]\}*}+| jrMt|gd n|},| j .|,|+},|rn| jrn|}-| j .|-|+}-|/dd }.n|,}-|}.| j|-|+|.| |!||dd\}/}0| j|,|+|||%|/|0dj0}1| jr|1/d\}2}3|2||3|2   }1| j j1|1|+|fi |$j2}|duri }4|D ]
}5t3 |5 |4|5< q|| |*|+|4}6|64d|}|64d|}|64d|}|*t|"d ks|*d |(kr|*d | j j+ dkr|)5  t6rt78  q=W d   n	1 sw   Y  q|dkr|}7n| 9|}8| j:j;|8|d}7| <  |s4|7fS t=|7dS )uI  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated video.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated video.
            num_frames (`int`, *optional*, defaults to 16):
                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
                amounts to 2 seconds of video.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
                `(batch_size, num_channel, num_frames, height, width)`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            conditioning_frames (`list[PipelineImageInput]`, *optional*):
                The SparseControlNet input to provide guidance to the `unet` for generation.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
                of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            controlnet_conditioning_scale (`float` or `list[float]`, *optional*, defaults to 1.0):
                The outputs of the ControlNet are multiplied by `controlnet_conditioning_scale` before they are added
                to the residual in the original `unet`. If multiple ControlNets are specified in `init`, you can set
                the corresponding scale as a list.
            controlnet_frame_indices (`list[int]`):
                The indices where the conditioning frames must be applied for generation. Multiple frames can be
                provided to guide the model to generate similar structure outputs, where the `unet` can
                "fill-in-the-gaps" for interpolation videos, or a single frame could be provided for general expected
                structure. Must have the same length as `conditioning_frames`.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        r$   )rw   r   r   rz   r7   r8   r   r   r   r   r   Nr   scale)r7   r8   rR   rS   )repeatsr   r   r   )totalr    F)encoder_hidden_statesr   conditioning_maskconditioning_scaler  r  )r  r   added_cond_kwargsdown_block_additional_residualsmid_block_additional_residualr,   r7   r8   latent)r   r  )frames)>r   r=   r   r<   rK   sample_sizerA   r   r   r   r   rF   rd   re   rJ   rf   _execution_devicer   global_pool_conditionsnetsr   getr   ry   rS   rj   r   r   r   r   r_   r   r>   set_timesteps	timestepsin_channelsr   r   free_init_enabled_free_init_num_itersrange_apply_free_initr   orderprogress_barr   scale_model_inputr   r&   r   prev_samplelocalspopupdateXLA_AVAILABLExm	mark_stepr   rM   postprocess_videomaybe_free_model_hooksr%   )9rO   rw   r   r   r   r  r   rz   r  r   r(   r,   r7   r8   r   r   r   r  r  r   r   r   r  rS   r  r   r=   r{   r`   r  text_encoder_lora_scaler   r   r   r  r   r   r  num_free_init_itersfree_init_iternum_warmup_stepsr!  r   tlatent_model_inputcontrol_model_inputcontrolnet_prompt_embedsdown_block_res_samplesmid_block_res_sample
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr   video_tensorr0   r0   r1   __call__  s"  s



	



6
?

z,AnimateDiffSparseControlNetPipeline.__call__)NN)NNNNNr   )NNNNNNNr   )6__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r   r   r   rE   rj   r   r   intr   r   r   r   r   r   r   r   r   r`   r_   tupler   propertyr   rS   ry   r   r   no_gradr   EXAMPLE_DOC_STRINGrd   re   	Generatorr	   booldictr   r   r<  __classcell__r0   r0   rP   r1   r3      sB   		
(	

 
8.
_'






	


r3   )Nr&   )Gr   typingr   r   numpyr   r   rj   torch.nn.functionalnn
functionalr   transformersr   r   r   r   image_processorr	   r
   loadersr   r   r   r   modelsr   r   r   r   (models.controlnets.controlnet_sparsectrlr   models.lorar   models.unets.unet_motion_modelr   
schedulersr   utilsr   r   r   r   r   r   utils.torch_utilsr   r   rM   r   free_init_utilsr!   pipeline_utilsr"   r#   pipeline_outputr%   torch_xla.core.xla_modelcore	xla_modelr(  r'  
get_loggerr=  rm   rG  r   rH  rd   r2   r3   r0   r0   r0   r1   <module>   sV    
@

