o
    Gi                     @   s^  d dl Z d dlmZmZ d dlZd dlmZmZmZm	Z	 ddl
mZ ddlmZmZmZ ddlmZmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 e rd dl1m2  m3Z4 dZ5ndZ5e6e7Z8dZ9G dd de-e.eeee)e+e0
Z:dS )    N)AnyCallable)CLIPImageProcessorCLIPTextModelCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInput)IPAdapterMixinStableDiffusionLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModelUNetMotionModel)adjust_lora_scale_text_encoder)MotionAdapter)KarrasDiffusionSchedulers)USE_PEFT_BACKENDis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)randn_tensor)VideoProcessor   )AnimateDiffPipelineOutput)FreeInitMixin)AnimateDiffFreeNoiseMixin)DiffusionPipelineStableDiffusionMixin   )PAGMixinTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import AnimateDiffPAGPipeline, MotionAdapter, DDIMScheduler
        >>> from diffusers.utils import export_to_gif

        >>> model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
        >>> motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-2"
        >>> motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id)
        >>> scheduler = DDIMScheduler.from_pretrained(
        ...     model_id, subfolder="scheduler", beta_schedule="linear", steps_offset=1, clip_sample=False
        ... )
        >>> pipe = AnimateDiffPAGPipeline.from_pretrained(
        ...     model_id,
        ...     motion_adapter=motion_adapter,
        ...     scheduler=scheduler,
        ...     pag_applied_layers=["mid"],
        ...     torch_dtype=torch.float16,
        ... ).to("cuda")

        >>> video = pipe(
        ...     prompt="car, futuristic cityscape with neon lights, street, no human",
        ...     negative_prompt="low quality, bad quality",
        ...     num_inference_steps=25,
        ...     guidance_scale=6.0,
        ...     pag_scale=3.0,
        ...     generator=torch.Generator().manual_seed(42),
        ... ).frames[0]

        >>> export_to_gif(video, "animatediff_pag.gif")
        ```
c                3       s^  e Zd ZdZdZg dZg dZ			dMdeded	e	d
e
eB dededededeee B f fddZ					dNdejdB dejdB dedB dedB fddZdOddZdd ZdPdefddZd d! Z						dQd"d#Z	dOd$d%Zed&d' Zed(d) Z ed*d+ Z!ed,d- Z"ed.d/ Z#e$ e%e&ddddd0d1dd2d3ddddddd4d5dddd6gdd7d3fd8eee B dB d9edB d:edB d;edB d<ed=ed>eee B dB d?edB d@edAej'eej' B dB d6ejdB dejdB dejdB dBe(dB dCeej dB dDedB dEe)dFe*ee+f dB dedB dGe,eegdf dB dHee dedIedJef0dKdLZ-  Z.S )RAnimateDiffPAGPipelinea7  
    Pipeline for text-to-video generation using
    [AnimateDiff](https://huggingface.co/docs/diffusers/en/api/pipelines/animatediff) and [Perturbed Attention
    Guidance](https://huggingface.co/docs/diffusers/en/using-diffusers/pag).

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        tokenizer (`CLIPTokenizer`):
            A [`~transformers.CLIPTokenizer`] to tokenize text.
        unet ([`UNet2DConditionModel`]):
            A [`UNet2DConditionModel`] used to create a UNetMotionModel to denoise the encoded video latents.
        motion_adapter ([`MotionAdapter`]):
            A [`MotionAdapter`] to be used in combination with `unet` to denoise the encoded video latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
    z&text_encoder->image_encoder->unet->vae)feature_extractorimage_encodermotion_adapter)latentsprompt_embedsnegative_prompt_embedsNmid_block.*attn1vaetext_encoder	tokenizerunetr'   	schedulerr%   r&   pag_applied_layersc
           
   
      s   t    t|trt||}| j||||||||d t| dd r.dt| j	j
jd  nd| _td| jd| _| |	 d S )N)r,   r-   r.   r/   r'   r0   r%   r&   r,   r   r"      F)	do_resizevae_scale_factor)super__init__
isinstancer   r   from_unet2dregister_modulesgetattrlenr,   configblock_out_channelsr4   r   video_processorset_pag_applied_layers)
selfr,   r-   r.   r/   r'   r0   r%   r&   r1   	__class__ g/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/pag/pipeline_pag_sd_animatediff.pyr6      s    

(
zAnimateDiffPAGPipeline.__init__r)   r*   
lora_scale	clip_skipc
              
   C   s  |durt | tr|| _tst| j| nt| j| |dur't |tr'd}
n|dur5t |tr5t	|}
n|j
d }
|du rt | trJ| || j}| j|d| jjddd}|j}| j|ddd	j}|j
d
 |j
d
 krt||s| j|dd| jjd d
f }td| jj d|  t| jjdr| jjjr|j|}nd}|	du r| j|||d}|d }n| j|||dd}|d
 |	d   }| jj|}| jdur| jj}n| jdur| jj}n|j}|j||d}|j
\}}}|d|d}||| |d
}|r|du r|du rdg|
 }nC|dur8t |t |ur8t!dt | dt | dt |trB|g}n|
t	|kr\t"d| dt	| d| d|
 d	|}t | trk| || j}|j
d }| j|d|ddd}t| jjdr| jjjr|j|}nd}| j|j||d}|d }|r|j
d }|j||d}|d|d}||
| |d
}| jdurt | trtrt#| j| ||fS )a  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            lora_scale (`float`, *optional*):
                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr"   r   
max_lengthTpt)paddingrG   
truncationreturn_tensorslongest)rI   rK   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: use_attention_mask)attention_mask)rO   output_hidden_states)dtypedevice z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)$r7   r   _lora_scaler   r   r-   r   strlistr;   shaper   maybe_convert_promptr.   model_max_length	input_idstorchequalbatch_decodeloggerwarninghasattrr<   rN   rO   to
text_modelfinal_layer_normrQ   r/   repeatviewtype	TypeError
ValueErrorr   )r@   promptrR   num_images_per_promptdo_classifier_free_guidancenegative_promptr)   r*   rE   rF   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textrO   prompt_embeds_dtypebs_embedseq_len_uncond_tokensrG   uncond_inputrC   rC   rD   encode_prompt   s   +











z$AnimateDiffPAGPipeline.encode_promptc           
      C   s   t | j j}t|tjs| j|ddj}|j	||d}|rH| j|ddj
d }|j|dd}| jt|ddj
d }|j|dd}||fS | |j}|j|dd}t|}	||	fS )	NrH   )rK   )rR   rQ   T)rP   r   dim)nextr&   
parametersrQ   r7   r\   Tensorr%   pixel_valuesrb   hidden_statesrepeat_interleave
zeros_likeimage_embeds)
r@   imagerR   rk   rP   rQ   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedsrC   rC   rD   encode_image\  s(   

z#AnimateDiffPAGPipeline.encode_imagec                 C   sl  g }|rg }|d u ret |ts|g}t|t| jjjkr/tdt| dt| jjj dt|| jjjD ],\}}	t |	t }
| 	||d|
\}}|
|d d d f  |rc|
|d d d f  q7n|D ]}|rw|d\}}|
| |
| qgg }t|D ]0\}}tj|g| dd}|rtj|| g| dd}tj||gdd}|j|d}|
| q|S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r"   r   r   r{   rR   )r7   rW   r;   r/   encoder_hid_projimage_projection_layersri   zipr   r   appendchunk	enumerater\   catrb   )r@   ip_adapter_imageip_adapter_image_embedsrR   rk   rl   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsirC   rC   rD   prepare_ip_adapter_image_embedsu  sH   


z6AnimateDiffPAGPipeline.prepare_ip_adapter_image_embeds   decode_chunk_sizec                 C   s   d| j jj | }|j\}}}}}|ddddd|| |||}g }td|jd |D ]}	||	|	|  }
| j |
j}
|	|
 q-t
|}|d d d f ||df|jdd   ddddd}| }|S )Nr"   r   r   r      rM   )r,   r<   scaling_factorrX   permutereshaperangedecodesampler   r\   r   float)r@   r(   r   rn   channels
num_framesheightwidthvideor   batch_latentsrC   rC   rD   decode_latents  s   "
8z%AnimateDiffPAGPipeline.decode_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Neta	generator)setinspect	signaturer0   stepr~   keys)r@   r   r   accepts_etaextra_step_kwargsaccepts_generatorrC   rC   rD   prepare_extra_step_kwargs  s   z0AnimateDiffPAGPipeline.prepare_extra_step_kwargsc
           
         s  |d dks|d dkrt d| d| d|	d ur8t fdd|	D s8t d j d	 fd
d|	D  |d urK|d urKt d| d| d|d u rW|d u rWt d|d urnt|tsnt|tsnt dt| |d ur|d urt d| d| d|d ur|d ur|j|jkrt d|j d|j d|d ur|d urt d|d urt|tst dt| |d jdvrt d|d j dd S d S )Nr2   r   z7`height` and `width` have to be divisible by 8 but are z and rT   c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0kr@   rC   rD   	<genexpr>  s    

z6AnimateDiffPAGPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS rC   r   r   r   rC   rD   
<listcomp>  s    z7AnimateDiffPAGPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r   r   zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is D)	ri   allr   r7   rV   rW   rg   rX   ndim)
r@   rj   r   r   rm   r)   r*   r   r   "callback_on_step_end_tensor_inputsrC   r   rD   check_inputs  s`   
z#AnimateDiffPAGPipeline.check_inputsc
                 C   s   | j r| |||||||||		}	t|tr(t||kr(tdt| d| d||||| j || j f}
|	d u rBt|
|||d}	n|	|}	|	| j	j
 }	|	S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   rR   rQ   )free_noise_enabled_prepare_latents_free_noiser7   rW   r;   ri   r4   r   rb   r0   init_noise_sigma)r@   rn   num_channels_latentsr   r   r   rQ   rR   r   r(   rX   rC   rC   rD   prepare_latents  s*   
z&AnimateDiffPAGPipeline.prepare_latentsc                 C      | j S r   _guidance_scaler   rC   rC   rD   guidance_scale(     z%AnimateDiffPAGPipeline.guidance_scalec                 C   r   r   )
_clip_skipr   rC   rC   rD   rF   ,  r   z AnimateDiffPAGPipeline.clip_skipc                 C   s
   | j dkS )Nr"   r   r   rC   rC   rD   rl   3  s   
z2AnimateDiffPAGPipeline.do_classifier_free_guidancec                 C   r   r   )_cross_attention_kwargsr   rC   rC   rD   cross_attention_kwargs7  r   z-AnimateDiffPAGPipeline.cross_attention_kwargsc                 C   r   r   )_num_timestepsr   rC   rC   rD   num_timesteps;  r   z$AnimateDiffPAGPipeline.num_timesteps2   g      @r"   g        pilTr(   g      @rj   r   r   r   num_inference_stepsr   rm   num_videos_per_promptr   r   r   r   output_typereturn_dictr   callback_on_step_endr   	pag_scalepag_adaptive_scalec           2      C   s  |p	| j jj| j }|p| j jj| j }d}| |||||||||	 || _|| _|| _|| _|| _	|dur>t
|tr>d}n|durLt
|trLt|}n|jd }| j}| jdur`| jddnd}| j|||| j||||| jd	\}}| jr| ||| j}n
| jrt||g}|j|dd}|dus|dur| ||||| | j}t|D ]1\}}d}| jr|d\}}| jr| ||| j}n| jrtj||gdd}||}|||< q| jj||d	 | jj }| j jj!} | "|| | ||||j#||
|	}| $|
|	}!|dus|durd
|ind}"| jr#| j j%}#| j&| j'| jd | j(r*| j)nd}$t*|$D ]}%| j(rC| +||%|||j#|
\}}t|| _,t||| jj-  }&| j.| j,d}'t|D ]\}}(t|g|jd | |jd   })| j/|)|(})| j |)|(|||"dj0}*| jr| 1|*| j| j2|(}*n| jr|*d\}+},|+||,|+   }*| jj3|*|(|fi |!j4}|duri }-|D ]
}.t5 |. |-|.< q|| ||(|-}/|/6d|}|/6d|}|/6d|}|t|d ks|d |&kr|d | jj- dkr|'7  t8rt9:  q^W d   n	1 sw   Y  q0|dkr$|}0n| ;||}1| j<j=|1|d}0| >  | jr@| j ?|# |sF|0fS t@|0dS )u  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated video.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated video.
            num_frames (`int`, *optional*, defaults to 16):
                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
                amounts to 2 seconds of video.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only
                applies to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
                `(batch_size, num_channel, num_frames, height, width)`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, text embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
                of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            pag_scale (`float`, *optional*, defaults to 3.0):
                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
                guidance will not be used.
            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
                used.

        Examples:

        Returns:
            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
        r"   Nr   scale)r)   r*   rE   rF   )repeatsr|   r   r{   r   r   )r1   rl   )total)encoder_hidden_statesr   added_cond_kwargsr(   r)   r*   latent)r   r   )frames)Ar/   r<   sample_sizer4   r   r   r   r   
_pag_scale_pag_adaptive_scaler7   rV   rW   r;   rX   _execution_devicer   getry   rl   rF   do_perturbed_attention_guidance%_prepare_perturbed_attention_guidancer\   r   r   r   r   r   rb   r0   set_timesteps	timestepsin_channelsr   rQ   r   attn_processors_set_pag_attn_processorr1   free_init_enabled_free_init_num_itersr   _apply_free_initr   orderprogress_barscale_model_inputr   #_apply_perturbed_attention_guidancer   r   prev_samplelocalspopupdateXLA_AVAILABLExm	mark_stepr   r>   postprocess_videomaybe_free_model_hooksset_attn_processorr   )2r@   rj   r   r   r   r   r   rm   r   r   r   r(   r)   r*   r   r   r   r   r   rF   r   r   r   r   r   rn   rR   text_encoder_lora_scaler   r   r   r   r   r   r   original_attn_procnum_free_init_itersfree_init_iternum_warmup_stepsr   tlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsr   video_tensorrC   rC   rD   __call__?  s&  o






	
6
/
zAnimateDiffPAGPipeline.__call__)NNr+   )NNNNNr   )r   )NNNNNN)/__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r   r   rV   rW   r6   r\   r   r   intry   r   r   r   r   r   r   propertyr   rF   rl   r   r   no_gradr   EXAMPLE_DOC_STRING	Generatorr	   booldictr   r   r  __classcell__rC   rC   rA   rD   r$   Y   s   

	

&	

 
8.
A
 




	
r$   );r   typingr   r   r\   transformersr   r   r   r   image_processorr	   loadersr
   r   r   modelsr   r   r   r   models.lorar   models.unets.unet_motion_modelr   
schedulersr   utilsr   r   r   r   r   r   utils.torch_utilsr   r>   r   animatediff.pipeline_outputr   free_init_utilsr   free_noise_utilsr   pipeline_utilsr    r!   	pag_utilsr#   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr  r_   r  r$   rC   rC   rC   rD   <module>   sD    

#