o
    ۷i\                  
   @   s  d dl Z d dlmZ d dlZd dlZd dlm  mZ	 d dl
mZ d dlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& e rzd dl'm(  m)Z* dZ+ndZ+e,e-Z.dZ/dd Z0dd Z1dd Z2d(ddZ3d)ddZ4d*ddZ5				d+d e6dB d!e7ej8B dB d"e9e6 dB d#e9e: dB fd$d%Z;G d&d' d'eZ<dS ),    N)Callable)Image)	BertModelBertTokenizerQwen2TokenizerQwen2VLForConditionalGeneration   )MultiPipelineCallbacksPipelineCallback)VaeImageProcessor)AutoencoderKLMagvitEasyAnimateTransformer3DModel)DiffusionPipeline)FlowMatchEulerDiscreteScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor)VideoProcessor   )EasyAnimatePipelineOutputTFaw  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import EasyAnimateInpaintPipeline
        >>> from diffusers.pipelines.easyanimate.pipeline_easyanimate_inpaint import get_image_to_video_latent
        >>> from diffusers.utils import export_to_video, load_image

        >>> pipe = EasyAnimateInpaintPipeline.from_pretrained(
        ...     "alibaba-pai/EasyAnimateV5.1-12b-zh-InP-diffusers", torch_dtype=torch.bfloat16
        ... )
        >>> pipe.to("cuda")

        >>> prompt = "An astronaut hatching from an egg, on the surface of the moon, the darkness and depth of space realised in the background. High quality, ultrarealistic detail and breath-taking movie-like camera shot."
        >>> validation_image_start = load_image(
        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/astronaut.jpg"
        ... )

        >>> validation_image_end = None
        >>> sample_size = (448, 576)
        >>> num_frames = 49
        >>> input_video, input_video_mask = get_image_to_video_latent(
        ...     [validation_image_start], validation_image_end, num_frames, sample_size
        ... )

        >>> video = pipe(
        ...     prompt,
        ...     num_frames=num_frames,
        ...     negative_prompt="Twisted body, limb deformities, text subtitles, comics, stillness, ugliness, errors, garbled text.",
        ...     height=sample_size[0],
        ...     width=sample_size[1],
        ...     video=input_video,
        ...     mask_video=input_video_mask,
        ... )
        >>> export_to_video(video.frames[0], "output.mp4", fps=8)
        ```
c                 C   s   t | tjrtjjj| d|dddd} n5t | tjr/| 	|d |d f} t
| } nt | t
jrIt| 	|d |d f} t
| } ntdt | tjsat| ddd d } | S )	zd
    Preprocess a single image (PIL.Image, numpy.ndarray, or torch.Tensor) to a resized tensor.
    r   bilinearFsizemodealign_cornersr   zKUnsupported input type. Expected PIL.Image, numpy.ndarray, or torch.Tensor.   g     o@)
isinstancetorchTensornn
functionalinterpolate	unsqueezesqueezer   resizenparrayndarray	fromarray
ValueError
from_numpypermutefloat)imagesample_size r0   r/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/easyanimate/pipeline_easyanimate_inpaint.pypreprocess_imageZ   s    r2   c           
         sJ  d}d}| durt | tr fdd| D }nt|  }t |trStjdd |D dd}t|ddddddf dd|ddg}||dddddt|f< nt|dddd|ddg}t|ddddf }t |trd	|ddddt|df< nd	|ddddddf< |durt |trՇ fd
d|D }tjdd |D dd}	|	|ddddt|	 df< d|ddddt| df< ||fS t| }|dd|ddddddf< d|ddddddf< ||fS | du r!t	dd| d  d g}t
dd| d  d gd	 }||fS )z
    Generate latent representations for video from start and end images. Inputs can be PIL.Image, numpy.ndarray, or
    torch.Tensor.
    Nc                       g | ]}t | qS r0   r2   .0imgr/   r0   r1   
<listcomp>       z-get_image_to_video_latent.<locals>.<listcomp>c                 S      g | ]
}| d  dqS r   r   r#   r5   r0   r0   r1   r9          r   dimr   r      c                    r3   r0   r4   r5   r8   r0   r1   r9      r:   c                 S   r;   r<   r=   r5   r0   r0   r1   r9      r>   r   )r   listr2   r   cattilelenr#   
zeros_likezerosones)
validation_image_startvalidation_image_end
num_framesr/   input_videoinput_video_maskimage_startstart_video	image_end	end_videor0   r8   r1   get_image_to_video_latentu   sN   


, 
 
  
&
 rS   c                 C   s   |}|}| \}}|| }||| kr|}t t|| | }	n|}	t t|| | }t t|| d }
t t||	 d }|
|f|
| ||	 ffS )Ng       @)intround)src	tgt_width
tgt_heighttwthhwrresize_heightresize_widthcrop_top	crop_leftr0   r0   r1   get_resize_crop_region_for_grid   s   rb           c                 C   sX   |j ttd|jdd}| j ttd| jdd}| ||  }|| d| |   } | S )a  
    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
    Flawed](https://huggingface.co/papers/2305.08891).

    Args:
        noise_cfg (`torch.Tensor`):
            The predicted noise tensor for the guided diffusion process.
        noise_pred_text (`torch.Tensor`):
            The predicted noise tensor for the text-guided diffusion process.
        guidance_rescale (`float`, *optional*, defaults to 0.0):
            A rescale factor applied to the noise predictions.

    Returns:
        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
    r   T)r@   keepdim)stdrC   rangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaledr0   r0   r1   rescale_noise_cfg   s
   rn   c                 C   s   |  }|rkt|dd  }d|d< tj| d d d d ddd d d d f |ddd}t|dd  }|d d |d< |d dkrgtj| d d d d dd d d d d f |ddd}tj||gdd}|S |}|S t|dd  }tj| |ddd}|S )Nr   r   r   	trilinearFr   r?   )r   rC   Fr"   r   rD   )masklatentprocess_first_frame_onlylatent_sizetarget_sizefirst_frame_resizedremaining_frames_resizedresized_maskr0   r0   r1   resize_mask   s(   **ry   c                 C   s   |d u rt jdd| jd fd| j}t || j}nt | jd f| j| j| }|d urLt j| 	 || j| jd|d d d d d d f  }nt 
| |d d d d d d f  }t | dkt | |}| | } | S )Ng            ?r   )meanre   r   )	generatordtypedevicerB   )r   normalshapetor~   expr}   rI   randnr   
randn_likewhererG   )r.   ratior|   sigmaimage_noiser0   r0   r1   add_noise_to_reference_video   s    " r   num_inference_stepsr~   	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r   r~   r   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r   r~   r~   r0   )
r*   setinspect	signatureset_timesteps
parameterskeys	__class__r   rF   )	schedulerr   r~   r   r   kwargsaccepts_timestepsaccept_sigmasr0   r0   r1   retrieve_timesteps  s2   r   c                8       s  e Zd ZdZdZg dZdedeeB de	e
B dedef
 fd	d
Z										dNdeee B dededeee B dB dejdB dejdB dejdB dejdB dejdB dejdB defddZdd Z						dOddZd d! Zd"d# Z					$	$dPd%d&Zed'd( Zed)d* Zed+d, Zed-d. Z ed/d0 Z!e" e#e$dd1dddd2d2d3d4ddd5ddddddd6ddd7gd5d8d9dfdeee B d:edB d;ej%d<ej%d=ej%d>edB d?edB d@edB dAe&dB deee B dB dedB dBe&dB dCej'eej' B dB d7ejdB dejdB dejdB dejdB dejdB dDedB dEedFe(eegdf e)B e*B dB dGee dHe&dIe&dJe&dKee dB f4dLdMZ+  Z,S )QEasyAnimateInpaintPipelinea  
    Pipeline for text-to-video generation using EasyAnimate.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    EasyAnimate uses one text encoder [qwen2 vl](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) in V5.1.

    Args:
        vae ([`AutoencoderKLMagvit`]):
            Variational Auto-Encoder (VAE) Model to encode and decode video to and from latent representations.
        text_encoder (`~transformers.Qwen2VLForConditionalGeneration`, `~transformers.BertModel` | None):
            EasyAnimate uses [qwen2 vl](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct) in V5.1.
        tokenizer (`~transformers.Qwen2Tokenizer`, `~transformers.BertTokenizer` | None):
            A `Qwen2Tokenizer` or `BertTokenizer` to tokenize text.
        transformer ([`EasyAnimateTransformer3DModel`]):
            The EasyAnimate model designed by EasyAnimate Team.
        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
            A scheduler to be used in combination with EasyAnimate to denoise the encoded image latents.
    ztext_encoder->transformer->vae)latentsprompt_embedsnegative_prompt_embedsvaetext_encoder	tokenizertransformerr   c                    s   t    | j|||||d t| dd d ur| jjjnd| _t| dd d ur+| jjnd| _	t| dd d ur:| jj
nd| _t| j	d| _t| j	dddd	| _t| j	d| _d S )
N)r   r   r   r   r   r   Tr         )vae_scale_factorF)r   do_normalizedo_binarizedo_convert_grayscale)super__init__register_modulesgetattrr   configenable_text_attention_maskr   spatial_compression_ratiovae_spatial_compression_ratiotemporal_compression_ratiovae_temporal_compression_ratior   image_processormask_processorr   video_processor)selfr   r   r   r   r   r   r0   r1   r   c  s0   


z#EasyAnimateInpaintPipeline.__init__r   TN   promptnum_images_per_promptdo_classifier_free_guidancenegative_promptr   r   prompt_attention_masknegative_prompt_attention_maskr~   r}   max_sequence_lengthc              	      sZ  |
p j j}
|	p j j}	|durt|trd}n|dur&t|tr&t|}n|jd }|du rt|tr?dd|dgdg}ndd	 |D } fd
d	|D } j|d|ddddd}|	 j j}|j
}|j} jrw j ||ddjd }ntd||d}|j	|
|	d}|j\}}}|d|d}||| |d}|j	|	d}|r|du r|durt|trdd|dgdg}ndd	 |D } fdd	|D } j|d|ddddd}|	 j j}|j
}|j} jr j ||ddjd }ntd||d}|r'|jd }|j	|
|	d}|d|d}||| |d}|j	|	d}||||fS )a[  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            dtype (`torch.dtype`):
                torch dtype
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the prompt. Required when `prompt_embeds` is passed directly.
            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the negative prompt. Required when `negative_prompt_embeds` is passed directly.
            max_sequence_length (`int`, *optional*): maximum sequence length to use for the prompt.
        Nr   r   usertexttyper   rolecontentc                 S      g | ]}d d|dgdqS r   r   r   r   r0   )r6   _promptr0   r0   r1   r9         
z<EasyAnimateInpaintPipeline.encode_prompt.<locals>.<listcomp>c                        g | ]} j j|gd ddqS FT)tokenizeadd_generation_promptr   apply_chat_templater6   mr   r0   r1   r9         
max_lengthTrightpt)r   paddingr   
truncationreturn_attention_maskpadding_sidereturn_tensors)	input_idsattention_maskoutput_hidden_stateszLLM needs attention_mask)r}   r~   rB   r~   c                 S   r   r   r0   )r6   _negative_promptr0   r0   r1   r9     r   c                    r   r   r   r   r   r0   r1   r9     r   )r   r}   r~   r   strrC   rF   r   r   r   r   r   r   hidden_statesr*   repeatview)r   r   r   r   r   r   r   r   r   r~   r}   r   
batch_sizemessagesr   text_inputstext_input_idsbs_embedseq_len_r0   r   r1   encode_prompt  s   -




	

	
z(EasyAnimateInpaintPipeline.encode_promptc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Netar|   )r   r   r   r   stepr   r   )r   r|   r   accepts_etaextra_step_kwargsaccepts_generatorr0   r0   r1   prepare_extra_step_kwargs-  s   z4EasyAnimateInpaintPipeline.prepare_extra_step_kwargsc
           
         st  |d dks|d dkrt d| d| d|	d ur8t fdd|	D s8t d j d	 fd
d|	D  |d urK|d urKt d| d| d|d u rW|d u rWt d|d urnt|tsnt|tsnt dt| |d urz|d u rzt d|d ur|d urt d| d| d|d ur|d u rt d|d ur|d ur|j|jkrt d|j d|j dd S d S d S )N   r   z8`height` and `width` have to be divisible by 16 but are z and .c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputsr6   kr   r0   r1   	<genexpr>M  s    

z:EasyAnimateInpaintPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r0   r   r   r   r0   r1   r9   Q  s    z;EasyAnimateInpaintPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is zEMust provide `prompt_attention_mask` when specifying `prompt_embeds`.z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zWMust provide `negative_prompt_attention_mask` when specifying `negative_prompt_embeds`.zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` )r*   allr   r   r   rC   r   r   )
r   r   heightwidthr   r   r   r   r   "callback_on_step_end_tensor_inputsr0   r   r1   check_inputs>  sN   z'EasyAnimateInpaintPipeline.check_inputsc                 C   sd   t t|| |}t|| d}| jj|| jj d  }t| jdr,| j|| jj  ||| fS )Nr   set_begin_index)minrT   maxr   r   orderhasattrr   )r   r   strengthr~   init_timestept_startr   r0   r0   r1   get_timestepsu  s   z(EasyAnimateInpaintPipeline.get_timestepsc                 C   sB  |d urB|j ||d}g }d}td|jd |D ]}||||  }| j|d }| }|| qtj|dd}|| jj	j
 }|d ur|j ||d}| jj	jrYt||
|d}g }d}td|jd |D ]}||||  }| j|d }| }|| qftj|dd}|| jj	j
 }|j ||d}||fS d }||fS )Nr~   r}   r   r   r?   )r   r|   )r   rf   r   r   encoder   appendr   rD   r   scaling_factorr   add_noise_in_inpaint_modelr   )r   rq   masked_imager   r   r   r}   r~   r|   r   noise_aug_strengthnew_maskbsimask_bsnew_mask_pixel_valuesmask_pixel_values_bsmasked_image_latentsr0   r0   r1   prepare_mask_latents  s<   
z/EasyAnimateInpaintPipeline.prepare_mask_latentsFc                 C   s  |||d | j  d || j || j f}t|tr,t||kr,tdt| d| d|s4|	d u r|s|
j||d}
d}g }td|
jd |D ]}|
|||  }| j	
|d }| }|| qHtj|dd}
|
| j	jj }
|
||
jd  dddd}|j||d}|	d u rt||||d}t| jtr|r|n| j|||}	n|r|n| j|||}	t| jd	r|r|	| jj n|	}	nt| jd	r|	|}|| jj }	n|	|}	|	f}|r||f7 }|r||f7 }|S )
Nr   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r	  r   r?   )r|   r~   r}   init_noise_sigma)r   r   r   rC   rF   r*   r   rf   r   r   r
  sampler  r   rD   r   r  r   r   r   r   scale_noise	add_noiser  r  )r   r   num_channels_latentsr   r   rL   r}   r~   r|   r   videotimestepis_strength_maxreturn_noisereturn_video_latentsr   r  	new_videor  video_bsvideo_latentsnoiseoutputsr0   r0   r1   prepare_latents  sV   



z*EasyAnimateInpaintPipeline.prepare_latentsc                 C      | j S r   _guidance_scaler   r0   r0   r1   guidance_scale     z)EasyAnimateInpaintPipeline.guidance_scalec                 C   r(  r   )_guidance_rescaler   r0   r0   r1   rj     r,  z+EasyAnimateInpaintPipeline.guidance_rescalec                 C   s
   | j dkS )Nr   r)  r   r0   r0   r1   r     s   
z6EasyAnimateInpaintPipeline.do_classifier_free_guidancec                 C   r(  r   )_num_timestepsr   r0   r0   r1   num_timesteps  r,  z(EasyAnimateInpaintPipeline.num_timestepsc                 C   r(  r   )
_interruptr   r0   r0   r1   	interrupt  r,  z$EasyAnimateInpaintPipeline.interrupt1   i   2   g      @rc   pilr         ?gޓZӬ?rL   r  
mask_videomasked_video_latentsr   r   r   r+  r   r|   output_typereturn_dictcallback_on_step_endr   rj   r  r  r   c           F      C   s	  t |ttfr
|j}t|d d }t|d d }| ||||
|||||	 |	| _|| _d| _|dur<t |t	r<d}n|durJt |t
rJt|}n|jd }| j}| jdur\| jj}n| jj}| j||||| j|
||||d
\}}}}tryd}n|}t | jtrt| j|||dd\}}n
t| j|||\}}| j|||d	\}}|dd || }|d
k} |dur|j\}}!}}"}#| jj|ddddd|| |!|"|#||d}$|$jtjd}$|$|||!||ddddd}$nd}$| j j!j"}%| jj!j#}&|&|%k}'| j$|| |%||||||||$|| d|'d}(|'r|(\}})}*n|(\}})|durp|dk% rt&|||}+| jj!j'rLt&|ddddf ||},n	t&|||},t&|||}| jrjt(|,gd n|,}-| jrxt(|gd n|}.tj(|-|.gdd|}/nv|j\}}!}}"}#| j)j|ddddd|| |!|"|#||d}0|0jtjd}0|0|||!||ddddd}0|&|%krOt*|0g d}1|du r|$|1dk  t+|$|1dk d  }2n|}2| jj!j'r| j,d|2||||||| j|d
\}3}t-d|0 || j j!j.},|,||| j j!j/ },n| j,|1|2||||||| j|d
\},}| jr2t(|,gd n|,}-| jr@t(|gd n|}.tj(|-|.gdd|}/nd}/t*|0d|%dddg}+t0j1|+|2 dd ddd||}+n|&|%krt&|||}+| jj!j'rt&|ddddf ||},n	t&|||},t&|||}| jrt(|,gd n|,}-| jrt(|gd n|}.tj(|-|.gdd|}/n-t&|$ddddf }+t*|+d|%dddg}+t0j1|+|2 dd ddd||}+d}/|&|%kr9|,jd }4|jd }5|%|4 |5 | jj!j#kr9t3d| jj! d| jj!j# d|% d|4 d|5 d |%|5 |4  d!| 4||}6| jrQt(||g}t(||g}|j|d"}|j|d"}t||| jj5  }7t|| _6| j7|d#2}8t8|D ]$\}9}:| j9rqx| jrt(|gd n|};t:| jd$r| j;|;|:};tj<|:g|;jd  |d"j|;jd}<| j|;|<||/dd%d }=|=2 d | j j!j"kr|=j=ddd\}=}3| jr|==d\}>}?|>|	|?|>   }=| jr|d&krt>|=|?|d'}=| jj?|=|:|fi |6d(did }|&|%krI|*}@|+}A|9t|d k r?||9d  }Bt | jtr3| j@|@t<|Bg|)}@n| jA|@|)t<|Bg}@d|A |@ |A|  }|durvi }C|D ]
}DtB |D |C|D< qR|| |9|:|C}E|ECd)|}|ECd*|}|ECd+|}|9t|d ks|9d |7kr|9d | jj5 dkr|8D  trtEF  qxW d   n	1 sw   Y  |d,ksd| j j!j/ | }| j jG|dd-d }| jHjI||d.}n|}| J  |s|fS tK|d/S )0a  
        The call function to the pipeline for generation with HunyuanDiT.

        Examples:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
            num_frames (`int`, *optional*):
                Length of the video to be generated in seconds. This parameter influences the number of frames and
                continuity of generated content.
            video (`torch.FloatTensor`, *optional*):
                A tensor representing an input video, which can be modified depending on the prompts provided.
            mask_video (`torch.FloatTensor`, *optional*):
                A tensor to specify areas of the video to be masked (omitted from generation).
            masked_video_latents (`torch.FloatTensor`, *optional*):
                Latents from masked portions of the video, utilized during image generation.
            height (`int`, *optional*):
                The height in pixels of the generated image or video frames.
            width (`int`, *optional*):
                The width in pixels of the generated image or video frames.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image but slower
                inference time. This parameter is modulated by `strength`.
            guidance_scale (`float`, *optional*, defaults to 5.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is effective when `guidance_scale > 1`.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide what to exclude in image generation. If not defined, you need to provide
                `negative_prompt_embeds`. This parameter is ignored when not using guidance (`guidance_scale < 1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                A parameter defined in the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies to the
                [`~schedulers.DDIMScheduler`] and is ignored in other schedulers. It adjusts noise level during the
                inference process.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) for setting
                random seeds which helps in making generation deterministic.
            latents (`torch.Tensor`, *optional*):
                A pre-computed latent representation which can be used to guide the generation process.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
                provided, embeddings are generated from the `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings, aiding in fine-tuning what should not be represented in the
                outputs. If not provided, embeddings are generated from the `negative_prompt` argument.
            prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask guiding the focus of the model on specific parts of the prompt text. Required when using
                `prompt_embeds`.
            negative_prompt_attention_mask (`torch.Tensor`, *optional*):
                Attention mask for the negative prompt, needed when `negative_prompt_embeds` are used.
            output_type (`str`, *optional*, defaults to `"latent"`):
                The output format of the generated image. Choose between `PIL.Image` and `np.array` to define how you
                want the results to be formatted.
            return_dict (`bool`, *optional*, defaults to `True`):
                If set to `True`, a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] will be returned;
                otherwise, a tuple containing the generated images and safety flags will be returned.
            callback_on_step_end (`Callable[[int, int], None]`, `PipelineCallback`, `MultiPipelineCallbacks`,
            *optional*):
                A callback function (or a list of them) that will be executed at the end of each denoising step,
                allowing for custom processing during generation.
            callback_on_step_end_tensor_inputs (`list[str]`, *optional*):
                Specifies which tensor inputs should be included in the callback function. If not defined, all tensor
                inputs will be passed, facilitating enhanced logging or monitoring of the generation process.
            guidance_rescale (`float`, *optional*, defaults to 0.0):
                Rescale parameter for adjusting noise configuration based on guidance rescale. Based on findings from
                [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://huggingface.co/papers/2305.08891).
            strength (`float`, *optional*, defaults to 1.0):
                Affects the overall styling or quality of the generated output. Values closer to 1 usually provide
                direct adherence to prompts.

        Examples:
            # Example usage of the function for generating images based on prompts.

        Returns:
            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
                Returns either a structured output containing generated images and their metadata when `return_dict` is
                `True`, or a simpler tuple, where the first element is a list of generated images and the second
                element indicates if any of them contain "not-safe-for-work" (NSFW) content.
        r   FNr   r   )
r   r~   r}   r   r   r   r   r   r   r   cpu)mu)r   r  r~   r5  r   r   r   )r   r   )r}   T)r  r  r  r   r!  rA   r?   )r   r   r   r   r   rz   rB   )r  ro   r   zHIncorrect configuration settings! The config of `pipeline.transformer`: z	 expects z& but received `num_channels_latents`: z + `num_channels_mask`: z  + `num_channels_masked_image`: z = z[. Please verify the config of `pipeline.transformer` or your `mask_image` or `image` input.r   )totalscale_model_input)encoder_hidden_statesinpaint_latentsr9  rc   )rj   r9  r   r   r   rr   )r9  )r  r8  )frames)Lr   r
   r	   tensor_inputsrT   r   r*  r-  r0  r   rC   rF   r   _execution_devicer   r}   r   r   r   XLA_AVAILABLEr   r   r   r  r   r   
preprocessr,   reshaper   r   float32r   r   latent_channelsin_channelsr'  r   rG   resize_inpaint_mask_directlyrD   r   rE   	ones_liker  ry   cache_mag_vaer  rp   r"   r   r*   r   r  r.  progress_bar	enumerater1  r  r?  tensorchunkrn   r   r  r  localspopupdatexm	mark_stepdecoder   postprocess_videomaybe_free_model_hooksr   )Fr   r   rL   r  r6  r7  r   r   r   r+  r   r   r   r|   r   r   r   r   r   r8  r9  r:  r   rj   r  r  r   r   r~   r}   timestep_devicelatent_timestepr  channelsheight_videowidth_video
init_videor  num_channels_transformerreturn_image_latentslatents_outputsr%  image_latentsrq   mask_latents
mask_inputmasked_video_latents_inputrA  mask_conditionmask_condition_tilemasked_videor   num_channels_masknum_channels_masked_imager   num_warmup_stepsrN  r  tlatent_model_inputt_expand
noise_prednoise_pred_uncondri   init_latents_proper	init_masknoise_timestepcallback_kwargsr   callback_outputsr0   r0   r1   __call__  s  p





 "


$






$




	
$

6
F
z#EasyAnimateInpaintPipeline.__call__)
r   TNNNNNNNr   )NNNNNN)NNNTFF)-__name__
__module____qualname____doc__model_cpu_offload_seqr   r   r   r   r   r   r   r   r   r   rC   rT   boolr   r   r~   r}   r   r   r   r  r  r'  propertyr+  rj   r   r/  r1  no_gradr   EXAMPLE_DOC_STRINGFloatTensorr-   	Generatorr   r
   r	   rw  __classcell__r0   r0   r   r1   r   J  s:   *
	

 $
7=
J





	

r   )rc   )T)NN)NNNN)=r   typingr   numpyr&   r   torch.nn.functionalr    r!   rp   PILr   transformersr   r   r   r   	callbacksr	   r
   r   r   modelsr   r   pipelines.pipeline_utilsr   
schedulersr   utilsr   r   r   utils.torch_utilsr   r   r   pipeline_outputr   torch_xla.core.xla_modelcore	xla_modelrU  rE  
get_loggerrx  loggerr  r2   rS   rb   rn   ry   r   rT   r   r~   rC   r-   r   r   r0   r0   r0   r1   <module>   sT   
'>





;