o
    Gix                 
   @   s  d dl Z d dlmZmZ d dlZd dlmZmZmZm	Z	m
Z
 ddlmZmZ ddlmZmZmZmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, e rddl-m.Z. e rd dl/m0  m1Z2 dZ3ndZ3e 4e5Z6dZ7d ddZ8				d!de9dB de:ej;B dB de<e9 dB de<e= dB fddZ>G dd de'e(eeeee,	Z?dS )"    N)AnyCallable)CLIPImageProcessorCLIPTextModelCLIPTextModelWithProjectionCLIPTokenizerCLIPVisionModelWithProjection   )PipelineImageInputVaeImageProcessor)FromSingleFileMixinIPAdapterMixin StableDiffusionXLLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModel)adjust_lora_scale_text_encoder)KarrasDiffusionSchedulers)USE_PEFT_BACKEND	deprecate is_invisible_watermark_availableis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)randn_tensor   )DiffusionPipelineStableDiffusionMixin)StableDiffusionXLPipelineOutput   )PAGMixin)StableDiffusionXLWatermarkerTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import AutoPipelineForText2Image

        >>> pipe = AutoPipelineForText2Image.from_pretrained(
        ...     "stabilityai/stable-diffusion-xl-base-1.0",
        ...     torch_dtype=torch.float16,
        ...     enable_pag=True,
        ... )
        >>> pipe = pipe.to("cuda")

        >>> prompt = "a photo of an astronaut riding a horse on mars"
        >>> image = pipe(prompt, pag_scale=0.3).images[0]
        ```
        c                 C   sX   |j ttd|jdd}| j ttd| jdd}| ||  }|| d| |   } | S )a  
    Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
    Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
    Flawed](https://huggingface.co/papers/2305.08891).

    Args:
        noise_cfg (`torch.Tensor`):
            The predicted noise tensor for the guided diffusion process.
        noise_pred_text (`torch.Tensor`):
            The predicted noise tensor for the text-guided diffusion process.
        guidance_rescale (`float`, *optional*, defaults to 0.0):
            A rescale factor applied to the noise predictions.

    Returns:
        noise_cfg (`torch.Tensor`): The rescaled noise prediction tensor.
    r"   T)dimkeepdim)stdlistrangendim)	noise_cfgnoise_pred_textguidance_rescalestd_textstd_cfgnoise_pred_rescaled r2   ^/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/pag/pipeline_pag_sd_xl.pyrescale_noise_cfgV   s
   r4   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr7   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r7   r6   r8   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r8   r6   r6   r2   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r7   len)	schedulerr5   r6   r7   r8   kwargsaccepts_timestepsaccept_sigmasr2   r2   r3   retrieve_timestepsq   s2   rF   c                K       s  e Zd ZdZdZg dZg dZ					dmded	ed
e	de
de
dededededededB deee B f fddZ												dndededB dejdB dedededB dedB dejdB dejdB d ejdB d!ejdB d"edB d#edB fd$d%Zdod&d'Zd(d) Zd*d+ Z									dpd,d-Zdod.d/Z	dod0d1Zd2d3 Zd4ej fd5ejd6ed7ej!d8ejfd9d:Z"e#d;d< Z$e#d=d> Z%e#d?d@ Z&e#dAdB Z'e#dCdD Z(e#dEdF Z)e#dGdH Z*e#dIdJ Z+e, e-e.dddddKddddLddddMdddddddddNdddMddOdddOddddPgdQdMf$deee B deee B dB dRedB dSedB dTedUee dVee dWedB dXedeee B dB deee B dB dedB dYedZej/eej/ B dB dPejdB dejdB dejdB d ejdB d!ejdB d[e0dB d\eej dB d]edB d^ed_e1ee2f dB d`edae3eef dB dbe3eef dce3eef dB dde3eef dB dee3eef dfe3eef dB d#edB dge4eegdf dB dhee diedjefHdkdlZ5  Z6S )qStableDiffusionXLPAGPipelinea  
    Pipeline for text-to-image generation using Stable Diffusion XL.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    The pipeline also inherits the following loading methods:
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModel`]):
            Frozen text-encoder. Stable Diffusion XL uses the text portion of
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
        text_encoder_2 ([` CLIPTextModelWithProjection`]):
            Second frozen text-encoder. Stable Diffusion XL uses the text and pool portion of
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
            specifically the
            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
            variant.
        tokenizer (`CLIPTokenizer`):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        tokenizer_2 (`CLIPTokenizer`):
            Second Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
        scheduler ([`SchedulerMixin`]):
            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
        force_zeros_for_empty_prompt (`bool`, *optional*, defaults to `"True"`):
            Whether the negative prompt embeddings shall be forced to always be set to 0. Also see the config of
            `stabilityai/stable-diffusion-xl-base-1-0`.
        add_watermarker (`bool`, *optional*):
            Whether to use the [invisible_watermark library](https://github.com/ShieldMnt/invisible-watermark/) to
            watermark output images. If not defined, it will default to True if the package is installed, otherwise no
            watermarker will be used.
    z6text_encoder->text_encoder_2->image_encoder->unet->vae)	tokenizertokenizer_2text_encodertext_encoder_2image_encoderfeature_extractor)latentsprompt_embedsnegative_prompt_embedsadd_text_embedsadd_time_idsnegative_pooled_prompt_embedsnegative_add_time_idsNTmidvaerJ   rK   rH   rI   unetrB   rL   rM   force_zeros_for_empty_promptadd_watermarkerpag_applied_layersc                    s   t    | j|||||||||	d	 | j|
d t| dd r*dt| jjjd  nd| _	t
| j	d| _t| drJ| jd urJt| jjd	rJ| jjjnd
| _|d urS|nt }|r]t | _nd | _| | d S )N)	rV   rJ   rK   rH   rI   rW   rB   rL   rM   )rX   rV   r   r"      )vae_scale_factorrW   sample_size   )super__init__register_modulesregister_to_configgetattrrA   rV   configblock_out_channelsr\   r   image_processorhasattrrW   r]   default_sample_sizer   r$   	watermarkset_pag_applied_layers)selfrV   rJ   rK   rH   rI   rW   rB   rL   rM   rX   rY   rZ   r@   r2   r3   r`      s4   
(

z%StableDiffusionXLPAGPipeline.__init__r"   promptprompt_2r6   num_images_per_promptdo_classifier_free_guidancenegative_promptnegative_prompt_2rO   rP   pooled_prompt_embedsrS   
lora_scale	clip_skipc           !   
   C   s:  |p| j }|dur9t| tr9|| _| jdur%tst| j| nt| j| | jdur9ts3t| j| nt| j| t|t	rA|gn|}|durLt
|}n|jd }| jdur\| j| jgn| jg}| jdurk| j| jgn| jg}|du r|pw|}t|t	r|gn|}g }||g}t|||D ]\}}}t| tr| ||}||d|jddd}|j}||dddj}|jd	 |jd	 krt||s||dd|jd
 d	f }td|j d|  |||dd}|
du r|d jdkr|d }
|du r|jd }n|j|d   }|| qtj|d	d}|du o| jj}|r6|	du r6|r6t|}	t|
}n|r|	du r|pBd}|pG|}t|t	rS||g n|}t|t	r`||g n|}|durt |t |urt!dt | dt | d|t
|krt"d| dt
| d| d| d	||g}g }t|||D ]E\}}}t| tr| ||}|jd
 }||d|ddd}||j|dd}	|du r|	d jdkr|	d }|	jd }	||	 qtj|d	d}	| jdur|j| jj#|d}n	|j| j$j#|d}|j\}}} |%d
|d
}|&|| |d	}|rR|	jd
 }| jdur9|	j| jj#|d}	n	|	j| j$j#|d}	|	%d
|d
}	|	&|| |d	}	|
%d
|&|| d	}
|rm|%d
|&|| d	}| jdurt| trtrt'| j| | jdurt| trtrt'| j| ||	|
|fS )a\  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in both text-encoders
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            negative_prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr   
max_lengthTpt)paddingrv   
truncationreturn_tensorslongest)rx   rz   r"   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: output_hidden_statesr   r&    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)dtyper6   )(_execution_device
isinstancer   _lora_scalerJ   r   r   r   rK   strrA   shaperH   rI   zipr   maybe_convert_promptmodel_max_length	input_idstorchequalbatch_decodeloggerwarningtor+   hidden_statesappendconcatrd   rX   
zeros_liketype	TypeErrorr9   r   rW   repeatviewr   )!rk   rm   rn   r6   ro   rp   rq   rr   rO   rP   rs   rS   rt   ru   
batch_size
tokenizerstext_encodersprompt_embeds_listpromptsrH   rJ   text_inputstext_input_idsuntruncated_idsremoved_textzero_out_negative_promptuncond_tokensnegative_prompt_embeds_listrv   uncond_inputbs_embedseq_len_r2   r2   r3   encode_prompt%  s   
:





 







z*StableDiffusionXLPAGPipeline.encode_promptc           
      C   s   t | j j}t|tjs| j|ddj}|j	||d}|rH| j|ddj
d }|j|dd}| jt|ddj
d }|j|dd}||fS | |j}|j|dd}t|}	||	fS )	Nrw   )rz   r6   r   Tr}   r   r   r   )nextrL   r>   r   r   r   TensorrM   pixel_valuesr   r   repeat_interleaver   image_embeds)
rk   imager6   ro   r~   r   image_enc_hidden_statesuncond_image_enc_hidden_statesr   uncond_image_embedsr2   r2   r3   encode_image  s(   

z)StableDiffusionXLPAGPipeline.encode_imagec                 C   sl  g }|rg }|d u ret |ts|g}t|t| jjjkr/tdt| dt| jjj dt|| jjjD ],\}}	t |	t }
| 	||d|
\}}|
|d d d f  |rc|
|d d d f  q7n|D ]}|rw|d\}}|
| |
| qgg }t|D ]0\}}tj|g| dd}|rtj|| g| dd}tj||gdd}|j|d}|
| q|S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r"   r   r   r   )r6   )r   r)   rA   rW   encoder_hid_projimage_projection_layersr9   r   r   r   r   chunk	enumerater   catr   )rk   ip_adapter_imageip_adapter_image_embedsr6   ro   rp   r   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsir2   r2   r3   prepare_ip_adapter_image_embeds-  sH   


z<StableDiffusionXLPAGPipeline.prepare_ip_adapter_image_embedsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Neta	generator)r:   r;   r<   rB   stepr>   r?   )rk   r   r   accepts_etaextra_step_kwargsaccepts_generatorr2   r2   r3   prepare_extra_step_kwargs[  s   z6StableDiffusionXLPAGPipeline.prepare_extra_step_kwargsc                    s  |d dks|d dkrt d| d| d|d ur1t|tr$|dkr1t d| dt| d|d urRt fdd	|D sRt d
 j d fdd|D  |d ure|d uret d| d| d|d urx|d urxt d| d| d|d u r|d u rt d|d urt|tst|tst dt| |d urt|tst|tst dt| |d ur|	d urt d| d|	 d|d ur|	d urt d| d|	 d|d ur|	d ur|j|	jkrt d|j d|	j d|d ur|
d u rt d|	d ur|d u rt d|d ur|d urt d|d urBt|ts/t dt| |d j	dvrDt d|d j	 d d S d S )!Nr[   r   z7`height` and `width` have to be divisible by 8 but are z and r   z5`callback_steps` has to be a positive integer but is z	 of type c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0krk   r2   r3   	<genexpr>  s    

z<StableDiffusionXLPAGPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r2   r   r   r   r2   r3   
<listcomp>  s    z=StableDiffusionXLPAGPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.z Cannot forward both `prompt_2`: zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z4`prompt_2` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: z)Cannot forward both `negative_prompt_2`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zIf `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`.zIf `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`.zProvide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined.z:`ip_adapter_image_embeds` has to be of type `list` but is )r	      zF`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is D)
r9   r   intr   allr   r   r)   r   r+   )rk   rm   rn   heightwidthcallback_stepsrq   rr   rO   rP   rs   rS   r   r   "callback_on_step_end_tensor_inputsr2   r   r3   check_inputsm  s   
z)StableDiffusionXLPAGPipeline.check_inputsc	           
      C   s   ||t || j t || j f}	t|tr(t||kr(tdt| d| d|d u r5t|	|||d}n||}|| jj	 }|S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   r6   r   )
r   r\   r   r)   rA   r9   r   r   rB   init_noise_sigma)
rk   r   num_channels_latentsr   r   r   r6   r   rN   r   r2   r2   r3   prepare_latents  s    
z,StableDiffusionXLPAGPipeline.prepare_latentsc           	      C   sd   t || | }| jjjt| | }| jjjj}||kr(td| d| dt	j
|g|d}|S )Nz7Model expects an added time embedding vector of length z, but a vector of z was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`.r   )r)   rW   rd   addition_time_embed_dimrA   add_embeddinglinear_1in_featuresr9   r   tensor)	rk   original_sizecrops_coords_top_lefttarget_sizer   text_encoder_projection_dimrR   passed_add_embed_dimexpected_add_embed_dimr2   r2   r3   _get_add_time_ids  s   z.StableDiffusionXLPAGPipeline._get_add_time_idsc                 C   s    t ddd | jjtjd d S )N
upcast_vaez1.0.0z`upcast_vae` is deprecated. Please use `pipe.vae.to(torch.float32)`. For more details, please refer to: https://github.com/huggingface/diffusers/pull/12619#issue-3606633695.r   )r   rV   r   r   float32r   r2   r2   r3   r     s   z'StableDiffusionXLPAGPipeline.upcast_vaei   wembedding_dimr   returnc                 C   s   t |jdks	J |d }|d }ttd|d  }ttj||d|  }||dddf |dddf  }tjt	|t
|gdd}|d dkrZtjj|d}|j|jd	 |fksfJ |S )
a  
        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298

        Args:
            w (`torch.Tensor`):
                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
            embedding_dim (`int`, *optional*, defaults to 512):
                Dimension of the embeddings to generate.
            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
                Data type of the generated embeddings.

        Returns:
            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
        r"   g     @@r   g     @r   Nr   )r   r"   r   )rA   r   r   logr   exparanger   r   sincosnn
functionalpad)rk   r   r   r   half_dimembr2   r2   r3   get_guidance_scale_embedding  s   &z9StableDiffusionXLPAGPipeline.get_guidance_scale_embeddingc                 C      | j S r   )_guidance_scaler   r2   r2   r3   guidance_scale     z+StableDiffusionXLPAGPipeline.guidance_scalec                 C   r   r   )_guidance_rescaler   r2   r2   r3   r.   #  r  z-StableDiffusionXLPAGPipeline.guidance_rescalec                 C   r   r   )
_clip_skipr   r2   r2   r3   ru   '  r  z&StableDiffusionXLPAGPipeline.clip_skipc                 C   s   | j dko| jjjd u S )Nr"   )r   rW   rd   time_cond_proj_dimr   r2   r2   r3   rp   .  s   z8StableDiffusionXLPAGPipeline.do_classifier_free_guidancec                 C   r   r   )_cross_attention_kwargsr   r2   r2   r3   cross_attention_kwargs2  r  z3StableDiffusionXLPAGPipeline.cross_attention_kwargsc                 C   r   r   )_denoising_endr   r2   r2   r3   denoising_end6  r  z*StableDiffusionXLPAGPipeline.denoising_endc                 C   r   r   )_num_timestepsr   r2   r2   r3   num_timesteps:  r  z*StableDiffusionXLPAGPipeline.num_timestepsc                 C   r   r   )
_interruptr   r2   r2   r3   	interrupt>  r  z&StableDiffusionXLPAGPipeline.interrupt2   g      @r%   pil)r   r   rN   g      @r   r   r5   r7   r8   r	  r  r   r   r   r   output_typereturn_dictr  r.   r   r   r   negative_original_sizenegative_crops_coords_top_leftnegative_target_sizecallback_on_step_endr   	pag_scalepag_adaptive_scalec%           G         sL  |p| j | j }|p| j | j }|p||f}|p||f}| ||||d|
||||||||" |	| _|| _| | _|| _|| _d| _|#| _	|$| _
|durRt|trRd}%n|dur`t|tr`t|}%n|jd }%| j}&| jdurt| jddnd}'| j|||&|| j|
||||||'| jd\}}}}trd}(n|&}(t| j||(||\}}| jjj})| |%| |)|||j|&||}| ||}*|}+| jdu rt |jd },n| jjj!},| j"||||j|,d	}-|dur|dur| j"||||j|,d	}.n|-}.| j#r| $||| j}| $|+|| j}+| $|-|.| j}-n| jr+t%j&||gdd
}t%j&||+gdd
}+t%j&|.|-gdd
}-|'|&}|+'|&}+|-'|&(|%| d}-|dusJ|dur| )|||&|%| | j}t*|D ]5\}/}0d}1| jrk|0+d\}1}0| j#rx| $|0|1| j}0n| jrt%j&|1|0gdd
}0|0'|&}0|0||/< qZt,t||| jj-  d}2| j.durt| j.t/r| j.dkr| j.dk rt t0| jjj1| j.| jjj1   ttt2 fdd|}|d| }d}3| jjj3durt%4| j5d (|%| }4| j6|4| jjj3dj'|&|jd}3| j#r| jj7}5| j8| j9| jd t|| _:| j;|d}6t*|D ]	\}/}7| j<r.q#t%&|g|jd |jd   }8| j=|8|7}8|+|-d}9|durS||9d< | j|8|7||3| j|9ddd }:| j#rt| >|:| j| j5|7d\}:};n| jr|:+d\}<};|<| j5|;|<   }:| jr| j?dkrt@|:|;| j?d}:|j}=| jjA|:|7|fi |*ddid }|j|=krt%jBjCD r|'|=}|!duri }>|"D ]
}?tE |? |>|?< q|!| |/|7|>}@|@Fd|}|@Fd|}|@Fd|}|@Fd|+}+|@Fd|}|@Fd|-}-|@Fd|.}.|/t|d ks!|/d |2kr%|/d | jj- dkr%|6G  tr,tHI  q#W d   n	1 s9w   Y  |d ks| jJjt%jKkoO| jJjjL}A|Arf| M  |'tNtO| jJjPQ j}n|j| jJjkr}t%jBjCD r}| jJ'|j| _JtR| jJjd!o| jJjjSdu}BtR| jJjd"o| jJjjTdu}C|Br|Crt%4| jJjjSUdd#dd'|jV|j}Dt%4| jJjjTUdd#dd'|jV|j}E||E | jJjjW |D }n|| jJjjW }| jJjX|dd$d }F|Ar| jJj't%jKd% n|}F|d ks| jYdur| jYZ|F}F| j[j\|F|d&}F| ]  | j#r| j^|5 |s!|FfS t_|Fd'S )(u3  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in both text-encoders
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. This is set to 1024 by default for the best results.
                Anything below 512 pixels won't work well for
                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
                and checkpoints that are not specifically fine-tuned on low resolutions.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image. This is set to 1024 by default for the best results.
                Anything below 512 pixels won't work well for
                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
                and checkpoints that are not specifically fine-tuned on low resolutions.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`list[int]`, *optional*):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
            sigmas (`list[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            denoising_end (`float`, *optional*):
                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
                completed before it is intentionally prematurely terminated. As a result, the returned sample will
                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
            guidance_scale (`float`, *optional*, defaults to 5.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            negative_prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            eta (`float`, *optional*, defaults to 0.0):
                Corresponds to parameter eta (η) in the DDIM paper: https://huggingface.co/papers/2010.02502. Only
                applies to [`schedulers.DDIMScheduler`], will be ignored for others.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            ip_adapter_image_embeds (`list[torch.Tensor]`, *optional*):
                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
                of a plain tuple.
            cross_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            guidance_rescale (`float`, *optional*, defaults to 0.0):
                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
                [Common Diffusion Noise Schedules and Sample Steps are
                Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
                using zero terminal SNR.
            original_size (`tuple[int]`, *optional*, defaults to (1024, 1024)):
                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
                explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            crops_coords_top_left (`tuple[int]`, *optional*, defaults to (0, 0)):
                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            target_size (`tuple[int]`, *optional*, defaults to (1024, 1024)):
                For most cases, `target_size` should be set to the desired height and width of the generated image. If
                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            negative_original_size (`tuple[int]`, *optional*, defaults to (1024, 1024)):
                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
                micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
            negative_crops_coords_top_left (`tuple[int]`, *optional*, defaults to (0, 0)):
                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
                micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
            negative_target_size (`tuple[int]`, *optional*, defaults to (1024, 1024)):
                To negatively condition the generation process based on a target image resolution. It should be as same
                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            pag_scale (`float`, *optional*, defaults to 3.0):
                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
                guidance will not be used.
            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
                used.

        Examples:

        Returns:
            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is a list with the generated images.
        NFr"   r   scale)rm   rn   r6   ro   rp   rq   rr   rO   rP   rs   rS   rt   ru   cpur|   )r   r   r   r   c                    s   |  kS r   r2   )tsdiscrete_timestep_cutoffr2   r3   <lambda>  s    z7StableDiffusionXLPAGPipeline.__call__.<locals>.<lambda>)r   r   )rZ   rp   )total)text_embedstime_idsr   )encoder_hidden_statestimestep_condr  added_cond_kwargsr  Tr%   )r.   r  rN   rO   rP   rQ   rS   rR   rT   latentlatents_meanlatents_stdr   )r  r   )r  )images)`rh   r\   r   r   r  r  r  r  r  
_pag_scale_pag_adaptive_scaler   r   r)   rA   r   r   r  getr   rp   ru   XLA_AVAILABLErF   rB   rW   rd   in_channelsr   r   r   rK   r   projection_dimr   do_perturbed_attention_guidance%_prepare_perturbed_attention_guidancer   r   r   r   r   r   r   maxorderr	  floatroundnum_train_timestepsfilterr  r   r  r   attn_processors_set_pag_attn_processorrZ   r
  progress_barr  scale_model_input#_apply_perturbed_attention_guidancer.   r4   r   backendsmpsis_availablelocalspopupdatexm	mark_steprV   float16force_upcastr   r   iterpost_quant_convr>   rg   r%  r&  r   r6   scaling_factordecoderi   apply_watermarkrf   postprocessmaybe_free_model_hooksset_attn_processorr!   )Grk   rm   rn   r   r   r5   r7   r8   r	  r  rq   rr   ro   r   r   rN   rO   rP   rs   rS   r   r   r  r  r  r.   r   r   r   r  r  r  ru   r  r   r  r  r   r6   rt   timestep_devicer   r   rQ   r   rR   rT   r   r   r   num_warmup_stepsr"  guidance_scale_tensororiginal_attn_procr8  tlatent_model_inputr#  
noise_predr-   noise_pred_uncondlatents_dtypecallback_kwargsr   callback_outputsneeds_upcastinghas_latents_meanhas_latents_stdr%  r&  r   r2   r  r3   __call__B  s   C

	








 


$

6
D&&

z%StableDiffusionXLPAGPipeline.__call__)NNTNrU   )NNr"   TNNNNNNNNr   )	NNNNNNNNN)7__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   r   boolr   r)   r`   r   r6   r   r   r2  r   r   r   r   r   r   r   r   r   r   r   propertyr  r.   ru   rp   r  r	  r  r  no_gradr   EXAMPLE_DOC_STRING	Generatorr
   dictr   tupler   r[  __classcell__r2   r2   rl   r3   rG      s   	-	

3	

 
p.

a











	


 !"#$%rG   )r%   )NNNN)@r;   typingr   r   r   transformersr   r   r   r   r   rf   r
   r   loadersr   r   r   r   modelsr   r   r   models.lorar   
schedulersr   utilsr   r   r   r   r   r   r   r   utils.torch_utilsr   pipeline_utilsr   r    #stable_diffusion_xl.pipeline_outputr!   	pag_utilsr#   stable_diffusion_xl.watermarkr$   torch_xla.core.xla_modelcore	xla_modelrA  r+  
get_loggerr\  r   re  r4   r   r   r6   r)   r2  rF   rG   r2   r2   r2   r3   <module>   sZ   (






;