o
    Gi                  
   @   sd  d dl Z d dlmZmZ d dlZd dlmZmZmZm	Z	 ddl
mZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& e r{d dl'm(  m)Z* dZ+ndZ+e,e-Z.dZ/				dde0dB de1ej2B dB de3e0 dB de3e4 dB fddZ5G dd de"eee&Z6dS )    N)AnyCallable)CLIPTextModelWithProjectionCLIPTokenizerT5EncoderModelT5TokenizerFast   )VaeImageProcessor)FromSingleFileMixinSD3LoraLoaderMixin)PAGCFGJointAttnProcessor2_0PAGJointAttnProcessor2_0)AutoencoderKL)SD3Transformer2DModel)FlowMatchEulerDiscreteScheduler)USE_PEFT_BACKENDis_torch_xla_availableloggingreplace_example_docstringscale_lora_layersunscale_lora_layers)randn_tensor   )DiffusionPipeline)StableDiffusion3PipelineOutput   )PAGMixinTFa^  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import AutoPipelineForText2Image

        >>> pipe = AutoPipelineForText2Image.from_pretrained(
        ...     "stabilityai/stable-diffusion-3-medium-diffusers",
        ...     torch_dtype=torch.float16,
        ...     enable_pag=True,
        ...     pag_applied_layers=["blocks.13"],
        ... )
        >>> pipe.to("cuda")
        >>> prompt = "A cat holding a sign that says hello world"
        >>> image = pipe(prompt, guidance_scale=5.0, pag_scale=0.7).images[0]
        >>> image.save("sd3_pag.png")
        ```
num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r   r   r    zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r    r   r    )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r   len)	schedulerr   r   r   r    kwargsaccepts_timestepsaccept_sigmasr!   r!   ]/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/pag/pipeline_pag_sd_3.pyretrieve_timestepsM   s2   r0   c                9       s  e Zd ZdZdZg Zg dZ	dSdedede	de
d	ed
e
dedededeee B f fddZ					dTdeee B dededejdB dejdB f
ddZ				dUdeee B dedejdB dedB def
ddZ			 										dVdeee B d!eee B d"eee B dejdB ded#ed$eee B dB d%eee B dB d&eee B dB d'ejdB d(ejdB d)ejdB d*ejdB dedB ded+edB f d,d-Z									dWd.d/Z	dXd0d1Zed2d3 Zed4d5 Zed6d7 Z ed8d9 Z!ed:d; Z"ed<d= Z#e$ e%e&dddddd>dd?ddddddddddd@d ddddAgddBdCfdeee B d!eee B dB d"eee B dB dDedB dEedB dFedGee dB dHed$eee B dB d%eee B dB d&eee B dB dedB dIej'eej' B dB dAejdB d'ejdB d(ejdB d)ejdB d*ejdB dJedB dKedLe(ee)f dB dedB dMe*eegdf dB dNee dedOedPef6dQdRZ+  Z,S )YStableDiffusion3PAGPipelinea  
    [PAG pipeline](https://huggingface.co/docs/diffusers/main/en/using-diffusers/pag) for text-to-image generation
    using Stable Diffusion 3.

    Args:
        transformer ([`SD3Transformer2DModel`]):
            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
        vae ([`AutoencoderKL`]):
            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
        text_encoder ([`CLIPTextModelWithProjection`]):
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
            specifically the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant,
            with an additional added projection layer that is initialized with a diagonal matrix with the `hidden_size`
            as its dimension.
        text_encoder_2 ([`CLIPTextModelWithProjection`]):
            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModelWithProjection),
            specifically the
            [laion/CLIP-ViT-bigG-14-laion2B-39B-b160k](https://huggingface.co/laion/CLIP-ViT-bigG-14-laion2B-39B-b160k)
            variant.
        text_encoder_3 ([`T5EncoderModel`]):
            Frozen text-encoder. Stable Diffusion 3 uses
            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
            [t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
        tokenizer (`CLIPTokenizer`):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        tokenizer_2 (`CLIPTokenizer`):
            Second Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        tokenizer_3 (`T5TokenizerFast`):
            Tokenizer of class
            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
    z>text_encoder->text_encoder_2->text_encoder_3->transformer->vae)latentsprompt_embedsnegative_prompt_embedsnegative_pooled_prompt_embedsblocks.1transformerr+   vaetext_encoder	tokenizertext_encoder_2tokenizer_2text_encoder_3tokenizer_3pag_applied_layersc                    s   t    | j|||||||	||d	 t| dd r$dt| jjjd  nd| _t	| jd| _
t| dr<| jd ur<| jjnd| _t| d	rN| jd urN| jjjnd
| _t| d	r`| jd ur`| jjjnd| _| j|
t t fd d S )N)	r8   r9   r;   r=   r:   r<   r>   r7   r+   r8   r   r      )vae_scale_factorr:   M   r7      )pag_attn_processors)super__init__register_modulesgetattrr*   r8   configblock_out_channelsrA   r	   image_processorhasattrr:   model_max_lengthtokenizer_max_lengthr7   sample_sizedefault_sample_size
patch_sizeset_pag_applied_layersr   r   )selfr7   r+   r8   r9   r:   r;   r<   r=   r>   r?   r)   r!   r/   rF      s6   
(

 
z$StableDiffusion3PAGPipeline.__init__Nr      promptnum_images_per_promptmax_sequence_lengthr   dtypec                 C   s<  |p| j }|p
| jj}t|tr|gn|}t|}| jd u r.tj|| || j	j
jf||dS | j|d|dddd}|j}| j|dddj}	|	jd |jd krqt||	sq| j|	d d | jd	 df }
td
| d|
  | ||d }| jj}|j||d}|j\}}}|d	|d	}||| |d}|S )Nr   rY   
max_lengthTpt)paddingr[   
truncationadd_special_tokensreturn_tensorslongestr]   r`   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  	 tokens: r   rY   r   )_execution_devicer9   rY   
isinstancestrr*   r=   torchzerosr7   rI   joint_attention_dimr>   	input_idsshapeequalbatch_decoderN   loggerwarningtorepeatview)rS   rV   rW   rX   r   rY   
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_textr3   _seq_lenr!   r!   r/   _get_t5_prompt_embeds   sN   


 "z1StableDiffusion3PAGPipeline._get_t5_prompt_embedsr   	clip_skipclip_model_indexc                 C   sp  |p| j }| j| jg}| j| jg}|| }|| }	t|tr!|gn|}t|}
||d| jddd}|j	}||dddj	}|j
d |j
d krht||sh||d d | jd df }td	| j d
|  |	||dd}|d }|d u r|jd }n|j|d   }|j| jj|d}|j
\}}}|d|d}||
| |d}|d|}||
| d}||fS )Nr[   Tr\   )r]   r[   r^   r`   ra   rb   rc   r   z\The following part of your input was truncated because CLIP can only handle sequences up to rd   )output_hidden_statesr   r   re   )rf   r:   r<   r9   r;   rg   rh   r*   rN   rl   rm   ri   rn   ro   rp   rq   rr   hidden_statesrY   rs   rt   )rS   rV   rW   r   r}   r~   clip_tokenizersclip_text_encodersr:   r9   ru   rv   rw   rx   ry   r3   pooled_prompt_embedsrz   r{   r!   r!   r/   _get_clip_prompt_embeds  sJ   
  z3StableDiffusion3PAGPipeline._get_clip_prompt_embedsTprompt_2prompt_3do_classifier_free_guidancenegative_promptnegative_prompt_2negative_prompt_3r3   r4   r   r5   
lora_scalec              
   C   s<  |p| j }|dur+t| tr+|| _| jdurtrt| j| | jdur+tr+t| j| t|tr3|gn|}|dur>t	|}n|
j
d }|
du r|pJ|}t|trS|gn|}|pX|}t|tra|gn|}| j||||dd\}}| j||||dd\}}tj||gdd}| j||||d}tjj|d|j
d |j
d  f}tj||gdd}
tj||gdd}|rn|du rn|pd	}|p|}|	p|}	t|tr||g n|}t|tr||g n|}t|	tr||	g n|	}	|durt|t|urtd
t| dt| d|t	|kr td| dt	| d| d| d	| j|||ddd\}}| j|||ddd\}}tj||gdd}| j|	|||d}tjj|d|j
d |j
d  f}tj||gdd}tj||gdd}| jdurt| trtrt| j| | jdurt| trtrt| j| |
|||fS )a"  

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in all text-encoders
            prompt_3 (`str` or `list[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
                used in all text-encoders
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            negative_prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
            negative_prompt_3 (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
                `text_encoder_3`. If not defined, `negative_prompt` is used in all the text-encoders.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
        Nr   )rV   r   rW   r}   r~   r   rc   dim)rV   rW   rX   r   r    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)r   rW   r}   r~   )rf   rg   r   _lora_scaler9   r   r   r;   rh   r*   rm   r   ri   catr|   nn
functionalpadtype	TypeErrorr"   r   )rS   rV   r   r   r   rW   r   r   r   r   r3   r4   r   r5   r}   rX   r   ru   prompt_embedpooled_prompt_embedprompt_2_embedpooled_prompt_2_embedclip_prompt_embedst5_prompt_embednegative_prompt_embednegative_pooled_prompt_embednegative_prompt_2_embednegative_pooled_prompt_2_embednegative_clip_prompt_embedst5_negative_prompt_embedr!   r!   r/   encode_promptO  s   
B





z)StableDiffusion3PAGPipeline.encode_promptc                    s  | j  j  dks| j  j  dkr<td j  j  d| d| d|| j  j    d|| j  j    d|d ur]t fdd	|D s]td
 j d fdd|D  |d urp|	d urptd| d|	 d|d ur|	d urtd| d|	 d|d ur|	d urtd| d|	 d|d u r|	d u rtd|d urt|tst|tstdt| |d urt|tst|tstdt| |d urt|tst|tstdt| |d ur|
d urtd| d|
 d|d ur|
d urtd| d|
 d|d ur$|
d ur$td| d|
 d|	d urB|
d urB|	j	|
j	krBtd|	j	 d|
j	 d|	d urP|d u rPtd|
d ur^|d u r^td|d uro|dkrqtd | d S d S )!Nr   z-`height` and `width` have to be divisible by z	 but are z and z.You can use height z and width r   c                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0krS   r!   r/   	<genexpr>+  s    

z;StableDiffusion3PAGPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r!   r   r   r   r!   r/   
<listcomp>/  s    z<StableDiffusion3PAGPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.z Cannot forward both `prompt_2`: z Cannot forward both `prompt_3`: zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z4`prompt_2` has to be of type `str` or `list` but is z4`prompt_3` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: z)Cannot forward both `negative_prompt_2`: z)Cannot forward both `negative_prompt_3`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zIf `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`.zIf `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`.i   z8`max_sequence_length` cannot be greater than 512 but is )
rA   rQ   r"   allr   rg   rh   listr   rm   )rS   rV   r   r   heightwidthr   r   r   r3   r4   r   r5   "callback_on_step_end_tensor_inputsrX   r!   r   r/   check_inputs  s   z(StableDiffusion3PAGPipeline.check_inputsc	           
      C   sz   |d ur|j ||dS ||t|| j t|| j f}	t|tr3t||kr3tdt| d| dt|	|||d}|S )NrZ   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatorr   rY   )rr   intrA   rg   r   r*   r"   r   )
rS   ru   num_channels_latentsr   r   rY   r   r   r2   rm   r!   r!   r/   prepare_latentsr  s   z+StableDiffusion3PAGPipeline.prepare_latentsc                 C      | j S r   _guidance_scaler   r!   r!   r/   guidance_scale     z*StableDiffusion3PAGPipeline.guidance_scalec                 C   r   r   )
_clip_skipr   r!   r!   r/   r}     r   z%StableDiffusion3PAGPipeline.clip_skipc                 C   s
   | j dkS )Nr   r   r   r!   r!   r/   r     s   
z7StableDiffusion3PAGPipeline.do_classifier_free_guidancec                 C   r   r   )_joint_attention_kwargsr   r!   r!   r/   joint_attention_kwargs  r   z2StableDiffusion3PAGPipeline.joint_attention_kwargsc                 C   r   r   )_num_timestepsr   r!   r!   r/   num_timesteps  r   z)StableDiffusion3PAGPipeline.num_timestepsc                 C   r   r   )
_interruptr   r!   r!   r/   	interrupt  r   z%StableDiffusion3PAGPipeline.interrupt   g      @pilr2   g      @g        r   r   r   r    r   r   output_typereturn_dictr   callback_on_step_endr   	pag_scalepag_adaptive_scalec           1      C   sx  |p| j | j }|p| j | j }| j||||||	|
|||||||d || _|| _|| _d| _|| _|| _|durAt	|t
rAd}n|durOt	|trOt|}n|jd }| j}| jdurc| jddnd}| jd#i d|d|d	|d
|	d|
d|d| jd|d|d|d|d|d| jd|d|d|\}}}}| jr| ||| j}| ||| j}n| jrtj||gdd}tj||gdd}trd}n|}t| j|||d\} }tt| || jj  d}!t| | _| jjj }"| !|| |"|||j"|||}| jr| jj#}#| j$| j%| jd | j&|d}$t'| D ]\}%}&| j(r(qt|g|jd |jd   }'|&)|'jd }(| j|'|(||| jddd })| jr]| *|)| j| j+|&})n| jrq|),d\}*}+|*| j+|+|*   })|j"},| jj-|)|&|ddd }|j"|,krtj.j/0 r|1|,}|duri }-|D ]
}.t2 |. |-|.< q|| |%|&|-}/|/3d|}|/3d|}|/3d|}|/3d|}|%t| d ks|%d |!kr|%d | jj dkr|$4  trt56  qW d   n	1 sw   Y  |d kr|}0n|| j7jj8 | j7jj9 }| j7j:|ddd }0| j;j<|0|d!}0| =  | jr1| j>|# |s7|0fS t?|0d"S )$a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                will be used instead
            prompt_3 (`str` or `list[str]`, *optional*):
                The prompt or prompts to be sent to `tokenizer_3` and `text_encoder_3`. If not defined, `prompt` is
                will be used instead
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. This is set to 1024 by default for the best results.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The width in pixels of the generated image. This is set to 1024 by default for the best results.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            sigmas (`list[float]`, *optional*):
                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
                will be used.
            guidance_scale (`float`, *optional*, defaults to 7.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            negative_prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
                `text_encoder_2`. If not defined, `negative_prompt` is used instead
            negative_prompt_3 (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation to be sent to `tokenizer_3` and
                `text_encoder_3`. If not defined, `negative_prompt` is used instead
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.FloatTensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
                of a plain tuple.
            joint_attention_kwargs (`dict`, *optional*):
                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
                `self.processor` in
                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
            max_sequence_length (`int` defaults to 256): Maximum sequence length to use with the `prompt`.
            pag_scale (`float`, *optional*, defaults to 3.0):
                The scale factor for the perturbed attention guidance. If it is set to 0.0, the perturbed attention
                guidance will not be used.
            pag_adaptive_scale (`float`, *optional*, defaults to 0.0):
                The adaptive scale factor for the perturbed attention guidance. If it is set to 0.0, `pag_scale` is
                used.

        Examples:

        Returns:
            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] or `tuple`:
            [`~pipelines.stable_diffusion_3.StableDiffusion3PipelineOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is a list with the generated images.
        )	r   r   r   r3   r4   r   r5   r   rX   FNr   r   scalerV   r   r   r   r   r   r   r3   r4   r   r5   r   r}   rW   rX   r   r   cpu)r    )r?   r   )total)r   timestepencoder_hidden_statespooled_projectionsr   r   r   )r   r2   latent)r   )imagesr!   )@rP   rA   r   r   r   r   r   
_pag_scale_pag_adaptive_scalerg   rh   r   r*   rm   rf   r   getr   r   r}   do_perturbed_attention_guidance%_prepare_perturbed_attention_guidanceri   r   XLA_AVAILABLEr0   r+   maxorderr   r7   rI   in_channelsr   rY   attn_processors_set_pag_attn_processorr?   progress_bar	enumerater   expand#_apply_perturbed_attention_guidancer   chunkstepbackendsmpsis_availablerr   localspopupdatexm	mark_stepr8   scaling_factorshift_factordecoderK   postprocessmaybe_free_model_hooksset_attn_processorr   )1rS   rV   r   r   r   r   r   r    r   r   r   r   rW   r   r2   r3   r4   r   r5   r   r   r   r}   r   r   rX   r   r   ru   r   r   timestep_devicer   num_warmup_stepsr   original_attn_procr   itlatent_model_inputr   
noise_prednoise_pred_uncondnoise_pred_textlatents_dtypecallback_kwargsr   callback_outputsimager!   r!   r/   __call__  sF   


	




 


6
:
z$StableDiffusion3PAGPipeline.__call__)r6   )Nr   rU   NN)r   NNr   )Nr   TNNNNNNNNrU   N)	NNNNNNNNNr   )-__name__
__module____qualname____doc__model_cpu_offload_seq_optional_componentsr   r   r   r   r   r   r   r   rh   r   rF   r   ri   r   rY   r|   r   boolFloatTensorfloatr   r   r   propertyr   r}   r   r   r   r   no_gradr   EXAMPLE_DOC_STRING	Generatordictr   r   r   __classcell__r!   r!   rT   r/   r1      s   $	

/

<

=


	

 J
j








	
r1   )NNNN)7r$   typingr   r   ri   transformersr   r   r   r   rK   r	   loadersr
   r   models.attention_processorr   r   models.autoencodersr   models.transformersr   
schedulersr   utilsr   r   r   r   r   r   utils.torch_utilsr   pipeline_utilsr   "stable_diffusion_3.pipeline_outputr   	pag_utilsr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rp   r  r   rh   r   r   r  r0   r1   r!   r!   r!   r/   <module>   sF    



;