o
    Gi                     @   s   d dl mZ d dlZd dlZd dlmZmZmZm	Z	m
Z
 ddlmZmZmZ ddlmZmZmZ ddlmZ dd	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ d
dlm Z  dZ!dZ"dZ#G dd deZ$G dd deZ%G dd deZ&dS )    )CallableN)CLIPImageProcessorCLIPTextModelWithProjectionCLIPTokenizerCLIPVisionModelWithProjectionXLMRobertaTokenizer   )PriorTransformerUNet2DConditionModelVQModel)DDIMSchedulerDDPMSchedulerUnCLIPScheduler)replace_example_docstring   )DiffusionPipeline   )KandinskyPipeline)KandinskyImg2ImgPipeline)KandinskyInpaintPipeline)KandinskyPriorPipeline)MultilingualCLIPa  
    Examples:
        ```py
        from diffusers import AutoPipelineForText2Image
        import torch

        pipe = AutoPipelineForText2Image.from_pretrained(
            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
        )
        pipe.enable_model_cpu_offload()

        prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"

        image = pipe(prompt=prompt, num_inference_steps=25).images[0]
        ```
a~  
    Examples:
        ```py
        from diffusers import AutoPipelineForImage2Image
        import torch
        import requests
        from io import BytesIO
        from PIL import Image
        import os

        pipe = AutoPipelineForImage2Image.from_pretrained(
            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
        )
        pipe.enable_model_cpu_offload()

        prompt = "A fantasy landscape, Cinematic lighting"
        negative_prompt = "low quality, bad quality"

        url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"

        response = requests.get(url)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        image.thumbnail((768, 768))

        image = pipe(prompt=prompt, image=original_image, num_inference_steps=25).images[0]
        ```
a  
    Examples:
        ```py
        from diffusers import AutoPipelineForInpainting
        from diffusers.utils import load_image
        import torch
        import numpy as np

        pipe = AutoPipelineForInpainting.from_pretrained(
            "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
        )
        pipe.enable_model_cpu_offload()

        prompt = "A fantasy landscape, Cinematic lighting"
        negative_prompt = "low quality, bad quality"

        original_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
        )

        mask = np.zeros((768, 768), dtype=np.float32)
        # Let's mask out an area above the cat's head
        mask[:250, 250:-250] = 1

        image = pipe(prompt=prompt, image=original_image, mask_image=mask, num_inference_steps=25).images[0]
        ```
c                !       sZ  e Zd ZdZdZdZdgZdedede	de
eB d	eded
ededededef fddZd4dedB fddZd5dedB dejeB fddZd5ddZdd Ze ee					 	 		!			"			d6d#ee e B d$ee e B dB d%ed&e!d'ed(ed)ed*e!d+ed,ej"e ej" B dB d-ej#dB d.edB d/eeeej#gdf dB d0ed1e$fd2d3Z%  Z&S )7KandinskyCombinedPipelinea  
    Combined Pipeline for text-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (`DDIMScheduler` | `DDPMScheduler`):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior_scheduler ([`UnCLIPScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
    TzNtext_encoder->unet->movq->prior_prior->prior_image_encoder->prior_text_encoderprior_priortext_encoder	tokenizerunet	schedulermovqprior_image_encoderprior_text_encoderprior_tokenizerprior_schedulerprior_image_processorc                    X   t    | j|||||||||	|
|d t||||	|
|d| _t|||||d| _d S N)r   r   r   r   r   r   r   r    r!   r"   r#   )priorimage_encoderr   r   r   image_processor)r   r   r   r   r   )super__init__register_modulesr   
prior_piper   decoder_pipeselfr   r   r   r   r   r   r   r    r!   r"   r#   	__class__ m/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.pyr*      :   
z"KandinskyCombinedPipeline.__init__Nattention_opc                 C      | j | d S Nr-   *enable_xformers_memory_efficient_attentionr/   r5   r2   r2   r3   r9         zDKandinskyCombinedPipeline.enable_xformers_memory_efficient_attentiongpu_iddevicec                 C   $   | j j||d | jj||d dS )u  
        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
        r<   r=   Nr,   enable_sequential_cpu_offloadr-   r/   r<   r=   r2   r2   r3   rA      s   z7KandinskyCombinedPipeline.enable_sequential_cpu_offloadc                 C   .   | j j||d | jj||d | j  d S N)iterabletotalr,   progress_barr-   enable_model_cpu_offloadr/   rE   rF   r2   r2   r3   rH         z&KandinskyCombinedPipeline.progress_barc                 K   (   | j jdi | | jjdi | d S Nr2   r,   set_progress_bar_configr-   r/   kwargsr2   r2   r3   rO         z1KandinskyCombinedPipeline.set_progress_bar_configd         @r         pilpromptnegative_promptnum_inference_stepsguidance_scalenum_images_per_promptheightwidthprior_guidance_scaleprior_num_inference_steps	generatorlatentsoutput_typecallbackcallback_stepsreturn_dictc                 C   s   | j ||||	|
||ddd	}|d }|d }t|ttfs |gn|}t||jd k rA|jd t| dkrA|jd t| | }| j|||||||
|||||d}|   |S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            prior_num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        ptF	rX   rY   r\   rZ   ra   rb   r[   rc   rf   r   r   )rX   image_embedsnegative_image_embedsr^   r]   rZ   ra   r[   rc   rd   re   rf   )r,   
isinstancelisttuplelenshaper-   maybe_free_model_hooks)r/   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   prior_outputsri   rj   outputsr2   r2   r3   __call__   s@   M(z"KandinskyCombinedPipeline.__call__r7   NNNrS   rT   r   rU   rU   rT   rV   NNrW   Nr   T)'__name__
__module____qualname____doc___load_connected_pipesmodel_cpu_offload_seq_exclude_from_cpu_offloadr   r   r
   r   r   r   r	   r   r   r   r   r   r*   r   r9   inttorchr=   strrA   rH   rO   no_gradr   TEXT2IMAGE_EXAMPLE_DOC_STRINGrl   float	GeneratorTensorboolrs   __classcell__r2   r2   r0   r3   r   q   s    	
-


	
r   c                %       s  e Zd ZdZdZdZdgZdedede	de
eB d	eded
ededededef fddZd7dedB fddZd8dedB dejeB fddZd8ddZdd Ze ee					 	!	!		"			#			d9d$ee e B d%ej!e"j#j#B e ej! B e e"j#j# B d&ee e B dB d'ed(e$d)ed*e$d+ed,ed-e$d.ed/ej%e ej% B dB d0ej!dB d1edB d2eeeej!gdf dB d3ed4e&f"d5d6Z'  Z(S ): KandinskyImg2ImgCombinedPipelinea  
    Combined Pipeline for image-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (`DDIMScheduler` | `DDPMScheduler`):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior_scheduler ([`UnCLIPScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
    TNprior_text_encoder->prior_image_encoder->prior_prior->text_encoder->unet->movqr   r   r   r   r   r   r   r    r!   r"   r#   c                    r$   r%   )r)   r*   r+   r   r,   r   r-   r.   r0   r2   r3   r*   n  r4   z)KandinskyImg2ImgCombinedPipeline.__init__Nr5   c                 C   r6   r7   r8   r:   r2   r2   r3   r9     r;   zKKandinskyImg2ImgCombinedPipeline.enable_xformers_memory_efficient_attentionr<   r=   c                 C   r>   a  
        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
        Note that offloading happens on a submodule basis. Memory savings are higher than with
        `enable_model_cpu_offload`, but performance is lower.
        r?   Nr@   rB   r2   r2   r3   rA        z>KandinskyImg2ImgCombinedPipeline.enable_sequential_cpu_offloadc                 C   rC   rD   rG   rJ   r2   r2   r3   rH     rK   z-KandinskyImg2ImgCombinedPipeline.progress_barc                 K   rL   rM   rN   rP   r2   r2   r3   rO     rR   z8KandinskyImg2ImgCombinedPipeline.set_progress_bar_configrS   rT   r   333333?rU   rV   rW   rX   imagerY   rZ   r[   r\   strengthr]   r^   r_   r`   ra   rb   rc   rd   re   rf   c                 C   s  | j |||||||
ddd	}|d }|d }t|ttfs |gn|}t|tjjr,|gn|}t||jd k rM|jd t| dkrM|jd t| | }t|ttfrst||jd k rs|jd t| dkrs|jd t| | }| j||||||	||||||||d}| 	  |S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
                again.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            strength (`float`, *optional*, defaults to 0.3):
                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
                be maximum and the denoising process will run for the full number of iterations specified in
                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            prior_num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        rg   Frh   r   r   )rX   r   ri   rj   r   r^   r]   rZ   ra   r[   rc   rd   re   rf   
r,   rk   rl   rm   PILImagern   ro   r-   rp   )r/   rX   r   rY   rZ   r[   r\   r   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rq   ri   rj   rr   r2   r2   r3   rs     sP   Y(z)KandinskyImg2ImgCombinedPipeline.__call__r7   rt   )NrS   rT   r   r   rU   rU   rT   rV   NNrW   Nr   T))rv   rw   rx   ry   rz   r{   r|   r   r   r
   r   r   r   r	   r   r   r   r   r   r*   r   r9   r}   r~   r=   r   rA   rH   rO   r   r   IMAGE2IMAGE_EXAMPLE_DOC_STRINGrl   r   r   r   r   r   r   rs   r   r2   r2   r0   r3   r   K  s    	
-

"	
r   c                %       s  e Zd ZdZdZdZdgZdedede	de
eB d	eded
ededededef fddZd6dedB fddZd7dedB dejeB fddZd7ddZdd Ze ee					 	 		!			"			d8d#ee e B d$ej!e"j#j#B e ej! B e e"j#j# B d%ej!e"j#j#B e ej! B e e"j#j# B d&ee e B dB d'ed(e$d)ed*ed+ed,e$d-ed.ej%e ej% B dB d/ej!dB d0edB d1eeeej!gdf dB d2ed3e&f"d4d5Z'  Z(S )9 KandinskyInpaintCombinedPipelinea  
    Combined Pipeline for generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (`DDIMScheduler` | `DDPMScheduler`):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior_scheduler ([`UnCLIPScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
    Tr   r   r   r   r   r   r   r   r    r!   r"   r#   c                    r$   r%   )r)   r*   r+   r   r,   r   r-   r.   r0   r2   r3   r*   _  r4   z)KandinskyInpaintCombinedPipeline.__init__Nr5   c                 C   r6   r7   r8   r:   r2   r2   r3   r9     r;   zKKandinskyInpaintCombinedPipeline.enable_xformers_memory_efficient_attentionr<   r=   c                 C   r>   r   r@   rB   r2   r2   r3   rA     r   z>KandinskyInpaintCombinedPipeline.enable_sequential_cpu_offloadc                 C   rC   rD   rG   rJ   r2   r2   r3   rH     rK   z-KandinskyInpaintCombinedPipeline.progress_barc                 K   rL   rM   rN   rP   r2   r2   r3   rO     rR   z8KandinskyInpaintCombinedPipeline.set_progress_bar_configrS   rT   r   rU   rV   rW   rX   r   
mask_imagerY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   c                 C   s|  | j |||||||
ddd	}|d }|d }t|ttfs |gn|}t|tjjr,|gn|}t|tjjr8|gn|}t||jd k rY|jd t| dkrY|jd t| | }t|ttfrt||jd k r|jd t| dkr|jd t| | }t|ttfrt||jd k r|jd t| dkr|jd t| | }| j||||||	||||||||d}| 	  |S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
                again.
            mask_image (`np.array`):
                Tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while
                black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
                so the expected shape would be `(B, H, W, 1)`.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            prior_num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        rg   Frh   r   r   )rX   r   r   ri   rj   r^   r]   rZ   ra   r[   rc   rd   re   rf   r   )r/   rX   r   r   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rq   ri   rj   rr   r2   r2   r3   rs     s\   X(z)KandinskyInpaintCombinedPipeline.__call__r7   rt   ru   ))rv   rw   rx   ry   rz   r{   r|   r   r   r
   r   r   r   r	   r   r   r   r   r   r*   r   r9   r}   r~   r=   r   rA   rH   rO   r   r   INPAINT_EXAMPLE_DOC_STRINGrl   r   r   r   r   r   r   rs   r   r2   r2   r0   r3   r   <  s    	
-

""	
r   )'typingr   	PIL.Imager   r~   transformersr   r   r   r   r   modelsr	   r
   r   
schedulersr   r   r   utilsr   pipeline_utilsr   pipeline_kandinskyr   pipeline_kandinsky_img2imgr   pipeline_kandinsky_inpaintr   pipeline_kandinsky_priorr   r   r   r   r   r   r   r   r   r2   r2   r2   r3   <module>   s*    [ r