o
    pid                     @   s   d dl mZmZmZmZ d dlZd dlZd dlm	Z	m
Z
mZmZmZ ddlmZmZmZ ddlmZmZmZ ddlmZ dd	lmZ d
dlmZ d
dlmZ d
dlmZ d
dl m!Z! d
dl"m#Z# dZ$dZ%dZ&G dd deZ'G dd deZ(G dd deZ)dS )    )CallableListOptionalUnionN)CLIPImageProcessorCLIPTextModelWithProjectionCLIPTokenizerCLIPVisionModelWithProjectionXLMRobertaTokenizer   )PriorTransformerUNet2DConditionModelVQModel)DDIMSchedulerDDPMSchedulerUnCLIPScheduler)replace_example_docstring   )DiffusionPipeline   )KandinskyPipeline)KandinskyImg2ImgPipeline)KandinskyInpaintPipeline)KandinskyPriorPipeline)MultilingualCLIPa  
    Examples:
        ```py
        from diffusers import AutoPipelineForText2Image
        import torch

        pipe = AutoPipelineForText2Image.from_pretrained(
            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
        )
        pipe.enable_model_cpu_offload()

        prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k"

        image = pipe(prompt=prompt, num_inference_steps=25).images[0]
        ```
a~  
    Examples:
        ```py
        from diffusers import AutoPipelineForImage2Image
        import torch
        import requests
        from io import BytesIO
        from PIL import Image
        import os

        pipe = AutoPipelineForImage2Image.from_pretrained(
            "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
        )
        pipe.enable_model_cpu_offload()

        prompt = "A fantasy landscape, Cinematic lighting"
        negative_prompt = "low quality, bad quality"

        url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"

        response = requests.get(url)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        image.thumbnail((768, 768))

        image = pipe(prompt=prompt, image=original_image, num_inference_steps=25).images[0]
        ```
a  
    Examples:
        ```py
        from diffusers import AutoPipelineForInpainting
        from diffusers.utils import load_image
        import torch
        import numpy as np

        pipe = AutoPipelineForInpainting.from_pretrained(
            "kandinsky-community/kandinsky-2-1-inpaint", torch_dtype=torch.float16
        )
        pipe.enable_model_cpu_offload()

        prompt = "A fantasy landscape, Cinematic lighting"
        negative_prompt = "low quality, bad quality"

        original_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/kandinsky/cat.png"
        )

        mask = np.zeros((768, 768), dtype=np.float32)
        # Let's mask out an area above the cat's head
        mask[:250, 250:-250] = 1

        image = pipe(prompt=prompt, image=original_image, mask_image=mask, num_inference_steps=25).images[0]
        ```
c                !       sV  e Zd ZdZdZdZdgZdedede	de
eef d	eded
ededededef fddZd3dee fddZd4ddZd5ddZdd Ze ee								 			!			d6d"e
ee e f d#ee
ee e f  d$e!d%e"d&e!d'e!d(e!d)e"d*e!d+ee
ej#e ej# f  d,eej$ d-ee d.eee!e!ej$gdf  d/e!d0e%fd1d2Z&  Z'S )7KandinskyCombinedPipelinea  
    Combined Pipeline for text-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior_scheduler ([`UnCLIPScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
    TzNtext_encoder->unet->movq->prior_prior->prior_image_encoder->prior_text_encoderprior_priortext_encoder	tokenizerunet	schedulermovqprior_image_encoderprior_text_encoderprior_tokenizerprior_schedulerprior_image_processorc                    X   t    | j|||||||||	|
|d t||||	|
|d| _t|||||d| _d S N)r   r   r   r    r!   r   r"   r#   r$   r%   r&   )priorimage_encoderr   r   r    image_processor)r   r   r   r    r!   )super__init__register_modulesr   
prior_piper   decoder_pipeselfr   r   r   r    r!   r   r"   r#   r$   r%   r&   	__class__ w/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky_combined.pyr-      :   
z"KandinskyCombinedPipeline.__init__Nattention_opc                 C      | j | d S Nr0   *enable_xformers_memory_efficient_attentionr2   r8   r5   r5   r6   r<         zDKandinskyCombinedPipeline.enable_xformers_memory_efficient_attentionr   c                 C       | j j|d | jj|d dS )u  
        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
        gpu_idNr/   enable_sequential_cpu_offloadr0   r2   rA   r5   r5   r6   rC      s   z7KandinskyCombinedPipeline.enable_sequential_cpu_offloadc                 C   .   | j j||d | jj||d | j  d S N)iterabletotalr/   progress_barr0   enable_model_cpu_offloadr2   rG   rH   r5   r5   r6   rJ         z&KandinskyCombinedPipeline.progress_barc                 K   (   | j jdi | | jjdi | d S Nr5   r/   set_progress_bar_configr0   r2   kwargsr5   r5   r6   rQ         z1KandinskyCombinedPipeline.set_progress_bar_configd         @r         pilpromptnegative_promptnum_inference_stepsguidance_scalenum_images_per_promptheightwidthprior_guidance_scaleprior_num_inference_steps	generatorlatentsoutput_typecallbackcallback_stepsreturn_dictc                 C   s   | j ||||	|
||ddd	}|d }|d }t|ttfs |gn|}t||jd k rA|jd t| dkrA|jd t| | }| j|||||||
|||||d}|   |S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            prior_num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        ptF	rZ   r[   r^   r\   rc   rd   r]   re   rh   r   r   )rZ   image_embedsnegative_image_embedsr`   r_   r\   rc   r]   re   rf   rg   rh   )r/   
isinstancelisttuplelenshaper0   maybe_free_model_hooks)r2   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   prior_outputsrk   rl   outputsr5   r5   r6   __call__   s@   M(z"KandinskyCombinedPipeline.__call__r:   r   NNNrU   rV   r   rW   rW   rV   rX   NNrY   Nr   T)(__name__
__module____qualname____doc___load_connected_pipesmodel_cpu_offload_seq_exclude_from_cpu_offloadr   r
   r   r   r   r   r   r   r	   r   r   r   r   r-   r   r   r<   rC   rJ   rQ   torchno_gradr   TEXT2IMAGE_EXAMPLE_DOC_STRINGstrr   intfloat	GeneratorTensorboolru   __classcell__r5   r5   r3   r6   r   q   s    
	
-


	
r   c                %       s  e Zd ZdZdZdZdgZdedede	de
eef d	eded
ededededef fddZd6dee fddZd7ddZd8ddZdd Ze ee						 	 		!			"			d9d#e
ee e f d$e
ej!e"j#j#e ej! e e"j#j# f d%ee
ee e f  d&e$d'e%d(e$d)e%d*e$d+e$d,e%d-e$d.ee
ej&e ej& f  d/eej! d0ee d1eee$e$ej!gdf  d2e$d3e'f"d4d5Z(  Z)S ): KandinskyImg2ImgCombinedPipelinea  
    Combined Pipeline for image-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior_scheduler ([`UnCLIPScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
    TNprior_text_encoder->prior_image_encoder->prior_prior->text_encoder->unet->movqr   r   r   r   r    r!   r"   r#   r$   r%   r&   c                    r'   r(   )r,   r-   r.   r   r/   r   r0   r1   r3   r5   r6   r-   n  r7   z)KandinskyImg2ImgCombinedPipeline.__init__Nr8   c                 C   r9   r:   r;   r=   r5   r5   r6   r<     r>   zKKandinskyImg2ImgCombinedPipeline.enable_xformers_memory_efficient_attentionr   c                 C   r?   a  
        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
        text_encoder, vae and safety checker have their state dicts saved to CPU and then are moved to a
        `torch.device('meta') and loaded to GPU only when their specific submodule has its `forward` method called.
        Note that offloading happens on a submodule basis. Memory savings are higher than with
        `enable_model_cpu_offload`, but performance is lower.
        r@   NrB   rD   r5   r5   r6   rC        z>KandinskyImg2ImgCombinedPipeline.enable_sequential_cpu_offloadc                 C   rE   rF   rI   rL   r5   r5   r6   rJ     rM   z-KandinskyImg2ImgCombinedPipeline.progress_barc                 K   rN   rO   rP   rR   r5   r5   r6   rQ     rT   z8KandinskyImg2ImgCombinedPipeline.set_progress_bar_configrU   rV   r   333333?rW   rX   rY   rZ   imager[   r\   r]   r^   strengthr_   r`   ra   rb   rc   rd   re   rf   rg   rh   c                 C   s  | j |||||||
ddd	}|d }|d }t|ttfs |gn|}t|tjjr,|gn|}t||jd k rM|jd t| dkrM|jd t| | }t|ttfrst||jd k rs|jd t| dkrs|jd t| | }| j||||||	||||||||d}| 	  |S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
                again.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            strength (`float`, *optional*, defaults to 0.3):
                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
                be maximum and the denoising process will run for the full number of iterations specified in
                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            prior_num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        ri   Frj   r   r   )rZ   r   rk   rl   r   r`   r_   r\   rc   r]   re   rf   rg   rh   
r/   rm   rn   ro   PILImagerp   rq   r0   rr   )r2   rZ   r   r[   r\   r]   r^   r   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   rs   rk   rl   rt   r5   r5   r6   ru     sP   Y(z)KandinskyImg2ImgCombinedPipeline.__call__r:   rv   rw   )NrU   rV   r   r   rW   rW   rV   rX   NNrY   Nr   T)*ry   rz   r{   r|   r}   r~   r   r   r
   r   r   r   r   r   r   r	   r   r   r   r   r-   r   r   r<   rC   rJ   rQ   r   r   r   IMAGE2IMAGE_EXAMPLE_DOC_STRINGr   r   r   r   r   r   r   r   r   ru   r   r5   r5   r3   r6   r   K  s    
	
-

"	
r   c                %       s  e Zd ZdZdZdZdgZdedede	de
eef d	eded
ededededef fddZd5dee fddZd6ddZd7ddZdd Ze ee								 			!			d8d"e
ee e f d#e
ej!e"j#j#e ej! e e"j#j# f d$e
ej!e"j#j#e ej! e e"j#j# f d%ee
ee e f  d&e$d'e%d(e$d)e$d*e$d+e%d,e$d-ee
ej&e ej& f  d.eej! d/ee d0eee$e$ej!gdf  d1e$d2e'f"d3d4Z(  Z)S )9 KandinskyInpaintCombinedPipelinea  
    Combined Pipeline for generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        prior_tokenizer (`CLIPTokenizer`):
             Tokenizer of class
             [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior_scheduler ([`UnCLIPScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
    Tr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   c                    r'   r(   )r,   r-   r.   r   r/   r   r0   r1   r3   r5   r6   r-   _  r7   z)KandinskyInpaintCombinedPipeline.__init__Nr8   c                 C   r9   r:   r;   r=   r5   r5   r6   r<     r>   zKKandinskyInpaintCombinedPipeline.enable_xformers_memory_efficient_attentionr   c                 C   r?   r   rB   rD   r5   r5   r6   rC     r   z>KandinskyInpaintCombinedPipeline.enable_sequential_cpu_offloadc                 C   rE   rF   rI   rL   r5   r5   r6   rJ     rM   z-KandinskyInpaintCombinedPipeline.progress_barc                 K   rN   rO   rP   rR   r5   r5   r6   rQ     rT   z8KandinskyInpaintCombinedPipeline.set_progress_bar_configrU   rV   r   rW   rX   rY   rZ   r   
mask_imager[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   c                 C   s|  | j |||||||
ddd	}|d }|d }t|ttfs |gn|}t|tjjr,|gn|}t|tjjr8|gn|}t||jd k rY|jd t| dkrY|jd t| | }t|ttfrt||jd k r|jd t| dkr|jd t| | }t|ttfrt||jd k r|jd t| dkr|jd t| | }| j||||||	||||||||d}| 	  |S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process. Can also accept image latents as `image`, if passing latents directly, it will not be encoded
                again.
            mask_image (`np.array`):
                Tensor representing an image batch, to mask `image`. White pixels in the mask will be repainted, while
                black pixels will be preserved. If `mask_image` is a PIL image, it will be converted to a single
                channel (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3,
                so the expected shape would be `(B, H, W, 1)`.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            prior_num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        ri   Frj   r   r   )rZ   r   r   rk   rl   r`   r_   r\   rc   r]   re   rf   rg   rh   r   )r2   rZ   r   r   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   rs   rk   rl   rt   r5   r5   r6   ru     s\   X(z)KandinskyInpaintCombinedPipeline.__call__r:   rv   rw   rx   )*ry   rz   r{   r|   r}   r~   r   r   r
   r   r   r   r   r   r   r	   r   r   r   r   r-   r   r   r<   rC   rJ   rQ   r   r   r   INPAINT_EXAMPLE_DOC_STRINGr   r   r   r   r   r   r   r   r   ru   r   r5   r5   r3   r6   r   <  s    
	
-

""	
r   )*typingr   r   r   r   	PIL.Imager   r   transformersr   r   r   r	   r
   modelsr   r   r   
schedulersr   r   r   utilsr   pipeline_utilsr   pipeline_kandinskyr   pipeline_kandinsky_img2imgr   pipeline_kandinsky_inpaintr   pipeline_kandinsky_priorr   r   r   r   r   r   r   r   r   r5   r5   r5   r6   <module>   s*    [ r