o
    ۷iVU                     @   s   d dl mZ d dlZd dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZmZ ddlmZ e rUd dlm  mZ dZndZeeZ dZ!dddZ"G dd deZ#dS )    )CallableN)XLMRobertaTokenizer   )VaeImageProcessor)UNet2DConditionModelVQModel)DDIMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutput   )MultilingualCLIPTFa  
    Examples:
        ```py
        >>> from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
        >>> from diffusers.utils import load_image
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
        ... )
        >>> pipe_prior.to("cuda")

        >>> prompt = "A red cartoon frog, 4k"
        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)

        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained(
        ...     "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
        ... )
        >>> pipe.to("cuda")

        >>> init_image = load_image(
        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
        ...     "/kandinsky/frog.png"
        ... )

        >>> image = pipe(
        ...     prompt,
        ...     image=init_image,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=zero_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ...     strength=0.2,
        ... ).images

        >>> image[0].save("red_frog.png")
        ```
   c                 C   sX   | |d  }| |d  dkr|d7 }||d  }||d  dkr$|d7 }|| || fS )Nr   r   r    )hwscale_factornew_hnew_wr   r   n/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.pyget_new_h_wS   s   r   c                #       sL  e Zd ZdZdZdedededede	f
 fdd	Z
d
d Zdd Z	d0ddZdejdejdejdejfddZe ee												d1deee B dejejjB eej B eejj B d ejd!ejd"eee B dB d#ed$ed%ed&ed'ed(ed)ejeej B dB d*edB d+eeeejgdf dB d,ed-ef d.d/Z  ZS )2KandinskyImg2ImgPipelinea  
    Pipeline for image-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler ([`DDIMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ image encoder and decoder
    ztext_encoder->unet->movqtext_encodermovq	tokenizerunet	schedulerc                    sx   t    | j|||||d t| dd r dt| jjjd  nd| _t| dd r.| jjj	nd}t
| j|ddd| _d S )	N)r   r   r   r    r   r   r   r   r      bicubic)vae_scale_factorvae_latent_channelsresamplereducing_gap)super__init__register_modulesgetattrlenr   configblock_out_channelsmovq_scale_factorlatent_channelsr   image_processor)selfr   r   r   r   r    movq_latent_channels	__class__r   r   r(   s   s"   
$z!KandinskyImg2ImgPipeline.__init__c                 C   s<   t t|| |}t|| d}| jj|d  }||| fS )Nr   )minintmaxr    	timesteps)r1   num_inference_stepsstrengthdeviceinit_timestept_startr8   r   r   r   get_timesteps   s   z&KandinskyImg2ImgPipeline.get_timestepsc           	      C   sv   |d u rt ||||d}n|j|krtd|j d| ||}||j }|j}t ||||d}| |||}|S )N)	generatorr;   dtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma	add_noise)	r1   latentslatent_timesteprA   r@   r;   r?   r    noiser   r   r   prepare_latents   s   


z(KandinskyImg2ImgPipeline.prepare_latentsNc              
   C   sZ  t |tr	t|nd}| j|ddddddd}|j}| j|dddj}	|	jd	 |jd	 krRt||	sR| j|	d d | jj	d d	f }
t
d
| jj	 d|
  ||}|j|}| j||d\}}|j|dd}|j|dd}|j|dd}|r(|d u rdg| }n;t|t|urtdt| dt| dt |tr|g}n|t|krtd| dt| d| d| d	|}| j|ddddddd}|j|}|j|}| j||d\}}|jd }|d|}||| |}|jd }|d|d}||| |d	}|j|dd}t||g}t||g}t||g}|||fS )Nr   
max_lengthM   Tpt)paddingrJ   
truncationreturn_attention_maskadd_special_tokensreturn_tensorslongest)rM   rQ   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )	input_idsattention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancelistr+   r   rT   rA   torchequalbatch_decodemodel_max_lengthloggerwarningrC   rU   r   repeat_interleavetype	TypeErrorstrrB   repeatviewcat)r1   promptr;   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_text	text_maskprompt_embedstext_encoder_hidden_statesuncond_tokensuncond_inputuncond_text_input_idsuncond_text_masknegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lenr   r   r   _encode_prompt   s   
 $


	




z'KandinskyImg2ImgPipeline._encode_promptoriginal_samplesrH   r8   returnc           
      C   s   t jdddt jd}d| }t j|dd}|j|j|jd}||j}|| d	 }| }t|j	t|j	k rG|
d
}t|j	t|j	k s8d||  d	 }| }t|j	t|j	k rl|
d
}t|j	t|j	k s]|| ||  }	|	S )Ng-C6?g{Gz?i  )r@         ?r   rV   )r;   r@   g      ?rS   r   )r\   linspacefloat32cumprodrC   r;   r@   flattenr+   rA   	unsqueeze)
r1   r}   rH   r8   betasalphasalphas_cumprodsqrt_alpha_prodsqrt_one_minus_alpha_prodnoisy_samplesr   r   r   rE     s"   

z"KandinskyImg2ImgPipeline.add_noise   d   333333?      @r   pilTri   imageimage_embedsnegative_image_embedsrl   heightwidthr9   r:   guidance_scalerj   r?   output_typecallbackcallback_stepsreturn_dictc           %   	      s  t |trd}nt |trt|}n	tdt| j}|| }|
dk}|||||\}}}t |tr>tj	|dd}t |trJtj	|dd}|ri|j
|dd}|j
|dd}tj	||gddj|j|d}t |tsq|g}tdd |D std	d
d |D  dtj	 fdd|D dd}|j|j|d}j|d }|j
|dd}jj||d ||	|\}}tjjj|	 d }tj|g| |j|d}jjj}t j\ |||| f|j||j}t|D ]\}}|rt	|gd n|}||d}j||||ddd }|rL|j|jd dd\}} | d\}!}"|  d\}}#|!|
|"|!   }tj	||#gdd}t!jjdr\jjj"dv sh|j|jd dd\}}jj#||||dj$}|dur|| dkr|t%jdd }$||$|| t&rt'(  qjj)|ddd }*  |dvrtd| j+,||}|s|fS t-|dS )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            image (`torch.Tensor`, `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
            image_embeds (`torch.Tensor` or `list[torch.Tensor]`):
                The clip image embeddings for text prompt, that will be used to condition the image generation.
            negative_image_embeds (`torch.Tensor` or `list[torch.Tensor]`):
                The clip image embeddings for negative text prompt, will be used to condition the image generation.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            strength (`float`, *optional*, defaults to 0.3):
                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
                be maximum and the denoising process will run for the full number of iterations specified in
                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        r   z2`prompt` has to be of type `str` or `list` but is r   r   rV   )r@   r;   c                 s   s$    | ]}t |tjjtjfV  qd S N)rZ   PILImager\   Tensor.0ir   r   r   	<genexpr>  s   " z4KandinskyImg2ImgPipeline.__call__.<locals>.<genexpr>zInput is in incorrect format: c                 S   s   g | ]}t |qS r   )rc   r   r   r   r   
<listcomp>  s    z5KandinskyImg2ImgPipeline.__call__.<locals>.<listcomp>z:. Currently, we only support  PIL image and pytorch tensorc                    s   g | ]
}j | qS r   )r0   
preprocessr   r   r1   r   r   r   r     s    rF   )r;   r   )text_embedsr   F)sampletimestepencoder_hidden_statesadded_cond_kwargsr   variance_type)learnedlearned_range)r?   NorderT)force_not_quantizer   )rL   npr   zIOnly the output types `pt`, `pil` and `np` are supported not output_type=)images).rZ   re   r[   r+   rB   rc   _execution_devicer|   r\   rh   rb   rC   r@   allr   encoder    set_timestepsr>   r6   r,   num_train_timestepstensorr   in_channelsr   r.   rI   	enumerateprogress_barsplitrA   chunkhasattrr   stepprev_sampler*   XLA_AVAILABLExm	mark_stepdecodemaybe_free_model_hooksr0   postprocessr   )%r1   ri   r   r   r   rl   r   r   r9   r:   r   rj   r?   r   r   r   r   rm   r;   rk   rs   rt   _rF   timesteps_tensorrG   num_channels_latentsr   tlatent_model_inputr   
noise_predvariance_prednoise_pred_uncondnoise_pred_textvariance_pred_textstep_idxr   r   r   __call__)  s   
O






 




z!KandinskyImg2ImgPipeline.__call__r   )Nr   r   r   r   r   r   Nr   Nr   T) __name__
__module____qualname____doc__model_cpu_offload_seqr   r   r   r   r   r(   r>   rI   r|   r\   r   	IntTensorrE   no_gradr   EXAMPLE_DOC_STRINGre   r[   r   r   r6   float	Generatorr   boolr   __classcell__r   r   r3   r   r   ]   s    	
g

"	
r   )r   )$typingr   	PIL.Imager   r\   transformersr   r0   r   modelsr   r   
schedulersr   utilsr	   r
   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   r`   r   r   r   r   r   r   r   <module>   s&   

)
