o
    piU                     @   s   d dl mZmZmZmZ d dlZd dlZd dl	Z	d dlm
Z
 d dlmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ d
dlmZmZ ddlmZ eeZdZdddZ dddZ!G dd deZ"dS )    )CallableListOptionalUnionN)Image)XLMRobertaTokenizer   )UNet2DConditionModelVQModel)DDIMScheduler)loggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutput   )MultilingualCLIPa  
    Examples:
        ```py
        >>> from diffusers import KandinskyImg2ImgPipeline, KandinskyPriorPipeline
        >>> from diffusers.utils import load_image
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained(
        ...     "kandinsky-community/kandinsky-2-1-prior", torch_dtype=torch.float16
        ... )
        >>> pipe_prior.to("cuda")

        >>> prompt = "A red cartoon frog, 4k"
        >>> image_emb, zero_image_emb = pipe_prior(prompt, return_dict=False)

        >>> pipe = KandinskyImg2ImgPipeline.from_pretrained(
        ...     "kandinsky-community/kandinsky-2-1", torch_dtype=torch.float16
        ... )
        >>> pipe.to("cuda")

        >>> init_image = load_image(
        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
        ...     "/kandinsky/frog.png"
        ... )

        >>> image = pipe(
        ...     prompt,
        ...     image=init_image,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=zero_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ...     strength=0.2,
        ... ).images

        >>> image[0].save("red_frog.png")
        ```
   c                 C   sX   | |d  }| |d  dkr|d7 }||d  }||d  dkr$|d7 }|| || fS )Nr   r   r    )hwscale_factornew_hnew_wr   r   v/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.pyget_new_h_wO   s   r      c                 C   s^   | j ||ftjdd} t| d}|tjd d }t|g d}t	
|d}|S )Nr   )resamplereducing_gapRGBg     _@)r   r   r   r   )resizer   BICUBICnparrayconvertastypefloat32	transposetorch
from_numpy	unsqueeze)	pil_imager   r   arrimager   r   r   prepare_imageY   s   r/   c                #       sX  e Zd ZdZdZdedededede	f
 fdd	Z
d
d Zdd Z	d0ddZdejdejdejdejfddZe ee												d1deeee f deejejjeej eejj f d ejd!ejd"eeeee f  d#ed$ed%ed&ed'ed(ed)eeejeej f  d*ee d+eeeeejgdf  d,ed-ef d.d/Z   Z!S )2KandinskyImg2ImgPipelinea  
    Pipeline for image-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler ([`DDIMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ image encoder and decoder
    ztext_encoder->unet->movqtext_encodermovq	tokenizerunet	schedulerc                    s:   t    | j|||||d dt| jjjd  | _d S )N)r1   r3   r4   r5   r2   r   r   )super__init__register_moduleslenr2   configblock_out_channelsmovq_scale_factor)selfr1   r2   r3   r4   r5   	__class__r   r   r7   x   s   
z!KandinskyImg2ImgPipeline.__init__c                 C   s<   t t|| |}t|| d}| jj|d  }||| fS )Nr   )minintmaxr5   	timesteps)r=   num_inference_stepsstrengthdeviceinit_timestept_startrC   r   r   r   get_timesteps   s   z&KandinskyImg2ImgPipeline.get_timestepsc           	      C   sv   |d u rt ||||d}n|j|krtd|j d| ||}||j }|j}t ||||d}| |||}|S )N)	generatorrF   dtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma	add_noise)	r=   latentslatent_timesteprL   rK   rF   rJ   r5   noiser   r   r   prepare_latents   s   


z(KandinskyImg2ImgPipeline.prepare_latentsNc              
   C   sZ  t |tr	t|nd}| j|ddddddd}|j}| j|dddj}	|	jd	 |jd	 krRt||	sR| j|	d d | jj	d d	f }
t
d
| jj	 d|
  ||}|j|}| j||d\}}|j|dd}|j|dd}|j|dd}|r(|d u rdg| }n;t|t|urtdt| dt| dt |tr|g}n|t|krtd| dt| d| d| d	|}| j|ddddddd}|j|}|j|}| j||d\}}|jd }|d|}||| |}|jd }|d|d}||| |d	}|j|dd}t||g}t||g}t||g}|||fS )Nr   
max_lengthM   Tpt)paddingrU   
truncationreturn_attention_maskadd_special_tokensreturn_tensorslongest)rX   r\   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )	input_idsattention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancelistr9   r3   r_   rL   r)   equalbatch_decodemodel_max_lengthloggerwarningrN   r`   r1   repeat_interleavetype	TypeErrorstrrM   repeatviewcat)r=   promptrF   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_text	text_maskprompt_embedstext_encoder_hidden_statesuncond_tokensuncond_inputuncond_text_input_idsuncond_text_masknegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lenr   r   r   _encode_prompt   s   
 $


	




z'KandinskyImg2ImgPipeline._encode_promptoriginal_samplesrS   rC   returnc           
      C   s   t jdddt jd}d| }t j|dd}|j|j|jd}||j}|| d	 }| }t|j	t|j	k rG|
d
}t|j	t|j	k s8d||  d	 }| }t|j	t|j	k rl|
d
}t|j	t|j	k s]|| ||  }	|	S )Ng-C6?g{Gz?i  )rK         ?r   ra   )rF   rK         ?r^   r   )r)   linspacer'   cumprodrN   rF   rK   flattenr9   rL   r+   )
r=   r   rS   rC   betasalphasalphas_cumprodsqrt_alpha_prodsqrt_one_minus_alpha_prodnoisy_samplesr   r   r   rP     s"   

z"KandinskyImg2ImgPipeline.add_noiser   d   333333?      @r   pilTrs   r.   image_embedsnegative_image_embedsrv   heightwidthrD   rE   guidance_scalert   rJ   output_typecallbackcallback_stepsreturn_dictc           %   	      s  t |trd}nt |trt|}n	tdt| | j}|| }|
dk}| |||||\}}}t |tr>tj	|dd}t |trJtj	|dd}|ri|j
|dd}|j
|dd}tj	||gddj|j|d}t |tsq|g}tdd |D std	d
d |D  dtj	 fdd|D dd}|j|j|d}| j|d }|j
|dd}| jj||d | ||	|\}}t| jjj|	 d }tj|g| |j|d}| jjj}t | j\ | |||| f|j||| j}t| |D ]\}}|r
t	|gd n|}||d}| j||||ddd }|rK|j|jd dd\}} | d\}!}"|  d\}}#|!|
|"|!   }tj	||#gdd}t!| jjdr[| jjj"dv sg|j|jd dd\}}| jj#||||dj$}|dur|| dkr|t%| jdd }$||$|| q| jj&|ddd }| '  |dvrtd| |dv r|d d }|(dd}|) *ddd d+ , }|d!kr| -|}|s|fS t.|d"S )#a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image (`torch.Tensor`, `PIL.Image.Image`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for text prompt, that will be used to condition the image generation.
            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for negative text prompt, will be used to condition the image generation.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            strength (`float`, *optional*, defaults to 0.3):
                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
                be maximum and the denoising process will run for the full number of iterations specified in
                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        r   z2`prompt` has to be of type `str` or `list` but is r   r   ra   )rK   rF   c                 s   s$    | ]}t |tjjtjfV  qd S N)re   PILr   r)   Tensor.0ir   r   r   	<genexpr>  s   " z4KandinskyImg2ImgPipeline.__call__.<locals>.<genexpr>zInput is in incorrect format: c                 S   s   g | ]}t |qS r   )rm   r   r   r   r   
<listcomp>  s    z5KandinskyImg2ImgPipeline.__call__.<locals>.<listcomp>z:. Currently, we only support  PIL image and pytorch tensorc                    s   g | ]}t | qS r   )r/   r   r   r   r   r   r     s    rQ   )rF   r   )text_embedsr   F)sampletimestepencoder_hidden_statesadded_cond_kwargsr   variance_type)learnedlearned_range)rJ   NorderT)force_not_quantizer   )rW   r#   r   zIOnly the output types `pt`, `pil` and `np` are supported not output_type=)r#   r   r   r   r   )images)/re   ro   rf   r9   rM   rm   _execution_devicer   r)   rr   rl   rN   rK   allr2   encoder5   set_timestepsrI   rA   r:   num_train_timestepstensorr4   in_channelsr   r<   rT   	enumerateprogress_barsplitrL   chunkhasattrr   stepprev_samplegetattrdecodemaybe_free_model_hooksclampcpupermutefloatnumpynumpy_to_pilr   )%r=   rs   r.   r   r   rv   r   r   rD   rE   r   rt   rJ   r   r   r   r   rw   rF   ru   r}   r~   _rQ   timesteps_tensorrR   num_channels_latentsr   tlatent_model_inputr   
noise_predvariance_prednoise_pred_uncondnoise_pred_textvariance_pred_textstep_idxr   r   r   __call__%  s   
O














z!KandinskyImg2ImgPipeline.__call__r   )Nr   r   r   r   r   r   Nr   Nr   T)"__name__
__module____qualname____doc__model_cpu_offload_seqr   r
   r   r	   r   r7   rI   rT   r   r)   r   	IntTensorrP   no_gradr   EXAMPLE_DOC_STRINGr   ro   r   r   r   r   rA   r   	Generatorr   boolr   __classcell__r   r   r>   r   r0   b   s    	
g
"	
r0   )r   )r   r   )#typingr   r   r   r   r   r#   	PIL.Imager   r)   r   transformersr   modelsr	   r
   
schedulersr   utilsr   r   utils.torch_utilsr   pipeline_utilsr   r   r1   r   
get_loggerr   rj   r   r   r/   r0   r   r   r   r   <module>   s"   

)

	