o
    ۷iE                     @   s   d dl mZ d dlZd dlmZ ddlmZmZ ddlm	Z	m
Z
 ddlmZmZmZ ddlmZ d	d
lmZmZ ddlmZ e rMd dlm  mZ dZndZeeZdZdddZG dd deZ dS )    )CallableN)XLMRobertaTokenizer   )UNet2DConditionModelVQModel)DDIMSchedulerDDPMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutput   )MultilingualCLIPTFav  
    Examples:
        ```py
        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-2-1-prior")
        >>> pipe_prior.to("cuda")

        >>> prompt = "red cat, 4k photo"
        >>> out = pipe_prior(prompt)
        >>> image_emb = out.image_embeds
        >>> negative_image_emb = out.negative_image_embeds

        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
        >>> pipe.to("cuda")

        >>> image = pipe(
        ...     prompt,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=negative_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ... ).images

        >>> image[0].save("cat.png")
        ```
   c                 C   sX   | |d  }| |d  dkr|d7 }||d  }||d  dkr$|d7 }|| || fS )Nr   r   r    )hwscale_factornew_hnew_wr   r   f/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky.pyget_new_h_wG   s   r   c                !       s  e Zd ZdZdZdedededee	B de
f
 fdd	Zd
d Z	d&ddZe ee												d'deee B dejeej B dejeej B deee B dB dedededededejeej B dB dejdB d edB d!eeeejgdf dB d"ed#efd$d%Z  ZS )(KandinskyPipelinea,  
    Pipeline for text-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (`DDIMScheduler` | `DDPMScheduler`):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
    ztext_encoder->unet->movqtext_encoder	tokenizerunet	schedulermovqc                    s:   t    | j|||||d dt| jjjd  | _d S )N)r   r   r   r   r    r   r   )super__init__register_moduleslenr    configblock_out_channelsmovq_scale_factor)selfr   r   r   r   r    	__class__r   r   r"   g   s   
zKandinskyPipeline.__init__c                 C   sR   |d u rt ||||d}n|j|krtd|j d| ||}||j }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r(   r.   r-   r,   r+   latentsr   r   r   r   prepare_latents{   s   


z!KandinskyPipeline.prepare_latentsNc              
   C   sZ  t |tr	t|nd}| j|ddddddd}|j}| j|dddj}	|	jd	 |jd	 krRt||	sR| j|	d d | jj	d d	f }
t
d
| jj	 d|
  ||}|j|}| j||d\}}|j|dd}|j|dd}|j|dd}|r(|d u rdg| }n;t|t|urtdt| dt| dt |tr|g}n|t|krtd| dt| d| d| d	|}| j|ddddddd}|j|}|j|}| j||d\}}|jd }|d|}||| |}|jd }|d|d}||| |d	}|j|dd}t||g}t||g}t||g}|||fS )Nr   
max_lengthTM   pt)padding
truncationr4   return_attention_maskadd_special_tokensreturn_tensorslongest)r7   r;   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )	input_idsattention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)r7   r4   r8   r9   r:   r;   )
isinstancelistr$   r   r>   r.   torchequalbatch_decodemodel_max_lengthloggerwarningr0   r?   r   repeat_interleavetype	TypeErrorstrr/   repeatviewcat)r(   promptr,   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_text	text_maskprompt_embedstext_encoder_hidden_statesuncond_tokensuncond_inputuncond_text_input_idsuncond_text_masknegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lenr   r   r   _encode_prompt   s   
 $


	




z KandinskyPipeline._encode_prompt   d         @r   pilTrS   image_embedsnegative_image_embedsrV   heightwidthnum_inference_stepsguidance_scalerT   r+   r2   output_typecallbackcallback_stepsreturn_dictc           #      C   s  t |trd}nt |trt|}n	tdt| | j}||	 }|dk}| |||	||\}}}t |tr>tj	|dd}t |trJtj	|dd}|ri|j
|	dd}|j
|	dd}tj	||gddj|j|d}| jj||d | jj}| jjj}t||| j\}}| ||||f|j||
|| j}t| |D ]\}}|rt	|gd n|}||d	}| j||||d
dd }|r|j|jd dd\}}|d\}}|d\}} ||||   }tj	|| gdd}t| jjdr| jjjdv s|j|jd dd\}}| jj||||
dj}|dur)|| dkr)|t| jdd }!||!|| t r0t!"  q| j#j$|ddd }"| %  |dvrKtd| |dv rj|"d d }"|"&dd}"|"' (dddd) * }"|dkrt| +|"}"|sz|"fS t,|"dS )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            image_embeds (`torch.Tensor` or `list[torch.Tensor]`):
                The clip image embeddings for text prompt, that will be used to condition the image generation.
            negative_image_embeds (`torch.Tensor` or `list[torch.Tensor]`):
                The clip image embeddings for negative text prompt, will be used to condition the image generation.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        r   z2`prompt` has to be of type `str` or `list` but is g      ?r   r@   )r-   r,   )r,   r   )text_embedsrk   F)sampletimestepencoder_hidden_statesadded_cond_kwargsrt   variance_type)learnedlearned_range)r+   NorderT)force_not_quantizerv   )r6   nprj   zIOnly the output types `pt`, `pil` and `np` are supported not output_type=)r   rj   g      ?r   rj   )images)-rD   rO   rE   r$   r/   rM   _execution_devicerf   rF   rR   rL   r0   r-   r   set_timesteps	timestepsr   r%   in_channelsr   r'   r3   	enumerateprogress_barsplitr.   chunkhasattrrz   stepprev_samplegetattrXLA_AVAILABLExm	mark_stepr    decodemaybe_free_model_hooksclampcpupermutefloatnumpynumpy_to_pilr   )#r(   rS   rk   rl   rV   rm   rn   ro   rp   rT   r+   r2   rq   rr   rs   rt   rW   r,   rU   r]   r^   _timesteps_tensornum_channels_latentsitlatent_model_inputry   
noise_predvariance_prednoise_pred_uncondnoise_pred_textvariance_pred_textstep_idximager   r   r   __call__   s   
I







	





zKandinskyPipeline.__call__)N)Nrg   rg   rh   ri   r   NNrj   Nr   T)__name__
__module____qualname____doc__model_cpu_offload_seqr   r   r   r   r   r   r"   r3   rf   rF   no_gradr   EXAMPLE_DOC_STRINGrO   rE   Tensorintr   	Generatorr   boolr   __classcell__r   r   r)   r   r   Q   s|    
f
	
r   )r   )!typingr   rF   transformersr   modelsr   r   
schedulersr   r   utilsr	   r
   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rJ   r   r   r   r   r   r   r   <module>   s"   


