o
    piE                     @   s   d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	m
Z
 ddlmZmZ ddlmZmZ ddlmZ d	d
lmZmZ ddlmZ eeZdZdddZG dd deZdS )    )CallableListOptionalUnionN)XLMRobertaTokenizer   )UNet2DConditionModelVQModel)DDIMSchedulerDDPMScheduler)loggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutput   )MultilingualCLIPav  
    Examples:
        ```py
        >>> from diffusers import KandinskyPipeline, KandinskyPriorPipeline
        >>> import torch

        >>> pipe_prior = KandinskyPriorPipeline.from_pretrained("kandinsky-community/Kandinsky-2-1-prior")
        >>> pipe_prior.to("cuda")

        >>> prompt = "red cat, 4k photo"
        >>> out = pipe_prior(prompt)
        >>> image_emb = out.image_embeds
        >>> negative_image_emb = out.negative_image_embeds

        >>> pipe = KandinskyPipeline.from_pretrained("kandinsky-community/kandinsky-2-1")
        >>> pipe.to("cuda")

        >>> image = pipe(
        ...     prompt,
        ...     image_embeds=image_emb,
        ...     negative_image_embeds=negative_image_emb,
        ...     height=768,
        ...     width=768,
        ...     num_inference_steps=100,
        ... ).images

        >>> image[0].save("cat.png")
        ```
   c                 C   sX   | |d  }| |d  dkr|d7 }||d  }||d  dkr$|d7 }|| || fS )Nr   r   r    )hwscale_factornew_hnew_wr   r   n/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/kandinsky/pipeline_kandinsky.pyget_new_h_wB   s   r   c                !       s4  e Zd ZdZdZdedededee	e
f def
 fdd	Zd
d Z	d&ddZe ee												d'deeee f deejeej f deejeej f deeeee f  dedededededeeejeej f  deej d ee d!eeeeejgdf  d"ed#efd$d%Z  ZS )(KandinskyPipelinea1  
    Pipeline for text-to-image generation using Kandinsky

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        text_encoder ([`MultilingualCLIP`]):
            Frozen text-encoder.
        tokenizer ([`XLMRobertaTokenizer`]):
            Tokenizer of class
        scheduler (Union[`DDIMScheduler`,`DDPMScheduler`]):
            A scheduler to be used in combination with `unet` to generate image latents.
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
    ztext_encoder->unet->movqtext_encoder	tokenizerunet	schedulermovqc                    s:   t    | j|||||d dt| jjjd  | _d S )N)r   r   r    r!   r"   r   r   )super__init__register_moduleslenr"   configblock_out_channelsmovq_scale_factor)selfr   r   r    r!   r"   	__class__r   r   r$   b   s   
zKandinskyPipeline.__init__c                 C   sR   |d u rt ||||d}n|j|krtd|j d| ||}||j }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r*   r0   r/   r.   r-   latentsr!   r   r   r   prepare_latentsv   s   


z!KandinskyPipeline.prepare_latentsNc              
   C   sZ  t |tr	t|nd}| j|ddddddd}|j}| j|dddj}	|	jd	 |jd	 krRt||	sR| j|	d d | jj	d d	f }
t
d
| jj	 d|
  ||}|j|}| j||d\}}|j|dd}|j|dd}|j|dd}|r(|d u rdg| }n;t|t|urtdt| dt| dt |tr|g}n|t|krtd| dt| d| d| d	|}| j|ddddddd}|j|}|j|}| j||d\}}|jd }|d|}||| |}|jd }|d|d}||| |d	}|j|dd}t||g}t||g}t||g}|||fS )Nr   
max_lengthTM   pt)padding
truncationr6   return_attention_maskadd_special_tokensreturn_tensorslongest)r9   r=   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )	input_idsattention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)r9   r6   r:   r;   r<   r=   )
isinstancelistr&   r   r@   r0   torchequalbatch_decodemodel_max_lengthloggerwarningr2   rA   r   repeat_interleavetype	TypeErrorstrr1   repeatviewcat)r*   promptr.   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsuntruncated_idsremoved_text	text_maskprompt_embedstext_encoder_hidden_statesuncond_tokensuncond_inputuncond_text_input_idsuncond_text_masknegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lenr   r   r   _encode_prompt   s   
 $


	




z KandinskyPipeline._encode_prompt   d         @r   pilTrU   image_embedsnegative_image_embedsrX   heightwidthnum_inference_stepsguidance_scalerV   r-   r4   output_typecallbackcallback_stepsreturn_dictc           #      C   s  t |trd}nt |trt|}n	tdt| | j}||	 }|dk}| |||	||\}}}t |tr>tj	|dd}t |trJtj	|dd}|ri|j
|	dd}|j
|	dd}tj	||gddj|j|d}| jj||d | jj}| jjj}t||| j\}}| ||||f|j||
|| j}t| |D ]\}}|rt	|gd n|}||d	}| j||||d
dd }|r|j|jd dd\}}|d\}}|d\}} ||||   }tj	|| gdd}t| jjdr| jjjdv s|j|jd dd\}}| jj||||
dj}|dur)|| dkr)|t| jdd }!||!|| q| j j!|ddd }"| "  |dvrDtd| |dv rc|"d d }"|"#dd}"|"$ %dddd& ' }"|dkrm| (|"}"|ss|"fS t)|"dS )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation.
            image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for text prompt, that will be used to condition the image generation.
            negative_image_embeds (`torch.Tensor` or `List[torch.Tensor]`):
                The clip image embeddings for negative text prompt, will be used to condition the image generation.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 100):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            callback (`Callable`, *optional*):
                A function that calls every `callback_steps` steps during inference. The function is called with the
                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function is called. If not specified, the callback is called at
                every step.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        r   z2`prompt` has to be of type `str` or `list` but is g      ?r   rB   )r/   r.   )r.   r   )text_embedsrm   F)sampletimestepencoder_hidden_statesadded_cond_kwargsrv   variance_type)learnedlearned_range)r-   NorderT)force_not_quantizerx   )r8   nprl   zIOnly the output types `pt`, `pil` and `np` are supported not output_type=)r   rl   g      ?r   rl   )images)*rF   rQ   rG   r&   r1   rO   _execution_devicerh   rH   rT   rN   r2   r/   r!   set_timesteps	timestepsr    r'   in_channelsr   r)   r5   	enumerateprogress_barsplitr0   chunkhasattrr|   stepprev_samplegetattrr"   decodemaybe_free_model_hooksclampcpupermutefloatnumpynumpy_to_pilr   )#r*   rU   rm   rn   rX   ro   rp   rq   rr   rV   r-   r4   rs   rt   ru   rv   rY   r.   rW   r_   r`   _timesteps_tensornum_channels_latentsitlatent_model_inputr{   
noise_predvariance_prednoise_pred_uncondnoise_pred_textvariance_pred_textstep_idximager   r   r   __call__   s   
I







	





zKandinskyPipeline.__call__)N)Nri   ri   rj   rk   r   NNrl   Nr   T)__name__
__module____qualname____doc__model_cpu_offload_seqr   r   r   r   r
   r   r	   r$   r5   rh   rH   no_gradr   EXAMPLE_DOC_STRINGrQ   r   Tensorr   intr   	Generatorr   boolr   __classcell__r   r   r+   r   r   L   s|    

f	
r   )r   )typingr   r   r   r   rH   transformersr   modelsr   r	   
schedulersr
   r   utilsr   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   
get_loggerr   rL   r   r   r   r   r   r   r   <module>   s   


