o
    pi@                     @   s   d dl mZmZmZmZmZ d dlZd dlmZm	Z	 ddl
mZ ddlmZmZ ddlmZ d	d
lmZ d	dlmZ d	dlmZ d	dlmZ d	dlmZ dZG dd deZdS )    )CallableDictListOptionalUnionN)CLIPTextModelCLIPTokenizer   )DDPMWuerstchenScheduler)	deprecatereplace_example_docstring   )DiffusionPipeline   )PaellaVQModel)WuerstchenDiffNeXt)WuerstchenPrior)WuerstchenDecoderPipeline)WuerstchenPriorPipelineax  
    Examples:
        ```py
        >>> from diffusions import WuerstchenCombinedPipeline

        >>> pipe = WuerstchenCombinedPipeline.from_pretrained("warp-ai/Wuerstchen", torch_dtype=torch.float16).to(
        ...     "cuda"
        ... )
        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
        >>> images = pipe(prompt=prompt)
        ```
c                /       s  e Zd ZdZdZdedededede	ded	ed
e
def fddZd;dee fddZd<dee deejef fddZd<dee deejef fddZd=ddZdd Ze eeddddddd dd!dddd"ddd#ddd$gdd$gfd%eeeee f  d&ed'ed(ed)eee  d*ed+ed,eee  d-ed.eeeee f  d/eej d0eej d1ed2eeejeej f  d$eej d3ee d4ed5eeeee gdf  d6ee d7eeeee gdf  d8ee f*d9d:Z!  Z"S )>WuerstchenCombinedPipelinea  
    Combined Pipeline for text-to-image generation using Wuerstchen

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        tokenizer (`CLIPTokenizer`):
            The decoder tokenizer to be used for text inputs.
        text_encoder (`CLIPTextModel`):
            The decoder text encoder to be used for text inputs.
        decoder (`WuerstchenDiffNeXt`):
            The decoder model to be used for decoder image generation pipeline.
        scheduler (`DDPMWuerstchenScheduler`):
            The scheduler to be used for decoder image generation pipeline.
        vqgan (`PaellaVQModel`):
            The VQGAN model to be used for decoder image generation pipeline.
        prior_tokenizer (`CLIPTokenizer`):
            The prior tokenizer to be used for text inputs.
        prior_text_encoder (`CLIPTextModel`):
            The prior text encoder to be used for text inputs.
        prior_prior (`WuerstchenPrior`):
            The prior model to be used for prior pipeline.
        prior_scheduler (`DDPMWuerstchenScheduler`):
            The scheduler to be used for prior pipeline.
    T	tokenizertext_encoderdecoder	schedulervqganprior_tokenizerprior_text_encoderprior_priorprior_schedulerc
           
         sP   t    | j|||||||||	d	 t||||	d| _t|||||d| _d S )N)	r   r   r   r   r   r   r   r   r   )priorr   r   r   )r   r   r   r   r   )super__init__register_modulesr   
prior_piper   decoder_pipe)
selfr   r   r   r   r   r   r   r   r   	__class__ y/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.pyr!   I   s2   
z#WuerstchenCombinedPipeline.__init__Nattention_opc                 C   s   | j | d S N)r$   *enable_xformers_memory_efficient_attention)r%   r*   r(   r(   r)   r,   p   s   zEWuerstchenCombinedPipeline.enable_xformers_memory_efficient_attentioncudagpu_iddevicec                 C   $   | j j||d | jj||d dS )a  
        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
        r.   r/   N)r#   enable_model_cpu_offloadr$   r%   r.   r/   r(   r(   r)   r2   s      z3WuerstchenCombinedPipeline.enable_model_cpu_offloadc                 C   r0   )u  
        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
        r1   N)r#   enable_sequential_cpu_offloadr$   r3   r(   r(   r)   r5   }   r4   z8WuerstchenCombinedPipeline.enable_sequential_cpu_offloadc                 C   s$   | j j||d | jj||d d S )N)iterabletotal)r#   progress_barr$   )r%   r6   r7   r(   r(   r)   r8      s   z'WuerstchenCombinedPipeline.progress_barc                 K   s(   | j jdi | | jjdi | d S )Nr(   )r#   set_progress_bar_configr$   )r%   kwargsr(   r(   r)   r9      s   z2WuerstchenCombinedPipeline.set_progress_bar_configi   <   g      @   g        r   pillatentspromptheightwidthprior_num_inference_stepsprior_timestepsprior_guidance_scalenum_inference_stepsdecoder_timestepsdecoder_guidance_scalenegative_promptprompt_embedsnegative_prompt_embedsnum_images_per_prompt	generatoroutput_typereturn_dictprior_callback_on_step_end(prior_callback_on_step_end_tensor_inputscallback_on_step_end"callback_on_step_end_tensor_inputsc                 K   s   i }| dddur|d|d< tddd | dddur,tddd |d|d< | jdi d	|du r7|ndd
|d|d|d|d|d|du rO|
ndd|d|d|d|d|ddddd|d||}|d }| jd||dur~|nd|||	|
|||||d|}|S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide the image generation for the prior and decoder.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `prior_guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting
                `prior_guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked
                to the text `prompt`, usually at the expense of lower image quality.
            prior_num_inference_steps (`Union[int, Dict[float, int]]`, *optional*, defaults to 60):
                The number of prior denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference. For more specific timestep spacing, you can pass customized
                `prior_timesteps`
            num_inference_steps (`int`, *optional*, defaults to 12):
                The number of decoder denoising steps. More denoising steps usually lead to a higher quality image at
                the expense of slower inference. For more specific timestep spacing, you can pass customized
                `timesteps`
            prior_timesteps (`List[float]`, *optional*):
                Custom timesteps to use for the denoising process for the prior. If not defined, equal spaced
                `prior_num_inference_steps` timesteps are used. Must be in descending order.
            decoder_timesteps (`List[float]`, *optional*):
                Custom timesteps to use for the denoising process for the decoder. If not defined, equal spaced
                `num_inference_steps` timesteps are used. Must be in descending order.
            decoder_guidance_scale (`float`, *optional*, defaults to 0.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will ge generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
            prior_callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `prior_callback_on_step_end(self: DiffusionPipeline, step: int, timestep:
                int, callback_kwargs: Dict)`.
            prior_callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
                the `._callback_tensor_inputs` attribute of your pipeline class.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
        prior_callbackNcallbackz1.0.0ztPassing `prior_callback` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`prior_callback_stepszzPassing `prior_callback_steps` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`callback_stepsr?   r@   rA   rE   	timestepsguidance_scalerH   rI   rJ   rK   rL   r>   rM   ptrN   FrQ   rR   r    )image_embeddingsr?   rE   rW   rX   rH   rL   rM   rN   rQ   rR   r(   )getpopr   r#   r$   )r%   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   r>   rM   rN   rO   rP   rQ   rR   r:   prior_kwargsprior_outputsr[   outputsr(   r(   r)   __call__   s   o
	
z#WuerstchenCombinedPipeline.__call__r+   )Nr-   )NN)#__name__
__module____qualname____doc___load_connected_pipesr   r   r   r
   r   r   r!   r   r   r,   intr   torchr/   strr2   r5   r8   r9   no_gradr   TEXT2IMAGE_EXAMPLE_DOC_STRINGr   floatTensor	Generatorboolr   ra   __classcell__r(   r(   r&   r)   r   +   s    	
'""




	
r   )typingr   r   r   r   r   rh   transformersr   r   
schedulersr
   utilsr   r   pipeline_utilsr   modeling_paella_vq_modelr   modeling_wuerstchen_diffnextr   modeling_wuerstchen_priorr   pipeline_wuerstchenr   pipeline_wuerstchen_priorr   rk   r   r(   r(   r(   r)   <module>   s   