o
    Gi@                     @   s   d dl mZ d dlZd dlmZmZ ddlmZ ddlm	Z	m
Z
 ddlmZmZ d	d
lmZ d	dlmZ d	dlmZ d	dlmZ d	dlmZ dZG dd deeZdS )    )CallableN)CLIPTextModelCLIPTokenizer   )DDPMWuerstchenScheduler)	deprecatereplace_example_docstring   )DeprecatedPipelineMixinDiffusionPipeline   )PaellaVQModel)WuerstchenDiffNeXt)WuerstchenPrior)WuerstchenDecoderPipeline)WuerstchenPriorPipelineax  
    Examples:
        ```py
        >>> from diffusions import WuerstchenCombinedPipeline

        >>> pipe = WuerstchenCombinedPipeline.from_pretrained("warp-ai/Wuerstchen", torch_dtype=torch.float16).to(
        ...     "cuda"
        ... )
        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
        >>> images = pipe(prompt=prompt)
        ```
c                .       s  e Zd ZdZdZdZdededede	de
d	ed
edede	f fddZd;dedB fddZd<dedB dejeB fddZd<dedB dejeB fddZd<ddZdd Ze eeddddddd dd!dddd"ddd#ddd$gdd$gfd%eee B dB d&ed'ed(ed)ee dB d*ed+ed,ee dB d-ed.eee B dB d/ejdB d0ejdB d1ed2ejeej B dB d$ejdB d3edB d4ed5eeegdf dB d6ee d7eeegdf dB d8ee f*d9d:Z  Z S )=WuerstchenCombinedPipelinea  
    Combined Pipeline for text-to-image generation using Wuerstchen

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        tokenizer (`CLIPTokenizer`):
            The decoder tokenizer to be used for text inputs.
        text_encoder (`CLIPTextModel`):
            The decoder text encoder to be used for text inputs.
        decoder (`WuerstchenDiffNeXt`):
            The decoder model to be used for decoder image generation pipeline.
        scheduler (`DDPMWuerstchenScheduler`):
            The scheduler to be used for decoder image generation pipeline.
        vqgan (`PaellaVQModel`):
            The VQGAN model to be used for decoder image generation pipeline.
        prior_tokenizer (`CLIPTokenizer`):
            The prior tokenizer to be used for text inputs.
        prior_text_encoder (`CLIPTextModel`):
            The prior text encoder to be used for text inputs.
        prior_prior (`WuerstchenPrior`):
            The prior model to be used for prior pipeline.
        prior_scheduler (`DDPMWuerstchenScheduler`):
            The scheduler to be used for prior pipeline.
    z0.33.1T	tokenizertext_encoderdecoder	schedulervqganprior_tokenizerprior_text_encoderprior_priorprior_schedulerc
           
         sP   t    | j|||||||||	d	 t||||	d| _t|||||d| _d S )N)	r   r   r   r   r   r   r   r   r   )priorr   r   r   )r   r   r   r   r   )super__init__register_modulesr   
prior_piper   decoder_pipe)
selfr   r   r   r   r   r   r   r   r   	__class__ o/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_combined.pyr   J   s2   
z#WuerstchenCombinedPipeline.__init__Nattention_opc                 C   s   | j | d S N)r!   *enable_xformers_memory_efficient_attention)r"   r'   r%   r%   r&   r)   q   s   zEWuerstchenCombinedPipeline.enable_xformers_memory_efficient_attentiongpu_iddevicec                 C   $   | j j||d | jj||d dS )a  
        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
        r*   r+   N)r    enable_model_cpu_offloadr!   r"   r*   r+   r%   r%   r&   r.   t      z3WuerstchenCombinedPipeline.enable_model_cpu_offloadc                 C   r,   )u  
        Offloads all models (`unet`, `text_encoder`, `vae`, and `safety checker` state dicts) to CPU using 🤗
        Accelerate, significantly reducing memory usage. Models are moved to a `torch.device('meta')` and loaded on a
        GPU only when their specific submodule's `forward` method is called. Offloading happens on a submodule basis.
        Memory savings are higher than using `enable_model_cpu_offload`, but performance is lower.
        r-   N)r    enable_sequential_cpu_offloadr!   r/   r%   r%   r&   r1   ~   r0   z8WuerstchenCombinedPipeline.enable_sequential_cpu_offloadc                 C   s$   | j j||d | jj||d d S )N)iterabletotal)r    progress_barr!   )r"   r2   r3   r%   r%   r&   r4      s   z'WuerstchenCombinedPipeline.progress_barc                 K   s(   | j jdi | | jjdi | d S )Nr%   )r    set_progress_bar_configr!   )r"   kwargsr%   r%   r&   r5      s   z2WuerstchenCombinedPipeline.set_progress_bar_configi   <   g      @   g        r   pillatentspromptheightwidthprior_num_inference_stepsprior_timestepsprior_guidance_scalenum_inference_stepsdecoder_timestepsdecoder_guidance_scalenegative_promptprompt_embedsnegative_prompt_embedsnum_images_per_prompt	generatoroutput_typereturn_dictprior_callback_on_step_end(prior_callback_on_step_end_tensor_inputscallback_on_step_end"callback_on_step_end_tensor_inputsc                 K   s   i }| dddur|d|d< tddd | dddur,tddd |d|d< | jdi d	|du r7|ndd
|d|d|d|d|d|du rO|
ndd|d|d|d|d|ddddd|d||}|d }| jd||dur~|nd|||	|
|||||d|}|S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation for the prior and decoder.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings for the prior. Can be used to easily tweak text inputs, *e.g.*
                prompt weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            height (`int`, *optional*, defaults to 512):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width in pixels of the generated image.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `prior_guidance_scale` is defined as `w` of
                equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by
                setting `prior_guidance_scale > 1`. Higher guidance scale encourages to generate images that are
                closely linked to the text `prompt`, usually at the expense of lower image quality.
            prior_num_inference_steps (`int | dict[float, int]`, *optional*, defaults to 60):
                The number of prior denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference. For more specific timestep spacing, you can pass customized
                `prior_timesteps`
            num_inference_steps (`int`, *optional*, defaults to 12):
                The number of decoder denoising steps. More denoising steps usually lead to a higher quality image at
                the expense of slower inference. For more specific timestep spacing, you can pass customized
                `timesteps`
            prior_timesteps (`list[float]`, *optional*):
                Custom timesteps to use for the denoising process for the prior. If not defined, equal spaced
                `prior_num_inference_steps` timesteps are used. Must be in descending order.
            decoder_timesteps (`list[float]`, *optional*):
                Custom timesteps to use for the denoising process for the decoder. If not defined, equal spaced
                `num_inference_steps` timesteps are used. Must be in descending order.
            decoder_guidance_scale (`float`, *optional*, defaults to 0.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
            prior_callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `prior_callback_on_step_end(self: DiffusionPipeline, step: int, timestep:
                int, callback_kwargs: Dict)`.
            prior_callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `prior_callback_on_step_end` function. The tensors specified in the
                list will be passed as `callback_kwargs` argument. You will only be able to include variables listed in
                the `._callback_tensor_inputs` attribute of your pipeline class.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images.
        prior_callbackNcallbackz1.0.0ztPassing `prior_callback` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`prior_callback_stepszzPassing `prior_callback_steps` as an input argument to `__call__` is deprecated, consider use `prior_callback_on_step_end`callback_stepsr;   r<   r=   rA   	timestepsguidance_scalerD   rE   rF   rG   rH   r:   rI   ptrJ   FrM   rN   r    )image_embeddingsr;   rA   rS   rT   rD   rH   rI   rJ   rM   rN   r%   )getpopr   r    r!   )r"   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   r:   rI   rJ   rK   rL   rM   rN   r6   prior_kwargsprior_outputsrW   outputsr%   r%   r&   __call__   s   o
	
z#WuerstchenCombinedPipeline.__call__r(   )NN)!__name__
__module____qualname____doc___last_supported_version_load_connected_pipesr   r   r   r   r   r   r   r   r)   inttorchr+   strr.   r1   r4   r5   no_gradr   TEXT2IMAGE_EXAMPLE_DOC_STRINGlistfloatTensor	Generatorboolr]   __classcell__r%   r%   r#   r&   r   +   s    	
'




	
r   )typingr   re   transformersr   r   
schedulersr   utilsr   r   pipeline_utilsr
   r   modeling_paella_vq_modelr   modeling_wuerstchen_diffnextr   modeling_wuerstchen_priorr   pipeline_wuerstchenr   pipeline_wuerstchen_priorr   rh   r   r%   r%   r%   r&   <module>   s   