o
    ۷iQ                     @   s   d dl mZ d dlZd dlZd dlmZmZ ddlm	Z	 ddl
mZmZmZmZ ddlmZ dd	lmZmZmZ d
dlmZ d
dlmZ e rSd dlm  mZ dZndZeeZ dZ!G dd deeZ"dS )    )CallableN)CLIPTextModelCLIPTokenizer   )DDPMWuerstchenScheduler)	deprecateis_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput   )PaellaVQModel)WuerstchenDiffNeXtTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import WuerstchenPriorPipeline, WuerstchenDecoderPipeline

        >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
        ... ).to("cuda")
        >>> gen_pipe = WuerstchenDecoderPipeline.from_pretrain("warp-ai/wuerstchen", torch_dtype=torch.float16).to(
        ...     "cuda"
        ... )

        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
        >>> prior_output = pipe(prompt)
        >>> images = gen_pipe(prior_output.image_embeddings, prompt=prompt)
        ```
c                       sF  e Zd ZdZdZg dZ	d-dededede	d	e
d
eddf fddZdd Z	d.ddZedd Zedd Zedd Ze eeddddddddddddgfdejeej B d eee B d!ed"ee dB d#ed$eee B dB d%ed&ejeej B dB dejdB d'edB d(ed)eeegdf dB d*ee fd+d,Z  ZS )/WuerstchenDecoderPipelineaR  
    Pipeline for generating images from the Wuerstchen model.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        tokenizer (`CLIPTokenizer`):
            The CLIP tokenizer.
        text_encoder (`CLIPTextModel`):
            The CLIP text encoder.
        decoder ([`WuerstchenDiffNeXt`]):
            The WuerstchenDiffNeXt unet decoder.
        vqgan ([`PaellaVQModel`]):
            The VQGAN model.
        scheduler ([`DDPMWuerstchenScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
        latent_dim_scale (float, `optional`, defaults to 10.67):
            Multiplier to determine the VQ latent space size from the image embeddings. If the image embeddings are
            height=24 and width=24, the VQ latent shape needs to be height=int(24*10.67)=256 and
            width=int(24*10.67)=256 in order to match the training conditions.
    ztext_encoder->decoder->vqgan)latentstext_encoder_hidden_statesnegative_prompt_embedsimage_embeddingsףp=
W%@	tokenizertext_encoderdecoder	schedulervqganlatent_dim_scalereturnNc                    s.   t    | j|||||d | j|d d S )N)r   r   r   r   r   )r   )super__init__register_modulesregister_to_config)selfr   r   r   r   r   r   	__class__ h/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/wuerstchen/pipeline_wuerstchen.pyr!   [   s   
	z"WuerstchenDecoderPipeline.__init__c                 C   sR   |d u rt ||||d}n|j|krtd|j d| ||}||j }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r$   r,   r+   r*   r)   r   r   r'   r'   r(   prepare_latentso   s   


z)WuerstchenDecoderPipeline.prepare_latentsc              
   C   s
  t |tr	t|nd}| j|d| jjddd}|j}|j}	| j|dddj}
|
jd |jd krmt	||
sm| j
|
d d | jjd df }td	| jj d
|  |d d d | jjf }|	d d d | jjf }	| j|||	|d}|j}|j|dd}d }|r|d u rdg| }n;t|t|urtdt| dt| dt |tr|g}n|t|krtd| dt| d| d| d	|}| j|d| jjddd}| j|j||j|d}|j}|jd }|d|d}||| |d}||fS )Nr   
max_lengthTpt)paddingr1   
truncationreturn_tensorslongest)r3   r5   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )attention_maskr   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancelistlenr   model_max_length	input_idsr8   r,   torchequalbatch_decodeloggerwarningr   r.   last_hidden_staterepeat_interleavetype	TypeErrorstrr-   repeatview)r$   promptr*   num_images_per_promptdo_classifier_free_guidancenegative_prompt
batch_sizetext_inputstext_input_idsr8   untruncated_idsremoved_texttext_encoder_outputr   !uncond_text_encoder_hidden_statesuncond_tokensuncond_input*negative_prompt_embeds_text_encoder_outputseq_lenr'   r'   r(   encode_promptz   s~    $


z'WuerstchenDecoderPipeline.encode_promptc                 C      | j S N_guidance_scaler$   r'   r'   r(   guidance_scale      z(WuerstchenDecoderPipeline.guidance_scalec                 C   s
   | j dkS )Nr   r`   rb   r'   r'   r(   rP      s   
z5WuerstchenDecoderPipeline.do_classifier_free_guidancec                 C   r^   r_   )_num_timestepsrb   r'   r'   r(   num_timesteps   rd   z'WuerstchenDecoderPipeline.num_timesteps   g        r   pilTr   r   rN   num_inference_steps	timestepsrc   rQ   rO   r)   output_typereturn_dictcallback_on_step_end"callback_on_step_end_tensor_inputsc           %         s  | dd}| dd}|durtddd |dur tddd |durAt fdd|D sAtd	 j d
 fdd|D   j} jj}| _t	|t
sct	|trY|g}n
tdt| d jr|durt	|t
st	|trx|g}n
tdt| dt	|t
rtj|dd}t	|tjrtj||dj|d}t	|tjstdt| dt	|tstdt| d |||d|  j|\}}|durt||gn|} jrt|t|gn|}t|d jj }t|d jj }|d| d||f}|dur" jj||d  jj}t|}n jj||d  jj} |||||	 j}	t|dd  _ t! "|dd D ]\}}|#|	d|} j jrjt|	gd n|	 jrwt|gd n|||d} jr|$d\}}t%|| j&} jj'|||	|dj(}	|duri } |D ]
}!t) |! | |!< q| ||| }"|" d|	}	|" d|}|" d |}|dur|| dkr|t* jd!d" }#||#||	 t+rt,-  qM|
d#vrtd$|
 |
d%ks< j.jj/|	 }	 j.0|	j12dd"}$|
d&kr#|$3dddd"4 5 6 }$n|
d'kr;|$3dddd"4 5 6 }$ 7|$}$n|	}$ 8  |sG|$S t9|$S )(aq  
        Function invoked when calling the pipeline for generation.

        Args:
            image_embedding (`torch.Tensor` or `list[torch.Tensor]`):
                Image Embeddings either extracted from an image or generated by a Prior Model.
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            num_inference_steps (`int`, *optional*, defaults to 12):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`list[int]`, *optional*):
                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
                timesteps are used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 0.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `decoder_guidance_scale` is defined as `w` of
                equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by
                setting `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are
                closely linked to the text `prompt`, usually at the expense of lower image quality.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `decoder_guidance_scale` is less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
            embeddings.
        callbackNcallback_stepsz1.0.0zhPassing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`znPassing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`c                 3   s    | ]}| j v V  qd S r_   _callback_tensor_inputs.0krb   r'   r(   	<genexpr>2  s    

z5WuerstchenDecoderPipeline.__call__.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r'   rq   rs   rb   r'   r(   
<listcomp>6  s    z6WuerstchenDecoderPipeline.__call__.<locals>.<listcomp>z2'prompt' must be of type 'list' or 'str', but got r<   z;'negative_prompt' must be of type 'list' or 'str', but got r   r9   )r*   )r+   zI'image_embeddings' must be of type 'torch.Tensor' or 'np.array', but got z5'num_inference_steps' must be of type 'int', but got zo                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument.r   r      )rj   r*   r7   )reffnetclip)model_outputtimestepsampler)   r   r   r   orderr   )r2   nprh   latentzSOnly the output types `pt`, `np`, `pil` and `latent` are supported not output_type=r   r   rh   ):popr   allr-   rr   _execution_devicer   r+   ra   r=   r>   rK   rJ   rI   rP   rB   catr   ndarrayTensorr.   intr]   size
zeros_likeconfigr   r   set_timestepsrj   r?   r0   re   	enumerateprogress_barexpandchunklerprc   stepprev_samplelocalsgetattrXLA_AVAILABLExm	mark_stepr   scale_factordecoder~   clamppermutecpufloatnumpynumpy_to_pilmaybe_free_model_hooksr   )%r$   r   rN   ri   rj   rc   rQ   rO   r)   r   rk   rl   rm   rn   kwargsro   rp   r*   r+   prompt_embedsr   r   rz   latent_heightlatent_widthlatent_features_shapeitratiopredicted_latentspredicted_latents_textpredicted_latents_uncondcallback_kwargsru   callback_outputsstep_idximagesr'   rb   r(   __call__   s   J












z"WuerstchenDecoderPipeline.__call__)r   r_   ) __name__
__module____qualname____doc__model_cpu_offload_seqrr   r   r   r   r   r   r   r!   r0   r]   propertyrc   rP   rf   rB   no_gradr
   EXAMPLE_DOC_STRINGr   r>   rK   r   	Generatorboolr   r   __classcell__r'   r'   r%   r(   r   ;   s    
R




	
r   )#typingr   r   r   rB   transformersr   r   
schedulersr   utilsr   r   r	   r
   utils.torch_utilsr   pipeline_utilsr   r   r   modeling_paella_vq_modelr   modeling_wuerstchen_diffnextr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rE   r   r   r'   r'   r'   r(   <module>   s"   
