o
    Gi]                     @   s"  d dl mZ d dlmZ d dlmZ d dlZd dlZd dl	m
Z
mZ ddlmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZ ddlmZ e r]d dlm  mZ dZ ndZ e!e"Z#e$e%ddde$e%ddddd  Z&dZ'eG dd deZ(G dd deeZ)dS )    )	dataclass)ceil)CallableN)CLIPTextModelCLIPTokenizer   )StableDiffusionLoraLoaderMixin)DDPMWuerstchenScheduler)
BaseOutput	deprecateis_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipeline   )WuerstchenPriorTF      ?gUUUUUU?   g           a  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import WuerstchenPriorPipeline

        >>> prior_pipe = WuerstchenPriorPipeline.from_pretrained(
        ...     "warp-ai/wuerstchen-prior", torch_dtype=torch.float16
        ... ).to("cuda")

        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
        >>> prior_output = pipe(prompt)
        ```
c                   @   s"   e Zd ZU dZejejB ed< dS )WuerstchenPriorPipelineOutputz
    Output class for WuerstchenPriorPipeline.

    Args:
        image_embeddings (`torch.Tensor` or `np.ndarray`)
            Prior image embeddings for text prompt

    image_embeddingsN)	__name__
__module____qualname____doc__torchTensornpndarray__annotations__ r"   r"   l/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/wuerstchen/pipeline_wuerstchen_prior.pyr   ;   s   
 	r   c                $       s  e Zd ZdZdZdZdZg dZddgZ			d6d	e	de
ded
ededededdf fddZdd Z				d7dejdB dejdB fddZ		d8ddZedd Zedd Zedd Ze eedd d d!dd"dddd#ddd$d%dd&gfd'eee B dB d(ed)ed*ed+ee d,ed-eee B dB dejdB dejdB d.edB d/ejeej B dB d&ejdB d0edB d1ed2e eegdf dB d3ee f d4d5Z!  Z"S )9WuerstchenPriorPipelinea  
    Pipeline for generating image prior for Wuerstchen.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    The pipeline also inherits the following loading methods:
        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights

    Args:
        prior ([`Prior`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        tokenizer (`CLIPTokenizer`):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        scheduler ([`DDPMWuerstchenScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
        latent_mean ('float', *optional*, defaults to 42.0):
            Mean value for latent diffusers.
        latent_std ('float', *optional*, defaults to 1.0):
            Standard value for latent diffusers.
        resolution_multiple ('float', *optional*, defaults to 42.67):
            Default resolution for multiple images generated.
    priortext_encoderztext_encoder->prior)latentstext_encoder_hidden_statesnegative_prompt_embeds      E@r   (\UE@	tokenizer	schedulerlatent_mean
latent_stdresolution_multiplereturnNc                    s0   t    | j||||d | j|||d d S )N)r,   r&   r%   r-   )r.   r/   r0   )super__init__register_modulesregister_to_config)selfr,   r&   r%   r-   r.   r/   r0   	__class__r"   r#   r3   l   s   


z WuerstchenPriorPipeline.__init__c                 C   sR   |d u rt ||||d}n|j|krtd|j d| ||}||j }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r6   r<   r;   r:   r9   r'   r-   r"   r"   r#   prepare_latents   s   


z'WuerstchenPriorPipeline.prepare_latentsprompt_embedsr)   c              
   C   sj  |d urt |trd}n|d urt |trt|}n|jd }|d u r| j|d| jjddd}	|	j}
|	j}| j|dddj}|jd	 |
jd	 krt	
|
|s| j|d d | jjd d	f }td
| jj d|  |
d d d | jjf }
|d d d | jjf }| j|
|||d}|j}|j| jj|d}|j|dd}|d u r|r|d u rdg| }n;t|t|urtdt| dt| dt |tr|g}n|t|krtd| dt| d| d| d	|}| j|d| jjddd}| j|j||j|d}|j}|r1|jd }|j| jj|d}|d|d}||| |d	}||fS )Nr   r   
max_lengthTpt)paddingrB   
truncationreturn_tensorslongest)rD   rF   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )attention_mask)r;   r:   )dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)
isinstancestrlistlenr<   r,   model_max_length	input_idsrI   r   equalbatch_decodeloggerwarningr&   r>   last_hidden_stater;   repeat_interleavetype	TypeErrorr=   repeatview)r6   r:   num_images_per_promptdo_classifier_free_guidancepromptnegative_promptrA   r)   
batch_sizetext_inputstext_input_idsrI   untruncated_idsremoved_texttext_encoder_outputuncond_tokensuncond_input*negative_prompt_embeds_text_encoder_outputseq_lenr"   r"   r#   encode_prompt   s   




z%WuerstchenPriorPipeline.encode_promptc                 C   s   |d ur|d urt d| d| d|d u r|d u rt d|d ur6t|ts6t|ts6t dt| |d urI|d urIt d| d| d|d urd|d urd|j|jkrdt d|j d	|j d
t|tsstdt| dd S )NzCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` rL   z5'num_inference_steps' must be of type 'int', but got zo                           In Case you want to provide explicit timesteps, please use the 'timesteps' argument.)r=   rM   rN   rO   rY   r<   intrZ   )r6   r_   r`   num_inference_stepsr^   rA   r)   r"   r"   r#   check_inputs   s<   	
z$WuerstchenPriorPipeline.check_inputsc                 C      | j S N_guidance_scaler6   r"   r"   r#   guidance_scale     z&WuerstchenPriorPipeline.guidance_scalec                 C   s
   | j dkS )Nr   rq   rs   r"   r"   r#   r^     s   
z3WuerstchenPriorPipeline.do_classifier_free_guidancec                 C   ro   rp   )_num_timestepsrs   r"   r"   r#   num_timesteps  ru   z%WuerstchenPriorPipeline.num_timestepsi   <   g       @r   rC   Tr'   r_   heightwidthrm   	timestepsrt   r`   r]   r9   output_typereturn_dictcallback_on_step_end"callback_on_step_end_tensor_inputsc           &   	      s  | dd}| dd}|durtddd |dur tddd |durAt fdd|D sAtd	 j d
 fdd|D   j}| _|durSt|trSd}n|durat|t	rat
|}n|jd }|durt|t	st|trx|g}n
tdt| d jr|durt|t	st|tr|g}n
tdt| d j||| j||	d  j|||
 j|||	d\}}	|	durt||	gn|}|j}t| jj }t| jj } jjj}|
| |||f}|dur jj||d  jj}t
|}n jj||d  jj} ||||| j}t
|dd  _t |dd D ]\}}| |!d"|} j jrFt|gd n| jrSt|gd n||d} jrk|#d\} }!t$|!|  j%} jj&||||dj'}|duri }"|D ]
}#t( |# |"|#< q| |||"}$|$ d|}|$ d|}|$ d|	}	|dur|| dkr|t) jdd }%||%|| t*rt+,  q)| jj-  jj. } /  |dkr|0 1 2 }|s|fS t3|S )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            height (`int`, *optional*, defaults to 1024):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to 1024):
                The width in pixels of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 60):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            timesteps (`list[int]`, *optional*):
                Custom timesteps to use for the denoising process. If not defined, equal spaced `num_inference_steps`
                timesteps are used. Must be in descending order.
            guidance_scale (`float`, *optional*, defaults to 8.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `decoder_guidance_scale` is defined as `w` of
                equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by
                setting `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are
                closely linked to the text `prompt`, usually at the expense of lower image quality.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `decoder_guidance_scale` is less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.WuerstchenPriorPipelineOutput`] or `tuple` [`~pipelines.WuerstchenPriorPipelineOutput`] if
            `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the
            generated image embeddings.
        callbackNcallback_stepsz1.0.0zhPassing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`znPassing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`c                 3   s    | ]}| j v V  qd S rp   _callback_tensor_inputs.0krs   r"   r#   	<genexpr>  s    

z3WuerstchenPriorPipeline.__call__.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r"   r   r   rs   r"   r#   
<listcomp>  s    z4WuerstchenPriorPipeline.__call__.<locals>.<listcomp>r   r   z2'prompt' must be of type 'list' or 'str', but got rL   z;'negative_prompt' must be of type 'list' or 'str', but got )rA   r)   )r_   r:   r]   r^   r`   rA   r)   )r{   r:   )r:   rH   r   )rc)model_outputtimestepsampler9   r'   r(   r)   orderr   )4popr   allr=   r   _execution_devicerr   rM   rN   rO   rP   r<   rZ   rY   r^   rn   rk   r   catr;   r   configr0   r%   c_inr-   set_timestepsr{   r@   rv   	enumerateprogress_barexpandsizer>   chunklerprt   stepprev_samplelocalsgetattrXLA_AVAILABLExm	mark_stepr.   r/   maybe_free_model_hookscpufloatnumpyr   )&r6   r_   ry   rz   rm   r{   rt   r`   rA   r)   r]   r9   r'   r|   r}   r~   r   kwargsr   r   r:   ra   r(   r;   latent_heightlatent_widthnum_channelseffnet_features_shapeitratiopredicted_image_embeddingpredicted_image_embedding_text predicted_image_embedding_uncondcallback_kwargsr   callback_outputsstep_idxr"   rs   r#   __call__!  s   V









z WuerstchenPriorPipeline.__call__)r*   r   r+   )NNNN)NN)#r   r   r   r   	unet_nametext_encoder_namemodel_cpu_offload_seqr   _lora_loadable_modulesr   r   r   r	   r   r3   r@   r   r   rk   rn   propertyrt   r^   rw   no_gradr   EXAMPLE_DOC_STRINGrN   rO   rl   	Generatorboolr   r   __classcell__r"   r"   r7   r#   r$   I   s    	
e
)


	
r$   )*dataclassesr   mathr   typingr   r   r   r   transformersr   r   loadersr   
schedulersr	   utilsr
   r   r   r   r   utils.torch_utilsr   pipeline_utilsr   modeling_wuerstchen_priorr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rU   rO   linspaceDEFAULT_STAGE_C_TIMESTEPSr   r   r$   r"   r"   r"   r#   <module>   s,   
,