o
    Gif                     @   s   d dl mZ d dlZd dlmZmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZ ddlmZ d	d
lmZmZmZ d	dlmZ e rOd dlm  mZ dZndZeeZdZG dd deeZ dS )    )CallableN)CLIPTextModelWithProjectionCLIPTokenizer   )StableCascadeUNet)DDPMWuerstchenScheduler)is_torch_versionis_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput)PaellaVQModelTFa  
    Examples:
        ```py
        >>> import torch
        >>> from diffusers import StableCascadePriorPipeline, StableCascadeDecoderPipeline

        >>> prior_pipe = StableCascadePriorPipeline.from_pretrained(
        ...     "stabilityai/stable-cascade-prior", torch_dtype=torch.bfloat16
        ... ).to("cuda")
        >>> gen_pipe = StableCascadeDecoderPipeline.from_pretrain(
        ...     "stabilityai/stable-cascade", torch_dtype=torch.float16
        ... ).to("cuda")

        >>> prompt = "an image of a shiba inu, donning a spacesuit and helmet"
        >>> prior_output = pipe(prompt)
        >>> images = gen_pipe(prior_output.image_embeddings, prompt=prompt)
        ```
c                $       s  e Zd ZdZdZdZdZdZg dZ	d5de	de
ded	ed
ededdf fddZdd Z						d6dejdB dejdB dejdB dejdB fddZ				d7ddZedd Zedd Zedd Zd d! Ze eedd"d#dddddd$ddd%d&dd'gfd(ejeej B d)eee B d*ed+ed,eee B dB dejdB dejdB dejdB dejdB d-ed.ej eej  B dB d'ejdB d/edB d0e!d1e"eegdf dB d2ee f d3d4Z#  Z$S )8StableCascadeDecoderPipelinea_  
    Pipeline for generating images from the Stable Cascade model.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        tokenizer (`CLIPTokenizer`):
            The CLIP tokenizer.
        text_encoder (`CLIPTextModelWithProjection`):
            The CLIP text encoder.
        decoder ([`StableCascadeUNet`]):
            The Stable Cascade decoder unet.
        vqgan ([`PaellaVQModel`]):
            The VQGAN model.
        scheduler ([`DDPMWuerstchenScheduler`]):
            A scheduler to be used in combination with `prior` to generate image embedding.
        latent_dim_scale (float, `optional`, defaults to 10.67):
            Multiplier to determine the VQ latent space size from the image embeddings. If the image embeddings are
            height=24 and width=24, the VQ latent shape needs to be height=int(24*10.67)=256 and
            width=int(24*10.67)=256 in order to match the training conditions.
    z0.35.2decodertext_encoderztext_encoder->decoder->vqgan)latentsprompt_embeds_poolednegative_prompt_embedsimage_embeddingsףp=
W%@	tokenizer	schedulervqganlatent_dim_scalereturnNc                    s.   t    | j|||||d | j|d d S )N)r   r   r   r   r   )r   )super__init__register_modulesregister_to_config)selfr   r   r   r   r   r   	__class__ n/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/stable_cascade/pipeline_stable_cascade.pyr    ^   s   
	z%StableCascadeDecoderPipeline.__init__c	                 C   s   |j \}	}
}}|| dt|| jj t|| jj f}|d u r(t||||d}n|j |kr8td|j  d| ||}||j }|S )N   )	generatordevicedtypezUnexpected latents shape, got z, expected )shapeintconfigr   r   
ValueErrortoinit_noise_sigma)r#   
batch_sizer   num_images_per_promptr+   r*   r)   r   r   _channelsheightwidthlatents_shaper&   r&   r'   prepare_latentsq   s   


z,StableCascadeDecoderPipeline.prepare_latentsprompt_embedsr   r   negative_prompt_embeds_pooledc              
   C   s  |d u r| j |d| j jddd}|j}|j}| j |dddj}|jd |jd krft||sf| j |d d | j jd df }t	d	| j j d
|  |d d d | j jf }|d d d | j jf }| j
||||dd}|jd }|d u r|jd}|j| j
j|d}|j| j
j|d}|j|dd}|j|dd}|	d u r|r|d u rdg| }n;t|t|urtdt| dt| dt|tr|g}n|t|krtd| dt| d| d| d	|}| j |d| j jddd}| j
|j||j|dd}|jd }	|jd}
|rV|	jd }|	j| j
j|d}	|	d|d}	|	|| |d}	|
jd }|
j| j
j|d}
|
d|d}
|
|| |d}
|||	|
fS )N
max_lengthTpt)paddingr<   
truncationreturn_tensorslongest)r>   r@      z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: )attention_maskoutput_hidden_states)r+   r*   r   dim z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)r   model_max_length	input_idsrD   r,   torchequalbatch_decodeloggerwarningr   r0   hidden_statestext_embeds	unsqueezer+   repeat_interleavetype	TypeError
isinstancestrlenr/   repeatview)r#   r*   r2   r3   do_classifier_free_guidancepromptnegative_promptr:   r   r   r;   text_inputstext_input_idsrD   untruncated_idsremoved_texttext_encoder_outputuncond_tokensuncond_input*negative_prompt_embeds_text_encoder_outputseq_lenr&   r&   r'   encode_prompt   s   







z*StableCascadeDecoderPipeline.encode_promptc                    s  |d ur!t  fdd|D s!td j d fdd|D  |d ur4|d ur4td| d| d	|d u r@|d u r@td
|d urWt|tsWt|tsWtdt| |d urj|d urjtd| d| d	|d ur|d ur|j|jkrtd|j d|j dd S d S d S )Nc                 3   s    | ]}| j v V  qd S N_callback_tensor_inputs.0kr#   r&   r'   	<genexpr>   s    

z<StableCascadeDecoderPipeline.check_inputs.<locals>.<genexpr>z2`callback_on_step_end_tensor_inputs` has to be in z, but found c                    s   g | ]	}| j vr|qS r&   rj   rl   ro   r&   r'   
<listcomp>   s    z=StableCascadeDecoderPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` rI   )allr/   rk   rW   rX   listrU   r,   )r#   r]   r^   r:   r   "callback_on_step_end_tensor_inputsr&   ro   r'   check_inputs   sB   z)StableCascadeDecoderPipeline.check_inputsc                 C      | j S ri   _guidance_scalero   r&   r&   r'   guidance_scale     z+StableCascadeDecoderPipeline.guidance_scalec                 C   s
   | j dkS )NrC   rw   ro   r&   r&   r'   r\     s   
z8StableCascadeDecoderPipeline.do_classifier_free_guidancec                 C   rv   ri   )_num_timestepsro   r&   r&   r'   num_timesteps"  rz   z*StableCascadeDecoderPipeline.num_timestepsc                 C   s   t dg}ddg}t |d|  t j d d }|| }|j| }||j||j}}|| d  t jd  d|  | }|S )NgMb?r   rC   g      ?r   )rL   tensorcospiclampr0   r*   acos)r#   talphas_cumprodsclamp_rangemin_varvarratior&   r&   r'   get_timestep_ratio_conditioning&  s    
&z<StableCascadeDecoderPipeline.get_timestep_ratio_conditioning
   g        rC   pilTr   r   r]   num_inference_stepsry   r^   r3   r)   output_typereturn_dictcallback_on_step_endrt   c           #      C   s*  | j }| jj}|| _tddr|tjkrtd| j|||||d t	|t
r.tj|dd}|dur:t	|tr:d}n|durHt	|t
rHt|}n|jd }|
|jd |  }
|du rr|du rr| j||||
| j|||||	d	
\}}}}	| jr|t||	gn|}| jrt|t|gn|}| jj||d
 | jj}| |||
||||| j}t	| jtr|dd }nt| jjdr| jjjrd| jj_td t| jdrd| jj }tj|dd}ng }t|| _t|  |D ]\}}t	| jts+t|dkr| !|" # |}|$|%d&|&|}n |' (| jjd $|%d&|}n|$|%d&|}| j| jrDt|gd n|| jrQt|gd n|||ddd }| jrm|)d\}}t*||| j+}t	| jtsv|}| jj,||||dj-}|duri }|D ]
} t. |  || < q|| |||}!|!/d|}|!/d|}|!/d|}t0rt12  q|dvrtd| |dks| j3jj4| }| j35|j67dd}"|dkr|"8dddd# ' 9 }"n|dkr|"8dddd# ' 9 }"| :|"}"n|}"| ;  |s|"S t<|"S )a?  
        Function invoked when calling the pipeline for generation.

        Args:
            image_embedding (`torch.Tensor` or `list[torch.Tensor]`):
                Image Embeddings either extracted from an image or generated by a Prior Model.
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide the image generation.
            num_inference_steps (`int`, *optional*, defaults to 12):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 0.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `decoder_guidance_scale` is defined as `w` of
                equation 2. of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by
                setting `decoder_guidance_scale > 1`. Higher guidance scale encourages to generate images that are
                closely linked to the text `prompt`, usually at the expense of lower image quality.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `decoder_guidance_scale` is less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            prompt_embeds_pooled (`torch.Tensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            negative_prompt_embeds_pooled (`torch.Tensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds_pooled will be generated from `negative_prompt`
                input argument.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by sampling using the supplied random `generator`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple` [`~pipelines.ImagePipelineOutput`] if `return_dict` is True,
            otherwise a `tuple`. When returning a tuple, the first element is a list with the generated image
            embeddings.
        <z2.2.0zW`StableCascadeDecoderPipeline` requires torch>=2.2.0 when using `torch.bfloat16` dtype.)r^   r:   r   rt   r   rF   NrC   )
r]   r*   r2   r3   r\   r^   r:   r   r   r;   )r*   rB   clip_sampleFz set `clip_sample` to be Falsebetasg      ?r   )sampletimestep_ratioclip_text_pooledeffnetr   )model_outputtimestepr   r)   r   r:   r   )r=   npr   latentzSOnly the output types `pt`, `np`, `pil` and `latent` are supported not output_type=r   r   r   r   )=_execution_devicer   r+   rx   r   rL   bfloat16r/   ru   rW   rs   catrX   rY   r,   rh   r\   
zeros_liker   set_timesteps	timestepsr9   r   hasattrr.   r   rO   rP   r   cumprodr{   	enumerateprogress_barr   longcpuexpandsizer0   floatdivchunklerpry   stepprev_samplelocalspopXLA_AVAILABLExm	mark_stepr   scale_factordecoder   r   permutenumpynumpy_to_pilmaybe_free_model_hooksr   )#r#   r   r]   r   ry   r^   r:   r   r   r;   r3   r)   r   r   r   r   rt   r*   r+   r2   r4   r   r   alphasr   ir   r   predicted_latentspredicted_latents_textpredicted_latents_uncondcallback_kwargsrn   callback_outputsimagesr&   r&   r'   __call__0  s   X





*	





z%StableCascadeDecoderPipeline.__call__)r   )NNNNNN)NNNN)%__name__
__module____qualname____doc___last_supported_version	unet_nametext_encoder_namemodel_cpu_offload_seqrk   r   r   r   r   r   r   r    r9   rL   Tensorrh   ru   propertyry   r\   r|   r   no_gradr   EXAMPLE_DOC_STRINGrs   rX   r-   	Generatorboolr   r   __classcell__r&   r&   r$   r'   r   :   s    	

n
)




	
r   )!typingr   rL   transformersr   r   modelsr   
schedulersr   utilsr   r	   r
   r   utils.torch_utilsr   pipeline_utilsr   r   r   #wuerstchen.modeling_paella_vq_modelr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rO   r   r   r&   r&   r&   r'   <module>   s    
