o
    ۷iGW                     @   s   d dl Z d dlZd dlmZ d dlmZmZ d dlm	Z	 ddl
mZmZmZ ddlmZ ddlmZmZ dd	lmZ d
dlmZmZmZ ddlmZ e rYd dlm  mZ dZndZe e!Z"G dd deeZ#dS )    N)
functional)CLIPTextModelWithProjectionCLIPTokenizer)CLIPTextModelOutput   )PriorTransformerUNet2DConditionModelUNet2DModel)UnCLIPScheduler)is_torch_xla_availablelogging)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput   )UnCLIPTextProjModelTFc                        s  e Zd ZU dZdZdgZeed< eed< e	ed< e
ed< eed< eed< eed	< eed
< eed< eed< dZdedede
dede	ded	ed
ededef fddZdd Z		d-deeB dB dejdB fddZe 															d.deee B dB ded ed!ed"ed#ejeej B dB d$ejdB d%ejdB d&ejdB deeB dB dejdB d'ed(ed)edB d*efd+d,Z  ZS )/UnCLIPPipelineaE  
    Pipeline for text-to-image generation using unCLIP.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        tokenizer ([`~transformers.CLIPTokenizer`]):
            A `CLIPTokenizer` to tokenize text.
        prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        text_proj ([`UnCLIPTextProjModel`]):
            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
            The decoder to invert the image embedding into an image.
        super_res_first ([`UNet2DModel`]):
            Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
        super_res_last ([`UNet2DModel`]):
            Super resolution UNet. Used in the last step of the super resolution diffusion process.
        prior_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the prior denoising process (a modified [`DDPMScheduler`]).
        decoder_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
        super_res_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).

    z0.33.1priordecoder	text_projtext_encoder	tokenizersuper_res_firstsuper_res_lastprior_schedulerdecoder_schedulersuper_res_schedulerzAtext_encoder->text_proj->decoder->super_res_first->super_res_lastc                    s,   t    | j|||||||||	|
d
 d S )N)
r   r   r   r   r   r   r   r   r   r   )super__init__register_modules)selfr   r   r   r   r   r   r   r   r   r   	__class__ `/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/pipelines/unclip/pipeline_unclip.pyr    X   s   

zUnCLIPPipeline.__init__c                 C   sR   |d u rt ||||d}n|j|krtd|j d| ||}||j }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r"   r*   r)   r(   r'   latents	schedulerr%   r%   r&   prepare_latentst   s   


zUnCLIPPipeline.prepare_latentsNtext_model_outputtext_attention_maskc                 C   s(  |d u ryt |trt|nd}| j|d| jjddd}|j}	|j |}
| j|dddj}|j	d |	j	d krjt
|	|sj| j|d d | jjd df }td	| jj d
|  |	d d d | jjf }	| |	|}|j}|j}n|d j	d }|d |d }}|}
|j|dd}|j|dd}|
j|dd}
|rdg| }| j|d| jjddd}|j |}| |j|}|j}|j}|j	d }|d|}||| |}|j	d }|d|d}||| |d}|j|dd}t
||g}t
||g}t
||
g}
|||
fS )Nr   
max_lengthTpt)paddingr3   
truncationreturn_tensorslongest)r5   r7   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: r   dim )
isinstancelistlenr   model_max_length	input_idsattention_maskboolr,   r*   torchequalbatch_decodeloggerwarningr   text_embedslast_hidden_staterepeat_interleaverepeatviewcat)r"   promptr(   num_images_per_promptdo_classifier_free_guidancer1   r2   
batch_sizetext_inputstext_input_ids	text_maskuntruncated_idsremoved_texttext_encoder_outputprompt_embedstext_enc_hid_statesuncond_tokensuncond_inputuncond_text_mask*negative_prompt_embeds_text_encoder_outputnegative_prompt_embedsuncond_text_enc_hid_statesseq_lenr%   r%   r&   _encode_prompt   sz   	




zUnCLIPPipeline._encode_promptr               @       @pilTrO   rP   prior_num_inference_stepsdecoder_num_inference_stepssuper_res_num_inference_stepsr'   prior_latentsdecoder_latentssuper_res_latentsprior_guidance_scaledecoder_guidance_scaleoutput_typereturn_dictc           2      C   s  |durt |trd}nt |trt|}ntdt| |
d jd }| j}|| }|dkp4|dk}| |||||
|\}}}| j	j
||d | j	j}| jjj}| ||f|j|||| j	}t| |D ]J\}}|rvt|gd n|}| j|||||dj}|r|d\}}||||   }|d |jd krd}n||d  }| j	j|||||d	j}qh| j|}|}| j||||d
\}} |jdkr|tj}tj|| jjdfdd}!|!tj}!ntj|| jjdfdd}!| j j
||d | j j}"| j!jj"}#| j!jj#}$| j!jj#}%| ||#|$|%f|j|||| j }t| |"D ]o\}}|r1t|gd n|}| j!|||| |!dj$}&|rq|&d\}'}(|'j%|jd dd\}'})|(j%|jd dd\}(}*|'||(|'   }&tj|&|*gdd}&|d |"jd kr~d}n|"|d  }| j j|&||||dj}q"|&dd}|}+| j'j
||d | j'j},| j(jj"d }-| j(jj#}$| j(jj#}%| ||-|$|%f|+j|||	| j'}	|jdkrtj)|+|$|%gd}.ni }/dt*+tj)j,v rd|/d< tj)|+f|$|%gddd|/}.t| |,D ]L\}}||,jd d kr| j-}0n| j(}0tj|	|.gdd}|0||dj$}&|d |,jd kr/d}n|,|d  }| j'j|&||	||dj}	t.rHt/0  q|	}1| 1  |1d d }1|1&dd}1|12 3dddd4 5 }1|dkrt| 6|1}1|sz|1fS t7|1dS )a  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`):
                The prompt or prompts to guide image generation. This can only be left undefined if `text_model_output`
                and `text_attention_mask` is passed.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            prior_num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps for the prior. More denoising steps usually lead to a higher quality
                image at the expense of slower inference.
            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
                image at the expense of slower inference.
            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
                quality image at the expense of slower inference.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            prior_latents (`torch.Tensor` of shape (batch size, embeddings dimension), *optional*):
                Pre-generated noisy latents to be used as inputs for the prior.
            decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            text_model_output (`CLIPTextModelOutput`, *optional*):
                Pre-defined [`CLIPTextModel`] outputs that can be derived from the text encoder. Pre-defined text
                outputs can be passed for tasks like text embedding interpolations. Make sure to also pass
                `text_attention_mask` in this case. `prompt` can the be left `None`.
            text_attention_mask (`torch.Tensor`, *optional*):
                Pre-defined CLIP text attention mask that can be derived from the tokenizer. Pre-defined text attention
                masks are necessary when passing `text_model_output`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images.
        Nr   z2`prompt` has to be of type `str` or `list` but is r   g      ?)r(   r   )timestepproj_embeddingencoder_hidden_statesrB   )rr   sampler'   prev_timestep)image_embeddingsrY   text_encoder_hidden_statesrQ   mps)valueT)ru   rr   rt   class_labelsrB   r:   )rv   r'   r9   )size	antialiasbicubicF)r|   modealign_corners)ru   rr   g      ?r   rg   )images)8r=   strr>   r?   r+   typer*   _execution_devicerb   r   set_timesteps	timestepsr   configembedding_dimr0   r)   	enumerateprogress_barrD   rN   predicted_image_embeddingchunkstepprev_samplepost_process_latentsr   intFpadclip_extra_context_tokensrC   r   r   in_channelssample_sizeru   splitclampr   r   interpolateinspect	signature
parametersr   XLA_AVAILABLExm	mark_stepmaybe_free_model_hookscpupermutefloatnumpynumpy_to_pilr   )2r"   rO   rP   rh   ri   rj   r'   rk   rl   rm   r1   r2   rn   ro   rp   rq   rR   r(   rQ   rY   rZ   rU   prior_timesteps_tensorr   itlatent_model_inputr    predicted_image_embedding_uncondpredicted_image_embedding_textrv   rw   additive_clip_time_embeddingsdecoder_text_maskdecoder_timesteps_tensornum_channels_latentsheightwidth
noise_prednoise_pred_uncondnoise_pred_text_predicted_varianceimage_smallsuper_res_timesteps_tensorchannelsimage_upscaledinterpolate_antialiasunetimager%   r%   r&   __call__   sD  E




	





	



	




zUnCLIPPipeline.__call__)NN)Nr   rc   rc   rd   NNNNNNre   rf   rg   T)__name__
__module____qualname____doc___last_supported_version_exclude_from_cpu_offloadr   __annotations__r   r   r   r   r	   r
   model_cpu_offload_seqr    r0   r   tuplerD   Tensorrb   no_gradr   r>   r   	Generatorr   rC   r   __classcell__r%   r%   r#   r&   r   (   s   
 	


[	

r   )$r   rD   torch.nnr   r   transformersr   r   &transformers.models.clip.modeling_clipr   modelsr   r   r	   
schedulersr
   utilsr   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   rG   r   r%   r%   r%   r&   <module>   s"   
