o
    pi|V                     @   s   d dl Z d dlmZmZmZmZ d dlZd dlmZ	 d dl
mZmZ d dlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ eeZ G dd deZ!dS )    N)ListOptionalTupleUnion)
functional)CLIPTextModelWithProjectionCLIPTokenizer)CLIPTextModelOutput   )PriorTransformerUNet2DConditionModelUNet2DModel)UnCLIPScheduler)logging)randn_tensor   )DiffusionPipelineImagePipelineOutput   )UnCLIPTextProjModelc                        s  e Zd ZU dZdgZeed< eed< eed< e	ed< e
ed< eed< eed< eed	< eed
< eed< dZdedede	de
dededed	ed
edef fddZdd Z		d,deeeef  deej fddZe 															d-deeeee f  deded ed!ed"eeejeej f  d#eej d$eej d%eej deeeef  deej d&ed'ed(ee d)efd*d+Z  ZS ).UnCLIPPipelineaE  
    Pipeline for text-to-image generation using unCLIP.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        tokenizer ([`~transformers.CLIPTokenizer`]):
            A `CLIPTokenizer` to tokenize text.
        prior ([`PriorTransformer`]):
            The canonical unCLIP prior to approximate the image embedding from the text embedding.
        text_proj ([`UnCLIPTextProjModel`]):
            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
            The decoder to invert the image embedding into an image.
        super_res_first ([`UNet2DModel`]):
            Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
        super_res_last ([`UNet2DModel`]):
            Super resolution UNet. Used in the last step of the super resolution diffusion process.
        prior_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the prior denoising process (a modified [`DDPMScheduler`]).
        decoder_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
        super_res_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).

    priordecoder	text_projtext_encoder	tokenizersuper_res_firstsuper_res_lastprior_schedulerdecoder_schedulersuper_res_schedulerzAtext_encoder->text_proj->decoder->super_res_first->super_res_lastc                    s,   t    | j|||||||||	|
d
 d S )N)
r   r   r   r   r   r   r   r   r   r    )super__init__register_modules)selfr   r   r   r   r   r   r   r   r   r    	__class__ h/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/unclip/pipeline_unclip.pyr"   Q   s   

zUnCLIPPipeline.__init__c                 C   sR   |d u rt ||||d}n|j|krtd|j d| ||}||j }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r$   r,   r+   r*   r)   latents	schedulerr'   r'   r(   prepare_latentsm   s   


zUnCLIPPipeline.prepare_latentsNtext_model_outputtext_attention_maskc                 C   s(  |d u ryt |trt|nd}| j|d| jjddd}|j}	|j |}
| j|dddj}|j	d |	j	d krjt
|	|sj| j|d d | jjd df }td	| jj d
|  |	d d d | jjf }	| |	|}|j}|j}n|d j	d }|d |d }}|}
|j|dd}|j|dd}|
j|dd}
|rdg| }| j|d| jjddd}|j |}| |j|}|j}|j}|j	d }|d|}||| |}|j	d }|d|d}||| |d}|j|dd}t
||g}t
||g}t
||
g}
|||
fS )Nr   
max_lengthTpt)paddingr5   
truncationreturn_tensorslongest)r7   r9   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: r   dim )
isinstancelistlenr   model_max_length	input_idsattention_maskboolr.   r,   torchequalbatch_decodeloggerwarningr   text_embedslast_hidden_staterepeat_interleaverepeatviewcat)r$   promptr*   num_images_per_promptdo_classifier_free_guidancer3   r4   
batch_sizetext_inputstext_input_ids	text_maskuntruncated_idsremoved_texttext_encoder_outputprompt_embedstext_enc_hid_statesuncond_tokensuncond_inputuncond_text_mask*negative_prompt_embeds_text_encoder_outputnegative_prompt_embedsuncond_text_enc_hid_statesseq_lenr'   r'   r(   _encode_promptx   sz   	




zUnCLIPPipeline._encode_promptr               @       @pilTrQ   rR   prior_num_inference_stepsdecoder_num_inference_stepssuper_res_num_inference_stepsr)   prior_latentsdecoder_latentssuper_res_latentsprior_guidance_scaledecoder_guidance_scaleoutput_typereturn_dictc           2      C   s  |durt |trd}nt |trt|}ntdt| |
d jd }| j}|| }|dkp4|dk}| |||||
|\}}}| j	j
||d | j	j}| jjj}| ||f|j|||| j	}t| |D ]J\}}|rvt|gd n|}| j|||||dj}|r|d\}}||||   }|d |jd krd}n||d  }| j	j|||||d	j}qh| j|}|}| j||||d
\}} |jdkr|tj}tj|| jjdfdd}!|!tj}!ntj|| jjdfdd}!| j j
||d | j j}"| j!jj"}#| j!jj#}$| j!jj#}%| ||#|$|%f|j|||| j }t| |"D ]o\}}|r1t|gd n|}| j!|||| |!dj$}&|rq|&d\}'}(|'j%|jd dd\}'})|(j%|jd dd\}(}*|'||(|'   }&tj|&|*gdd}&|d |"jd kr~d}n|"|d  }| j j|&||||dj}q"|&dd}|}+| j'j
||d | j'j},| j(jj"d }-| j(jj#}$| j(jj#}%| ||-|$|%f|+j|||	| j'}	|jdkrtj)|+|$|%gd}.ni }/dt*+tj)j,v rd|/d< tj)|+f|$|%gddd|/}.t| |,D ]E\}}||,jd d kr| j-}0n| j(}0tj|	|.gdd}|0||dj$}&|d |,jd kr/d}n|,|d  }| j'j|&||	||dj}	q|	}1| .  |1d d }1|1&dd}1|1/ 0dddd1 2 }1|dkrm| 3|1}1|ss|1fS t4|1dS )a  
        The call function to the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`):
                The prompt or prompts to guide image generation. This can only be left undefined if `text_model_output`
                and `text_attention_mask` is passed.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            prior_num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps for the prior. More denoising steps usually lead to a higher quality
                image at the expense of slower inference.
            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
                image at the expense of slower inference.
            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
                quality image at the expense of slower inference.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            prior_latents (`torch.Tensor` of shape (batch size, embeddings dimension), *optional*):
                Pre-generated noisy latents to be used as inputs for the prior.
            decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            prior_guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            text_model_output (`CLIPTextModelOutput`, *optional*):
                Pre-defined [`CLIPTextModel`] outputs that can be derived from the text encoder. Pre-defined text
                outputs can be passed for tasks like text embedding interpolations. Make sure to also pass
                `text_attention_mask` in this case. `prompt` can the be left `None`.
            text_attention_mask (`torch.Tensor`, *optional*):
                Pre-defined CLIP text attention mask that can be derived from the tokenizer. Pre-defined text attention
                masks are necessary when passing `text_model_output`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images.
        Nr   z2`prompt` has to be of type `str` or `list` but is r   g      ?)r*   r   )timestepproj_embeddingencoder_hidden_statesrD   )rt   sampler)   prev_timestep)image_embeddingsr[   text_encoder_hidden_statesrS   mps)valueT)rw   rt   rv   class_labelsrD   r<   )rx   r)   r;   )size	antialiasbicubicF)r~   modealign_corners)rw   rt   g      ?r
   ri   )images)5r?   strr@   rA   r-   typer,   _execution_devicerd   r   set_timesteps	timestepsr   configembedding_dimr2   r+   	enumerateprogress_barrF   rP   predicted_image_embeddingchunkstepprev_samplepost_process_latentsr   intFpadclip_extra_context_tokensrE   r   r   in_channelssample_sizerw   splitclampr    r   interpolateinspect	signature
parametersr   maybe_free_model_hookscpupermutefloatnumpynumpy_to_pilr   )2r$   rQ   rR   rj   rk   rl   r)   rm   rn   ro   r3   r4   rp   rq   rr   rs   rT   r*   rS   r[   r\   rW   prior_timesteps_tensorr   itlatent_model_inputr    predicted_image_embedding_uncondpredicted_image_embedding_textrx   ry   additive_clip_time_embeddingsdecoder_text_maskdecoder_timesteps_tensornum_channels_latentsheightwidth
noise_prednoise_pred_uncondnoise_pred_text_predicted_varianceimage_smallsuper_res_timesteps_tensorchannelsimage_upscaledinterpolate_antialiasunetimager'   r'   r(   __call__   s>  E




	





	



	




zUnCLIPPipeline.__call__)NN)Nr   re   re   rf   NNNNNNrg   rh   ri   T) __name__
__module____qualname____doc___exclude_from_cpu_offloadr   __annotations__r   r   r   r   r   r   model_cpu_offload_seqr"   r2   r   r   r	   r   rF   Tensorrd   no_gradr   r   r   	Generatorr   rE   r   __classcell__r'   r'   r%   r(   r   "   s   
 	

[	
r   )"r   typingr   r   r   r   rF   torch.nnr   r   transformersr   r   &transformers.models.clip.modeling_clipr	   modelsr   r   r   
schedulersr   utilsr   utils.torch_utilsr   pipeline_utilsr   r   r   r   
get_loggerr   rI   r   r'   r'   r'   r(   <module>   s   
