o
    GiPK                     @   s   d dl Z d dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ d	d
lmZmZmZ ddlmZ e rYd dlm  mZ dZ ndZ e!e"Z#G dd deeZ$dS )    N)
functional)CLIPImageProcessorCLIPTextModelWithProjectionCLIPTokenizerCLIPVisionModelWithProjection   )UNet2DConditionModelUNet2DModel)UnCLIPScheduler)is_torch_xla_availablelogging)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput   )UnCLIPTextProjModelTFc                       sV  e Zd ZU dZdZeed< eed< eed< e	ed< e
ed< eed< eed	< eed
< eed< eed< dZdedede	dede
ded	ed
ededef fddZdd Zdd Zd*dejdB fddZe 											d+dejjeejj B ejB dB ded ed!ed"ejdB d#ejdB d$ejdB dejdB d%ed&edB d'efd(d)Z  ZS ),UnCLIPImageVariationPipelinea  
    Pipeline to generate image variations from an input image using UnCLIP.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
    implemented for all pipelines (downloading, saving, running on a particular device, etc.).

    Args:
        text_encoder ([`~transformers.CLIPTextModelWithProjection`]):
            Frozen text-encoder.
        tokenizer ([`~transformers.CLIPTokenizer`]):
            A `CLIPTokenizer` to tokenize text.
        feature_extractor ([`~transformers.CLIPImageProcessor`]):
            Model that extracts features from generated images to be used as inputs for the `image_encoder`.
        image_encoder ([`~transformers.CLIPVisionModelWithProjection`]):
            Frozen CLIP image-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
        text_proj ([`UnCLIPTextProjModel`]):
            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
            The decoder to invert the image embedding into an image.
        super_res_first ([`UNet2DModel`]):
            Super resolution UNet. Used in all but the last step of the super resolution diffusion process.
        super_res_last ([`UNet2DModel`]):
            Super resolution UNet. Used in the last step of the super resolution diffusion process.
        decoder_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the decoder denoising process (a modified [`DDPMScheduler`]).
        super_res_scheduler ([`UnCLIPScheduler`]):
            Scheduler used in the super resolution denoising process (a modified [`DDPMScheduler`]).
    z0.33.1decoder	text_projtext_encoder	tokenizerfeature_extractorimage_encodersuper_res_firstsuper_res_lastdecoder_schedulersuper_res_schedulerzPtext_encoder->image_encoder->text_proj->decoder->super_res_first->super_res_lastc                    s,   t    | j|||||||||	|
d
 d S )N)
r   r   r   r   r   r   r   r   r   r   )super__init__register_modules)selfr   r   r   r   r   r   r   r   r   r   	__class__ n/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/unclip/pipeline_unclip_image_variation.pyr    Y   s   

z%UnCLIPImageVariationPipeline.__init__c                 C   sR   |d u rt ||||d}n|j|krtd|j d| ||}||j }|S )N)	generatordevicedtypezUnexpected latents shape, got z, expected )r   shape
ValueErrortoinit_noise_sigma)r"   r*   r)   r(   r'   latents	schedulerr%   r%   r&   prepare_latentsv   s   


z,UnCLIPImageVariationPipeline.prepare_latentsc                 C   st  t |tr	t|nd}| j|d| jjdd}|j}|j |}| 	||}	|	j
}
|	j}|
j|dd}
|j|dd}|j|dd}|rdg| }|jd }| j|d|d	dd
}|j |}| 	|j|}|j
}|j}|jd }|d|}||| |}|jd }|d|d}||| |d}|j|dd}t||
g}
t||g}t||g}|
||fS )Nr   
max_lengthpt)paddingr1   return_tensorsr   dim T)r3   r1   
truncationr4   )
isinstancelistlenr   model_max_length	input_idsattention_maskboolr,   r   text_embedslast_hidden_staterepeat_interleaver*   repeatviewtorchcat)r"   promptr(   num_images_per_promptdo_classifier_free_guidance
batch_sizetext_inputstext_input_ids	text_masktext_encoder_outputprompt_embedstext_encoder_hidden_statesuncond_tokensr1   uncond_inputuncond_text_mask*negative_prompt_embeds_text_encoder_outputnegative_prompt_embeds!uncond_text_encoder_hidden_statesseq_lenr%   r%   r&   _encode_prompt   sT   





z+UnCLIPImageVariationPipeline._encode_promptNimage_embeddingsc                 C   s`   t | j j}|d u r't|tjs| j|ddj}|j	||d}| |j
}|j|dd}|S )Nr2   )imagesr4   )r(   r)   r   r5   )nextr   
parametersr)   r:   rF   Tensorr   pixel_valuesr,   image_embedsrC   )r"   imager(   rI   rZ   r)   r%   r%   r&   _encode_image   s   z*UnCLIPImageVariationPipeline._encode_imager                @pilTra   rI   decoder_num_inference_stepssuper_res_num_inference_stepsr'   decoder_latentssuper_res_latentsdecoder_guidance_scaleoutput_typereturn_dictc           (      C   s   |durt |tjjrd}nt |trt|}n|jd }n|jd }dg| }| j}|| }|	dk}| ||||\}}}| ||||}| j	||||d\}}|j
dkro|
tj}tj|| j	jdfdd}|
tj}ntj|| j	jdfd	d}| jj||d
 | jj}| jjj}| jjj}| jjj}|du r| ||||f|j|||| j}t| |D ]l\}}|rt|gd n|}| j|||||dj}|r|d\}}|j|jd dd\}}|j|jd dd\}} ||	||   }tj|| gdd}|d |jd krd}!n||d  }!| jj ||||!|dj!}q|"dd}|}"| j#j||d
 | j#j}#| j$jjd }$| j$jj}| j$jj}|du rX| ||$||f|"j|||| j#}|j
dkrhtj%|"||gd}%ni }&dt&'tj%j(v rxd	|&d< tj%|"f||gddd|&}%t| |#D ]L\}}||#jd d kr| j)}'n| j$}'tj||%gdd}|'||dj}|d |#jd krd}!n|#|d  }!| j#j ||||!|dj!}t*rt+,  q|}| -  |d d }|"dd}|. /dddd0 1 }|
dkr| 2|}|s|fS t3|dS )a  
        The call function to the pipeline for generation.

        Args:
            image (`PIL.Image.Image` or `list[PIL.Image.Image]` or `torch.Tensor`):
                `Image` or tensor representing an image batch to be used as the starting point. If you provide a
                tensor, it needs to be compatible with the [`CLIPImageProcessor`]
                [configuration](https://huggingface.co/fusing/karlo-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
                Can be left as `None` only when `image_embeddings` are passed.
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            decoder_num_inference_steps (`int`, *optional*, defaults to 25):
                The number of denoising steps for the decoder. More denoising steps usually lead to a higher quality
                image at the expense of slower inference.
            super_res_num_inference_steps (`int`, *optional*, defaults to 7):
                The number of denoising steps for super resolution. More denoising steps usually lead to a higher
                quality image at the expense of slower inference.
            generator (`torch.Generator`, *optional*):
                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
                generation deterministic.
            decoder_latents (`torch.Tensor` of shape (batch size, channels, height, width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            super_res_latents (`torch.Tensor` of shape (batch size, channels, super res height, super res width), *optional*):
                Pre-generated noisy latents to be used as inputs for the decoder.
            decoder_guidance_scale (`float`, *optional*, defaults to 4.0):
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            image_embeddings (`torch.Tensor`, *optional*):
                Pre-defined image embeddings that can be derived from the image encoder. Pre-defined image embeddings
                can be passed for tasks like image interpolations. `image` can be left as `None`.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`:
                If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
                returned where the first element is a list with the generated images.
        Nr   r   r7   g      ?)rZ   rP   rQ   rJ   mps)valueT)r(   r   )sampletimestepencoder_hidden_statesclass_labelsr?   r5   )prev_timestepr'   r8   )size	antialiasbicubicF)ru   modealign_corners)rp   rq   g      ?r   rf   )r[   )4r:   PILImager;   r<   r*   _execution_devicerY   rb   r   typerF   intFpadclip_extra_context_tokensr@   r   set_timesteps	timestepsr   configin_channelssample_sizer0   r)   	enumerateprogress_barrG   rp   chunksplitstepprev_sampleclampr   r   interpolateinspect	signaturer]   r   XLA_AVAILABLExm	mark_stepmaybe_free_model_hookscpupermutefloatnumpynumpy_to_pilr   )(r"   ra   rI   rg   rh   r'   ri   rj   rZ   rk   rl   rm   rK   rH   r(   rJ   rP   rQ   rN   additive_clip_time_embeddingsdecoder_text_maskdecoder_timesteps_tensornum_channels_latentsheightwidthitlatent_model_input
noise_prednoise_pred_uncondnoise_pred_text_predicted_variancert   image_smallsuper_res_timesteps_tensorchannelsimage_upscaledinterpolate_antialiasunetr%   r%   r&   __call__   s   7










	




	




z%UnCLIPImageVariationPipeline.__call__)N)Nr   rc   rd   NNNNre   rf   T) __name__
__module____qualname____doc___last_supported_versionr   __annotations__r   r   r   r   r   r	   r
   model_cpu_offload_seqr    r0   rY   rF   r^   rb   no_gradrz   r{   r;   r~   	Generatorr   strr@   r   __classcell__r%   r%   r#   r&   r   -   s   
 	
?	
r   )%r   	PIL.Imagerz   rF   torch.nnr   r   transformersr   r   r   r   modelsr   r	   
schedulersr
   utilsr   r   utils.torch_utilsr   pipeline_utilsr   r   r   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   loggerr   r%   r%   r%   r&   <module>   s"   
