o
    Gi;                     @   s   d dl Zd dlZd dlmZ ddlmZmZ ddlm	Z	 ddl
mZmZmZ ddlmZ dd	lmZmZmZ d
dlmZ d
dlmZ d
dlmZ e rWd dlm  mZ dZndZee Z!dZ"G dd deeZ#dS )    N)CLIPTokenizer   )AutoencoderKLUNet2DConditionModel)PNDMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutput   )BlipImageProcessor)Blip2QFormerModel)ContextCLIPTextModelTFah  
    Examples:
        ```py
        >>> from diffusers.pipelines import BlipDiffusionPipeline
        >>> from diffusers.utils import load_image
        >>> import torch

        >>> blip_diffusion_pipe = BlipDiffusionPipeline.from_pretrained(
        ...     "Salesforce/blipdiffusion", torch_dtype=torch.float16
        ... ).to("cuda")


        >>> cond_subject = "dog"
        >>> tgt_subject = "dog"
        >>> text_prompt_input = "swimming underwater"

        >>> cond_image = load_image(
        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/dog.jpg"
        ... )
        >>> guidance_scale = 7.5
        >>> num_inference_steps = 25
        >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"


        >>> output = blip_diffusion_pipe(
        ...     text_prompt_input,
        ...     cond_image,
        ...     cond_subject,
        ...     tgt_subject,
        ...     guidance_scale=guidance_scale,
        ...     num_inference_steps=num_inference_steps,
        ...     neg_prompt=negative_prompt,
        ...     height=512,
        ...     width=512,
        ... ).images
        >>> output[0].save("image.png")
        ```
c                !       s$  e Zd ZdZdZdZ			d3dededed	e	d
e
dedededee dee f fddZdd Zd4ddZd5ddZd5ddZe ee										 	!d6d"ee d#ejjd$ee d%ee d&ejdB d'ed(ed)ed*ed+ejeej B dB d,edB d-ed.ed/edB d0efd1d2Z  Z S )7BlipDiffusionPipelinea  
    Pipeline for Zero-Shot Subject Driven Generation using Blip Diffusion.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        tokenizer ([`CLIPTokenizer`]):
            Tokenizer for the text encoder
        text_encoder ([`ContextCLIPTextModel`]):
            Text encoder to encode the text prompt
        vae ([`AutoencoderKL`]):
            VAE model to map the latents to the image
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        scheduler ([`PNDMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
        qformer ([`Blip2QFormerModel`]):
            QFormer model to get multi-modal embeddings from the text and image.
        image_processor ([`BlipImageProcessor`]):
            Image Processor to preprocess and postprocess the image.
        ctx_begin_pos (int, `optional`, defaults to 2):
            Position of the context token in the text encoder.
    z0.33.1z qformer->text_encoder->unet->vaer   N	tokenizertext_encodervaeunet	schedulerqformerimage_processorctx_begin_posmeanstdc              	      s6   t    | j|||||||d | j||	|
d d S )N)r   r   r   r   r   r   r   )r   r   r   )super__init__register_modulesregister_to_config)selfr   r   r   r   r   r   r   r   r   r   	__class__ n/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/blip_diffusion/pipeline_blip_diffusion.pyr   k   s   
	zBlipDiffusionPipeline.__init__c                 C   s   | j ||ddS )NF)image_input
text_inputreturn_dict)r   )r"   input_imagesrc_subjectr%   r%   r&   get_query_embeddings   s   z*BlipDiffusionPipeline.get_query_embeddings      ?   c              	   C   sN   g }t ||D ]\}}d| d|  }|d|gt||   q|S )Nza  z, )zipstripappendjoinint)r"   promptstgt_subjectsprompt_strengthprompt_repsrvprompttgt_subjectr%   r%   r&   _build_prompt   s
    z#BlipDiffusionPipeline._build_promptc	           
      C   st   ||||f}	t |trt||krtdt| d| d|d u r+t|	|||d}n|j||d}|| jj }|S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatordevicedtype)r>   r?   )
isinstancelistlen
ValueErrorr
   tor   init_noise_sigma)
r"   
batch_sizenum_channelsheightwidthr?   r>   r=   latentsshaper%   r%   r&   prepare_latents   s   z%BlipDiffusionPipeline.prepare_latentsc           	      C   sp   |p| j }| jjjj}|| jjj8 }| j|dd|dd|}|j	d }| jj
g| }| j|j||dd }|S )N
max_lengthTpt)padding
truncationrM   return_tensorsr   )	input_idsctx_embeddingsr   )_execution_devicer   
text_modelconfigmax_position_embeddingsr   num_query_tokensr   rD   rK   r   rR   )	r"   query_embedsr:   r>   max_lentokenized_promptrF   r   text_embeddingsr%   r%   r&   encode_prompt   s,   

z#BlipDiffusionPipeline.encode_prompt      @   2    pilTr:   reference_imagesource_subject_categorytarget_subject_categoryrJ   guidance_scalerH   rI   num_inference_stepsr=   
neg_promptr7   r8   output_typer)   c           !   
   C   s(  | j }| jj|| jj| jjddd }||}t|tr |g}t|tr(|g}t|tr0|g}t	|}| j
||||d}| ||}| |||}|dk}|rv| jjjj}| j|g| d|dd}| j|j|dd	d
 }t||g}dt	| jjjd  }| j|| jjj|| || |
|| jj|d}i }| jj|	fi | t| | jjD ]A\}}|dk}|rt|gd n|}| j|||dddd }|r|d\}}||||   }| j|||d }t rt!"  q| j#j$|| j#jj% ddd
 } | jj&| |d} | '  |s| fS t(| dS )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`list[str]`):
                The prompt or prompts to guide the image generation.
            reference_image (`PIL.Image.Image`):
                The reference image to condition the generation on.
            source_subject_category (`list[str]`):
                The source subject category.
            target_subject_category (`list[str]`):
                The target subject category.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by random sampling.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            height (`int`, *optional*, defaults to 512):
                The height of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width of the generated image.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            neg_prompt (`str`, *optional*, defaults to ""):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            prompt_strength (`float`, *optional*, defaults to 1.0):
                The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
                to amplify the prompt.
            prompt_reps (`int`, *optional*, defaults to 20):
                The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between: `"pil"` (`PIL.Image.Image`), `"np"`
                (`np.array`) or `"pt"` (`torch.Tensor`).
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.ImagePipelineOutput`] instead of a plain tuple.
        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        rN   )
image_mean	image_stdrQ   pixel_values)r5   r6   r7   r8   r-   rM   )rO   rM   rQ   N)rR   rS   r   r   r   )rF   rG   rH   rI   r=   rJ   r?   r>   )timestepencoder_hidden_statesdown_block_additional_residualsmid_block_additional_residualsampleprev_sampleF)r)   )ri   )images))rT   r   
preprocessrV   r   r   rD   r@   strrB   r<   r,   r]   r   rU   rW   r   rR   torchcatr   block_out_channelsrL   in_channelsr?   r   set_timesteps	enumerateprogress_bar	timestepschunkstepXLA_AVAILABLExm	mark_stepr   decodescaling_factorpostprocessmaybe_free_model_hooksr   )!r"   r:   rc   rd   re   rJ   rf   rH   rI   rg   r=   rh   r7   r8   ri   r)   r>   rF   rY   r\   do_classifier_free_guidancerM   uncond_inputuncond_embeddingsscale_down_factorextra_set_kwargsitlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textimager%   r%   r&   __call__   s   F




	
zBlipDiffusionPipeline.__call__)r   NN)r-   r.   )N)Nr^   r_   r_   r`   Nra   r-   r.   rb   T)!__name__
__module____qualname____doc___last_supported_versionmodel_cpu_offload_seqr   r   r   r   r   r   r   r4   rA   floatr   r,   r<   rL   r]   rv   no_gradr	   EXAMPLE_DOC_STRINGru   PILImageTensor	Generatorboolr   __classcell__r%   r%   r#   r&   r   N   s    	




	
r   )$	PIL.Imager   rv   transformersr   modelsr   r   
schedulersr   utilsr   r   r	   utils.torch_utilsr
   pipeline_utilsr   r   r   blip_image_processingr   modeling_blip2r   modeling_ctx_clipr   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   loggerr   r   r%   r%   r%   r&   <module>   s$   
(