o
    GiD                     @   s   d dl Zd dlZd dlmZ ddlmZmZmZ ddl	m
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ e rYd dlm  mZ dZndZe e!Z"dZ#G dd deeZ$dS )    N)CLIPTokenizer   )AutoencoderKLControlNetModelUNet2DConditionModel)PNDMScheduler)is_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )BlipImageProcessor)Blip2QFormerModel)ContextCLIPTextModel)DeprecatedPipelineMixinDiffusionPipelineImagePipelineOutputTFa  
    Examples:
        ```py
        >>> from diffusers.pipelines import BlipDiffusionControlNetPipeline
        >>> from diffusers.utils import load_image
        >>> from controlnet_aux import CannyDetector
        >>> import torch

        >>> blip_diffusion_pipe = BlipDiffusionControlNetPipeline.from_pretrained(
        ...     "Salesforce/blipdiffusion-controlnet", torch_dtype=torch.float16
        ... ).to("cuda")

        >>> style_subject = "flower"
        >>> tgt_subject = "teapot"
        >>> text_prompt = "on a marble table"

        >>> cldm_cond_image = load_image(
        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/kettle.jpg"
        ... ).resize((512, 512))
        >>> canny = CannyDetector()
        >>> cldm_cond_image = canny(cldm_cond_image, 30, 70, output_type="pil")
        >>> style_image = load_image(
        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/flower.jpg"
        ... )
        >>> guidance_scale = 7.5
        >>> num_inference_steps = 50
        >>> negative_prompt = "over-exposure, under-exposure, saturated, duplicate, out of frame, lowres, cropped, worst quality, low quality, jpeg artifacts, morbid, mutilated, out of frame, ugly, bad anatomy, bad proportions, deformed, blurry, duplicate"


        >>> output = blip_diffusion_pipe(
        ...     text_prompt,
        ...     style_image,
        ...     cldm_cond_image,
        ...     style_subject,
        ...     tgt_subject,
        ...     guidance_scale=guidance_scale,
        ...     num_inference_steps=num_inference_steps,
        ...     neg_prompt=negative_prompt,
        ...     height=512,
        ...     width=512,
        ... ).images
        >>> output[0].save("image.png")
        ```
c                #       s<  e Zd ZdZdZdZ			d8dededed	e	d
e
dededededee dee f fddZdd Zd9ddZd:ddZd:ddZ	d;ddZe ee		 	!	!	"		#			$	%d<d&ee d'ejjd(ejjd)ee d*ee d+ejdB d,ed-ed.ed/ed0ejeej B dB d1edB d2ed3ed4edB d5e f d6d7Z!  Z"S )=BlipDiffusionControlNetPipelinea.  
    Pipeline for Canny Edge based Controlled subject-driven generation using Blip Diffusion.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)

    Args:
        tokenizer ([`CLIPTokenizer`]):
            Tokenizer for the text encoder
        text_encoder ([`ContextCLIPTextModel`]):
            Text encoder to encode the text prompt
        vae ([`AutoencoderKL`]):
            VAE model to map the latents to the image
        unet ([`UNet2DConditionModel`]):
            Conditional U-Net architecture to denoise the image embedding.
        scheduler ([`PNDMScheduler`]):
             A scheduler to be used in combination with `unet` to generate image latents.
        qformer ([`Blip2QFormerModel`]):
            QFormer model to get multi-modal embeddings from the text and image.
        controlnet ([`ControlNetModel`]):
            ControlNet model to get the conditioning image embedding.
        image_processor ([`BlipImageProcessor`]):
            Image Processor to preprocess and postprocess the image.
        ctx_begin_pos (int, `optional`, defaults to 2):
            Position of the context token in the text encoder.
    z0.33.1z qformer->text_encoder->unet->vaer   N	tokenizertext_encodervaeunet	schedulerqformer
controlnetimage_processorctx_begin_posmeanstdc              
      s8   t    | j||||||||d | j|	|
|d d S )N)r   r   r   r   r   r   r   r   )r   r   r   )super__init__register_modulesregister_to_config)selfr   r   r   r   r   r   r   r   r   r   r   	__class__ u/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/controlnet/pipeline_controlnet_blip_diffusion.pyr    t   s   

z(BlipDiffusionControlNetPipeline.__init__c                 C   s   | j ||ddS )NF)image_input
text_inputreturn_dict)r   )r#   input_imagesrc_subjectr&   r&   r'   get_query_embeddings   s   z4BlipDiffusionControlNetPipeline.get_query_embeddings      ?   c              	   C   sN   g }t ||D ]\}}d| d|  }|d|gt||   q|S )Nza  z, )zipstripappendjoinint)r#   promptstgt_subjectsprompt_strengthprompt_repsrvprompttgt_subjectr&   r&   r'   _build_prompt   s
    z-BlipDiffusionControlNetPipeline._build_promptc	           
      C   st   ||||f}	t |trt||krtdt| d| d|d u r+t|	|||d}n|j||d}|| jj }|S )Nz/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)	generatordevicedtyper?   r@   )
isinstancelistlen
ValueErrorr   tor   init_noise_sigma)
r#   
batch_sizenum_channelsheightwidthr@   r?   r>   latentsshaper&   r&   r'   prepare_latents   s   z/BlipDiffusionControlNetPipeline.prepare_latentsc           	      C   sp   |p| j }| jjjj}|| jjj8 }| j|dd|dd|}|j	d }| jj
g| }| j|j||dd }|S )N
max_lengthTpt)padding
truncationrO   return_tensorsr   )	input_idsctx_embeddingsr   )_execution_devicer   
text_modelconfigmax_position_embeddingsr   num_query_tokensr   rF   rM   r   rT   )	r#   query_embedsr;   r?   max_lentokenized_promptrH   r   text_embeddingsr&   r&   r'   encode_prompt   s,   

z-BlipDiffusionControlNetPipeline.encode_promptFc	                 C   sx   | j j|||ddddddd |}|jd }	|	dkr |}
n|}
|j|
dd	}|j||d
}|r:t|gd }|S )N)rK   rJ   TFrP   )size
do_rescaledo_center_cropdo_normalizerS   pixel_valuesr      )dimrA   r   )r   
preprocessrF   rM   repeat_interleavetorchcat)r#   imagerK   rJ   rH   num_images_per_promptr?   r@   do_classifier_free_guidanceimage_batch_size	repeat_byr&   r&   r'   prepare_control_image   s*   
z5BlipDiffusionControlNetPipeline.prepare_control_image      @   2    pilTr;   reference_imagecondtioning_imagesource_subject_categorytarget_subject_categoryrL   guidance_scalerJ   rK   num_inference_stepsr>   
neg_promptr8   r9   output_typer*   c           %   
   C   s`  | j }| jj|| jj| jjddd }||}t|tr |g}t|tr(|g}t|tr0|g}t	|}| j
||||d}| ||}| |||}|dk}|rv| jjjj}| j|g| d|dd}| j|j|dd	d
 }t||g}dt	| jjjd  }| j|| jjj|| |	| ||| jj|d}i }| jj|
fi | | j||	||d|| jj|d}t| | jjD ]N\}}|dk}|rt|gd n|}| j||||dd\}} | j||||| dd }!|r|! d\}"}#|"||#|"   }!| j!|!||d }t"r	t#$  q| j%j&|| j%jj' ddd
 }$| jj(|$|d}$| )  |s+|$fS t*|$dS )a  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`list[str]`):
                The prompt or prompts to guide the image generation.
            reference_image (`PIL.Image.Image`):
                The reference image to condition the generation on.
            condtioning_image (`PIL.Image.Image`):
                The conditioning canny edge image to condition the generation on.
            source_subject_category (`list[str]`):
                The source subject category.
            target_subject_category (`list[str]`):
                The target subject category.
            latents (`torch.Tensor`, *optional*):
                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
                tensor will be generated by random sampling.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            height (`int`, *optional*, defaults to 512):
                The height of the generated image.
            width (`int`, *optional*, defaults to 512):
                The width of the generated image.
            seed (`int`, *optional*, defaults to 42):
                The seed to use for random generation.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            neg_prompt (`str`, *optional*, defaults to ""):
                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                if `guidance_scale` is less than `1`).
            prompt_strength (`float`, *optional*, defaults to 1.0):
                The strength of the prompt. Specifies the number of times the prompt is repeated along with prompt_reps
                to amplify the prompt.
            prompt_reps (`int`, *optional*, defaults to 20):
                The number of times the prompt is repeated along with prompt_strength to amplify the prompt.
        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        rP   )
image_mean	image_stdrS   rd   )r6   r7   r8   r9   r.   rO   )rQ   rO   rS   N)rT   rU   r   r   re   )rH   rI   rJ   rK   r>   rL   r@   r?   )rk   rK   rJ   rH   rl   r?   r@   rm   F)encoder_hidden_statescontrolnet_condr*   )timestepr   down_block_additional_residualsmid_block_additional_residualsampleprev_sample)r*   )r}   )images)+rV   r   rg   rX   r   r   rF   rB   strrD   r=   r-   r_   r   rW   rY   r   rT   ri   rj   r   block_out_channelsrN   in_channelsr@   r   set_timestepsrp   r   	enumerateprogress_bar	timestepschunkstepXLA_AVAILABLExm	mark_stepr   decodescaling_factorpostprocessmaybe_free_model_hooksr   )%r#   r;   rv   rw   rx   ry   rL   rz   rJ   rK   r{   r>   r|   r8   r9   r}   r*   r?   rH   r[   r^   rm   rO   uncond_inputuncond_embeddingsscale_down_factorextra_set_kwargs
cond_imageitlatent_model_inputdown_block_res_samplesmid_block_res_sample
noise_prednoise_pred_uncondnoise_pred_textrk   r&   r&   r'   __call__   s   F





	
z(BlipDiffusionControlNetPipeline.__call__)r   NN)r.   r/   )N)F)Nrq   rr   rr   rs   Nrt   r.   r/   ru   T)#__name__
__module____qualname____doc___last_supported_versionmodel_cpu_offload_seqr   r   r   r   r   r   r   r   r5   rC   floatr    r-   r=   rN   r_   rp   ri   no_gradr
   EXAMPLE_DOC_STRINGr   PILImageTensor	Generatorboolr   __classcell__r&   r&   r$   r'   r   U   s    	




$
$	
r   )%	PIL.Imager   ri   transformersr   modelsr   r   r   
schedulersr   utilsr   r	   r
   utils.torch_utilsr   $blip_diffusion.blip_image_processingr   blip_diffusion.modeling_blip2r    blip_diffusion.modeling_ctx_clipr   pipeline_utilsr   r   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   loggerr   r   r&   r&   r&   r'   <module>   s$   
.