o
    Giv                     @   s   d dl Z d dlmZ d dlZd dlZd dlZd dlmZmZ ddl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZ e rad dlm  mZ dZ ndZ e!e"Z#dZ$G dd deeZ%dS )    N)Callable)T5EncoderModelT5Tokenizer   )VaeImageProcessor)StableDiffusionLoraLoaderMixin)Kandinsky3UNetVQModel)DDPMScheduler)	deprecateis_torch_xla_availableloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutputTFa?  
    Examples:
        ```py
        >>> from diffusers import AutoPipelineForImage2Image
        >>> from diffusers.utils import load_image
        >>> import torch

        >>> pipe = AutoPipelineForImage2Image.from_pretrained(
        ...     "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
        ... )
        >>> pipe.enable_model_cpu_offload()

        >>> prompt = "A painting of the inside of a subway train with tiny raccoons."
        >>> image = load_image(
        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
        ... )

        >>> generator = torch.Generator(device="cpu").manual_seed(0)
        >>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]
        ```
c                $       s  e Zd ZdZg dZdedededede	f
 fdd	Z
d
d Zdd Ze 									d6dejdB dejdB dejdB dejdB fddZd7ddZdd Z						d8ddZedd Zedd  Zed!d" Ze eeddd#d$d%dddddddd&ddd'gfd(eee B d)ejejjB eej B eejj B d*ed+ed,ed-eee B dB d.edB d/ej eej  B dB dejdB dejdB dejdB dejdB d0edB d1e!d2e"eegdf dB d3ee f d4d5Z#  Z$S )9Kandinsky3Img2ImgPipelineztext_encoder->movq->unet->movq)latentsprompt_embedsnegative_prompt_embedsnegative_attention_maskattention_mask	tokenizertext_encoderunet	schedulermovqc                    st   t    | j|||||d t| dd r dt| jjjd  nd}t| dd r-| jjjnd}t	||ddd| _
d S )	N)r   r   r   r   r   r   r            bicubic)vae_scale_factorvae_latent_channelsresamplereducing_gap)super__init__register_modulesgetattrlenr   configblock_out_channelslatent_channelsr   image_processor)selfr   r   r   r   r   movq_scale_factormovq_latent_channels	__class__ n/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.pyr'   B   s   

&z"Kandinsky3Img2ImgPipeline.__init__c                 C   s<   t t|| |}t|| d}| jj|d  }||| fS )Nr   )minintmaxr   	timesteps)r/   num_inference_stepsstrengthdeviceinit_timestept_startr9   r4   r4   r5   get_timestepsX   s   z'Kandinsky3Img2ImgPipeline.get_timestepsc                 C   s`   |r,t ||dk ||dk< |d d }|d d d |f }|d d d |f }||fS )Nr   r   )torch
zeros_likesumr8   )r/   
embeddingsr   cut_contextmax_seq_lengthr4   r4   r5   _process_embedsa   s   z)Kandinsky3Img2ImgPipeline._process_embedsTr   Nr   r   r   r   c              
   C   s  |dur|durt |t |urtdt | dt | d|du r&| j}|dur2t|tr2d}n|dur@t|tr@t|}n|jd }d}|du r|| j|d|d	d
d}|j	
|}|j
|}	| j||	d}|d }| ||	|\}}	||	d }| jdur| jj}nd}|j
||d}|j\}}}|d|d}||| |d}|	|d}	|r4|du r4|du rdg| }n$t|tr|g}n|t|krtd| dt| d| d| d	|}|dur*| j|ddd	d	d
d}|j	
|}|j
|}
| j||
d}|d }|ddd|jd f }|
ddd|jd f }
||
d }n
t|}t|	}
|ra|jd }|j
||d}|j|jkr`|d|d}||| |d}|
|d}
nd}d}
|||	|
fS )aY  
        Encodes the prompt into text encoder hidden states.

        Args:
             prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`, *optional*):
                torch device to place the resulting embeddings on
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
            negative_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
        Nz?`negative_prompt` should be the same type to `prompt`, but got z != .r   r      
max_lengthTpt)paddingrJ   
truncationreturn_tensors)r   r   dtyper<   r@    z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)rL   rJ   rM   return_attention_maskrN   )type	TypeError_execution_device
isinstancestrlistr*   shaper   	input_idstor   r   rG   	unsqueezerP   repeatview
ValueErrorrA   rB   )r/   promptdo_classifier_free_guidancenum_images_per_promptr<   negative_promptr   r   _cut_contextr   r   
batch_sizerJ   text_inputstext_input_idsrP   bs_embedseq_len_uncond_tokensuncond_inputr4   r4   r5   encode_promptj   s   *








z'Kandinsky3Img2ImgPipeline.encode_promptc                    s  t tjtjjtfstdt j||d|| }j	d dkr)}nCt  trAt
 |krAtdt
  d| dt  tr[ fdd	t|D }tj|d
d}n
jj }jjj| }tj|gd
d}|j	}	t|	 ||d}
j||
|}|}|S )NzK`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is )r<   rP   r   r    z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.c                    s0   g | ]}j ||d   j | qS )r   )r   encodelatent_distsample.0i	generatorimager/   r4   r5   
<listcomp>  s    $z=Kandinsky3Img2ImgPipeline.prepare_latents.<locals>.<listcomp>r   dim)ru   r<   rP   )rV   rA   TensorPILImagerX   r_   rS   r[   rY   r*   rangecatr   rn   ro   rp   r+   scaling_factorr   r   	add_noise)r/   rv   timestepre   rb   rP   r<   ru   init_latentsrY   noiser   r4   rt   r5   prepare_latents   s6   
z)Kandinsky3Img2ImgPipeline.prepare_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Netaru   )setinspect	signaturer   step
parameterskeys)r/   ru   r   accepts_etaextra_step_kwargsaccepts_generatorr4   r4   r5   prepare_extra_step_kwargs+  s   z3Kandinsky3Img2ImgPipeline.prepare_extra_step_kwargsc	           	         s
  |d urt |tr|dkrtd| dt| d|d ur;t fdd|D s;td j d fd	d
|D  |d urN|d urNtd| d| d|d u rZ|d u rZtd|d urqt |tsqt |tsqtdt| |d ur|d urtd| d| d|d ur|d ur|j|jkrtd|j d|j d|d ur|d u rtd|d ur|d ur|jd d |jkrtd|jd d  d|j d|d ur|d u rtd|d ur|d ur|jd d |jkrtd|jd d  d|j dd S d S d S )Nr   z5`callback_steps` has to be a positive integer but is z	 of type rH   c                 3       | ]}| j v V  qd S N_callback_tensor_inputsrr   kr/   r4   r5   	<genexpr>M      

z9Kandinsky3Img2ImgPipeline.check_inputs.<locals>.<genexpr>2`callback_on_step_end_tensor_inputs` has to be in , but found c                       g | ]	}| j vr|qS r4   r   r   r   r4   r5   rw   Q      z:Kandinsky3Img2ImgPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zLPlease provide `negative_attention_mask` along with `negative_prompt_embeds`r   z`negative_prompt_embeds` and `negative_attention_mask` must have the same batch_size and token length when passed directly, but got: `negative_prompt_embeds` z != `negative_attention_mask` z:Please provide `attention_mask` along with `prompt_embeds`z`prompt_embeds` and `attention_mask` must have the same batch_size and token length when passed directly, but got: `prompt_embeds` z != `attention_mask` )	rV   r7   r_   rS   allr   rW   rX   rY   )	r/   r`   callback_stepsrc   r   r   "callback_on_step_end_tensor_inputsr   r   r4   r   r5   check_inputs<  sz   z&Kandinsky3Img2ImgPipeline.check_inputsc                 C      | j S r   _guidance_scaler   r4   r4   r5   guidance_scale     z(Kandinsky3Img2ImgPipeline.guidance_scalec                 C   s
   | j dkS )Nr   r   r   r4   r4   r5   ra     s   
z5Kandinsky3Img2ImgPipeline.do_classifier_free_guidancec                 C   r   r   )_num_timestepsr   r4   r4   r5   num_timesteps  r   z'Kandinsky3Img2ImgPipeline.num_timestepsg333333?   g      @pilr   r`   rv   r;   r:   r   rc   rb   ru   output_typereturn_dictcallback_on_step_endr   c           &         sV  | dd}| dd}|durtddd |dur tddd |durAt fdd|D sAtd	 j d
 fdd|D  d} ||||	|
||| | _|dur^t|tr^d}n|durlt|t	rlt
|}n|	jd } j} j| j||||	|
|||d
\}	}
}} jrt|
|	g}	t||g }t|t	s|g}tdd |D stddd |D  dtj fdd|D dd}|j|	j|d} jj||d  |||\}} j|d }|j|dd}|dd || } |||||	j||}t dr jdur j  t
|| jj  }t
| _  j!|d}t"|D ]\}} jrAt|gd n|} j#|||	|dd } jrc|$d\} }!|d |! ||   } jj%||||dj&}|duri }"|D ]
}#t' |# |"|#< qw| |||"}$|$ d|}|$ d |	}	|$ d!|
}
|$ d"|}|$ d#|}|t
|d ks|d |kr|d  jj dkr|(  |dur|| dkr|t) jd$d }%||%|| t*rt+,  q1|d%ks jj-|dd&d' } j./||}n|} 0  |s|fW  d   S t1|d(W  d   S 1 s$w   Y  dS ))a/  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `list[torch.Tensor]`, `list[PIL.Image.Image]`, or `list[np.ndarray]`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
            strength (`float`, *optional*, defaults to 0.8):
                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
                essentially ignores `image`.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 3.0):
                Guidance scale as defined in [Classifier-Free Diffusion
                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
                the text `prompt`, usually at the expense of lower image quality.
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `list[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
            negative_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`list`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`

        callbackNr   z1.0.0zhPassing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`znPassing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`c                 3   r   r   r   r   r   r4   r5   r     r   z5Kandinsky3Img2ImgPipeline.__call__.<locals>.<genexpr>r   r   c                    r   r4   r   r   r   r4   r5   rw     r   z6Kandinsky3Img2ImgPipeline.__call__.<locals>.<listcomp>Tr   r   )rb   r<   rc   r   r   rd   r   r   c                 s   s$    | ]}t |tjjtjfV  qd S r   )rV   r{   r|   rA   rz   rq   r4   r4   r5   r   *  s   " zInput is in incorrect format: c                 S   s   g | ]}t |qS r4   )rS   rq   r4   r4   r5   rw   ,  s    z:. Currently, we only support  PIL image and pytorch tensorc                    s   g | ]} j |qS r4   )r.   
preprocessrq   r   r4   r5   rw   /  s    rx   rO   )r<   r   text_encoder_offload_hook)totalr   )encoder_hidden_statesencoder_attention_maskg      ?)ru   r   r   r   r   orderlatent)force_not_quantizerp   )images)2popr   r   r_   r   r   r   rV   rW   rX   r*   rY   rU   rm   ra   rA   r~   boolr[   rP   r   set_timestepsr?   r   rn   repeat_interleaver]   r   hasattrr   offloadr   r   progress_bar	enumerater   chunkr   prev_samplelocalsupdater)   XLA_AVAILABLExm	mark_stepdecoder.   postprocessmaybe_free_model_hooksr   )&r/   r`   rv   r;   r:   r   rc   rb   ru   r   r   r   r   r   r   r   r   kwargsr   r   rE   re   r<   r9   r   latent_timestepnum_warmup_stepsr   rs   tlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsstep_idxr4   r   r5   __call__  s   X





6
9&z"Kandinsky3Img2ImgPipeline.__call__)	Tr   NNNNTNNr   )NNNNNN)%__name__
__module____qualname__model_cpu_offload_seqr   r   r   r   r
   r	   r'   r?   rG   rA   no_gradrz   rm   r   r   r   propertyr   ra   r   r   EXAMPLE_DOC_STRINGrW   rX   r{   r|   floatr7   	Generatorr   r   r   __classcell__r4   r4   r2   r5   r   8   s    		
 
+
H



"	
r   )&r   typingr   r{   	PIL.ImagerA   transformersr   r   r.   r   loadersr   modelsr   r	   
schedulersr
   utilsr   r   r   r   utils.torch_utilsr   pipeline_utilsr   r   torch_xla.core.xla_modelcore	xla_modelr   r   
get_loggerr   loggerr   r   r4   r4   r4   r5   <module>   s(    
