o
    piOx                     @   s   d dl Z d dlmZmZmZmZmZ d dlZd dl	Z	d dl
Z	d dlZd dlmZmZ ddlmZ ddlmZmZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZmZ ee Z!dZ"dddZ#dd Z$G dd deeZ%dS )    N)CallableDictListOptionalUnion)T5EncoderModelT5Tokenizer   )StableDiffusionLoraLoaderMixin)Kandinsky3UNetVQModel)DDPMScheduler)	deprecateloggingreplace_example_docstring)randn_tensor   )DiffusionPipelineImagePipelineOutputa?  
    Examples:
        ```py
        >>> from diffusers import AutoPipelineForImage2Image
        >>> from diffusers.utils import load_image
        >>> import torch

        >>> pipe = AutoPipelineForImage2Image.from_pretrained(
        ...     "kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16
        ... )
        >>> pipe.enable_model_cpu_offload()

        >>> prompt = "A painting of the inside of a subway train with tiny raccoons."
        >>> image = load_image(
        ...     "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/kandinsky3/t2i.png"
        ... )

        >>> generator = torch.Generator(device="cpu").manual_seed(0)
        >>> image = pipe(prompt, image=image, strength=0.75, num_inference_steps=25, generator=generator).images[0]
        ```
   c                 C   sX   | |d  }| |d  dkr|d7 }||d  }||d  dkr$|d7 }|| || fS )Nr   r       )heightwidthscale_factor
new_height	new_widthr   r   x/home/ubuntu/SoloSpeech/.venv/lib/python3.10/site-packages/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.pydownscale_height_and_width/   s   r   c                 C   sH   t | d}|t jd d }t |g d}t|d}|S )NRGBg     _@r   )r   r   r   r   )	nparrayconvertastypefloat32	transposetorch
from_numpy	unsqueeze)	pil_imagearrimager   r   r   prepare_image9   s
   r,   c                %       s  e Zd ZdZg dZdedededede	f
 fdd	Z
d
d Zdd Ze 									d7deej deej deej deej fddZd8ddZdd Z						d9ddZedd Zed d! Zed"d# Ze eeddd$d%d&dddddddd'ddd(gfd)eeee f d*eejejjeej eejj f d+e d,e!d-e d.eeeee f  d/ee! d0eeej"eej" f  deej deej deej deej d1ee d2e#d3ee$e!e!e%gdf  d4ee f d5d6Z&  Z'S ):Kandinsky3Img2ImgPipelineztext_encoder->movq->unet->movq)latentsprompt_embedsnegative_prompt_embedsnegative_attention_maskattention_mask	tokenizertext_encoderunet	schedulermovqc                    s"   t    | j|||||d d S )N)r3   r4   r5   r6   r7   )super__init__register_modules)selfr3   r4   r5   r6   r7   	__class__r   r   r9   K   s   


z"Kandinsky3Img2ImgPipeline.__init__c                 C   s<   t t|| |}t|| d}| jj|d  }||| fS )Nr   )minintmaxr6   	timesteps)r;   num_inference_stepsstrengthdeviceinit_timestept_startrA   r   r   r   get_timestepsY   s   z'Kandinsky3Img2ImgPipeline.get_timestepsc                 C   s`   |r,t ||dk ||dk< |d d }|d d d |f }|d d d |f }||fS )Nr   r   )r&   
zeros_likesumr@   )r;   
embeddingsr2   cut_contextmax_seq_lengthr   r   r   _process_embedsb   s   z)Kandinsky3Img2ImgPipeline._process_embedsTr   NFr/   r0   r2   r1   c              
   C   s  |dur|durt |t |urtdt | dt | d|du r&| j}|dur2t|tr2d}n|dur@t|tr@t|}n|jd }d}|du r|| j|d|d	d
d}|j	
|}|j
|}	| j||	d}|d }| ||	|\}}	||	d }| jdur| jj}nd}|j
||d}|j\}}}|d|d}||| |d}|	|d}	|r4|du r4|du rdg| }n$t|tr|g}n|t|krtd| dt| d| d| d	|}|dur*| j|ddd	d	d
d}|j	
|}|j
|}
| j||
d}|d }|ddd|jd f }|
ddd|jd f }
||
d }n
t|}t|	}
|ra|jd }|j
||d}|j|jkr`|d|d}||| |d}|
|d}
nd}d}
|||	|
fS )aY  
        Encodes the prompt into text encoder hidden states.

        Args:
             prompt (`str` or `List[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`, *optional*):
                torch device to place the resulting embeddings on
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                number of images that should be generated per prompt
            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
                whether to use classifier free guidance or not
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds`. instead. If not defined, one has to pass `negative_prompt_embeds`. instead.
                Ignored when not using guidance (i.e., ignored if `guidance_scale` is less than `1`).
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
            negative_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
        Nz?`negative_prompt` should be the same type to `prompt`, but got z != .r   r      
max_lengthTpt)paddingrQ   
truncationreturn_tensors)r2   r   dtyperD   rH    z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)rS   rQ   rT   return_attention_maskrU   )type	TypeError_execution_device
isinstancestrlistlenshaper3   	input_idstor2   r4   rN   r(   rW   repeatview
ValueErrorr&   rI   )r;   promptdo_classifier_free_guidancenum_images_per_promptrD   negative_promptr/   r0   _cut_contextr2   r1   
batch_sizerQ   text_inputstext_input_idsrW   bs_embedseq_len_uncond_tokensuncond_inputr   r   r   encode_promptk   s   *








z'Kandinsky3Img2ImgPipeline.encode_promptc                    s  t tjtjjtfstdt j||d|| }j	d dkr)}nCt  trAt
 |krAtdt
  d| dt  tr[ fdd	t|D }tj|d
d}n
jj }jjj| }tj|gd
d}|j	}	t|	 ||d}
j||
|}|}|S )NzK`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is )rD   rW   r      z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.c                    s0   g | ]}j ||d   j | qS )r   )r7   encodelatent_distsample.0i	generatorr+   r;   r   r   
<listcomp>  s    $z=Kandinsky3Img2ImgPipeline.prepare_latents.<locals>.<listcomp>r   dim)r}   rD   rW   )r]   r&   TensorPILImager_   rf   rZ   rc   ra   r`   rangecatr7   rv   rw   rx   configscaling_factorr   r6   	add_noise)r;   r+   timesteprl   ri   rW   rD   r}   init_latentsra   noiser.   r   r|   r   prepare_latents  s6   
z)Kandinsky3Img2ImgPipeline.prepare_latentsc                 C   sX   dt t| jjj v }i }|r||d< dt t| jjj v }|r*||d< |S )Netar}   )setinspect	signaturer6   step
parameterskeys)r;   r}   r   accepts_etaextra_step_kwargsaccepts_generatorr   r   r   prepare_extra_step_kwargs,  s   z3Kandinsky3Img2ImgPipeline.prepare_extra_step_kwargsc	           	         s
  |d urt |tr|dkrtd| dt| d|d ur;t fdd|D s;td j d fd	d
|D  |d urN|d urNtd| d| d|d u rZ|d u rZtd|d urqt |tsqt |tsqtdt| |d ur|d urtd| d| d|d ur|d ur|j|jkrtd|j d|j d|d ur|d u rtd|d ur|d ur|jd d |jkrtd|jd d  d|j d|d ur|d u rtd|d ur|d ur|jd d |jkrtd|jd d  d|j dd S d S d S )Nr   z5`callback_steps` has to be a positive integer but is z	 of type rO   c                 3       | ]}| j v V  qd S N_callback_tensor_inputsrz   kr;   r   r   	<genexpr>N      

z9Kandinsky3Img2ImgPipeline.check_inputs.<locals>.<genexpr>2`callback_on_step_end_tensor_inputs` has to be in , but found c                       g | ]	}| j vr|qS r   r   r   r   r   r   r~   R      z:Kandinsky3Img2ImgPipeline.check_inputs.<locals>.<listcomp>zCannot forward both `prompt`: z and `prompt_embeds`: z2. Please make sure to only forward one of the two.zeProvide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined.z2`prompt` has to be of type `str` or `list` but is z'Cannot forward both `negative_prompt`: z and `negative_prompt_embeds`: zu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` zLPlease provide `negative_attention_mask` along with `negative_prompt_embeds`r   z`negative_prompt_embeds` and `negative_attention_mask` must have the same batch_size and token length when passed directly, but got: `negative_prompt_embeds` z != `negative_attention_mask` z:Please provide `attention_mask` along with `prompt_embeds`z`prompt_embeds` and `attention_mask` must have the same batch_size and token length when passed directly, but got: `prompt_embeds` z != `attention_mask` )	r]   r?   rf   rZ   allr   r^   r_   ra   )	r;   rg   callback_stepsrj   r/   r0   "callback_on_step_end_tensor_inputsr2   r1   r   r   r   check_inputs=  sz   z&Kandinsky3Img2ImgPipeline.check_inputsc                 C      | j S r   _guidance_scaler   r   r   r   guidance_scale     z(Kandinsky3Img2ImgPipeline.guidance_scalec                 C   s
   | j dkS )Nr   r   r   r   r   r   rh     s   
z5Kandinsky3Img2ImgPipeline.do_classifier_free_guidancec                 C   r   r   )_num_timestepsr   r   r   r   num_timesteps  r   z'Kandinsky3Img2ImgPipeline.num_timestepsg333333?   g      @pilr.   rg   r+   rC   rB   r   rj   ri   r}   output_typereturn_dictcallback_on_step_endr   c           &         s  | dd}| dd}|durtddd |dur tddd |durAt fdd|D sAtd	 j d
 fdd|D  d} ||||	|
||| | _|dur^t|tr^d}n|durlt|t	rlt
|}n|	jd } j} j| j||||	|
|||d
\}	}
}} jrt|
|	g}	t||g }t|t	s|g}tdd |D stddd |D  dtjdd |D dd}|j|	j|d} jj||d  |||\}} j|d }|j|dd}|dd || } |||||	j||}t dr jdur j  t
|| jj  }t
| _  j!|d}t"|D ]\}} jr@t|gd n|} j#|||	|dd } jrb|$d\} }!|d |! ||   } jj%||||dj&}|duri }"|D ]
}#t' |# |"|#< qv| |||"}$|$ d|}|$ d |	}	|$ d!|
}
|$ d"|}|$ d#|}|t
|d ks|d |kr|d  jj dkr|(  |dur|| dkr|t) jd$d }%||%|| q0|d%vrtd&| |d'ks' jj*|dd(d) }|d*v r|d+ d+ }|+dd}|, -ddd,d. / }|d-kr& 0|}n|} 1  |s:|fW  d   S t2|d.W  d   S 1 sJw   Y  dS )/a#  
        Function invoked when calling the pipeline for generation.

        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            image (`torch.Tensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.Tensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
                `Image`, or tensor representing an image batch, that will be used as the starting point for the
                process.
            strength (`float`, *optional*, defaults to 0.8):
                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
                essentially ignores `image`.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
            guidance_scale (`float`, *optional*, defaults to 3.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            num_images_per_prompt (`int`, *optional*, defaults to 1):
                The number of images to generate per prompt.
            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
                to make generation deterministic.
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            attention_mask (`torch.Tensor`, *optional*):
                Pre-generated attention mask. Must provide if passing `prompt_embeds` directly.
            negative_attention_mask (`torch.Tensor`, *optional*):
                Pre-generated negative attention mask. Must provide if passing `negative_prompt_embeds` directly.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.IFPipelineOutput`] instead of a plain tuple.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.

        Examples:

        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`

        callbackNr   z1.0.0zhPassing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`znPassing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`c                 3   r   r   r   r   r   r   r   r     r   z5Kandinsky3Img2ImgPipeline.__call__.<locals>.<genexpr>r   r   c                    r   r   r   r   r   r   r   r~     r   z6Kandinsky3Img2ImgPipeline.__call__.<locals>.<listcomp>Tr   r   )ri   rD   rj   r/   r0   rk   r2   r1   c                 s   s$    | ]}t |tjjtjfV  qd S r   )r]   r   r   r&   r   ry   r   r   r   r   +  s   " zInput is in incorrect format: c                 S      g | ]}t |qS r   )rZ   ry   r   r   r   r~   -      z:. Currently, we only support  PIL image and pytorch tensorc                 S   r   r   )r,   ry   r   r   r   r~   0  r   r   rV   )rD   r.   text_encoder_offload_hook)totalr   )encoder_hidden_statesencoder_attention_maskg      ?)r}   r/   r0   r2   r1   order)rR   r    r   latentzSOnly the output types `pt`, `pil`, `np` and `latent` are supported not output_type=r   )force_not_quantizerx   )r    r   g      ?r	   r   )images)3popr   r   rf   r   r   r   r]   r^   r_   r`   ra   r\   rt   rh   r&   r   boolrc   rW   r6   set_timestepsrG   r7   rv   repeat_interleaverd   r   hasattrr   offloadr   r   progress_bar	enumerater5   chunkr   prev_samplelocalsupdategetattrdecodeclampcpupermutefloatnumpynumpy_to_pilmaybe_free_model_hooksr   )&r;   rg   r+   rC   rB   r   rj   ri   r}   r/   r0   r2   r1   r   r   r   r   kwargsr   r   rL   rl   rD   rA   r.   latent_timestepnum_warmup_stepsr   r{   tlatent_model_input
noise_prednoise_pred_uncondnoise_pred_textcallback_kwargsr   callback_outputsstep_idxr   r   r   __call__  s  X





6




A&z"Kandinsky3Img2ImgPipeline.__call__)	Tr   NNNNFNNr   )NNNNNN)(__name__
__module____qualname__model_cpu_offload_seqr   r   r   r   r   r   r9   rG   rN   r&   no_gradr   r   rt   r   r   r   propertyr   rh   r   r   EXAMPLE_DOC_STRINGr   r^   r   r   r   r   r?   	Generatorr   r   r   r   __classcell__r   r   r<   r   r-   A   s    		
 
+
H


"	
r-   )r   )&r   typingr   r   r   r   r   r   r    r   	PIL.Imager&   transformersr   r   loadersr
   modelsr   r   
schedulersr   utilsr   r   r   utils.torch_utilsr   pipeline_utilsr   r   
get_loggerr   loggerr   r   r,   r-   r   r   r   r   <module>   s$    


