o
    Gi                     @   s8  d dl Z d dlmZmZmZmZmZ ddlmZ ddl	m
Z
 ddlmZmZ ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z# ddlm$Z$ e%e&Z'	dde j(de j)dB de*fddZ+G dd deZ,G dd deZ-G dd deZ.G dd deZ/dS )    N)CLIPImageProcessorCLIPTextModelCLIPTextModelWithProjectionCLIPTokenizerCLIPVisionModelWithProjection   )
FrozenDict)ClassifierFreeGuidance)PipelineImageInputVaeImageProcessor) StableDiffusionXLLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKLImageProjectionUNet2DConditionModel)adjust_lora_scale_text_encoder)USE_PEFT_BACKENDloggingscale_lora_layersunscale_lora_layers   )ModularPipelineBlocksPipelineState)ComponentSpec
ConfigSpec
InputParamOutputParam   ) StableDiffusionXLModularPipelinesampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr#   r   moder%   AttributeError)r    r!   r"    r)   l/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/modular_pipelines/stable_diffusion_xl/encoders.pyretrieve_latents-   s   

r+   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
ZedddZdd Ze dededefddZdS )StableDiffusionXLIPAdapterStepstable-diffusion-xlreturnc                 C   s   	 dS )Na  IP Adapter step that prepares ip adapter image embeddings.
Note that this step only prepares the embeddings - in order for it to work correctly, you need to load ip adapter weights into unet via ModularPipeline.load_ip_adapter() and pipeline.set_ip_adapter_scale().
See [ModularIPAdapterMixin](https://huggingface.co/docs/diffusers/api/loaders/ip_adapter#diffusers.loaders.ModularIPAdapterMixin) for more detailsr)   selfr)   r)   r*   description=   s   z*StableDiffusionXLIPAdapterStep.descriptionc              	   C   sB   t dtt dttdddddt dtt dttd	d
iddgS )Nimage_encoderfeature_extractor   )size	crop_sizefrom_configconfigdefault_creation_methodunetguiderguidance_scale      @)r   r   r   r   r   r	   r/   r)   r)   r*   expected_componentsG   s   
z2StableDiffusionXLIPAdapterStep.expected_componentsc                 C   s   t dtdddgS )Nip_adapter_imageTz%The image(s) to be used as ip adapter)requiredr1   )r   r
   r/   r)   r)   r*   inputsZ   s   z%StableDiffusionXLIPAdapterStep.inputsc                 C   s    t dtjddt dtjddgS )Nip_adapter_embedszIP adapter image embeddings	type_hintr1   negative_ip_adapter_embedsz$Negative IP adapter image embeddingsr   torchTensorr/   r)   r)   r*   intermediate_outputse   s   z3StableDiffusionXLIPAdapterStep.intermediate_outputsNc           
      C   s   t | j j}t|tjs| j|ddj}|j	||d}|rH| j|ddj
d }|j|dd}| jt|ddj
d }|j|dd}||fS | |j}|j|dd}t|}	||	fS )	Npt)return_tensorsdevicedtypeToutput_hidden_statesr   dim)nextr2   
parametersrO   
isinstancerH   rI   r3   pixel_valuestohidden_statesrepeat_interleave
zeros_likeimage_embeds)

componentsimagerN   num_images_per_promptrQ   rO   image_enc_hidden_statesuncond_image_enc_hidden_statesr]   uncond_image_embedsr)   r)   r*   encode_imagep   s(   

z+StableDiffusionXLIPAdapterStep.encode_imagec                 C   sn  g }|rg }|d u rft |ts|g}t|t|jjjkr/tdt| dt|jjj dt||jjjD ]-\}	}
t |
t }| 	||	|d|\}}|
|d d d f  |rd|
|d d d f  q7n|D ]}|rx|d\}}|
| |
| qhg }t|D ]0\}}tj|g| dd}|rtj|| g| dd}tj||gdd}|j|d}|
| q|S )	NzK`ip_adapter_image` must have same length as the number of IP Adapters. Got z images and z IP Adapters.r   r   r   rS   )rN   )rW   listlenr;   encoder_hid_projimage_projection_layers
ValueErrorzipr   rd   appendchunk	enumeraterH   catrY   )r0   r^   r@   ip_adapter_image_embedsrN   r`   prepare_unconditional_embedsr]   negative_image_embedssingle_ip_adapter_imageimage_proj_layeroutput_hidden_statesingle_image_embedssingle_negative_image_embedsir)   r)   r*   prepare_ip_adapter_image_embeds   sH   	



z>StableDiffusionXLIPAdapterStep.prepare_ip_adapter_image_embedsr^   statec                 C   s   |  |}|jjdk|_|j|_| j||jd |jd|jd|_|jrAg |_	t
|jD ]\}}|d\}}|j	| ||j|< q*| || ||fS )Nr   )r@   ro   rN   r`   rp   r   )get_block_stater<   num_conditionsrp   _execution_devicerN   rx   r@   rC   rF   rm   rl   rk   set_block_state)r0   r^   ry   block_staterw   r]   rq   r)   r)   r*   __call__   s&   
z'StableDiffusionXLIPAdapterStep.__call__)N)__name__
__module____qualname__
model_namepropertystrr1   re   r   r?   r   rB   r   rJ   staticmethodrd   rx   rH   no_gradr   r   r   r)   r)   r)   r*   r,   :   s    	

3r,   c                   @   s2  e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zedee fddZedd Ze												d%dededB dejdB dedededB dedB dejdB dejdB dejdB dejdB dedB dedB fdd Ze d!ed"edefd#d$ZdS )& StableDiffusionXLTextEncoderStepr-   r.   c                 C      dS )NzMText Encoder step that generate text_embeddings to guide the image generationr)   r/   r)   r)   r*   r1         z,StableDiffusionXLTextEncoderStep.descriptionc              
   C   s:   t dtt dtt dtt dtt dttddidd	gS )
Ntext_encodertext_encoder_2	tokenizertokenizer_2r<   r=   r>   r7   r8   )r   r   r   r   r	   r   r/   r)   r)   r*   r?      s   
z4StableDiffusionXLTextEncoderStep.expected_componentsc                 C   s   t ddgS )Nforce_zeros_for_empty_promptT)r   r/   r)   r)   r*   expected_configs   s   z1StableDiffusionXLTextEncoderStep.expected_configsc                 C   s(   t dt dt dt dt dt dgS )Npromptprompt_2negative_promptnegative_prompt_2cross_attention_kwargs	clip_skip)r   r/   r)   r)   r*   rB      s   z'StableDiffusionXLTextEncoderStep.inputsc              	   C   sD   t dtjdddt dtjdddt dtjdddt d	tjdd
dgS )Nprompt_embedsdenoiser_input_fieldsz2text embeddings used to guide the image generation)rE   kwargs_typer1   negative_prompt_embedsz;negative text embeddings used to guide the image generationpooled_prompt_embedsz9pooled text embeddings used to guide the image generationnegative_pooled_prompt_embedszBnegative pooled text embeddings used to guide the image generationrG   r/   r)   r)   r*   rJ      s2   z5StableDiffusionXLTextEncoderStep.intermediate_outputsc                 C   sx   | j d urt| j tst| j tstdt| j  | jd ur6t| jts8t| jts:tdt| j d S d S d S )Nz2`prompt` has to be of type `str` or `list` but is z4`prompt_2` has to be of type `str` or `list` but is )r   rW   r   re   ri   typer   )r~   r)   r)   r*   check_inputs  s   





z-StableDiffusionXLTextEncoderStep.check_inputsNr   Tr   r   rN   r`   rp   r   r   r   r   r   r   
lora_scaler   c           !   
   C   s  |p| j }|dur9t| tr9|| _| jdur%tst| j| nt| j| | jdur9ts3t| j| nt| j| t|t	rA|gn|}|durLt
|}n|jd }| jdur\| j| jgn| jg}| jdurk| j| jgn| jg}|du r|pw|}t|t	r|gn|}g }||g}t|||D ]u\}}}t| tr| ||}||d|jddd}|j}||dddj}|jd	 |jd	 krt||s||dd|jd
 d	f }td|j d|  |||dd}|d }
|du r|jd }n|j|d   }|| qtj|d	d}|du o| jj}|r*|	du r*|r*t|}	t|
}n|r|	du r|p6d}|p;|}t|t	rG||g n|}t|t	rT||g n|}|durst|t|urst dt| dt| d|t
|krt!d| dt
| d| d| d	||g}g }t|||D ]8\}}}t| tr| ||}|jd
 }||d|ddd}||j|dd}	|	d }|	jd }	||	 qtj|d	d}	| jdur|j| jj"|d}n	|j| j#j"|d}|j\}}} |$d
|d
}|%|| |d	}|r9|	jd
 }| jdur |	j| jj"|d}	n	|	j| j#j"|d}	|	$d
|d
}	|	%|| |d	}	|
$d
|%|| d	}
|rT|$d
|%|| d	}| jdurit| tritrit&| j| | jdur~t| tr~tr~t&| j| ||	|
|fS )ae  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
                used in both text-encoders
            device: (`torch.device`):
                torch device
            num_images_per_prompt (`int`):
                number of images that should be generated per prompt
            prepare_unconditional_embeds (`bool`):
                whether to use prepare unconditional embeddings or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            negative_prompt_2 (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
            prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
                provided, text embeddings will be generated from `prompt` input argument.
            negative_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
            negative_pooled_prompt_embeds (`torch.Tensor`, *optional*):
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
        Nr   
max_lengthTrK   )paddingr   
truncationrL   longest)r   rL   r   z\The following part of your input was truncated because CLIP can only handle sequences up to z	 tokens: rP   rR   r   rS    z?`negative_prompt` should be the same type to `prompt`, but got z != .z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)rO   rN   )'r|   rW   r   _lora_scaler   r   r   r   r   r   rf   shaper   r   rj   r   maybe_convert_promptmodel_max_length	input_idsrH   equalbatch_decodeloggerwarningrY   rZ   rk   concatr9   r   r\   r   	TypeErrorri   rO   r;   repeatviewr   )!r^   r   r   rN   r`   rp   r   r   r   r   r   r   r   r   
batch_size
tokenizerstext_encodersprompt_embeds_listpromptsr   r   text_inputstext_input_idsuntruncated_idsremoved_textzero_out_negative_promptuncond_tokensnegative_prompt_embeds_listr   uncond_inputbs_embedseq_len_r)   r)   r*   encode_prompt%  s  
;







 






z.StableDiffusionXLTextEncoderStep.encode_promptr^   ry   c                 C   s   |  |}| | |jjdk|_|j|_|jd ur!|jdd nd |_	| j
||j|j|jd|j|j|jd d d d |j	|jd\|_|_|_|_| || ||fS )Nr   scale)r   r   r   r   r   r   )rz   r   r<   r{   rp   r|   rN   r   gettext_encoder_lora_scaler   r   r   r   r   r   r   r   r   r   r}   )r0   r^   ry   r~   r)   r)   r*   r     s>   



z)StableDiffusionXLTextEncoderStep.__call__)NNr   TNNNNNNNN)r   r   r   r   r   r   r1   re   r   r?   r   r   r   rB   r   rJ   r   r   rH   rN   intboolrI   floatr   r   r   r   r   r)   r)   r)   r*   r      sr    


	
 sr   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
ZdejdejfddZe dededefddZdS )StableDiffusionXLVaeEncoderStepr-   r.   c                 C   r   )NzIVae Encoder step that encode the input image into a latent representationr)   r/   r)   r)   r*   r1   D  r   z+StableDiffusionXLVaeEncoderStep.descriptionc                 C   s"   t dtt dttddiddgS )Nvaeimage_processorvae_scale_factor   r7   r8   r   r   r   r   r/   r)   r)   r*   r?   H  s   
z3StableDiffusionXLVaeEncoderStep.expected_componentsc              
   C   s>   t dddt dt dt dt dtjdd	t d
td B dd	gS )Nr_   TrA   heightwidthr!   rO   z Data type of model tensor inputsrD   preprocess_kwargszA kwargs dictionary that if specified is passed along to the `ImageProcessor` as defined under `self.image_processor` in [diffusers.image_processor.VaeImageProcessor])r   rH   rO   dictr/   r)   r)   r*   rB   T  s   
z&StableDiffusionXLVaeEncoderStep.inputsc                 C   s   t dtjddgS )Nimage_latentszUThe latents representing the reference image for image-to-image/inpainting generationrD   rG   r/   r)   r)   r*   rJ   c  s   z4StableDiffusionXLVaeEncoderStep.intermediate_outputsr_   r!   c                    sj  d  }}t  jjdr  jjjd ur t jjjdddd}t  jjdr< jjjd ur<t jjjdddd}j} jjj	rP
  jjtjd ttrm fddtjd D }tj|dd	}n
t jd
} jjj	r j| ||}|d ur|d ur|j|j|d}|j|j|d}||  jjj | }|S  jjj| }|S )Nlatents_meanr      latents_stdrO   c              	      0   g | ]}t  j||d   | dqS r   r!   r+   r   encode.0rw   r^   r!   r_   r)   r*   
<listcomp>|      "zEStableDiffusionXLVaeEncoderStep._encode_vae_image.<locals>.<listcomp>r   rS   r   rM   r&   r   r9   r   rH   tensorr   r   rO   force_upcastr   rY   float32rW   re   ranger   rn   r+   r   rN   scaling_factorr0   r^   r_   r!   r   r   rO   r   r)   r   r*   _encode_vae_imageo  2   



z1StableDiffusionXLVaeEncoderStep._encode_vae_imager^   ry   c                 C   s   |  |}|jp	i |_|j|_|jd ur|jn|jj|_|jj|jf|j	|j
d|j}|j|j|jd}|jd |_t|jtrYt|j|jkrYtdt|j d|j d| j|||jd|_| || ||fS )N)r   r   rM   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.r_   r!   )rz   r   r|   rN   rO   r   r   
preprocessr_   r   r   rY   r   r   rW   r!   re   rf   ri   r   r   r}   )r0   r^   ry   r~   r_   r)   r)   r*   r     s,   
z(StableDiffusionXLVaeEncoderStep.__call__N)r   r   r   r   r   r   r1   re   r   r?   r   rB   r   rJ   rH   rI   	Generatorr   r   r   r   r   r)   r)   r)   r*   r   A  s    "r   c                   @   s   e Zd ZdZedee fddZedefddZ	edee
 fddZedee fd	d
ZdejdejfddZdd Ze dededefddZdS )&StableDiffusionXLInpaintVaeEncoderStepr-   r.   c                 C   s>   t dtt dttddiddt dttddd	d	d
ddgS )Nr   r   r   r   r7   r8   mask_processorFT)do_normalizer   do_binarizedo_convert_grayscaler   r/   r)   r)   r*   r?     s    
z:StableDiffusionXLInpaintVaeEncoderStep.expected_componentsc                 C   r   )NzLVae encoder step that prepares the image and mask for the inpainting processr)   r/   r)   r)   r*   r1     r   z2StableDiffusionXLInpaintVaeEncoderStep.descriptionc              
   C   s>   t dt dt dddt dddt dt dtjd	d
t dgS )Nr   r   r_   Tr   
mask_imagepadding_mask_croprO   zThe dtype of the model inputsrD   r!   )r   rH   rO   r/   r)   r)   r*   rB     s   

z-StableDiffusionXLInpaintVaeEncoderStep.inputsc                 C   sF   t dtjddt dtjddt dtjddt dtttf d B d	dgS )
Nr   z-The latents representation of the input imagerD   maskz*The mask to use for the inpainting processmasked_image_latentsz^The masked image latents to use for the inpainting process (only for inpainting-specifid unet)crops_coordszPThe crop coordinates to use for the preprocess/postprocess of the image and mask)r   rH   rI   tupler   r/   r)   r)   r*   rJ     s   z;StableDiffusionXLInpaintVaeEncoderStep.intermediate_outputsr_   r!   c                    sj  d  }}t  jjdr  jjjd ur t jjjdddd}t  jjdr< jjjd ur<t jjjdddd}j} jjj	rP
  jjtjd ttrm fddtjd D }tj|dd	}n
t jd
} jjj	r j| ||}|d ur|d ur|j|j|d}|j|j|d}|| | jjj | }|S  jjj| }|S )Nr   r   r   r   r   c              	      r   r   r   r   r   r)   r*   r     r   zLStableDiffusionXLInpaintVaeEncoderStep._encode_vae_image.<locals>.<listcomp>r   rS   r   rM   r   r   r)   r   r*   r     r   z8StableDiffusionXLInpaintVaeEncoderStep._encode_vae_imagec
                 C   s@  t jjj|||j ||j fd}|j||d}|jd |k rC||jd  dks6td| d|jd  d|||jd  ddd}|d urQ|jd dkrQ|}
nd }
|d ur|
d u rj|j||d}| j	|||	d	}
|
jd |k r||
jd  dkstd
| d|
jd  d|
||
jd  ddd}
|
j||d}
||
fS )N)r5   rM   r   zvThe passed mask and the required batch size don't match. Masks are supposed to be duplicated to a total batch size of z, but zo masks were passed. Make sure the number of masks that you pass is divisible by the total requested batch size.r   r   r   zyThe passed images and the required batch size don't match. Images are supposed to be duplicated to a total batch size of zq images were passed. Make sure the number of images that you pass is divisible by the total requested batch size.)
rH   nn
functionalinterpolater   rY   r   ri   r   r   )r0   r^   r   masked_imager   r   r   rO   rN   r!   r   r)   r)   r*   prepare_mask_latents  sF   z;StableDiffusionXLInpaintVaeEncoderStep.prepare_mask_latentsr^   ry   c                 C   s^  |  |}|jd ur|jn|jj|_|j|_|jd u r|j|_|jd u r(|j|_|j	d ur@|j
j|j|j|j|j	d|_d|_nd |_d|_|jj|j|j|j|j|jd}|jtjd}|j
j|j|j|j|j|jd}||dk  |_|jd |_|j|j|jd	}| j|||jd
|_| |||j|j|j|j|j|j|j	\|_|_| || ||fS )N)padfilldefault)r   r   r   resize_moder   )r   r   r  r   g      ?r   rM   r   )rz   rO   r   r|   rN   r   default_heightr   default_widthr   r   get_crop_regionr   r   r  r   r   r_   rY   rH   r   r   r   r   r   r!   r   r   r   r   r}   )r0   r^   ry   r~   r_   r   r)   r)   r*   r   ;  s\   



z/StableDiffusionXLInpaintVaeEncoderStep.__call__N)r   r   r   r   r   re   r   r?   r   r1   r   rB   r   rJ   rH   rI   r   r   r   r   r   r   r   r)   r)   r)   r*   r     s    $/r   )Nr   )0rH   transformersr   r   r   r   r   configuration_utilsr   guidersr	   r   r
   r   loadersr   r   modelsr   r   r   models.lorar   utilsr   r   r   r   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   r   
get_loggerr   r   rI   r   r   r+   r,   r   r   r   r)   r)   r)   r*   <module>   s8   

   kk