o
    ۷iK                     @   sj  d dl Z d dlZd dlZd dlmZmZmZmZ ddl	m
Z
 ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZmZ d	d
lmZmZ d	dlmZmZm Z  ddlm!Z! e rcd dl"Z"e#e$Z%dd Z&dd Z'dd Z(	d&dej)dej*dB de+fddZ,d'dedej)dej*fddZ-G dd deZ.G d d! d!eZ/G d"d# d#eZ0G d$d% d%eZ1dS )(    N)CLIPTextModelCLIPTokenizerT5EncoderModelT5TokenizerFast   )
FrozenDict)VaeImageProcessoris_valid_imageis_valid_image_imagelist)FluxLoraLoaderMixinTextualInversionLoaderMixin)AutoencoderKL)USE_PEFT_BACKENDis_ftfy_availableloggingscale_lora_layersunscale_lora_layers   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )FluxModularPipelinec                 C   s"   t | } tt| } |  S N)ftfyfix_texthtmlunescapestriptext r#   _/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/modular_pipelines/flux/encoders.pybasic_clean&   s   
r%   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr    r!   r#   r#   r$   whitespace_clean,   s   r)   c                 C   s   t t| } | S r   )r)   r%   r!   r#   r#   r$   prompt_clean2   s   r*   sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr+   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr/   r+   moder1   AttributeError)r,   r-   r.   r#   r#   r$   retrieve_latents8   s   

r5   vaeimagec                    sh   t  tr fddtjd D }tj|dd}n
t d}|jj	 jj
 }|S )Nc              	      s0   g | ]}t ||d    | dqS )r   r-   r.   )r5   encode).0ir-   r7   r.   r6   r#   r$   
<listcomp>G   s    "z$encode_vae_image.<locals>.<listcomp>r   )dimr8   )
isinstancelistrangeshapetorchcatr5   r9   configshift_factorscaling_factor)r6   r7   r-   r.   image_latentsr#   r<   r$   encode_vae_imageE   s   
rI   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zedd Ze dedefddZdS )FluxProcessImagesInputStepfluxreturnc                 C      dS )NzImage Preprocess step.r#   selfr#   r#   r$   descriptionW      z&FluxProcessImagesInputStep.descriptionc                 C   s   t dttdddddgS )Nimage_processor   )vae_scale_factorvae_latent_channelsfrom_configrE   default_creation_methodr   r   r   rN   r#   r#   r$   expected_components[   s   z.FluxProcessImagesInputStep.expected_componentsc                 C   s   t dt dt dt dgS )Nresized_imager7   heightwidth)r   rN   r#   r#   r$   inputsf   s   z!FluxProcessImagesInputStep.inputsc                 C      t ddgS Nprocessed_image)namer   rN   r#   r#   r$   intermediate_outputsj      z/FluxProcessImagesInputStep.intermediate_outputsc                 C   sh   | d ur| |d  dkrt d|d  d|  |d ur0||d  dkr2t d|d  d| d S d S )Nr   r   zHeight must be divisible by z but is zWidth must be divisible by )
ValueErrorr\   r]   rT   r#   r#   r$   check_inputsn   s
   z'FluxProcessImagesInputStep.check_inputs
componentsstatec                 C   s   |  |}|jd u r|jd u rtd|jd u r3|j}| j|j|j|jd |jp+|j}|jp1|j	}n|jd j
\}}|j}|jj|||d|_| || ||fS )Nz;`resized_image` and `image` cannot be None at the same timerg   r   )r7   r\   r]   )get_block_stater[   r7   rf   rh   r\   r]   rT   default_heightdefault_widthsizerR   
preprocessra   set_block_state)rO   ri   rj   block_stater7   r\   r]   r#   r#   r$   __call__v   s   

z#FluxProcessImagesInputStep.__call__N)__name__
__module____qualname__
model_namepropertystrrP   r@   r   rZ   r   r^   r   rd   staticmethodrh   rC   no_gradr   r   rr   r#   r#   r#   r$   rJ   T   s    

rJ   c                   @   s~   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Ze dedefddZdS )!FluxKontextProcessImagesInputStepzflux-kontextrL   c                 C   s   	 dS )NzImage preprocess step for Flux Kontext. The preprocessed image goes to the VAE.
Kontext works as a T2I model, too, in case no input image is provided.r#   rN   r#   r#   r$   rP      s   z-FluxKontextProcessImagesInputStep.descriptionc                 C   s   t dttddiddgS )NrR   rT   rS   rV   rW   rY   rN   r#   r#   r$   rZ      s   
z5FluxKontextProcessImagesInputStep.expected_componentsc                 C   s   t dt dtddgS )Nr7   _auto_resizeT)	type_hintdefault)r   boolrN   r#   r#   r$   r^         z(FluxKontextProcessImagesInputStep.inputsc                 C   r_   r`   rc   rN   r#   r#   r$   rd      re   z6FluxKontextProcessImagesInputStep.intermediate_outputsri   rj   c                    s   ddl m} | |}|j}|d u rd |_nY|jjj}t|s(t	dt
| t|r/|g}|d }|j|\}}	|	|  |j}
|
rRt fdd|D \}}	}|	| | }	|| | }|j|||	}|j|||	|_| || ||fS )Nr   )PREFERRED_KONTEXT_RESOLUTIONSz/Images must be image or list of images but are r   c                 3   s*    | ]\}}t  ||  ||fV  qd S r   )abs)r:   whaspect_ratior#   r$   	<genexpr>   s    
z=FluxKontextProcessImagesInputStep.__call__.<locals>.<genexpr>)$pipelines.flux.pipeline_flux_kontextr   rk   r7   ra   rR   rE   rT   r
   rf   typer	   get_default_height_widthr|   minresizero   rp   )rO   ri   rj   r   rq   imagesmultiple_ofimgimage_heightimage_widthr|   _r#   r   r$   rr      s0   

z*FluxKontextProcessImagesInputStep.__call__N)rs   rt   ru   rv   rw   rx   rP   r@   r   rZ   r   r^   r   rd   rC   rz   r   r   rr   r#   r#   r#   r$   r{      s    
r{   c                       s   e Zd ZdZ	ddededef fdd	Zed
efddZed
ee	 fddZ
ed
ee fddZed
ee fddZe deded
efddZ  ZS )FluxVaeEncoderSteprK   ra   rH   r+   
input_nameoutput_namer.   c                    s    || _ || _|| _t   dS )a9  Initialize a VAE encoder step for converting images to latent representations.

        Both the input and output names are configurable so this block can be configured to process to different image
        inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").

        Args:
            input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
                Examples: "processed_image" or "processed_control_image"
            output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
                Examples: "image_latents" or "control_image_latents"
            sample_mode (str, optional): Sampling mode to be used.

        Examples:
            # Basic usage with default settings (includes image processor): # FluxImageVaeEncoderDynamicStep()

            # Custom input/output names for control image: # FluxImageVaeEncoderDynamicStep(
                input_name="processed_control_image", output_name="control_image_latents"
            )
        N)_image_input_name_image_latents_output_namer.   super__init__)rO   r   r   r.   	__class__r#   r$   r      s   zFluxVaeEncoderStep.__init__rL   c                 C   s   d| j  d| j dS )Nz'Dynamic VAE Encoder step that converts z into latent representations z.
)r   r   rN   r#   r#   r$   rP      r   zFluxVaeEncoderStep.descriptionc                 C   s   t dtg}|S )Nr6   )r   r   )rO   ri   r#   r#   r$   rZ      s   z&FluxVaeEncoderStep.expected_componentsc                 C   s   t | jt dg}|S )Nr-   )r   r   )rO   r^   r#   r#   r$   r^      s   zFluxVaeEncoderStep.inputsc                 C   s   t | jtjddgS )Nz,The latents representing the reference image)r}   rP   )r   r   rC   TensorrN   r#   r#   r$   rd      s   z'FluxVaeEncoderStep.intermediate_outputsri   rj   c                 C   s   |  |}t|| j}|d u rt|| jd  n |j}|jj}|j||d}t	||j|j
| jd}t|| j| | || ||fS )N)devicedtype)r7   r6   r-   r.   )rk   getattrr   setattrr   _execution_devicer6   r   torI   r-   r.   rp   )rO   ri   rj   rq   r7   r   r   rH   r#   r#   r$   rr     s   
zFluxVaeEncoderStep.__call__)ra   rH   r+   )rs   rt   ru   rv   rx   r   rw   rP   r@   r   rZ   r   r^   r   rd   rC   rz   r   r   rr   __classcell__r#   r#   r   r$   r      s(    	 r   c                   @   s.  e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zedd Zedeee B dedejfddZedeee B dejfddZe					d deee B deee B dejdB dejdB dejdB dededB fddZe dededefddZdS )!FluxTextEncoderSteprK   rL   c                 C   rM   )NzMText Encoder step that generate text_embeddings to guide the image generationr#   rN   r#   r#   r$   rP      rQ   zFluxTextEncoderStep.descriptionc                 C   s$   t dtt dtt dtt dtgS )Ntext_encoder	tokenizertext_encoder_2tokenizer_2)r   r   r   r   r   rN   r#   r#   r$   rZ   $  s
   z'FluxTextEncoderStep.expected_componentsc                 C   s$   t dt dt dtdddt dgS )Npromptprompt_2max_sequence_length   F)r}   r~   requiredjoint_attention_kwargs)r   intrN   r#   r#   r$   r^   -  s
   zFluxTextEncoderStep.inputsc                 C   s$   t ddtjddt ddtjddgS )Nprompt_embedsdenoiser_input_fieldsz2text embeddings used to guide the image generation)kwargs_typer}   rP   pooled_prompt_embedsz9pooled text embeddings used to guide the image generation)r   rC   r   rN   r#   r#   r$   rd   6  s   z(FluxTextEncoderStep.intermediate_outputsc                 C   sD   | j | jfD ]}|d urt|tst|tstdt| qd S )Nz@`prompt` or `prompt_2` has to be of type `str` or `list` but is )r   r   r?   rx   r@   rf   r   )rq   r   r#   r#   r$   rh   G  s
   z FluxTextEncoderStep.check_inputsr   r   r   c           
   	   C   s   | j j}t|tr|gn|}t| tr| || j}| j|d|ddddd}|j}| j|dddj}|jd |jd kr]t	
||s]| j|d d |d	 df }td
| d|  | j ||ddd }	|	j||d}	|	S )N
max_lengthTFpt)paddingr   
truncationreturn_lengthreturn_overflowing_tokensreturn_tensorslongestr   r   r   zXThe following part of your input was truncated because `max_sequence_length` is set to  	 tokens: output_hidden_statesr   r   r   )r   r   r?   rx   r   maybe_convert_promptr   	input_idsrB   rC   equalbatch_decodeloggerwarningr   )
ri   r   r   r   r   text_inputstext_input_idsuntruncated_idsremoved_textr   r#   r#   r$   _get_t5_prompt_embedsM  s6   
	  z)FluxTextEncoderStep._get_t5_prompt_embedsc           	   	   C   s   t |tr|gn|}t | tr| || j}| j|d| jjddddd}|j}| jj}| j|dddj}|jd |jd kr_t	||s_| j
|d d |d	 df }td
| d|  | j||dd}|j}|j| jj|d}|S )Nr   TFr   )r   r   r   r   r   r   r   r   r   r   z\The following part of your input was truncated because CLIP can only handle sequences up to r   r   r   )r?   rx   r   r   r   model_max_lengthr   rB   rC   r   r   r   r   r   r   pooler_outputr   )	ri   r   r   r   r   tokenizer_max_lengthr   r   r   r#   r#   r$   _get_clip_prompt_embedsl  s8   

  z+FluxTextEncoderStep._get_clip_prompt_embedsNr   r   r   r   
lora_scalec                 C   s   |p| j }|d ur+t| tr+|| _| jd urtrt| j| | jd ur+tr+t| j| t|tr3|gn|}|d u rX|p<|}t|trE|gn|}t	j
| ||d}t	j| |||d}| jd urjt| trjtrjt| j| | jd ur|t| tr|tr|t| j| ||fS )N)r   r   )r   r   r   )r   r?   r   _lora_scaler   r   r   r   rx   r   r   r   r   )ri   r   r   r   r   r   r   r   r#   r#   r$   encode_prompt  s:   


z!FluxTextEncoderStep.encode_promptri   rj   c              
   C   sx   |  |}| | |j|_|jd ur|jdd nd |_| j||jd d d |j|j	|jd\|_
|_| || ||fS )Nscale)r   r   r   r   r   r   r   )rk   rh   r   r   r   gettext_encoder_lora_scaler   r   r   r   r   rp   )rO   ri   rj   rq   r#   r#   r$   rr     s&   


zFluxTextEncoderStep.__call__)NNNr   N)rs   rt   ru   rv   rw   rx   rP   r@   r   rZ   r   r^   r   rd   ry   rh   r   rC   r   r   r   FloatTensorfloatr   rz   r   r   rr   r#   r#   r#   r$   r     sN    
"!

6r   )Nr+   )r+   )2r   regexr'   rC   transformersr   r   r   r   configuration_utilsr   rR   r   r	   r
   loadersr   r   modelsr   utilsr   r   r   r   r   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   r   
get_loggerrs   r   r%   r)   r*   r   	Generatorrx   r5   rI   rJ   r{   r   r   r#   r#   r#   r$   <module>   s>   

:BM