o
    ۷iZW                     @   s   d dl Z d dlmZmZmZmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZmZmZmZ ddlmZmZ eeZd!dee defddZ	d"de jde jdB defddZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd  d eZ%dS )#    N)AutoProcessor Mistral3ForConditionalGenerationQwen2TokenizerFastQwen3ForCausalLM   )
FrozenDict)ClassifierFreeGuidance)AutoencoderKLFlux2)logging   )ModularPipelineBlocksPipelineState)ComponentSpec
ConfigSpec
InputParamOutputParam   )Flux2KleinModularPipelineFlux2ModularPipelinepromptssystem_messagec                    s    dd | D } fdd|D S )z*Format prompts for Mistral3 chat template.c                 S   s   g | ]}| d dqS )z[IMG] )replace.0prompt r   `/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/modular_pipelines/flux2/encoders.py
<listcomp>!   s    z%format_text_input.<locals>.<listcomp>c                    s0   g | ]}d d dgddd|dgdgqS )systemtext)typer    rolecontentuserr   r   r   r   r   r   #   s    
r   )r   r   cleaned_txtr   r&   r   format_text_input   s   
r(   sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr)   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr-   r)   moder/   AttributeError)r*   r+   r,   r   r   r   retrieve_latents0   s   

r3   c                   @   s   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZedd Ze					ddededeee B dejdB dejdB dededee fddZe dededefddZdS ) Flux2TextEncoderStepflux2You are an AI that reasons about image descriptions. You give structured responses focusing on object relationships, object attribution and actions without speculation.returnc                 C      dS )Nz]Text Encoder step that generates text embeddings using Mistral3 to guide the image generationr   selfr   r   r   descriptionD      z Flux2TextEncoderStep.descriptionc                 C      t dtt dtgS Ntext_encoder	tokenizer)r   r   r   r9   r   r   r   expected_componentsH      z(Flux2TextEncoderStep.expected_componentsc                 C   *   t dt dtdddt dtt dddgS )Nr   max_sequence_length   F	type_hintdefaultrequiredtext_encoder_out_layers
         r   inttupler9   r   r   r   inputsO      zFlux2TextEncoderStep.inputsc                 C      t ddtjddgS )Nprompt_embedsdenoiser_input_fieldsz@Text embeddings from Mistral3 used to guide the image generationkwargs_typerG   r;   r   torchTensorr9   r   r   r   intermediate_outputsW      z)Flux2TextEncoderStep.intermediate_outputsc                 C   @   | j }|d urt|tst|tstdt| d S d S d S Nz2`prompt` has to be of type `str` or `list` but is r   
isinstancestrlist
ValueErrorr!   block_stater   r   r   r   check_inputsb   s   z!Flux2TextEncoderStep.check_inputsNrE   rK   r?   r@   r   dtypedevicerD   r   hidden_states_layersc              
      s   |d u r| j n|}|d u r| jn|}t|tr|gn|}t||d}|j|dddddd|d}	|	d |}
|	d |}| |
|ddd	 tj fd
d|D dd}|j||d}|j	\}}}}|
dddd|||| }|S )N)r   r   FTpt
max_length)add_generation_prompttokenizereturn_dictreturn_tensorspadding
truncationrl   	input_idsattention_maskrs   rt   output_hidden_states	use_cachec                       g | ]} j | qS r   hidden_statesr   koutputr   r   r          zEFlux2TextEncoderStep._get_mistral_3_prompt_embeds.<locals>.<listcomp>r   dimrh   ri   r   r   r   )rh   ri   ra   rb   r(   apply_chat_templatetorZ   stackshapepermutereshape)r?   r@   r   rh   ri   rD   r   rj   messages_batchrR   rs   rt   out
batch_sizenum_channelsseq_len
hidden_dimrU   r   r}   r   _get_mistral_3_prompt_embedsh   s6   z1Flux2TextEncoderStep._get_mistral_3_prompt_embeds
componentsstatec              	   C   s|   |  |}| | |j|_|j}|d u rd}t|tr|gn|}| j|j|j	||j|j
| j|jd|_| || ||fS )Nr   )r?   r@   r   ri   rD   r   rj   )get_block_staterg   _execution_deviceri   r   ra   rb   r   r?   r@   rD   DEFAULT_SYSTEM_MESSAGErJ   rU   set_block_state)r:   r   r   rf   r   r   r   r   __call__   s$   


zFlux2TextEncoderStep.__call__)NNrE   r6   rK   )__name__
__module____qualname__
model_namer   propertyrb   r;   rc   r   rA   r   rR   r   r\   staticmethodrg   r   r   rZ   rh   ri   rP   rQ   r   no_gradr   r   r   r   r   r   r   r4   =   sL    



0r4   c                   @   s   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZedd Ze dededefddZdS )Flux2RemoteTextEncoderStepr5   z9https://remote-text-encoder-flux-2.huggingface.co/predictr7   c                 C   r8   )NzLText Encoder step that generates text embeddings using a remote API endpointr   r9   r   r   r   r;      r<   z&Flux2RemoteTextEncoderStep.descriptionc                 C   s   g S Nr   r9   r   r   r   rA      r<   z.Flux2RemoteTextEncoderStep.expected_componentsc                 C   s
   t dgS )Nr   )r   r9   r   r   r   rR      s   z!Flux2RemoteTextEncoderStep.inputsc                 C   rT   )NrU   rV   zBText embeddings from remote API used to guide the image generationrW   rY   r9   r   r   r   r\      r]   z/Flux2RemoteTextEncoderStep.intermediate_outputsc                 C   sB   | j }|d urt|tst|tstdt| j  d S d S d S r_   r`   re   r   r   r   rg      s   z'Flux2RemoteTextEncoderStep.check_inputsr   r   c           	      C   s   dd l }dd l}ddlm} | |}| | |j|_|j}|d u r%d}t	|t
r-|gn|}|j| jd|id|  ddd}|  tj||jd	d
|_|j|j|_| || ||fS )Nr   )	get_tokenr   r   zBearer zapplication/json)AuthorizationzContent-Type)jsonheadersT)weights_only)iorequestshuggingface_hubr   r   rg   r   ri   r   ra   rb   post
REMOTE_URLraise_for_statusrZ   loadBytesIOr$   rU   r   r   )	r:   r   r   r   r   r   rf   r   responser   r   r   r      s,   


z#Flux2RemoteTextEncoderStep.__call__N)r   r   r   r   r   r   rb   r;   rc   r   rA   r   rR   r   r\   r   rg   rZ   r   r   r   r   r   r   r   r   r      s    

r   c                   @      e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zedee fddZedd Ze				ddededeee B dejdB dejdB dedee fddZe dededefddZdS ) Flux2KleinTextEncoderStepflux2-kleinr7   c                 C   r8   NzZText Encoder step that generates text embeddings using Qwen3 to guide the image generationr   r9   r   r   r   r;      r<   z%Flux2KleinTextEncoderStep.descriptionc                 C   r=   r>   )r   r   r   r9   r   r   r   rA      rB   z-Flux2KleinTextEncoderStep.expected_componentsc                 C      t dddgS )Nis_distilledTnamerH   r   r9   r   r   r   expected_configs     
z*Flux2KleinTextEncoderStep.expected_configsc                 C   rC   Nr   rD   rE   FrF   rJ   	         rO   r9   r   r   r   rR     rS   z Flux2KleinTextEncoderStep.inputsc                 C   rT   )NrU   rV   =Text embeddings from qwen3 used to guide the image generationrW   rY   r9   r   r   r   r\     r]   z.Flux2KleinTextEncoderStep.intermediate_outputsc                 C   r^   r_   r`   re   r   r   r   rg         z&Flux2KleinTextEncoderStep.check_inputsNrE   r   r?   r@   r   rh   ri   rD   rj   c                    (  |d u r| j n|}|d u r| jn|}t|tr|gn|}g }g }|D ](}	d|	dg}
|j|
dddd}||ddd|d}||d	  ||d
  q"tj|dd|}tj|dd|}| ||ddd tj	 fdd|D dd}|j||d}|j
\}}}}|dddd|||| }|S )Nr%   r"   FTrn   rm   enable_thinkingrk   rl   rp   rq   rr   rl   rs   rt   r   r   ru   c                    rx   r   ry   r{   r}   r   r   r   Y  r   zFFlux2KleinTextEncoderStep._get_qwen3_prompt_embeds.<locals>.<listcomp>r   r   r   r   rh   ri   ra   rb   r   appendrZ   catr   r   r   r   r   r?   r@   r   rh   ri   rD   rj   all_input_idsall_attention_maskssingle_promptmessagesr    rR   rs   rt   r   r   r   r   r   rU   r   r}   r   _get_qwen3_prompt_embeds'  F   z2Flux2KleinTextEncoderStep._get_qwen3_prompt_embedsr   r   c                 C   st   |  |}| | |j}|j}|d u rd}t|tr|gn|}| j|j|j|||j	|j
d|_| || ||fS Nr   )r?   r@   r   ri   rD   rj   )r   rg   r   r   ra   rb   r   r?   r@   rD   rJ   rU   r   )r:   r   r   rf   ri   r   r   r   r   r   a  s"   

	z"Flux2KleinTextEncoderStep.__call__NNrE   r   r   r   r   r   r   rb   r;   rc   r   rA   r   r   r   rR   r   r\   r   rg   r   r   rZ   rh   ri   rP   r   r   r   r   r   r   r   r   r   r      sH    


8r   c                   @   r   ) Flux2KleinBaseTextEncoderStepr   r7   c                 C   r8   r   r   r9   r   r   r   r;   }  r<   z)Flux2KleinBaseTextEncoderStep.descriptionc                 C   s*   t dtt dtt dttddiddgS )Nr?   r@   guiderguidance_scaleg      @from_config)configdefault_creation_method)r   r   r   r   r   r9   r   r   r   rA     s   
z1Flux2KleinBaseTextEncoderStep.expected_componentsc                 C   r   )Nr   Fr   r   r9   r   r   r   r     r   z.Flux2KleinBaseTextEncoderStep.expected_configsc                 C   rC   r   rO   r9   r   r   r   rR     rS   z$Flux2KleinBaseTextEncoderStep.inputsc                 C   s$   t ddtjddt ddtjddgS )NrU   rV   r   rW   negative_prompt_embedszFNegative text embeddings from qwen3 used to guide the image generationrY   r9   r   r   r   r\     s   z2Flux2KleinBaseTextEncoderStep.intermediate_outputsc                 C   r^   r_   r`   re   r   r   r   rg     r   z*Flux2KleinBaseTextEncoderStep.check_inputsNrE   r   r?   r@   r   rh   ri   rD   rj   c                    r   )Nr%   r"   FTr   rk   rl   r   rs   rt   r   r   ru   c                    rx   r   ry   r{   r}   r   r   r     r   zJFlux2KleinBaseTextEncoderStep._get_qwen3_prompt_embeds.<locals>.<listcomp>r   r   r   r   r   r   r   r}   r   r     r   z6Flux2KleinBaseTextEncoderStep._get_qwen3_prompt_embedsr   r   c                 C   s   |  |}| | |j}|j}|d u rd}t|tr|gn|}| j|j|j|||j	|j
d|_|jrKdgt| }| j|j|j|||j	|j
d|_nd |_| || ||fS r   )r   rg   r   r   ra   rb   r   r?   r@   rD   rJ   rU   requires_unconditional_embedslenr   r   )r:   r   r   rf   ri   r   negative_promptr   r   r   r     s8   

	
	z&Flux2KleinBaseTextEncoderStep.__call__r   r   r   r   r   r   r   z  sH    

8r   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zedd ZdedejdejfddZe dededefddZdS )Flux2VaeEncoderStepr5   r7   c                 C   r8   )NzXVAE Encoder step that encodes preprocessed images into latent representations for Flux2.r   r9   r   r   r   r;     r<   zFlux2VaeEncoderStep.descriptionc                 C   s   t dtgS )Nvae)r   r	   r9   r   r   r   rA     s   z'Flux2VaeEncoderStep.expected_componentsc                 C   s   t dttj dt dgS )Ncondition_images)rG   r+   )r   rc   rZ   r[   r9   r   r   r   rR     s   zFlux2VaeEncoderStep.inputsc                 C   s   t dttj ddgS )Nimage_latentsz7List of latent representations for each reference image)rG   r;   )r   rc   rZ   r[   r9   r   r   r   r\   &  s   z(Flux2VaeEncoderStep.intermediate_outputsc                 C   s^   | j \}}}}| |||d d|d d} | dddddd} | ||d |d |d } | S )z/Convert latents to patchified format for Flux2.r   r   r   r         )r   viewr   r   )r/   r   num_channels_latentsheightwidthr   r   r   _patchify_latents0  s
   z%Flux2VaeEncoderStep._patchify_latentsr   imager+   c                 C   s   |j dkrtd|j  dt|||dd}| |}|jjdddd|j	|j
}t|jjdddd|jj }||j	|j
}|| | }|S )zDEncode a single image using Flux2 VAE with batch norm normalization.r   zExpected image dims 4, got .r.   )r+   r,   r   )ndimrd   r3   encoder   bnrunning_meanr   r   ri   rh   rZ   sqrtrunning_varr   batch_norm_eps)r:   r   r   r+   r   latents_bn_meanlatents_bn_stdr   r   r   _encode_vae_image9  s   

 "z%Flux2VaeEncoderStep._encode_vae_imager   r   c           
      C   s   |  |}|j}|d u r||fS |j}|jj}g }|D ]}|j||d}| j|j||jd}	||	 q||_	| 
|| ||fS )N)ri   rh   )r   r   r+   )r   r   r   r   rh   r   r   r+   r   r   r   )
r:   r   r   rf   r   ri   rh   r   r   latentr   r   r   r   H  s$   
zFlux2VaeEncoderStep.__call__N)r   r   r   r   r   rb   r;   rc   r   rA   r   rR   r   r\   r   r   r	   rZ   r[   	Generatorr   r   r   r   r   r   r   r   r   r     s    	
r   r   )Nr)   )&rZ   transformersr   r   r   r   configuration_utilsr   guidersr   modelsr	   utilsr
   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   r   r   
get_loggerr   loggerrc   rb   r(   r[   r   r3   r4   r   r   r   r   r   r   r   r   <module>   s4   

vF  