o
    ۷is                     @   s$  d Z ddlZddlZddlmZmZmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZm Z  ddlm!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+ e,e-Z.dej/dej/fddZ0de*e+ddfde1e2e1 B de1de3de3dej4dB f
ddZ5dde&e'dfde1e2e1 B dej/dB de1de3dej4dB f
dd Z6dde$e#e%dfde1e2e1 B dej/e2ej7j7ej7j7f B dB de1d!e1de3dej4dB fd"d#Z8	$dOd%ej/d&ej9dB d'e1fd(d)Z:	*	+dPdej/d,ed&ej9dej4d-ej;d.e3d'e1fd/d0Z<G d1d2 d2eZ=G d3d4 d4eZ>G d5d6 d6eZ?G d7d8 d8eZ@G d9d: d:eZAG d;d< d<eZBG d=d> d>eZCG d?d@ d@eZDG dAdB dBeZEG dCdD dDeZFG dEdF dFeZGG dGdH dHeZHG dIdJ dJeZIG dKdL dLeZJG dMdN dNeZKdS )Qz6
Text and VAE encoder blocks for QwenImage pipelines.
    N)"Qwen2_5_VLForConditionalGenerationQwen2TokenizerQwen2VLProcessor   )
FrozenDict)ClassifierFreeGuidance)InpaintProcessorVaeImageProcessoris_valid_imageis_valid_image_imagelist)AutoencoderKLQwenImageQwenImageControlNetModelQwenImageMultiControlNetModel)calculate_dimensions)logging)unwrap_module   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )QwenImageModularPipeline)	 QWENIMAGE_EDIT_PLUS_IMG_TEMPLATE#QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE-QWENIMAGE_EDIT_PLUS_PROMPT_TEMPLATE_START_IDXQWENIMAGE_EDIT_PROMPT_TEMPLATE(QWENIMAGE_EDIT_PROMPT_TEMPLATE_START_IDX#QWENIMAGE_LAYERED_CAPTION_PROMPT_CN#QWENIMAGE_LAYERED_CAPTION_PROMPT_ENQWENIMAGE_PROMPT_TEMPLATE#QWENIMAGE_PROMPT_TEMPLATE_START_IDXhidden_statesmaskc                 C   s4   |  }|jdd}| | }tj|| dd}|S )Nr   dimr   )boolsumtorchsplittolist)r#   r$   	bool_maskvalid_lengthsselectedsplit_result r0   d/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/modular_pipelines/qwenimage/encoders.py_extract_masked_hidden1   s
   r2      promptprompt_template_encode prompt_template_encode_start_idxtokenizer_max_lengthdevicec                    s   t |tr|gn|}|| fdd|D }|||  dddd|}| |j|jdd}	|	jd }
t|
|j} fdd|D }d	d |D }td
d |D t	fdd|D }t	fdd|D }|j|d}||fS )Nc                       g | ]}  |qS r0   format.0etemplater0   r1   
<listcomp>F       z*get_qwen_prompt_embeds.<locals>.<listcomp>Tpt)
max_lengthpadding
truncationreturn_tensors)	input_idsattention_maskoutput_hidden_statesc                       g | ]}| d  qS Nr0   r<   drop_idxr0   r1   rA   R       c                 S   &   g | ]}t j|d t j|jdqS r   )dtyper8   r)   onessizelongr8   r<   r0   r0   r1   rA   S      & c                 S      g | ]}| d qS r   rV   r<   r0   r0   r1   rA   T   rB   c                    2   g | ]}t || |d  |dgqS r   r   r)   cat	new_zerosrV   r=   umax_seq_lenr0   r1   rA   V      2 c                    *   g | ]}t || |d  gqS rZ   r^   ra   rc   r0   r1   rA   Y      * r8   )

isinstancestrtorH   rI   r#   r2   maxr)   stack)text_encoder	tokenizerr4   r5   r6   r7   r8   txt
txt_tokensencoder_hidden_statesr#   split_hidden_statesattn_mask_listprompt_embedsencoder_attention_maskr0   rO   rd   r@   r1   get_qwen_prompt_embeds9   s6   	
rx   imagec                    s   t |tr|gn|}|| fdd|D }|||ddd|}| |j|j|j|jdd}	|	jd }
t|
|j} fdd|D }d	d |D }t	d
d |D t
fdd|D }t
fdd|D }|j|d}||fS )Nc                    r9   r0   r:   r<   r?   r0   r1   rA   n   rB   z/get_qwen_prompt_embeds_edit.<locals>.<listcomp>TrC   textimagesrE   rG   rH   rI   pixel_valuesimage_grid_thwrJ   rK   c                    rL   rM   r0   r<   rN   r0   r1   rA      rP   c                 S   rQ   rR   rT   r<   r0   r0   r1   rA      rX   c                 S   rY   rZ   r[   r<   r0   r0   r1   rA      rB   c                    r\   r]   r^   ra   rc   r0   r1   rA      re   c                    rf   rZ   r^   ra   rc   r0   r1   rA      rg   rh   )ri   rj   rk   rH   rI   r~   r   r#   r2   rl   r)   rm   )rn   	processorr4   ry   r5   r6   r8   rp   model_inputsoutputsr#   rs   rt   ru   rv   r0   rw   r1   get_qwen_prompt_embeds_edita   s@   	
r   img_template_encodec                    s8  t |tr|gn|}t |tr$d t|D ]\}}	 ||d 7  qn|d ur.|d nd || fdd|D }
||
|ddd|}| |j|j|j|j	dd}|j
d	 }t||j}fd
d|D }dd |D }tdd |D tfdd|D }tfdd|D }|j|d}||fS )N r   c                    s   g | ]	}  | qS r0   r:   r<   )base_img_promptr@   r0   r1   rA      s    z4get_qwen_prompt_embeds_edit_plus.<locals>.<listcomp>TrC   rz   r}   rK   c                    rL   rM   r0   r<   rN   r0   r1   rA      rP   c                 S   rQ   rR   rT   r<   r0   r0   r1   rA      rX   c                 S   rY   rZ   r[   r<   r0   r0   r1   rA      rB   c                    r\   r]   r^   ra   rc   r0   r1   rA      re   c                    rf   rZ   r^   ra   rc   r0   r1   rA      rg   rh   )ri   rj   list	enumerater;   rk   rH   rI   r~   r   r#   r2   rl   r)   rm   )rn   r   r4   ry   r5   r   r6   r8   iimgrp   r   r   r#   rs   rt   ru   rv   r0   )r   rO   rd   r@   r1    get_qwen_prompt_embeds_edit_plus   sP   


r   sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr   r   moder   AttributeError)r   r   r   r0   r0   r1   retrieve_latents   s   

r      r   vaerS   latent_channelsc           
         s  t tjstdt d dkrdn dkr,td  dj||dt  trQ fdd	t	j
d
 D }tj|d
d}n
t d}tjjd|ddd|j|j}tjjd|ddd|j|j}	|| |	 }|S )Nz#Expected image to be a tensor, got .   r      z Expected image dims 4 or 5, got )r8   rS   c              	      s0   g | ]}t ||d    | dqS )r   r   r   )r   encode)r=   r   r   ry   r   r   r0   r1   rA      s    "z$encode_vae_image.<locals>.<listcomp>r   r%   r   r   )ri   r)   Tensor
ValueErrortyper&   	unsqueezerk   r   rangeshaper_   r   r   tensorconfiglatents_meanviewr8   rS   latents_std)
ry   r   r   r8   rS   r   r   image_latentsr   r   r0   r   r1   encode_vae_image   s.   	
r   c                   @      e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZe dedefddZdS )QwenImageEditResizeStepa  
    Image Resize step that resize the image to target area while maintaining the aspect ratio.

      Components:
          image_resize_processor (`VaeImageProcessor`)

      Inputs:
          image (`Image | list`):
              Reference image(s) for denoising. Can be a single image or list of images.

      Outputs:
          resized_image (`list`):
              The resized images
    qwenimage-editreturnc                 C      dS )NzZImage Resize step that resize the image to target area while maintaining the aspect ratio.r0   selfr0   r0   r1   description*     z#QwenImageEditResizeStep.descriptionc                 C      t dttddiddgS Nimage_resize_processorvae_scale_factorr   from_configr   default_creation_methodr   r	   r   r   r0   r0   r1   expected_components.     
z+QwenImageEditResizeStep.expected_componentsc                 C      t dgS Nry   r   r@   r   r0   r0   r1   inputs9     zQwenImageEditResizeStep.inputsc                 C      t dttjj ddgS Nresized_imagezThe resized imagesname	type_hintr   r   r   PILImager   r0   r0   r1   intermediate_outputs=     
z,QwenImageEditResizeStep.intermediate_outputs
componentsstatec           	         s   |  |}|j}t|stdt| t|r|g}|d j\}}td|| \ } fdd|D }||_| 	|| |fS )N/Images must be image or list of images but are r      c                       g | ]}j j| d qS heightwidthr   resizer=   ry   calculated_heightcalculated_widthr   r0   r1   rA   V      z4QwenImageEditResizeStep.__call__.<locals>.<listcomp>)
get_block_statery   r   r   r   r
   rV   r   r   set_block_state)	r   r   r   block_stater|   image_widthimage_height_resized_imagesr0   r   r1   __call__G  s   
z QwenImageEditResizeStep.__call__N__name__
__module____qualname____doc__
model_namepropertyrj   r   r   r   r   r   r   r   r   r)   no_gradr   r   r   r0   r0   r0   r1   r     s    
	r   c                   @   s   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZedefddZe dedefddZdS )QwenImageLayeredResizeStepaE  
    Image Resize step that resize the image to a target area (defined by the resolution parameter from user) while
    maintaining the aspect ratio.

      Components:
          image_resize_processor (`VaeImageProcessor`)

      Inputs:
          image (`Image | list`):
              Reference image(s) for denoising. Can be a single image or list of images.
          resolution (`int`, *optional*, defaults to 640):
              The target area to resize the image to, can be 1024 or 640

      Outputs:
          resized_image (`list`):
              The resized images
    qwenimage-layeredr   c                 C   r   )NzImage Resize step that resize the image to a target area (defined by the resolution parameter from user) while maintaining the aspect ratio.r0   r   r0   r0   r1   r   v  r   z&QwenImageLayeredResizeStep.descriptionc                 C   r   r   r   r   r0   r0   r1   r   z  r   z.QwenImageLayeredResizeStep.expected_componentsc                 C   s   t dt ddtddgS )Nry   
resolution  z:The target area to resize the image to, can be 1024 or 640r   defaultr   r   )r   r@   intr   r0   r0   r1   r     s   z!QwenImageLayeredResizeStep.inputsc                 C   r   r   r   r   r0   r0   r1   r     r   z/QwenImageLayeredResizeStep.intermediate_outputsr   c                 C   s   | dvrt d|  d S )N)r3   r   z&Resolution must be 1024 or 640 but is r   r   r0   r0   r1   check_inputs  s   z'QwenImageLayeredResizeStep.check_inputsr   r   c           
         s   |  |}| j|jd |j}t|stdt| t|r#|g}|d j\}}|j|j }t	||| \ } fdd|D }	|	|_
| || |fS )Nr   r   r   c                    r   r   r   r   r   r0   r1   rA     r   z7QwenImageLayeredResizeStep.__call__.<locals>.<listcomp>)r   r   r   ry   r   r   r   r
   rV   r   r   r   )
r   r   r   r   r|   r   r   target_arear   r   r0   r   r1   r     s    
z#QwenImageLayeredResizeStep.__call__N)r   r   r   r   r   r   rj   r   r   r   r   r   r   r   r   staticmethodr   r   r)   r   r   r   r   r0   r0   r0   r1   r   a  s    
	r   c                   @   r   )QwenImageEditPlusResizeStepa  
    Resize images for QwenImage Edit Plus pipeline.
      Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text
      encoding. Each image is resized independently based on its own aspect ratio.

      Components:
          image_resize_processor (`VaeImageProcessor`)

      Inputs:
          image (`Image | list`):
              Reference image(s) for denoising. Can be a single image or list of images.

      Outputs:
          resized_image (`list`):
              Images resized to 1024x1024 target area for VAE encoding
          resized_cond_image (`list`):
              Images resized to 384x384 target area for VL text encoding
    qwenimage-edit-plusr   c                 C      	 dS )NzResize images for QwenImage Edit Plus pipeline.
Produces two outputs: resized_image (1024x1024) for VAE encoding, resized_cond_image (384x384) for VL text encoding.
Each image is resized independently based on its own aspect ratio.r0   r   r0   r0   r1   r        z'QwenImageEditPlusResizeStep.descriptionc                 C   r   r   r   r   r0   r0   r1   r     r   z/QwenImageEditPlusResizeStep.expected_componentsc                 C   r   r   r   r   r0   r0   r1   r     s   z"QwenImageEditPlusResizeStep.inputsc                 C   s,   t dttjj ddt dttjj ddgS )Nr   z8Images resized to 1024x1024 target area for VAE encodingr   resized_cond_imagez:Images resized to 384x384 target area for VL text encodingr   r   r0   r0   r1   r     s   

z0QwenImageEditPlusResizeStep.intermediate_outputsr   r   c                 C   s   |  |}|j}t|stdt| t|r|g}g }g }|D ]3}|j\}}	td||	 \}
}}||j	j
|||
d td||	 \}}}||j	j
|||d q"||_||_| || ||fS )Nr   r   r   i @ )r   ry   r   r   r   r
   rV   r   appendr   r   r   r   r   )r   r   r   r   r|   r   resized_cond_imagesry   r   r   	vae_width
vae_heightr   vl_width	vl_heightr0   r0   r1   r     s(   

z$QwenImageEditPlusResizeStep.__call__Nr   r0   r0   r0   r1   r     s    
r   c                       s   e Zd ZdZdZ fddZedefddZede	e
 fdd	Zede	e fd
dZede	e fddZe dededefddZ  ZS )"QwenImageLayeredGetImagePromptStepae  
    Auto-caption step that generates a text prompt from the input image if none is provided.
      Uses the VL model (text_encoder) to generate a description of the image. If prompt is already provided, this step
      passes through unchanged.

      Components:
          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`)

      Inputs:
          prompt (`str`, *optional*):
              The prompt or prompts to guide image generation.
          resized_image (`Image`):
              The image to generate caption from, should be resized use the resize step
          use_en_prompt (`bool`, *optional*, defaults to False):
              Whether to use English prompt template

      Outputs:
          prompt (`str`):
              The prompt or prompts to guide image generation. If not provided, updated using image caption
    r   c                       t | _t| _t   d S rM   )r    image_caption_prompt_enr   image_caption_prompt_cnsuper__init__r   	__class__r0   r1   r  ;     z+QwenImageLayeredGetImagePromptStep.__init__r   c                 C   r   )NzAuto-caption step that generates a text prompt from the input image if none is provided.
Uses the VL model (text_encoder) to generate a description of the image.
If prompt is already provided, this step passes through unchanged.r0   r   r0   r0   r1   r   @  r   z.QwenImageLayeredGetImagePromptStep.descriptionc                 C   s   t dtt dtgS )Nrn   r   )r   r   r   r   r0   r0   r1   r   H  s   z6QwenImageLayeredGetImagePromptStep.expected_componentsc                 C   s0   t jdddt ddtjjddt ddtd	d
gS )Nr4   F)requiredr   TzIThe image to generate caption from, should be resized use the resize stepr   r  r   r   use_en_promptz&Whether to use English prompt templater   )r   r@   r   r   r'   r   r0   r0   r1   r   O  s    z)QwenImageLayeredGetImagePromptStep.inputsc                 C   s   t dtddgS )Nr4   z]The prompt or prompts to guide image generation. If not provided, updated using image captionr   )r   rj   r   r0   r0   r1   r   c  s   z7QwenImageLayeredGetImagePromptStep.intermediate_outputsr   r   c           
      C   s   |  |}|j}|jd u s|jdks|jdkrV|jr| j}n| j}|j||jddd|}|j	j
di |ddi}dd	 t|j|D }|jj|dd
dd }	|	 |_| || ||fS )Nr    TrC   rz   max_new_tokensi   c                 S   s    g | ]\}}|t |d  qS rM   )len)r=   in_idsout_idsr0   r0   r1   rA     s    z?QwenImageLayeredGetImagePromptStep.__call__.<locals>.<listcomp>F)skip_special_tokensclean_up_tokenization_spacesr   r0   )r   _execution_devicer4   r  r	  r
  r   r   rk   rn   generateziprH   batch_decodestripr   )
r   r   r   r   r8   caption_promptr   generated_idsgenerated_ids_trimmedoutput_textr0   r0   r1   r   m  s4   


z+QwenImageLayeredGetImagePromptStep.__call__)r   r   r   r   r   r  r   rj   r   r   r   r   r   r   r   r   r)   r   r   r   r   __classcell__r0   r0   r  r1   r  #  s    	 r  c                          e Zd ZdZdZ fddZedefddZede	e
 fdd	Zede	e fd
dZede	e fddZedd Ze dedefddZ  ZS )QwenImageTextEncoderStepa  
    Text Encoder step that generates text embeddings to guide the image generation.

      Components:
          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
          The tokenizer to use guider (`ClassifierFreeGuidance`)

      Inputs:
          prompt (`str`):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          max_sequence_length (`int`, *optional*, defaults to 1024):
              Maximum sequence length for prompt encoding.

      Outputs:
          prompt_embeds (`Tensor`):
              The prompt embeddings.
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask.
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings.
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask.
    	qwenimagec                    s    t | _t| _d| _t   d S )Nr3   )r!   r5   r"   r6   r7   r  r  r   r  r0   r1   r       z!QwenImageTextEncoderStep.__init__r   c                 C   r   )NzOText Encoder step that generates text embeddings to guide the image generation.r0   r   r0   r0   r1   r     r   z$QwenImageTextEncoderStep.descriptionc                 C   s2   t dtddt dtddt dttddid	d
gS )Nrn   zThe text encoder to use)r   ro   zThe tokenizer to useguiderguidance_scale      @r   r   )r   r   r   r   r   r   r0   r0   r1   r     s   
z,QwenImageTextEncoderStep.expected_componentsc                 C   s    t dt dt jdddgS )Nr4   negative_promptmax_sequence_lengthr3   )r   r   r   r0   r0   r1   r     s   zQwenImageTextEncoderStep.inputsc                 C   $   t dt dt dt dgS Nru   prompt_embeds_masknegative_prompt_embedsnegative_prompt_embeds_maskr   r@   r   r0   r0   r1   r     
   z-QwenImageTextEncoderStep.intermediate_outputsc                 C   sz   t | tst | tstdt|  |d ur*t |ts*t |ts*tdt| |d ur9|dkr;td| d S d S )N2`prompt` has to be of type `str` or `list` but is ;`negative_prompt` has to be of type `str` or `list` but is r3   z9`max_sequence_length` cannot be greater than 1024 but is ri   rj   r   r   r   )r4   r+  r,  r0   r0   r1   r     s   z%QwenImageTextEncoderStep.check_inputsr   r   c              	   C   s  |  |}|j}| |j|j|j t|j|j|j| j	| j
| j|d\|_|_|jd d d |jf |_|jd d d |jf |_d |_d |_|jr}|jpNd}t|j|j|| j	| j
| j|d\|_|_|jd d d |jf |_|jd d d |jf |_| || ||fS )N)r4   r5   r6   r7   r8   r   )r   r  r   r4   r+  r,  rx   rn   ro   r5   r6   r7   ru   r/  r0  r1  requires_unconditional_embedsr   r   r   r   r   r8   r+  r0   r0   r1   r     sF   


	z!QwenImageTextEncoderStep.__call__r   r   r   r   r   r  r   rj   r   r   r   r   r   r   r   r   r   r   r)   r   r   r   r   r#  r0   r0   r  r1   r%    s     
r%  c                       r$  )QwenImageEditTextEncoderStepa  
    Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image
    generation.

      Components:
          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
          (`ClassifierFreeGuidance`)

      Inputs:
          prompt (`str`):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          resized_image (`Image`):
              The image prompt to encode, should be resized using resize step

      Outputs:
          prompt_embeds (`Tensor`):
              The prompt embeddings.
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask.
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings.
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask.
    r&  c                    r  rM   )r   r5   r   r6   r  r  r   r  r0   r1   r  4  r  z%QwenImageEditTextEncoderStep.__init__r   c                 C   r   )NzyText Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation.r0   r   r0   r0   r1   r   9  r   z(QwenImageEditTextEncoderStep.descriptionc                 C   *   t dtt dtt dttddiddgS Nrn   r   r(  r)  r*  r   r   r   r   r   r   r   r   r0   r0   r1   r   =     
z0QwenImageEditTextEncoderStep.expected_componentsc                 C   s&   t dt dt ddtjjddgS )Nr4   r+  r   Tz?The image prompt to encode, should be resized using resize stepr  r   r@   r   r   r   r0   r0   r1   r   J  s   z#QwenImageEditTextEncoderStep.inputsc                 C   r-  r.  r2  r   r0   r0   r1   r   W  r3  z1QwenImageEditTextEncoderStep.intermediate_outputsc                 C   `   t | tst | tstdt|  |d ur*t |ts,t |ts.tdt| d S d S d S Nr4  r5  r6  r4   r+  r0   r0   r1   r   `     z)QwenImageEditTextEncoderStep.check_inputsr   r   c              	   C   s   |  |}| |j|j |j}t|j|j|j|j| j	| j
|d\|_|_d |_d |_|jrG|jp2d}t|j|j||j| j	| j
|d\|_|_| || ||fS )N)r4   ry   r5   r6   r8   r  )r   r   r4   r+  r  r   rn   r   r   r5   r6   ru   r/  r0  r1  r7  r   r8  r0   r0   r1   r   l  s6   



z%QwenImageEditTextEncoderStep.__call__r9  r0   r0   r  r1   r:    s     
r:  c                       r$  ) QwenImageEditPlusTextEncoderStepa?  
    Text Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text
    embeddings for guiding image generation.

      Components:
          text_encoder (`Qwen2_5_VLForConditionalGeneration`) processor (`Qwen2VLProcessor`) guider
          (`ClassifierFreeGuidance`)

      Inputs:
          prompt (`str`):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          resized_cond_image (`Tensor`):
              The image(s) to encode, can be a single image or list of images, should be resized to 384x384 using
              resize step

      Outputs:
          prompt_embeds (`Tensor`):
              The prompt embeddings.
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask.
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings.
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask.
    r   c                    s    t | _t| _t| _t   d S rM   )r   r5   r   r   r   r6   r  r  r   r  r0   r1   r    r'  z)QwenImageEditPlusTextEncoderStep.__init__r   c                 C   r   )NzText Encoder step for QwenImage Edit Plus that processes prompt and multiple images together to generate text embeddings for guiding image generation.r0   r   r0   r0   r1   r     r   z,QwenImageEditPlusTextEncoderStep.descriptionc                 C   r;  r<  r=  r   r0   r0   r1   r     r>  z4QwenImageEditPlusTextEncoderStep.expected_componentsc                 C   s$   t dt dt ddtjddgS )Nr4   r+  r   TzoThe image(s) to encode, can be a single image or list of images, should be resized to 384x384 using resize stepr  )r   r@   r)   r   r   r0   r0   r1   r     s   z'QwenImageEditPlusTextEncoderStep.inputsc                 C   r-  r.  r2  r   r0   r0   r1   r     r3  z5QwenImageEditPlusTextEncoderStep.intermediate_outputsc                 C   r@  rA  r6  rB  r0   r0   r1   r     rC  z-QwenImageEditPlusTextEncoderStep.check_inputsr   r   c              
   C   s   |  |}| |j|j |j}t|j|j|j|j| j	| j
| j|d\|_|_d |_d |_|jrK|jp4d}t|j|j||j| j	| j
| j|d\|_|_| || ||fS )N)r4   ry   r5   r   r6   r8   r  )r   r   r4   r+  r  r   rn   r   r   r5   r   r6   ru   r/  r0  r1  r7  r   r8  r0   r0   r1   r     s<   


z)QwenImageEditPlusTextEncoderStep.__call__r9  r0   r0   r  r1   rD    s     
rD  c                   @      e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZedd Ze dedefddZdS )&QwenImageInpaintProcessImagesInputStepa  
    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be
    resized to the given height and width.

      Components:
          image_mask_processor (`InpaintProcessor`)

      Inputs:
          mask_image (`Image`):
              Mask image for inpainting.
          image (`Image | list`):
              Reference image(s) for denoising. Can be a single image or list of images.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.

      Outputs:
          processed_image (`Tensor`):
              The processed image
          processed_mask_image (`Tensor`):
              The processed mask image
          mask_overlay_kwargs (`dict`):
              The kwargs for the postprocess step to apply the mask overlay
    r&  r   c                 C   r   )NzImage Preprocess step for inpainting task. This processes the image and mask inputs together. Images will be resized to the given height and width.r0   r   r0   r0   r1   r   9  r   z2QwenImageInpaintProcessImagesInputStep.descriptionc                 C   r   Nimage_mask_processorr   r   r   r   r   r   r   r   r0   r0   r1   r   =  r   z:QwenImageInpaintProcessImagesInputStep.expected_componentsc                 C   s,   t dt dt dt dt dgS )N
mask_imagery   r   r   padding_mask_cropr   r   r0   r0   r1   r   H  s   z-QwenImageInpaintProcessImagesInputStep.inputsc                 C   ,   t dtjddt dtjddt dtddgS Nprocessed_imageThe processed imager   processed_mask_imagezThe processed mask imagemask_overlay_kwargsz=The kwargs for the postprocess step to apply the mask overlayr   r)   r   dictr   r0   r0   r1   r   R  s    z;QwenImageInpaintProcessImagesInputStep.intermediate_outputsc                 C   h   | d ur| |d  dkrt d|d  d|  |d ur0||d  dkr2t d|d  d| d S d S Nr   r   zHeight must be divisible by z but is zWidth must be divisible by r   r   r   r   r0   r0   r1   r   f  
   z3QwenImageInpaintProcessImagesInputStep.check_inputsr   r   c                 C   st   |  |}| j|j|j|jd |jp|j}|jp|j}|jj|j	|j
|||jd\|_|_|_| || ||fS )NrV  ry   r$   r   r   rK  )r   r   r   r   r   default_heightdefault_widthrH  
preprocessry   rJ  rK  rN  rP  rQ  r   r   r   r   r   r   r   r0   r0   r1   r   n  s    

z/QwenImageInpaintProcessImagesInputStep.__call__Nr   r   r   r   r   r   rj   r   r   r   r   r   r   r   r   r   r   r)   r   r   r   r   r0   r0   r0   r1   rF    s    
	
rF  c                   @   r   )*QwenImageEditInpaintProcessImagesInputStepa  
    Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be
    resized first.

      Components:
          image_mask_processor (`InpaintProcessor`)

      Inputs:
          mask_image (`Image`):
              Mask image for inpainting.
          resized_image (`Image`):
              The resized image. should be generated using a resize step
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.

      Outputs:
          processed_image (`Tensor`):
              The processed image
          processed_mask_image (`Tensor`):
              The processed mask image
          mask_overlay_kwargs (`dict`):
              The kwargs for the postprocess step to apply the mask overlay
    r   r   c                 C   r   )Nz}Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images should be resized first.r0   r   r0   r0   r1   r     r   z6QwenImageEditInpaintProcessImagesInputStep.descriptionc                 C   r   rG  rI  r   r0   r0   r1   r     r   z>QwenImageEditInpaintProcessImagesInputStep.expected_componentsc                 C   s&   t dt ddtjjddt dgS )NrJ  r   T:The resized image. should be generated using a resize stepr  rK  r?  r   r0   r0   r1   r     s   z1QwenImageEditInpaintProcessImagesInputStep.inputsc                 C   rL  rM  rR  r   r0   r0   r1   r     s   z?QwenImageEditInpaintProcessImagesInputStep.intermediate_outputsr   r   c                 C   sV   |  |}|jd j\}}|jj|j|j|||jd\|_|_|_	| 
|| ||fS )Nr   rX  )r   r   rV   rH  r[  rJ  rK  rN  rP  rQ  r   r   r   r   r   r   r   r0   r0   r1   r     s   

z3QwenImageEditInpaintProcessImagesInputStep.__call__Nr   r0   r0   r0   r1   r^    s    
r^  c                   @   rE  )QwenImageProcessImagesInputStepa:  
    Image Preprocess step. will resize the image to the given height and width.

      Components:
          image_processor (`VaeImageProcessor`)

      Inputs:
          image (`Image | list`):
              Reference image(s) for denoising. Can be a single image or list of images.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.

      Outputs:
          processed_image (`Tensor`):
              The processed image
    r&  r   c                 C   r   )NzKImage Preprocess step. will resize the image to the given height and width.r0   r   r0   r0   r1   r     r   z+QwenImageProcessImagesInputStep.descriptionc                 C   r   Nimage_processorr   r   r   r   r   r   r0   r0   r1   r     r   z3QwenImageProcessImagesInputStep.expected_componentsc                 C   s   t dt dt dgS )Nry   r   r   r   r   r0   r0   r1   r     s   z&QwenImageProcessImagesInputStep.inputsc                 C      t dtjddgS NrN  rO  r   r   r)   r   r   r0   r0   r1   r        z4QwenImageProcessImagesInputStep.intermediate_outputsc                 C   rT  rU  r   rV  r0   r0   r1   r     rW  z,QwenImageProcessImagesInputStep.check_inputsr   r   c                 C   sb   |  |}| j|j|j|jd |jp|j}|jp|j}|jj|j	||d|_
| || ||fS )NrV  ry   r   r   )r   r   r   r   r   rY  rZ  rc  r[  ry   rN  r   r\  r0   r0   r1   r   "  s   
z(QwenImageProcessImagesInputStep.__call__Nr]  r0   r0   r0   r1   ra    s    
	
ra  c                   @   r   )#QwenImageEditProcessImagesInputStepaW  
    Image Preprocess step. Images needs to be resized first.

      Components:
          image_processor (`VaeImageProcessor`)

      Inputs:
          resized_image (`list`):
              The resized image. should be generated using a resize step

      Outputs:
          processed_image (`Tensor`):
              The processed image
    r   r   c                 C   r   )Nz8Image Preprocess step. Images needs to be resized first.r0   r   r0   r0   r1   r   I  r   z/QwenImageEditProcessImagesInputStep.descriptionc                 C   r   rb  r   r   r0   r0   r1   r   M  r   z7QwenImageEditProcessImagesInputStep.expected_componentsc                 C      t ddttjj ddgS Nr   Tr_  r  r   r   r   r   r   r0   r0   r1   r   X  r   z*QwenImageEditProcessImagesInputStep.inputsc                 C   rd  re  rf  r   r0   r0   r1   r   c  rg  z8QwenImageEditProcessImagesInputStep.intermediate_outputsr   r   c                 C   sD   |  |}|jd j\}}|jj|j||d|_| || ||fS )Nr   rh  )r   r   rV   rc  r[  rN  r   r`  r0   r0   r1   r   m  s   
z,QwenImageEditProcessImagesInputStep.__call__Nr   r0   r0   r0   r1   ri  7  s    

	ri  c                   @   r   )'QwenImageEditPlusProcessImagesInputStepa  
    Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of
    processed images.

      Components:
          image_processor (`VaeImageProcessor`)

      Inputs:
          resized_image (`list`):
              The resized image. should be generated using a resize step

      Outputs:
          processed_image (`Tensor`):
              The processed image
    r   r   c                 C   r   )Nz|Image Preprocess step. Images can be resized first. If a list of images is provided, will return a list of processed images.r0   r   r0   r0   r1   r     r   z3QwenImageEditPlusProcessImagesInputStep.descriptionc                 C   r   rb  r   r   r0   r0   r1   r     r   z;QwenImageEditPlusProcessImagesInputStep.expected_componentsc                 C   rj  rk  rl  r   r0   r0   r1   r     r   z.QwenImageEditPlusProcessImagesInputStep.inputsc                 C   rd  re  rf  r   r0   r0   r1   r     rg  z<QwenImageEditPlusProcessImagesInputStep.intermediate_outputsr   r   c           
      C   s~   |  |}|j}t|t}|s|g}g }|D ]}|j\}}	||jj||	|d q|r0||_n|d |_| 	|| ||fS )Nrh  r   )
r   r   ri   r   rV   r  rc  r[  rN  r   )
r   r   r   r   ry   is_image_listprocessed_imagesr   	img_width
img_heightr0   r0   r1   r     s    



z0QwenImageEditPlusProcessImagesInputStep.__call__Nr   r0   r0   r0   r1   rm  ~  s    

	rm  c                       s   e Zd ZdZdZddedB dedB f fddZede	fd	d
Z
edee fddZedee fddZedee fddZe dededefddZ  ZS )QwenImageVaeEncoderStepa&  
    VAE Encoder step that converts processed_image into latent representations image_latents.
      Handles both single images and lists of images with varied resolutions.

      Components:
          vae (`AutoencoderKLQwenImage`)

      Inputs:
          processed_image (`Tensor`):
              The image tensor to encode
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.

      Outputs:
          image_latents (`Tensor`):
              The latent representation of the input image.
    r&  Ninputoutputc                    s   |du rt ddtjdd}|du rtd}t|t s$tdt| t|ts2tdt| || _|| _	|j
| _|j
| _t   dS )	a  Initialize a VAE encoder step for converting images to latent representations.

        Handles both single images and lists of images. When input is a list, outputs a list of latents. When input is
        a single tensor, outputs a single latent tensor.

        Args:
            input (InputParam, optional): Input parameter for the processed image. Defaults to "processed_image".
            output (OutputParam, optional): Output parameter for the image latents. Defaults to "image_latents".
        NrN  TzThe image tensor to encoder  r   z input must be InputParam but is z"output must be OutputParam but is )r   r)   r   r   r@   ri   r   r   _input_outputr   _image_input_name_image_latents_output_namer  r  )r   rs  rt  r  r0   r1   r    s   




z QwenImageVaeEncoderStep.__init__r   c                 C   s   d| j  d| j dS )NzVAE Encoder step that converts z into latent representations zI.
Handles both single images and lists of images with varied resolutions.)rw  rx  r   r0   r0   r1   r     s   z#QwenImageVaeEncoderStep.descriptionc                 C   s   t dtgS )Nr   )r   r   r   r0   r0   r1   r     r   z+QwenImageVaeEncoderStep.expected_componentsc                 C   s   | j tdgS )Nr   )ru  r   r@   r   r0   r0   r1   r     s   zQwenImageVaeEncoderStep.inputsc                 C   s   | j gS rM   )rv  r   r0   r0   r1   r     s   z,QwenImageVaeEncoderStep.intermediate_outputsr   r   c           
      C   s   |  |}|j}|jj}t|| j}t|t}|s|g}g }|D ]}	|t	|	|j|j
|||jd q |s9|d }t|| j| | || ||fS )N)ry   r   r   r8   rS   r   r   )r   r  r   rS   getattrrw  ri   r   r  r   r   num_channels_latentssetattrrx  r   )
r   r   r   r   r8   rS   ry   rn  r   r   r0   r0   r1   r     s0   


z QwenImageVaeEncoderStep.__call__)NN)r   r   r   r   r   r   r   r  r   rj   r   r   r   r   r   r   r)   r   r   r   r   r#  r0   r0   r  r1   rr    s      rr  c                   @   s   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZedd Ze dededefddZdS )!QwenImageControlNetVaeEncoderStepa  
    VAE Encoder step that converts `control_image` into latent representations control_image_latents.

      Components:
          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
          (`VaeImageProcessor`)

      Inputs:
          control_image (`Image`):
              Control image for ControlNet conditioning.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.

      Outputs:
          control_image_latents (`Tensor`):
              The latents representing the control image
    r&  r   c                 C   r   )NzbVAE Encoder step that converts `control_image` into latent representations control_image_latents.
r0   r   r0   r0   r1   r   Z  r   z-QwenImageControlNetVaeEncoderStep.descriptionc                 C   s.   t dtt dtt dttddiddg}|S )Nr   
controlnetcontrol_image_processorr   r   r   r   )r   r   r   r	   r   )r   r   r0   r0   r1   r   ^  s   

z5QwenImageControlNetVaeEncoderStep.expected_componentsc                 C   s(   t dt dt dt dg}|S )Ncontrol_imager   r   r   r   )r   r   r0   r0   r1   r   l  s   z(QwenImageControlNetVaeEncoderStep.inputsc                 C   rd  )Ncontrol_image_latentsz*The latents representing the control image)r   r   rf  r   r0   r0   r1   r   v  rg  z6QwenImageControlNetVaeEncoderStep.intermediate_outputsc                 C   rT  rU  r   rV  r0   r0   r1   r     rW  z.QwenImageControlNetVaeEncoderStep.check_inputsr   r   c              
   C   s*  |  |}| |j|j|j |j}|jj}|jp|j}|jp!|j	}t
|j}t|tr7t|jts7|jg|_t|trcg |_|jD ]}	|jj|	||d}	t|	|j|j|||jdd}
|j|
 qBn(t|tr|jj|j||d}t||j|j|||jdd|_n	tdt| | || ||fS )Nrh  r   )ry   r   r   r8   rS   r   r   z[Expected controlnet to be a QwenImageControlNetModel or QwenImageMultiControlNetModel, got )r   r   r   r   r   r  r   rS   rY  rZ  r   r}  ri   r   r  r   r  r~  r[  r   r   rz  r  r   r   r   r   )r   r   r   r   r8   rS   r   r   r}  control_image_control_image_latents_r  r0   r0   r1   r     s`   




	

z*QwenImageControlNetVaeEncoderStep.__call__Nr]  r0   r0   r0   r1   r|  A  s    		
r|  c                   @   sl   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Ze d
edefddZdS )"QwenImageLayeredPermuteLatentsStepa  
    Permute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing.

      Inputs:
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.

      Outputs:
          image_latents (`Tensor`):
              The latent representation of the input image. (permuted from [B, C, 1, H, W] to [B, 1, C, H, W])
    r   r   c                 C   r   )NzRPermute image latents from (B, C, 1, H, W) to (B, 1, C, H, W) for Layered packing.r0   r   r0   r0   r1   r     r   z.QwenImageLayeredPermuteLatentsStep.descriptionc                 C   r   )Nr   r   r   r0   r0   r1   r     s   z)QwenImageLayeredPermuteLatentsStep.inputsc                 C   s   t jdddgS )Nr   z0permuted from [B, C, 1, H, W] to [B, 1, C, H, W])noter2  r   r0   r0   r1   r     s   z7QwenImageLayeredPermuteLatentsStep.intermediate_outputsr   c                 C   s8   |  |}|j}|ddddd|_| || ||fS )Nr   r   r   r   r   )r   r   permuter   )r   r   r   r   r   r0   r0   r1   r     s
   
z+QwenImageLayeredPermuteLatentsStep.__call__N)r   r   r   r   r   r   rj   r   r   r   r   r   r   r)   r   r   r   r0   r0   r0   r1   r    s    r  )Nr   )r   r   )Lr   r   r)   transformersr   r   r   configuration_utilsr   guidersr   rc  r   r	   r
   r   modelsr   r   r   +pipelines.qwenimage.pipeline_qwenimage_editr   utilsr   utils.torch_utilsr   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   prompt_templatesr   r   r   r   r   r   r    r!   r"   
get_loggerr   loggerr   r2   rj   r   r   r8   rx   r   r   r   	Generatorr   rS   r   r   r   r   r  r%  r:  rD  rF  r^  ra  ri  rm  rr  r|  r  r0   r0   r0   r1   <module>   s   ,


+

2

9

BI\fr { 
m\TGWl 