o
    Gi                     @   s.  d dl Z ddlmZ ddlmZmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZmZmZmZmZ dd	lmZmZmZmZ dd
lmZmZmZmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z& e'e(Z)G dd deZ*G dd deZ+G dd deZ,G dd deZ-G dd deZ.G dd deZ/G dd deZ0G dd deZ1G dd deZ2G dd  d eZ3G d!d" d"eZ4G d#d$ d$eZ5G d%d& d&eZ6G d'd( d(eZ7G d)d* d*eZ8G d+d, d,eZ9G d-d. d.eZ:G d/d0 d0eZ;e	d1e* fd2e- fd3e. fd4e8 fd5e; fgZ<G d6d7 d7eZ=dS )8    N   )logging   )AutoPipelineBlocksConditionalPipelineBlocksSequentialPipelineBlocks)
InputParamInsertableDictOutputParam   )%QwenImageControlNetBeforeDenoiserStepQwenImageCreateMaskLatentsStepQwenImagePrepareLatentsStep'QwenImagePrepareLatentsWithStrengthStepQwenImageRoPEInputsStepQwenImageSetTimestepsStep%QwenImageSetTimestepsWithStrengthStep)QwenImageAfterDenoiseStepQwenImageDecoderStep'QwenImageInpaintProcessImagesOutputStep QwenImageProcessImagesOutputStep)QwenImageControlNetDenoiseStepQwenImageDenoiseStep%QwenImageInpaintControlNetDenoiseStepQwenImageInpaintDenoiseStep)!QwenImageControlNetVaeEncoderStep&QwenImageInpaintProcessImagesInputStepQwenImageProcessImagesInputStepQwenImageTextEncoderStepQwenImageVaeEncoderStep)QwenImageAdditionalInputsStepQwenImageControlNetInputsStepQwenImageTextInputsStepc                   @   s:   e Zd ZdZdZe gZdgZdgZe	de
fddZdS )	QwenImageAutoTextEncoderStepa  
    Text encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block.

      Components:
          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
          The tokenizer to use guider (`ClassifierFreeGuidance`)

      Inputs:
          prompt (`str`, *optional*):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          max_sequence_length (`int`, *optional*, defaults to 1024):
              Maximum sequence length for prompt encoding.

      Outputs:
          prompt_embeds (`Tensor`):
              The prompt embeddings.
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask.
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings.
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask.
    	qwenimagetext_encoderpromptreturnc                 C      dS )NzeText encoder step that encodes the text prompt into a text embedding. This is an auto pipeline block. selfr)   r)   r/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/modular_pipelines/qwenimage/modular_blocks_qwenimage.pydescription`      z(QwenImageAutoTextEncoderStep.descriptionN)__name__
__module____qualname____doc__
model_namer   block_classesblock_namesblock_trigger_inputspropertystrr-   r)   r)   r)   r,   r#   @   s    r#   c                   @   :   e Zd ZdZdZe e gZddgZe	de
fddZdS )	QwenImageInpaintVaeEncoderStepaM  
    This step is used for processing image and mask inputs for inpainting tasks. It:
       - Resizes the image to the target size, based on `height` and `width`.
       - Processes and updates `image` and `mask_image`.
       - Creates `image_latents`.

      Components:
          image_mask_processor (`InpaintProcessor`) vae (`AutoencoderKLQwenImage`)

      Inputs:
          mask_image (`Image`):
              Mask image for inpainting.
          image (`Image | list`):
              Reference image(s) for denoising. Can be a single image or list of images.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.

      Outputs:
          processed_image (`Tensor`):
              The processed image
          processed_mask_image (`Tensor`):
              The processed mask image
          mask_overlay_kwargs (`dict`):
              The kwargs for the postprocess step to apply the mask overlay
          image_latents (`Tensor`):
              The latent representation of the input image.
    r$   
preprocessencoder'   c                 C      	 dS )NzThis step is used for processing image and mask inputs for inpainting tasks. It:
 - Resizes the image to the target size, based on `height` and `width`.
 - Processes and updates `image` and `mask_image`.
 - Creates `image_latents`.r)   r*   r)   r)   r,   r-         z*QwenImageInpaintVaeEncoderStep.descriptionN)r/   r0   r1   r2   r3   r   r   r4   r5   r7   r8   r-   r)   r)   r)   r,   r:   m       "r:   c                   @   r9   )	QwenImageImg2ImgVaeEncoderStepa7  
    Vae encoder step that preprocess andencode the image inputs into their latent representations.

      Components:
          image_processor (`VaeImageProcessor`) vae (`AutoencoderKLQwenImage`)

      Inputs:
          image (`Image | list`):
              Reference image(s) for denoising. Can be a single image or list of images.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.

      Outputs:
          processed_image (`Tensor`):
              The processed image
          image_latents (`Tensor`):
              The latent representation of the input image.
    r$   r;   r<   r'   c                 C   r(   )Nz^Vae encoder step that preprocess andencode the image inputs into their latent representations.r)   r*   r)   r)   r,   r-      r.   z*QwenImageImg2ImgVaeEncoderStep.descriptionN)r/   r0   r1   r2   r3   r   r   r4   r5   r7   r8   r-   r)   r)   r)   r,   r@      s    r@   c                   @   s0   e Zd ZeegZddgZddgZedd Z	dS )QwenImageAutoVaeEncoderStepinpaintimg2img
mask_imageimagec                 C   r=   )NaZ  Vae encoder step that encode the image inputs into their latent representations.
This is an auto pipeline block.
 - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.
 - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.
 - if `mask_image` or `image` is not provided, step will be skipped.r)   r*   r)   r)   r,   r-      r>   z'QwenImageAutoVaeEncoderStep.descriptionN)
r/   r0   r1   r:   r@   r4   r5   r6   r7   r-   r)   r)   r)   r,   rA          rA   c                   @   s.   e Zd ZdZegZdgZdgZedd Z	dS ))QwenImageOptionalControlNetVaeEncoderStepa  
    Vae encoder step that encode the image inputs into their latent representations.
      This is an auto pipeline block.
       - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
       - if `control_image` is not provided, step will be skipped.

      Components:
          vae (`AutoencoderKLQwenImage`) controlnet (`QwenImageControlNetModel`) control_image_processor
          (`VaeImageProcessor`)

      Inputs:
          control_image (`Image`, *optional*):
              Control image for ControlNet conditioning.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.

      Outputs:
          control_image_latents (`Tensor`):
              The latents representing the control image
    
controlnetcontrol_imagec                 C   r=   )Na  Vae encoder step that encode the image inputs into their latent representations.
This is an auto pipeline block.
 - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.
 - if `control_image` is not provided, step will be skipped.r)   r*   r)   r)   r,   r-      r>   z5QwenImageOptionalControlNetVaeEncoderStep.descriptionN)
r/   r0   r1   r2   r   r4   r5   r6   r7   r-   r)   r)   r)   r,   rG      s    rG   c                   @   4   e Zd ZdZdZe e gZddgZe	dd Z
dS )QwenImageImg2ImgInputStepaA	  
    Input step that prepares the inputs for the img2img denoising step. It:

      Components:
          pachifier (`QwenImagePachifier`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.

      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
              batch-expanded)
    r$   text_inputsadditional_inputsc                 C   r(   )NzHInput step that prepares the inputs for the img2img denoising step. It:
r)   r*   r)   r)   r,   r-   9  r.   z%QwenImageImg2ImgInputStep.descriptionN)r/   r0   r1   r2   r3   r"   r    r4   r5   r7   r-   r)   r)   r)   r,   rK     s    2rK   c                   @   sF   e Zd ZdZdZe eedej	ddgdgZ
ddgZed	d
 ZdS )QwenImageInpaintInputStepa
  
    Input step that prepares the inputs for the inpainting denoising step. It:

      Components:
          pachifier (`QwenImagePachifier`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`, *optional*):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image

      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
              batch-expanded)
          processed_mask_image (`Tensor`):
              The processed mask image (batch-expanded)
    r$   processed_mask_imagezThe processed mask image)name	type_hintr-   )additional_batch_inputsrL   rM   c                 C   r(   )NzKInput step that prepares the inputs for the inpainting denoising step. It:
r)   r*   r)   r)   r,   r-     r.   z%QwenImageInpaintInputStep.descriptionN)r/   r0   r1   r2   r3   r"   r    r   torchTensorr4   r5   r7   r-   r)   r)   r)   r,   rN   A  s    6rN   c                   @   r9   )	"QwenImageInpaintPrepareLatentsStepaG  
    This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
       - Add noise to the image latents to create the latents input for the denoiser.
       - Create the pachified latents `mask` based on the processedmask image.

      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`) pachifier (`QwenImagePachifier`)

      Inputs:
          latents (`Tensor`):
              The initial random noised, can be generated in prepare latent step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
              generated from vae encoder and updated in input step.)
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.
          processed_mask_image (`Tensor`):
              The processed mask to use for the inpainting process.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          dtype (`dtype`, *optional*, defaults to torch.float32):
              The dtype of the model inputs, can be generated in input step.

      Outputs:
          initial_noise (`Tensor`):
              The initial random noised used for inpainting denoising.
          latents (`Tensor`):
              The scaled noisy latents to use for inpainting/image-to-image denoising.
          mask (`Tensor`):
              The mask to use for the inpainting process.
    r$   add_noise_to_latentscreate_mask_latentsr'   c                 C   r=   )NzThis step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:
 - Add noise to the image latents to create the latents input for the denoiser.
 - Create the pachified latents `mask` based on the processedmask image.
r)   r*   r)   r)   r,   r-     r>   z.QwenImageInpaintPrepareLatentsStep.descriptionN)r/   r0   r1   r2   r3   r   r   r4   r5   r7   r8   r-   r)   r)   r)   r,   rU     r?   rU   c                   @   sP   e Zd ZdZdZe e e e e	 e
 gZg dZedd Zedd ZdS )	QwenImageCoreDenoiseStepa  
    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
    (timesteps, latents, rope inputs etc.).

      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          attention_kwargs (`dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
          latents (`Tensor`):
              Denoised latents.
    r$   )inputprepare_latentsset_timestepsprepare_rope_inputsdenoiseafter_denoisec                 C   r(   Nzstep that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs (timesteps, latents, rope inputs etc.).r)   r*   r)   r)   r,   r-     r.   z$QwenImageCoreDenoiseStep.descriptionc                 C      t dgS Nlatentsr
   templater*   r)   r)   r,   outputs     z QwenImageCoreDenoiseStep.outputsN)r/   r0   r1   r2   r3   r"   r   r   r   r   r   r4   r5   r7   r-   re   r)   r)   r)   r,   rX     s    )	
rX   c                   @   T   e Zd ZdZdZe e e e e	 e
 e gZg dZedd Zedd ZdS )	QwenImageInpaintCoreDenoiseStepaM	  
    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
    task.

      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`, *optional*):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          attention_kwargs (`dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
          latents (`Tensor`):
              Denoised latents.
    r$   )rY   rZ   r[   prepare_inpaint_latentsr\   r]   r^   c                 C   r(   NzyBefore denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task.r)   r*   r)   r)   r,   r-   N  r.   z+QwenImageInpaintCoreDenoiseStep.descriptionc                 C   r`   ra   rc   r*   r)   r)   r,   re   R  rf   z'QwenImageInpaintCoreDenoiseStep.outputsN)r/   r0   r1   r2   r3   rN   r   r   rU   r   r   r   r4   r5   r7   r-   re   r)   r)   r)   r,   rh   
  s     /	

rh   c                   @   rg   )	QwenImageImg2ImgCoreDenoiseStepa  
    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
    task.

      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) guider
          (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          attention_kwargs (`dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
          latents (`Tensor`):
              Denoised latents.
    r$   )rY   rZ   r[   prepare_img2img_latentsr\   r]   r^   c                 C   r(   NzyBefore denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task.r)   r*   r)   r)   r,   r-     r.   z+QwenImageImg2ImgCoreDenoiseStep.descriptionc                 C   r`   ra   rc   r*   r)   r)   r,   re     rf   z'QwenImageImg2ImgCoreDenoiseStep.outputsN)r/   r0   r1   r2   r3   rK   r   r   r   r   r   r   r4   r5   r7   r-   re   r)   r)   r)   r,   rk   [  s     -	

rk   c                   @   sX   e Zd ZdZdZe e e e e	 e
 e e gZg dZedd Zedd ZdS )	"QwenImageControlNetCoreDenoiseStepa^
  
    step that denoise noise into image for text2image task. It includes the denoise loop, as well as prepare the inputs
    (timesteps, latents, rope inputs etc.).

      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          attention_kwargs (`dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
          latents (`Tensor`):
              Denoised latents.
    r$   )rY   controlnet_inputrZ   r[   r\   controlnet_before_denoisecontrolnet_denoiser^   c                 C   r(   r_   r)   r*   r)   r)   r,   r-     r.   z.QwenImageControlNetCoreDenoiseStep.descriptionc                 C   r`   ra   rc   r*   r)   r)   r,   re     rf   z*QwenImageControlNetCoreDenoiseStep.outputsN)r/   r0   r1   r2   r3   r"   r!   r   r   r   r   r   r   r4   r5   r7   r-   re   r)   r)   r)   r,   rn     s"    2

rn   c                	   @   \   e Zd ZdZdZe e e e e	 e
 e e e g	Zg dZedd Zedd ZdS )	)QwenImageControlNetInpaintCoreDenoiseStepa  
    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint
    task.

      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`, *optional*):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          attention_kwargs (`dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
          latents (`Tensor`):
              Denoised latents.
    r$   )	rY   ro   rZ   r[   ri   r\   rp   rq   r^   c                 C   r(   rj   r)   r*   r)   r)   r,   r-   Q  r.   z5QwenImageControlNetInpaintCoreDenoiseStep.descriptionc                 C   r`   ra   rc   r*   r)   r)   r,   re   U  rf   z1QwenImageControlNetInpaintCoreDenoiseStep.outputsN)r/   r0   r1   r2   r3   rN   r!   r   r   rU   r   r   r   r   r4   r5   r7   r-   re   r)   r)   r)   r,   rs      s$    8
rs   c                	   @   rr   )	)QwenImageControlNetImg2ImgCoreDenoiseStepa2  
    Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img
    task.

      Components:
          pachifier (`QwenImagePachifier`) scheduler (`FlowMatchEulerDiscreteScheduler`) controlnet
          (`QwenImageControlNetModel`) guider (`ClassifierFreeGuidance`) transformer (`QwenImageTransformer2DModel`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          attention_kwargs (`dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.

      Outputs:
          latents (`Tensor`):
              Denoised latents.
    r$   )	rY   ro   rZ   r[   rl   r\   rp   rq   r^   c                 C   r(   rm   r)   r*   r)   r)   r,   r-     r.   z5QwenImageControlNetImg2ImgCoreDenoiseStep.descriptionc                 C   r`   ra   rc   r*   r)   r)   r,   re     rf   z1QwenImageControlNetImg2ImgCoreDenoiseStep.outputsN)r/   r0   r1   r2   r3   rK   r!   r   r   r   r   r   r   r   r4   r5   r7   r-   re   r)   r)   r)   r,   rt   ^  s$    6
rt   c                   @   sR   e Zd ZeeeeeegZ	g dZ
g dZdZdddZedd Zed	d
 ZdS )QwenImageAutoCoreDenoiseStep)
text2imagerB   rC   controlnet_text2imagecontrolnet_inpaintcontrolnet_img2img)control_image_latentsrO   image_latentsrv   Nc                 C   s@   |d ur|d ur
dS |d urdS dS |d urdS |d urdS dS )Nrx   ry   rw   rB   rC   rv   r)   )r+   rz   rO   r{   r)   r)   r,   select_block  s   z)QwenImageAutoCoreDenoiseStep.select_blockc                 C   r=   )Na  Core step that performs the denoising process. 
 - `QwenImageCoreDenoiseStep` (text2image) for text2image tasks.
 - `QwenImageInpaintCoreDenoiseStep` (inpaint) for inpaint tasks.
 - `QwenImageImg2ImgCoreDenoiseStep` (img2img) for img2img tasks.
 - `QwenImageControlNetCoreDenoiseStep` (controlnet_text2image) for text2image tasks with controlnet.
 - `QwenImageControlNetInpaintCoreDenoiseStep` (controlnet_inpaint) for inpaint tasks with controlnet.
 - `QwenImageControlNetImg2ImgCoreDenoiseStep` (controlnet_img2img) for img2img tasks with controlnet.
This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:
 - for image-to-image generation, you need to provide `image_latents`
 - for inpainting, you need to provide `processed_mask_image` and `image_latents`
 - to run the controlnet workflow, you need to provide `control_image_latents`
 - for text-to-image generation, all you need to provide is prompt embeddingsr)   r*   r)   r)   r,   r-     r>   z(QwenImageAutoCoreDenoiseStep.descriptionc                 C   r`   ra   rc   r*   r)   r)   r,   re     rf   z$QwenImageAutoCoreDenoiseStep.outputs)NNN)r/   r0   r1   rX   rh   rk   rn   rs   rt   r4   r5   r6   default_block_namer|   r7   r-   re   r)   r)   r)   r,   ru     s     

ru   c                   @   rJ   )QwenImageDecodeStepaP  
    Decode step that decodes the latents to images and postprocess the generated image.

      Components:
          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)

      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.

      Outputs:
          images (`list`):
              Generated images. (tensor output of the vae decoder.)
    r$   decodepostprocessc                 C   r(   )NzSDecode step that decodes the latents to images and postprocess the generated image.r)   r*   r)   r)   r,   r-     r.   zQwenImageDecodeStep.descriptionN)r/   r0   r1   r2   r3   r   r   r4   r5   r7   r-   r)   r)   r)   r,   r~     s    r~   c                   @   rJ   )QwenImageInpaintDecodeStepaK  
    Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask
    overally to the original image.

      Components:
          vae (`AutoencoderKLQwenImage`) image_mask_processor (`InpaintProcessor`)

      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
          mask_overlay_kwargs (`dict`, *optional*):
              The kwargs for the postprocess step to apply the mask overlay. generated in
              InpaintProcessImagesInputStep.

      Outputs:
          images (`list`):
              Generated images. (tensor output of the vae decoder.)
    r$   r   r   c                 C   r(   )NzDecode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image.r)   r*   r)   r)   r,   r-   5  r.   z&QwenImageInpaintDecodeStep.descriptionN)r/   r0   r1   r2   r3   r   r   r4   r5   r7   r-   r)   r)   r)   r,   r     s    r   c                   @   s0   e Zd ZeegZddgZddgZedd Z	dS )QwenImageAutoDecodeStepinpaint_decoder   maskNc                 C   r=   )NaH  Decode step that decode the latents into images. 
 This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.
 - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.
 - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.
r)   r*   r)   r)   r,   r-   @  r>   z#QwenImageAutoDecodeStep.description)
r/   r0   r1   r   r~   r4   r5   r6   r7   r-   r)   r)   r)   r,   r   ;  rF   r   r%   vae_encodercontrolnet_vae_encoderr]   r   c                
   @   sx   e Zd ZdZdZe Ze Z	ddiddddddddddddddddddd	d
Z
edd Zedd ZdS )QwenImageAutoBlocksa  
    Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.

      Supported workflows:
        - `text2image`: requires `prompt`
        - `image2image`: requires `prompt`, `image`
        - `inpainting`: requires `prompt`, `mask_image`, `image`
        - `controlnet_text2image`: requires `prompt`, `control_image`
        - `controlnet_image2image`: requires `prompt`, `image`, `control_image`
        - `controlnet_inpainting`: requires `prompt`, `mask_image`, `image`, `control_image`

      Components:
          text_encoder (`Qwen2_5_VLForConditionalGeneration`): The text encoder to use tokenizer (`Qwen2Tokenizer`):
          The tokenizer to use guider (`ClassifierFreeGuidance`) image_mask_processor (`InpaintProcessor`) vae
          (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`) controlnet (`QwenImageControlNetModel`)
          control_image_processor (`VaeImageProcessor`) pachifier (`QwenImagePachifier`) scheduler
          (`FlowMatchEulerDiscreteScheduler`) transformer (`QwenImageTransformer2DModel`)

      Inputs:
          prompt (`str`, *optional*):
              The prompt or prompts to guide image generation.
          negative_prompt (`str`, *optional*):
              The prompt or prompts not to guide the image generation.
          max_sequence_length (`int`, *optional*, defaults to 1024):
              Maximum sequence length for prompt encoding.
          mask_image (`Image`, *optional*):
              Mask image for inpainting.
          image (`Image | list`, *optional*):
              Reference image(s) for denoising. Can be a single image or list of images.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          padding_mask_crop (`int`, *optional*):
              Padding for mask cropping in inpainting.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          control_image (`Image`, *optional*):
              Control image for ControlNet conditioning.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.
          latents (`Tensor`):
              Pre-generated noisy latents for image generation.
          num_inference_steps (`int`):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          attention_kwargs (`dict`, *optional*):
              Additional kwargs for attention processors.
          **denoiser_input_fields (`None`, *optional*):
              conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.
          image_latents (`Tensor`, *optional*):
              image latents used to guide the image generation. Can be generated from vae_encoder step.
          processed_mask_image (`Tensor`, *optional*):
              The processed mask image
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.
          control_image_latents (`Tensor`, *optional*):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
          mask_overlay_kwargs (`dict`, *optional*):
              The kwargs for the postprocess step to apply the mask overlay. generated in
              InpaintProcessImagesInputStep.

      Outputs:
          images (`list`):
              Generated images.
    r$   r&   T)r&   rE   )r&   rD   rE   )r&   rI   )r&   rE   rI   )r&   rD   rE   rI   )rv   image2image
inpaintingrw   controlnet_image2imagecontrolnet_inpaintingc                 C   r(   )NzjAuto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.r)   r*   r)   r)   r,   r-     r.   zQwenImageAutoBlocks.descriptionc                 C   r`   )Nimagesrc   r*   r)   r)   r,   re     s   zQwenImageAutoBlocks.outputsN)r/   r0   r1   r2   r3   AUTO_BLOCKSvaluesr4   keysr5   _workflow_mapr7   r-   re   r)   r)   r)   r,   r   Y  s     U

	
r   )>rS   utilsr   modular_pipeliner   r   r   modular_pipeline_utilsr   r	   r
   before_denoiser   r   r   r   r   r   r   decodersr   r   r   r   r]   r   r   r   r   encodersr   r   r   r   r   inputsr    r!   r"   
get_loggerr/   loggerr#   r:   r@   rA   rG   rK   rN   rU   rX   rh   rk   rn   rs   rt   ru   r~   r   r   r   r   r)   r)   r)   r,   <module>   sJ   $	
	-2"/?K5IQOV^[C!