o
    GÆÏiX  ã                   @   sJ  d dl mZ ddlmZmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ e e¡ZG dd„ deƒZG dd„ deƒZG dd„ deƒZG dd„ deƒZG dd„ deƒZ G dd„ deƒZ!G dd„ deƒZ"G dd„ deƒZ#edeƒ fdeƒ fde#ƒ fdeƒ fgƒZ$G dd „ d eƒZ%d!S )"é   )Úloggingé   )ÚAutoPipelineBlocksÚSequentialPipelineBlocks)ÚInsertableDictÚOutputParamé   )ÚFluxImg2ImgPrepareLatentsStepÚFluxImg2ImgSetTimestepsStepÚFluxPrepareLatentsStepÚFluxRoPEInputsStepÚFluxSetTimestepsStep)ÚFluxDecodeStep)ÚFluxDenoiseStep)ÚFluxProcessImagesInputStepÚFluxTextEncoderStepÚFluxVaeEncoderStep)ÚFluxAdditionalInputsStepÚFluxTextInputStepc                   @   s:   e Zd ZdZdZeƒ eƒ gZddgZe	de
fdd„ƒZdS )	ÚFluxImg2ImgVaeEncoderStepa  
    Vae encoder step that preprocess andencode the image inputs into their latent representations.

      Components:
          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)

      Inputs:
          resized_image (`None`, *optional*):
              TODO: Add description.
          image (`None`, *optional*):
              TODO: Add description.
          height (`None`, *optional*):
              TODO: Add description.
          width (`None`, *optional*):
              TODO: Add description.
          generator (`None`, *optional*):
              TODO: Add description.

      Outputs:
          processed_image (`None`):
              TODO: Add description.
          image_latents (`Tensor`):
              The latents representing the reference image
    ÚfluxÚ
preprocessÚencodeÚreturnc                 C   ó   dS )Nz^Vae encoder step that preprocess andencode the image inputs into their latent representations.© ©Úselfr   r   úh/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/modular_pipelines/flux/modular_blocks_flux.pyÚdescriptionL   ó   z%FluxImg2ImgVaeEncoderStep.descriptionN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_namer   r   Úblock_classesÚblock_namesÚpropertyÚstrr   r   r   r   r   r   -   s    r   c                   @   s2   e Zd ZdZdZegZdgZdgZe	dd„ ƒZ
dS )ÚFluxAutoVaeEncoderStepaÔ  
    Vae encoder step that encode the image inputs into their latent representations.
      This is an auto pipeline block that works for img2img tasks.
       - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided. - if `image` is not provided,
         step will be skipped.

      Components:
          image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`)

      Inputs:
          resized_image (`None`, *optional*):
              TODO: Add description.
          image (`None`, *optional*):
              TODO: Add description.
          height (`None`, *optional*):
              TODO: Add description.
          width (`None`, *optional*):
              TODO: Add description.
          generator (`None`, *optional*):
              TODO: Add description.

      Outputs:
          processed_image (`None`):
              TODO: Add description.
          image_latents (`Tensor`):
              The latents representing the reference image
    r   Úimg2imgÚimagec                 C   ó   	 dS )Na  Vae encoder step that encode the image inputs into their latent representations.
This is an auto pipeline block that works for img2img tasks.
 - `FluxImg2ImgVaeEncoderStep` (img2img) is used when only `image` is provided. - if `image` is not provided, step will be skipped.r   r   r   r   r   r   t   ó   ÿz"FluxAutoVaeEncoderStep.descriptionN)r!   r"   r#   r$   r%   r   r&   r'   Úblock_trigger_inputsr(   r   r   r   r   r   r*   R   s    r*   c                   @   s8   e Zd ZdZdZeƒ eƒ eƒ gZg d¢Z	e
dd„ ƒZdS )ÚFluxBeforeDenoiseStepa—  
    Before denoise step that prepares the inputs for the denoise step in text-to-image generation.

      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)

      Inputs:
          height (`int`, *optional*):
              TODO: Add description.
          width (`int`, *optional*):
              TODO: Add description.
          latents (`Tensor | NoneType`, *optional*):
              TODO: Add description.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              TODO: Add description.
          generator (`None`, *optional*):
              TODO: Add description.
          batch_size (`int`):
              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
              Can be generated in input step.
          dtype (`dtype`, *optional*):
              The dtype of the model inputs
          num_inference_steps (`None`, *optional*, defaults to 50):
              TODO: Add description.
          timesteps (`None`, *optional*):
              TODO: Add description.
          sigmas (`None`, *optional*):
              TODO: Add description.
          guidance_scale (`None`, *optional*, defaults to 3.5):
              TODO: Add description.
          prompt_embeds (`None`, *optional*):
              TODO: Add description.

      Outputs:
          latents (`Tensor`):
              The initial latents to use for the denoising process
          timesteps (`Tensor`):
              The timesteps to use for inference
          num_inference_steps (`int`):
              The number of denoising steps to perform at inference time
          guidance (`Tensor`):
              Optional guidance to be used.
          txt_ids (`list`):
              The sequence lengths of the prompt embeds, used for RoPE calculation.
          img_ids (`list`):
              The sequence lengths of the image latents, used for RoPE calculation.
    r   )Úprepare_latentsÚset_timestepsÚprepare_rope_inputsc                 C   r   )Nz^Before denoise step that prepares the inputs for the denoise step in text-to-image generation.r   r   r   r   r   r   µ   r    z!FluxBeforeDenoiseStep.descriptionN)r!   r"   r#   r$   r%   r   r   r   r&   r'   r(   r   r   r   r   r   r0   €   s    0r0   c                   @   s<   e Zd ZdZdZeƒ eƒ eƒ eƒ gZ	g d¢Z
edd„ ƒZdS )ÚFluxImg2ImgBeforeDenoiseStepa	  
    Before denoise step that prepare the inputs for the denoise step for img2img task.

      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)

      Inputs:
          height (`int`, *optional*):
              TODO: Add description.
          width (`int`, *optional*):
              TODO: Add description.
          latents (`Tensor | NoneType`, *optional*):
              TODO: Add description.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              TODO: Add description.
          generator (`None`, *optional*):
              TODO: Add description.
          batch_size (`int`):
              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
              Can be generated in input step.
          dtype (`dtype`, *optional*):
              The dtype of the model inputs
          num_inference_steps (`None`, *optional*, defaults to 50):
              TODO: Add description.
          timesteps (`None`, *optional*):
              TODO: Add description.
          sigmas (`None`, *optional*):
              TODO: Add description.
          strength (`None`, *optional*, defaults to 0.6):
              TODO: Add description.
          guidance_scale (`None`, *optional*, defaults to 3.5):
              TODO: Add description.
          image_latents (`Tensor`):
              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
              step.
          prompt_embeds (`None`, *optional*):
              TODO: Add description.

      Outputs:
          latents (`Tensor`):
              The initial latents to use for the denoising process
          timesteps (`Tensor`):
              The timesteps to use for inference
          num_inference_steps (`int`):
              The number of denoising steps to perform at inference time
          guidance (`Tensor`):
              Optional guidance to be used.
          initial_noise (`Tensor`):
              The initial random noised used for inpainting denoising.
          txt_ids (`list`):
              The sequence lengths of the prompt embeds, used for RoPE calculation.
          img_ids (`list`):
              The sequence lengths of the image latents, used for RoPE calculation.
    r   )r1   r2   Úprepare_img2img_latentsr3   c                 C   r   )NzRBefore denoise step that prepare the inputs for the denoise step for img2img task.r   r   r   r   r   r   ý   r    z(FluxImg2ImgBeforeDenoiseStep.descriptionN)r!   r"   r#   r$   r%   r   r
   r	   r   r&   r'   r(   r   r   r   r   r   r4   ¼   s    7ür4   c                   @   ó8   e Zd ZdZdZeegZddgZddgZ	e
dd„ ƒZdS )	ÚFluxAutoBeforeDenoiseStepa¾	  
    Before denoise step that prepare the inputs for the denoise step.
      This is an auto pipeline block that works for text2image.
       - `FluxBeforeDenoiseStep` (text2image) is used.
       - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.

      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)

      Inputs:
          height (`int`):
              TODO: Add description.
          width (`int`):
              TODO: Add description.
          latents (`Tensor | NoneType`, *optional*):
              TODO: Add description.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              TODO: Add description.
          generator (`None`, *optional*):
              TODO: Add description.
          batch_size (`int`):
              Number of prompts, the final batch size of model inputs should be `batch_size * num_images_per_prompt`.
              Can be generated in input step.
          dtype (`dtype`, *optional*):
              The dtype of the model inputs
          num_inference_steps (`None`, *optional*, defaults to 50):
              TODO: Add description.
          timesteps (`None`, *optional*):
              TODO: Add description.
          sigmas (`None`, *optional*):
              TODO: Add description.
          strength (`None`, *optional*, defaults to 0.6):
              TODO: Add description.
          guidance_scale (`None`, *optional*, defaults to 3.5):
              TODO: Add description.
          image_latents (`Tensor`, *optional*):
              The image latents to use for the denoising process. Can be generated in vae encoder and packed in input
              step.
          prompt_embeds (`None`, *optional*):
              TODO: Add description.

      Outputs:
          latents (`Tensor`):
              The initial latents to use for the denoising process
          timesteps (`Tensor`):
              The timesteps to use for inference
          num_inference_steps (`int`):
              The number of denoising steps to perform at inference time
          guidance (`Tensor`):
              Optional guidance to be used.
          initial_noise (`Tensor`):
              The initial random noised used for inpainting denoising.
          txt_ids (`list`):
              The sequence lengths of the prompt embeds, used for RoPE calculation.
          img_ids (`list`):
              The sequence lengths of the image latents, used for RoPE calculation.
    r   r+   Ú
text2imageÚimage_latentsNc                 C   r-   )Na  Before denoise step that prepare the inputs for the denoise step.
This is an auto pipeline block that works for text2image.
 - `FluxBeforeDenoiseStep` (text2image) is used.
 - `FluxImg2ImgBeforeDenoiseStep` (img2img) is used when only `image_latents` is provided.
r   r   r   r   r   r   D  r.   z%FluxAutoBeforeDenoiseStep.description)r!   r"   r#   r$   r%   r4   r0   r&   r'   r/   r(   r   r   r   r   r   r7     s    :r7   c                   @   s4   e Zd ZdZdZeƒ eƒ gZddgZe	dd„ ƒZ
dS )ÚFluxImg2ImgInputStepa?  
    Input step that prepares the inputs for the img2img denoising step. It:

      Inputs:
          num_images_per_prompt (`None`, *optional*, defaults to 1):
              TODO: Add description.
          prompt_embeds (`Tensor`):
              Pre-generated text embeddings. Can be generated from text_encoder step.
          pooled_prompt_embeds (`Tensor`, *optional*):
              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
          height (`None`, *optional*):
              TODO: Add description.
          width (`None`, *optional*):
              TODO: Add description.
          image_latents (`None`, *optional*):
              TODO: Add description.

      Outputs:
          batch_size (`int`):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
          dtype (`dtype`):
              Data type of model tensor inputs (determined by `prompt_embeds`)
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation
          pooled_prompt_embeds (`Tensor`):
              pooled text embeddings used to guide the image generation
          image_height (`int`):
              The height of the image latents
          image_width (`int`):
              The width of the image latents
    r   Útext_inputsÚadditional_inputsc                 C   r   )NzHInput step that prepares the inputs for the img2img denoising step. It:
r   r   r   r   r   r   w  r    z FluxImg2ImgInputStep.descriptionN)r!   r"   r#   r$   r%   r   r   r&   r'   r(   r   r   r   r   r   r:   R  s     r:   c                   @   r6   )	ÚFluxAutoInputStepax  
    Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size,
    and patchified.
       This is an auto pipeline block that works for text2image/img2img tasks.
       - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.
       - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.

      Inputs:
          num_images_per_prompt (`None`, *optional*, defaults to 1):
              TODO: Add description.
          prompt_embeds (`Tensor`):
              Pre-generated text embeddings. Can be generated from text_encoder step.
          pooled_prompt_embeds (`Tensor`, *optional*):
              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
          height (`None`, *optional*):
              TODO: Add description.
          width (`None`, *optional*):
              TODO: Add description.
          image_latents (`None`, *optional*):
              TODO: Add description.

      Outputs:
          batch_size (`int`):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt
          dtype (`dtype`):
              Data type of model tensor inputs (determined by `prompt_embeds`)
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation
          pooled_prompt_embeds (`Tensor`):
              pooled text embeddings used to guide the image generation
          image_height (`int`):
              The height of the image latents
          image_width (`int`):
              The width of the image latents
    r   r+   r8   r9   Nc                 C   r-   )Nal  Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. 
 This is an auto pipeline block that works for text2image/img2img tasks.
 - `FluxImg2ImgInputStep` (img2img) is used when `image_latents` is provided.
 - `FluxTextInputStep` (text2image) is used when `image_latents` are not provided.
r   r   r   r   r   r   ª  r.   zFluxAutoInputStep.description)r!   r"   r#   r$   r%   r:   r   r&   r'   r/   r(   r   r   r   r   r   r=     s    $r=   c                   @   s>   e Zd ZdZdZeeegZg d¢Z	e
dd„ ƒZe
dd„ ƒZdS )	ÚFluxCoreDenoiseStepa*  
    Core step that performs the denoising process for Flux.
      This step supports text-to-image and image-to-image tasks for Flux:
       - for image-to-image generation, you need to provide `image_latents`
       - for text-to-image generation, all you need to provide is prompt embeddings.

      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)

      Inputs:
          num_images_per_prompt (`None`, *optional*, defaults to 1):
              TODO: Add description.
          prompt_embeds (`Tensor`):
              Pre-generated text embeddings. Can be generated from text_encoder step.
          pooled_prompt_embeds (`Tensor`, *optional*):
              Pre-generated pooled text embeddings. Can be generated from text_encoder step.
          height (`None`, *optional*):
              TODO: Add description.
          width (`None`, *optional*):
              TODO: Add description.
          image_latents (`None`, *optional*):
              TODO: Add description.
          latents (`Tensor | NoneType`, *optional*):
              TODO: Add description.
          generator (`None`, *optional*):
              TODO: Add description.
          num_inference_steps (`None`, *optional*, defaults to 50):
              TODO: Add description.
          timesteps (`None`, *optional*):
              TODO: Add description.
          sigmas (`None`, *optional*):
              TODO: Add description.
          strength (`None`, *optional*, defaults to 0.6):
              TODO: Add description.
          guidance_scale (`None`, *optional*, defaults to 3.5):
              TODO: Add description.
          joint_attention_kwargs (`None`, *optional*):
              TODO: Add description.

      Outputs:
          latents (`Tensor`):
              Denoised latents.
    r   )ÚinputÚbefore_denoiseÚdenoisec                 C   r-   )Na  Core step that performs the denoising process for Flux.
This step supports text-to-image and image-to-image tasks for Flux:
 - for image-to-image generation, you need to provide `image_latents`
 - for text-to-image generation, all you need to provide is prompt embeddings.r   r   r   r   r   r   æ  r.   zFluxCoreDenoiseStep.descriptionc                 C   ó   t  d¡gS )NÚlatents©r   Útemplater   r   r   r   Úoutputsï  s   ÿzFluxCoreDenoiseStep.outputsN)r!   r"   r#   r$   r%   r=   r7   r   r&   r'   r(   r   rF   r   r   r   r   r>   µ  s    ,

r>   Útext_encoderÚvae_encoderrA   Údecodec                   @   sP   e Zd ZdZdZe ¡ Ze ¡ Z	ddidddœdœZ
edd„ ƒZed	d
„ ƒZdS )ÚFluxAutoBlocksa‡  
    Auto Modular pipeline for text-to-image and image-to-image using Flux.

      Supported workflows:
        - `text2image`: requires `prompt`
        - `image2image`: requires `image`, `prompt`

      Components:
          text_encoder (`CLIPTextModel`) tokenizer (`CLIPTokenizer`) text_encoder_2 (`T5EncoderModel`) tokenizer_2
          (`T5TokenizerFast`) image_processor (`VaeImageProcessor`) vae (`AutoencoderKL`) scheduler
          (`FlowMatchEulerDiscreteScheduler`) transformer (`FluxTransformer2DModel`)

      Inputs:
          prompt (`None`, *optional*):
              TODO: Add description.
          prompt_2 (`None`, *optional*):
              TODO: Add description.
          max_sequence_length (`int`, *optional*, defaults to 512):
              TODO: Add description.
          joint_attention_kwargs (`None`, *optional*):
              TODO: Add description.
          resized_image (`None`, *optional*):
              TODO: Add description.
          image (`None`, *optional*):
              TODO: Add description.
          height (`None`, *optional*):
              TODO: Add description.
          width (`None`, *optional*):
              TODO: Add description.
          generator (`None`, *optional*):
              TODO: Add description.
          num_images_per_prompt (`None`, *optional*, defaults to 1):
              TODO: Add description.
          image_latents (`None`, *optional*):
              TODO: Add description.
          latents (`Tensor | NoneType`, *optional*):
              TODO: Add description.
          num_inference_steps (`None`, *optional*, defaults to 50):
              TODO: Add description.
          timesteps (`None`, *optional*):
              TODO: Add description.
          sigmas (`None`, *optional*):
              TODO: Add description.
          strength (`None`, *optional*, defaults to 0.6):
              TODO: Add description.
          guidance_scale (`None`, *optional*, defaults to 3.5):
              TODO: Add description.
          output_type (`None`, *optional*, defaults to pil):
              TODO: Add description.

      Outputs:
          images (`list`):
              Generated images.
    r   ÚpromptT)r,   rK   )r8   Úimage2imagec                 C   r   )NzFAuto Modular pipeline for text-to-image and image-to-image using Flux.r   r   r   r   r   r   D  r    zFluxAutoBlocks.descriptionc                 C   rB   )NÚimagesrD   r   r   r   r   rF   H  s   zFluxAutoBlocks.outputsN)r!   r"   r#   r$   r%   ÚAUTO_BLOCKSÚvaluesr&   Úkeysr'   Ú_workflow_mapr(   r   rF   r   r   r   r   rJ     s    7þ
rJ   N)&Úutilsr   Úmodular_pipeliner   r   Úmodular_pipeline_utilsr   r   r@   r	   r
   r   r   r   Údecodersr   rA   r   Úencodersr   r   r   Úinputsr   r   Ú
get_loggerr!   Úloggerr   r*   r0   r4   r7   r:   r=   r>   rN   rJ   r   r   r   r   Ú<module>   s2   
%.<HN-6Büÿ