o
    ۷i                     @   s   d dl Z ddlmZ ddlmZmZ ddlmZmZm	Z	 ddlm
Z
mZmZ 	dd	ed
e jdedede jf
ddZde jdedeeef fddZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    N   )QwenImageMultiControlNetModel   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )QwenImageLayeredPachifierQwenImageModularPipelineQwenImagePachifier
input_nameinput_tensor
batch_sizenum_images_per_promptreturnc                 C   sz   t |tjstd|  d|jd dkr|| }n|jd |kr$|}ntd|  d| d|jd  |j|dd}|S )a(  Repeat tensor elements to match the final batch size.

    This function expands a tensor's batch dimension to match the final batch size (batch_size * num_images_per_prompt)
    by repeating each element along dimension 0.

    The input tensor must have batch size 1 or batch_size. The function will:
    - If batch size is 1: repeat each element (batch_size * num_images_per_prompt) times
    - If batch size equals batch_size: repeat each element num_images_per_prompt times

    Args:
        input_name (str): Name of the input tensor (used for error messages)
        input_tensor (torch.Tensor): The tensor to repeat. Must have batch size 1 or batch_size.
        batch_size (int): The base batch size (number of prompts)
        num_images_per_prompt (int, optional): Number of images to generate per prompt. Defaults to 1.

    Returns:
        torch.Tensor: The repeated tensor with final batch size (batch_size * num_images_per_prompt)

    Raises:
        ValueError: If input_tensor is not a torch.Tensor or has invalid batch size

    Examples:
        tensor = torch.tensor([[1, 2, 3]]) # shape: [1, 3] repeated = repeat_tensor_to_batch_size("image", tensor,
        batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape:
        [4, 3]

        tensor = torch.tensor([[1, 2, 3], [4, 5, 6]]) # shape: [2, 3] repeated = repeat_tensor_to_batch_size("image",
        tensor, batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]])
        - shape: [4, 3]
    `z` must be a tensorr   r
   z!` must have have batch size 1 or z
, but got dim)
isinstancetorchTensor
ValueErrorshaperepeat_interleave)r   r   r   r   	repeat_by r   b/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/modular_pipelines/qwenimage/inputs.pyrepeat_tensor_to_batch_size   s   %
r   latentsvae_scale_factorc                 C   sN   | j dkr| j dkrtd| j  | jdd \}}|| }|| }||fS )a  Calculate image dimensions from latent tensor dimensions.

    This function converts latent space dimensions to image space dimensions by multiplying the latent height and width
    by the VAE scale factor.

    Args:
        latents (torch.Tensor): The latent tensor. Must have 4 or 5 dimensions.
            Expected shapes: [batch, channels, height, width] or [batch, channels, frames, height, width]
        vae_scale_factor (int): The scale factor used by the VAE to compress images.
            Typically 8 for most VAEs (image is 8x larger than latents in each dimension)

    Returns:
        tuple[int, int]: The calculated image dimensions as (height, width)

    Raises:
        ValueError: If latents tensor doesn't have 4 or 5 dimensions

          z6unpacked latents must have 4 or 5 dimensions, but got N)ndimr   r   )r    r!   latent_heightlatent_widthheightwidthr   r   r    calculate_dimension_from_latentsP   s   r*   c                   @   st   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zed
d ZdededefddZdS )QwenImageTextInputsStepa  
    Text input processing step that standardizes text embeddings for the pipeline.
      This step:
        1. Determines `batch_size` and `dtype` based on `prompt_embeds`
        2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)

      This block should be placed after all encoder steps to process the text embeddings before they are used in
      subsequent pipeline steps.

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          prompt_embeds (`Tensor`):
              text embeddings used to guide the image generation. Can be generated from text_encoder step.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds (`Tensor`, *optional*):
              negative text embeddings used to guide the image generation. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.

      Outputs:
          batch_size (`int`):
              The batch size of the prompt embeddings
          dtype (`dtype`):
              The data type of the prompt embeddings
          prompt_embeds (`Tensor`):
              The prompt embeddings. (batch-expanded)
          prompt_embeds_mask (`Tensor`):
              The encoder attention mask. (batch-expanded)
          negative_prompt_embeds (`Tensor`):
              The negative prompt embeddings. (batch-expanded)
          negative_prompt_embeds_mask (`Tensor`):
              The negative prompt embeddings mask. (batch-expanded)
    	qwenimager   c                 C   s   d}d}|| S )NzText input processing step that standardizes text embeddings for the pipeline.
This step:
  1. Determines `batch_size` and `dtype` based on `prompt_embeds`
  2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)z

This block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps.r   )selfsummary_sectionplacement_sectionr   r   r   description   s   z#QwenImageTextInputsStep.descriptionc                 C   s,   t dt dt dt dt dgS )Nr   prompt_embedsprompt_embeds_masknegative_prompt_embedsnegative_prompt_embeds_mask)r   templater-   r   r   r   inputs   s   zQwenImageTextInputsStep.inputsc              	   C   sN   t dtddt dtjddt jdddt jd	ddt jd
ddt jdddgS )Nr   z'The batch size of the prompt embeddingsname	type_hintr0   dtypez&The data type of the prompt embeddingsr1   zbatch-expanded)noter2   r3   r4   )r	   intr   r;   r5   r6   r   r   r   intermediate_outputs   s   z,QwenImageTextInputsStep.intermediate_outputsc                 C   s   |d ur|d u rt d|d u r|d urt d|jd | jd kr&t d|d ur8|jd | jd kr8t d|d urJ|jd | jd krLt dd S d S )NzS`negative_prompt_embeds_mask` is required when `negative_prompt_embeds` is not NonezJcannot pass `negative_prompt_embeds_mask` without `negative_prompt_embeds`r   zE`prompt_embeds_mask` must have the same batch size as `prompt_embeds`zI`negative_prompt_embeds` must have the same batch size as `prompt_embeds`zN`negative_prompt_embeds_mask` must have the same batch size as `prompt_embeds`)r   r   r1   r2   r3   r4   r   r   r   check_inputs   s   z$QwenImageTextInputsStep.check_inputs
componentsstatec                 C   s*  |  |}| j|j|j|j|jd |jjd |_|jj|_|jj\}}}|j	d|j
d|_|j|j|j
 |d|_|j	d|j
d|_|j|j|j
 ||_|jd ur|jj\}}}|j	d|j
d|_|j|j|j
 |d|_|j	d|j
d|_|j|j|j
 ||_| || ||fS )Nr?   r   r
   )get_block_stater@   r1   r2   r3   r4   r   r   r;   repeatr   viewset_block_state)r-   rA   rB   block_state_seq_lenr   r   r   __call__   sD   


z QwenImageTextInputsStep.__call__N)__name__
__module____qualname____doc__
model_namepropertystrr0   listr   r7   r	   r>   staticmethodr@   r   r   rK   r   r   r   r   r+   p   s    $	

r+   c                          e Zd ZdZdZ		ddee dB dee dB f fddZede	fd	d
Z
edee fddZedee fddZedee fddZdededefddZ  ZS )QwenImageAdditionalInputsStepa  
    Input processing step that:
        1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size
        2. For additional batch inputs: Expands batch dimensions to match final batch size

      Configured inputs:
        - Image latent inputs: ['image_latents']

      This block should be placed after the encoder steps and the text input step.

      Components:
          pachifier (`QwenImagePachifier`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.

      Outputs:
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified and
              batch-expanded)
    r,   Nimage_latent_inputsadditional_batch_inputsc                       |d u r
t dg}|d u rg }t|tstdt| |D ]}t|t s0tdt| q t|ts?tdt| |D ]}t|t sQtdt| qA|| _|| _t 	  d S Nimage_latentsz,image_latent_inputs must be a list, but got z:image_latent_inputs must be a list of InputParam, but got z0additional_batch_inputs must be a list, but got z>additional_batch_inputs must be a list of InputParam, but got 
r   r5   r   rS   r   type_image_latent_inputs_additional_batch_inputssuper__init__r-   rW   rX   input_param	__class__r   r   ra   ,  s*   



z&QwenImageAdditionalInputsStep.__init__r   c                 C   h   d}d}| j s
| jr,d}| j r|ddd | j D  7 }| jr,|ddd | jD  7 }d	}|| | S )
NzInput processing step that:
  1. For image latent inputs: Updates height/width if None, patchifies, and expands batch size
  2. For additional batch inputs: Expands batch dimensions to match final batch size 

Configured inputs:
  - Image latent inputs: c                 S      g | ]}|j qS r   r9   .0pr   r   r   
<listcomp>W      z=QwenImageAdditionalInputsStep.description.<locals>.<listcomp>
  - Additional batch inputs: c                 S   rj   r   rk   rl   r   r   r   ro   Y  rp   N

This block should be placed after the encoder steps and the text input step.r^   r_   r-   r.   inputs_infor/   r   r   r   r0   K     z)QwenImageAdditionalInputsStep.descriptionc                 C      t dtddgS N	pachifierfrom_config)default_creation_methodr   r   r6   r   r   r   expected_components_     z1QwenImageAdditionalInputsStep.expected_componentsc                 C   8   t dt dt dt dg}|| j| j 7 }|S Nr   r   r(   r)   r   r5   r^   r_   r-   r7   r   r   r   r7   e  s   z$QwenImageAdditionalInputsStep.inputsc                 C      t dtddt dtddg}t| jdkr)|t dtdd |t d	td
d | jD ]}|t |j|j|jd d q,| jD ]}|t |j|j|jd d qA|S )Nimage_height<The image height calculated from the image latents dimensionr8   image_width;The image width calculated from the image latents dimensionr   r(   (if not provided, updated to image heightr)   'if not provided, updated to image widthz  (patchified and batch-expanded) (batch-expanded)	r	   r=   lenr^   appendr9   r:   r0   r_   r-   outputsrc   r   r   r   r>   r  sF   

	z2QwenImageAdditionalInputsStep.intermediate_outputsrA   rB   c                 C   s   |  |}| jD ]I}|j}t||}|d u rqt||j\}}|jp#||_|jp)||_t|ds3||_	t|ds;||_
|j|}t|||j|jd}t||| q| jD ]}|j}	t||	}
|
d u rdqUt|	|
|j|jd}
t||	|
 qU| || ||fS )Nr   r   r   r   r   r   )rD   r^   r9   getattrr*   r!   r(   r)   hasattrr   r   ry   pack_latentsr   r   r   setattrr_   rG   r-   rA   rB   rH   rc   image_latent_input_nameimage_latent_tensorr(   r)   r   r   r   r   r   rK     sF   






z&QwenImageAdditionalInputsStep.__call__NNrL   rM   rN   rO   rP   rS   r   ra   rQ   rR   r0   r   r}   r7   r	   r>   r   r   rK   __classcell__r   r   rd   r   rV     s&    (

-rV   c                       rU   )%QwenImageEditPlusAdditionalInputsStepa  
    Input processing step for Edit Plus that:
        1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch
        2. For additional batch inputs: Expands batch dimensions to match final batch size
        Height/width defaults to last image in the list.

      Configured inputs:
        - Image latent inputs: ['image_latents']

      This block should be placed after the encoder steps and the text input step.

      Components:
          pachifier (`QwenImagePachifier`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.

      Outputs:
          image_height (`list`):
              The image heights calculated from the image latents dimension
          image_width (`list`):
              The image widths calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified,
              concatenated, and batch-expanded)
    zqwenimage-edit-plusNrW   rX   c                    rY   rZ   r\   rb   rd   r   r   ra     *   



z.QwenImageEditPlusAdditionalInputsStep.__init__r   c                 C   rf   )
Na  Input processing step for Edit Plus that:
  1. For image latent inputs (list): Collects heights/widths, patchifies each, concatenates, expands batch
  2. For additional batch inputs: Expands batch dimensions to match final batch size
  Height/width defaults to last image in the list.rg   rh   ri   c                 S   rj   r   rk   rl   r   r   r   ro   -  rp   zEQwenImageEditPlusAdditionalInputsStep.description.<locals>.<listcomp>rq   c                 S   rj   r   rk   rl   r   r   r   ro   /  rp   rr   rs   rt   r   r   r   r0      s   z1QwenImageEditPlusAdditionalInputsStep.descriptionc                 C   rw   rx   r|   r6   r   r   r   r}   5  r~   z9QwenImageEditPlusAdditionalInputsStep.expected_componentsc                 C   r   r   r   r   r   r   r   r7   ;  s   z,QwenImageEditPlusAdditionalInputsStep.inputsc                 C   s   t dtt ddt dtt ddg}t| jdkr-|t dtdd |t d	td
d | jD ]}|t |j|j|jd d q0| j	D ]}|t |j|j|jd d qE|S )Nr   z=The image heights calculated from the image latents dimensionr8   r   z<The image widths calculated from the image latents dimensionr   r(   r   r)   r   z/ (patchified, concatenated, and batch-expanded)r   )
r	   rS   r=   r   r^   r   r9   r:   r0   r_   r   r   r   r   r>   I  sF   

	z:QwenImageEditPlusAdditionalInputsStep.intermediate_outputsrA   rB   c                 C   s\  |  |}| jD ]x}|j}t||}|d u rqt|t}|s!|g}g }g }	g }
t|D ]1\}}t||j\}}|	| |		| |j
|}t| d| d||j|jd}|
	| q+tj|
dd}
||_|	|_|jpp|d |_|jpx|	d |_t|||
 q| jD ]}|j}t||}|d u rqt|||j|jd}t||| q| || ||fS )N[]r   r
   r   rC   )rD   r^   r9   r   r   rS   	enumerater*   r!   r   ry   r   r   r   r   r   catr   r   r(   r)   r   r_   rG   )r-   rA   rB   rH   rc   r   r   is_listimage_heightsimage_widthspacked_image_latent_tensorsiimg_latent_tensorr(   r)   r   r   r   r   r   rK   w  sX   







z.QwenImageEditPlusAdditionalInputsStep.__call__r   r   r   r   rd   r   r     s&    )

-r   c                       rU   )$QwenImageLayeredAdditionalInputsStepaA  
    Input processing step for Layered that:
        1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch
           size
        2. For additional batch inputs: Expands batch dimensions to match final batch size

      Configured inputs:
        - Image latent inputs: ['image_latents']

      This block should be placed after the encoder steps and the text input step.

      Components:
          pachifier (`QwenImageLayeredPachifier`)

      Inputs:
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.

      Outputs:
          image_height (`int`):
              The image height calculated from the image latents dimension
          image_width (`int`):
              The image width calculated from the image latents dimension
          height (`int`):
              if not provided, updated to image height
          width (`int`):
              if not provided, updated to image width
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (patchified
              with layered pachifier and batch-expanded)
    zqwenimage-layeredNrW   rX   c                    rY   rZ   r\   rb   rd   r   r   ra     r   z-QwenImageLayeredAdditionalInputsStep.__init__r   c                 C   rf   )
NzInput processing step for Layered that:
  1. For image latent inputs: Updates height/width if None, patchifies with layered pachifier, and expands batch size
  2. For additional batch inputs: Expands batch dimensions to match final batch sizerg   rh   ri   c                 S   rj   r   rk   rl   r   r   r   ro     rp   zDQwenImageLayeredAdditionalInputsStep.description.<locals>.<listcomp>rq   c                 S   rj   r   rk   rl   r   r   r   ro     rp   rr   rs   rt   r   r   r   r0     rv   z0QwenImageLayeredAdditionalInputsStep.descriptionc                 C   rw   rx   )r   r   r6   r   r   r   r}     r~   z8QwenImageLayeredAdditionalInputsStep.expected_componentsc                 C   s(   t dt dg}|| j| j 7 }|S )Nr   r   r   r   r   r   r   r7      s
   z+QwenImageLayeredAdditionalInputsStep.inputsc                 C   r   )Nr   r   r8   r   r   r   r(   r   r)   r   z7 (patchified with layered pachifier and batch-expanded)r   r   r   r   r   r   r>   ,  sF   

	z9QwenImageLayeredAdditionalInputsStep.intermediate_outputsrA   rB   c                 C   s  |  |}| jD ]K}|j}t||}|d u rq|jd |j }|jd |j }||_||_t|ds5||_	t|ds=||_
|j|}t|||j|jd}t||| q| jD ]}|j}	t||	}
|
d u rfqWt|	|
|j|jd}
t||	|
 qW| || ||fS )Nr   r"   r   r   r   )rD   r^   r9   r   r   r!   r(   r)   r   r   r   ry   r   r   r   r   r   r_   rG   r   r   r   r   rK   Y  sH   






z-QwenImageLayeredAdditionalInputsStep.__call__r   r   r   r   rd   r   r     s&    %

,r   c                   @   sp   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Ze d
ededefddZdS )QwenImageControlNetInputsStepay  
    prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps.

      Inputs:
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.

      Outputs:
          control_image_latents (`Tensor`):
              The control image latents (patchified and batch-expanded).
          height (`int`):
              if not provided, updated to control image height
          width (`int`):
              if not provided, updated to control image width
    r,   r   c                 C   s   dS )Nz\prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps.r   r6   r   r   r   r0     s   z)QwenImageControlNetInputsStep.descriptionc                 C   s4   t ddtjddt dt dt dt dgS )	Ncontrol_image_latentsTzlThe control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.)r9   requiredr:   r0   r   r   r(   r)   )r   r   r   r5   r6   r   r   r   r7     s   z$QwenImageControlNetInputsStep.inputsc                 C   s*   t dtjddt dtddt dtddgS )Nr   z:The control image latents (patchified and batch-expanded).r8   r(   z0if not provided, updated to control image heightr)   z/if not provided, updated to control image width)r	   r   r   r=   r6   r   r   r   r>     s   z2QwenImageControlNetInputsStep.intermediate_outputsrA   rB   c           	      C   s   |  |}t|jtrHg }t|jD ]1\}}t||j\}}|jp"||_|j	p(||_	|j
|}td| d||j|jd}|| q||_n-t|j|j\}}|jpU||_|j	p[||_	|j
|j|_td|j|j|jd|_|j|_| || ||fS )Nzcontrol_image_latents[r   r   r   )rD   r   
controlnetr   r   r   r*   r!   r(   r)   ry   r   r   r   r   r   rG   )	r-   rA   rB   rH   r   r   control_image_latents_r(   r)   r   r   r   rK     s>   

z&QwenImageControlNetInputsStep.__call__N)rL   rM   rN   rO   rP   rQ   rR   r0   rS   r   r7   r	   r>   r   no_gradr   r   rK   r   r   r   r   r     s    r   )r
   )r   modelsr   modular_pipeliner   r   modular_pipeline_utilsr   r   r	   r   r   r   rR   r   r=   r   tupler*   r+   rV   r   r   r   r   r   r   r   <module>   s6   
 8   V k R