o
    Gi}                  
   @   s  d dl Z d dlZd dlZddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZmZ ddlmZmZmZ d	d
lmZmZmZ 				d5dedededefddZ				d6dedB deejB dB dee dB dee dB fddZdd ZG dd deZG dd  d eZG d!d" d"eZ G d#d$ d$eZ!G d%d& d&eZ"G d'd( d(eZ#G d)d* d*eZ$G d+d, d,eZ%G d-d. d.eZ&G d/d0 d0eZ'G d1d2 d2eZ(G d3d4 d4eZ)dS )7    N   )QwenImageControlNetModelQwenImageMultiControlNetModel)FlowMatchEulerDiscreteScheduler)randn_tensorunwrap_module   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )QwenImageLayeredPachifierQwenImageModularPipelineQwenImagePachifier            ?ffffff?base_seq_lenmax_seq_len
base_shift	max_shiftc                 C   s,   || ||  }|||  }| | | }|S )N )image_seq_lenr   r   r   r   mbmur   r   h/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/modular_pipelines/qwenimage/before_denoise.pycalculate_shift   s   r    num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr#   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r#   r"   r$   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r$   r"   r"   r   )

ValueErrorsetinspect	signatureset_timesteps
parameterskeys	__class__r#   len)	schedulerr!   r"   r#   r$   kwargsaccepts_timestepsaccept_sigmasr   r   r   retrieve_timesteps+   s2   r2   c                 C   sZ   t || |}tt|| d}| j|| j d  }t| dr'| || j  ||| fS )Nr   set_begin_index)minintmaxr#   orderhasattrr3   )r.   r!   strengthinit_timestept_startr#   r   r   r   get_timestepsg   s   
r<   c                   @      e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZedd Ze dededefddZdS )QwenImagePrepareLatentsStepa  
    Prepare initial random noise for the generation process

      Components:
          pachifier (`QwenImagePachifier`)

      Inputs:
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          dtype (`dtype`, *optional*, defaults to torch.float32):
              The dtype of the model inputs, can be generated in input step.

      Outputs:
          height (`int`):
              if not set, updated to default value
          width (`int`):
              if not set, updated to default value
          latents (`Tensor`):
              The initial latents to use for the denoising process
    	qwenimagereturnc                 C      dS )Nz7Prepare initial random noise for the generation processr   selfr   r   r   description      z'QwenImagePrepareLatentsStep.descriptionc                 C      t dtddgS N	pachifierfrom_config)default_creation_methodr   r   rB   r   r   r   expected_components      z/QwenImagePrepareLatentsStep.expected_componentsc              	   C   s<   t dt dt dt dt dt dt dgS )Nlatentsheightwidthnum_images_per_prompt	generator
batch_sizedtyper   templaterB   r   r   r   inputs   s   z"QwenImagePrepareLatentsStep.inputsc                 C   *   t dtddt dtddt dtjddgS NrO   z$if not set, updated to default valuename	type_hintrD   rP   rN   z4The initial latents to use for the denoising processr   r5   torchTensorrB   r   r   r   intermediate_outputs      z0QwenImagePrepareLatentsStep.intermediate_outputsc                 C   h   | d ur| |d  dkrt d|d  d|  |d ur0||d  dkr2t d|d  d| d S d S Nr   r   zHeight must be divisible by z but is zWidth must be divisible by r%   rO   rP   vae_scale_factorr   r   r   check_inputs   
   z(QwenImagePrepareLatentsStep.check_inputs
componentsstatec           	      C   s  |  |}| j|j|j|jd |j}|j|j }|jp|j|_|jp%|j	|_dt
|j|jd   }dt
|j|jd   }||jd||f}t|jtrbt|j|krbtdt|j d| d|jd u rzt||j||jd|_|j|j|_| || ||fS Nre   r   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)rR   r"   rT   )get_block_staterg   rO   rP   rf   _execution_devicerS   rQ   default_heightdefault_widthr5   num_channels_latents
isinstancerR   listr-   r%   rN   r   rT   rH   pack_latentsset_block_state	rC   ri   rj   block_stater"   rS   latent_heightlatent_widthshaper   r   r   __call__   s4   

z$QwenImagePrepareLatentsStep.__call__N__name__
__module____qualname____doc__
model_namepropertystrrD   rr   r   rL   r   rW   r   r`   staticmethodrg   r^   no_gradr   r
   rz   r   r   r   r   r>   y   s     
r>   c                   @   r=   )"QwenImageLayeredPrepareLatentsStepa  
    Prepare initial random noise (B, layers+1, C, H, W) for the generation process

      Components:
          pachifier (`QwenImageLayeredPachifier`)

      Inputs:
          latents (`Tensor`, *optional*):
              Pre-generated noisy latents for image generation.
          height (`int`, *optional*):
              The height in pixels of the generated image.
          width (`int`, *optional*):
              The width in pixels of the generated image.
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image
          num_images_per_prompt (`int`, *optional*, defaults to 1):
              The number of images to generate per prompt.
          generator (`Generator`, *optional*):
              Torch generator for deterministic generation.
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          dtype (`dtype`, *optional*, defaults to torch.float32):
              The dtype of the model inputs, can be generated in input step.

      Outputs:
          height (`int`):
              if not set, updated to default value
          width (`int`):
              if not set, updated to default value
          latents (`Tensor`):
              The initial latents to use for the denoising process
    qwenimage-layeredr@   c                 C   rA   )NzNPrepare initial random noise (B, layers+1, C, H, W) for the generation processr   rB   r   r   r   rD     rE   z.QwenImageLayeredPrepareLatentsStep.descriptionc                 C   rF   rG   )r   r   rB   r   r   r   rL     rM   z6QwenImageLayeredPrepareLatentsStep.expected_componentsc              
   C   sD   t dt dt dt dt dt dt dt dgS )	NrN   rO   rP   layersrQ   rR   rS   rT   rU   rB   r   r   r   rW     s   z)QwenImageLayeredPrepareLatentsStep.inputsc                 C   rX   rY   r]   rB   r   r   r   r`   )  ra   z7QwenImageLayeredPrepareLatentsStep.intermediate_outputsc                 C   rb   rc   rd   re   r   r   r   rg   5  rh   z/QwenImageLayeredPrepareLatentsStep.check_inputsri   rj   c           	      C   s  |  |}| j|j|j|jd |j}|j|j }|jp|j|_|jp%|j	|_dt
|j|jd   }dt
|j|jd   }||jd |j||f}t|jtret|j|kretdt|j d| d|jd u r}t||j||jd|_|j|j|_| || ||fS rk   )rl   rg   rO   rP   rf   rm   rS   rQ   rn   ro   r5   r   rp   rq   rR   rr   r-   r%   rN   r   rT   rH   rs   rt   ru   r   r   r   rz   =  s4   

z+QwenImageLayeredPrepareLatentsStep.__call__Nr{   r   r   r   r   r      s    "
r   c                   @   r=   )'QwenImagePrepareLatentsWithStrengthStepa  
    Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps,
    prepare_latents. Both noise and image latents should alreadybe patchified.

      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)

      Inputs:
          latents (`Tensor`):
              The initial random noised, can be generated in prepare latent step.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step. (Can be
              generated from vae encoder and updated in input step.)
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.

      Outputs:
          initial_noise (`Tensor`):
              The initial random noised used for inpainting denoising.
          latents (`Tensor`):
              The scaled noisy latents to use for inpainting/image-to-image denoising.
    r?   r@   c                 C   rA   )NzStep that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified.r   rB   r   r   r   rD   ~  rE   z3QwenImagePrepareLatentsWithStrengthStep.descriptionc                 C      t dtgS Nr.   r   r   rB   r   r   r   rL        z;QwenImagePrepareLatentsWithStrengthStep.expected_componentsc                 C   s0   t ddtjddt jdddt ddtjd	dgS )
NrN   TzCThe initial random noised, can be generated in prepare latent step.r[   requiredr\   rD   image_latentsz<Can be generated from vae encoder and updated in input step.)noter#   WThe timesteps to use for the denoising process. Can be generated in set_timesteps step.r   r^   r_   rV   rB   r   r   r   rW     s   z.QwenImagePrepareLatentsWithStrengthStep.inputsc                 C   s    t dtjddt dtjddgS )Ninitial_noisez8The initial random noised used for inpainting denoising.rZ   rN   zHThe scaled noisy latents to use for inpainting/image-to-image denoising.r   r^   r_   rB   r   r   r   r`     s   z<QwenImagePrepareLatentsWithStrengthStep.intermediate_outputsc                 C   sR   | j d |j d krtd| j d  d|j d  | jdkr'td| j d S )Nr   zE`image_latents` must have have same batch size as `latents`, but got z and r   z=`image_latents` must have 3 dimensions (patchified), but got )ry   r%   ndimr   rN   r   r   r   rg     s   
z4QwenImagePrepareLatentsWithStrengthStep.check_inputsri   rj   c                 C   sj   |  |}| j|j|jd |jd d |jjd }|j|_|j	|j||j|_| 
|| ||fS )Nr   r   r   )rl   rg   r   rN   r#   repeatry   r   r.   scale_noisert   )rC   ri   rj   rv   latent_timestepr   r   r   rz     s   

z0QwenImagePrepareLatentsWithStrengthStep.__call__Nr{   r   r   r   r   r   d  s    
	r   c                   @   s   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZe dededefddZdS )QwenImageCreateMaskLatentsStepa  
    Step that creates mask latents from preprocessed mask_image by interpolating to latent space.

      Components:
          pachifier (`QwenImagePachifier`)

      Inputs:
          processed_mask_image (`Tensor`):
              The processed mask to use for the inpainting process.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          dtype (`dtype`, *optional*, defaults to torch.float32):
              The dtype of the model inputs, can be generated in input step.

      Outputs:
          mask (`Tensor`):
              The mask to use for the inpainting process.
    r?   r@   c                 C   rA   )Nz]Step that creates mask latents from preprocessed mask_image by interpolating to latent space.r   rB   r   r   r   rD     rE   z*QwenImageCreateMaskLatentsStep.descriptionc                 C   rF   rG   rK   rB   r   r   r   rL     rM   z2QwenImageCreateMaskLatentsStep.expected_componentsc                 C   s4   t ddtjddt jdddt jdddt dgS )	Nprocessed_mask_imageTz5The processed mask to use for the inpainting process.r   rO   r   rP   rT   r   rB   r   r   r   rW     s   z%QwenImageCreateMaskLatentsStep.inputsc                 C      t dtjddgS )Nmaskz+The mask to use for the inpainting process.rZ   r   rB   r   r   r   r`        z3QwenImageCreateMaskLatentsStep.intermediate_outputsri   rj   c                 C   s   |  |}|j}dt|j|jd   }dt|j|jd   }tjjj	|j
||fd|_|jd|_|jd|jddd|_|jj||jd|_|j|j|_| || ||fS )Nr   )sizer   r"   rT   )rl   rm   r5   rO   rf   rP   r^   nn
functionalinterpolater   r   	unsqueezer   rp   torT   rH   rs   rt   )rC   ri   rj   rv   r"   height_latentswidth_latentsr   r   r   rz     s   
z'QwenImageCreateMaskLatentsStep.__call__N)r|   r}   r~   r   r   r   r   rD   rr   r   rL   r   rW   r   r`   r^   r   r   r
   rz   r   r   r   r   r     s    r   c                   @   ~   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZdededefddZdS )QwenImageSetTimestepsStepa  
    Step that sets the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.

      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)

      Inputs:
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          latents (`Tensor`):
              The initial random noised latents for the denoising process. Can be generated in prepare latents step.

      Outputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process
    r?   r@   c                 C   rA   )NzpStep that sets the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step.r   rB   r   r   r   rD   =  rE   z%QwenImageSetTimestepsStep.descriptionc                 C   r   r   r   rB   r   r   r   rL   A  r   z-QwenImageSetTimestepsStep.expected_componentsc                 C   s$   t dt dt ddtjddgS )Nr!   r$   rN   TzfThe initial random noised latents for the denoising process. Can be generated in prepare latents step.r   r   rV   r^   r_   rB   r   r   r   rW   G  s   z QwenImageSetTimestepsStep.inputsc                 C   r   )Nr#   z.The timesteps to use for the denoising processrZ   r   rB   r   r   r   r`   T  r   z.QwenImageSetTimestepsStep.intermediate_outputsri   rj   c              	   C   s   |  |}|j}|jd u rtdd|j |jn|j}t|jjd |j	j
dd|j	j
dd|j	j
dd|j	j
d	d
d}t|j	|j|||d\|_|_|j	d | || ||fS )N      ?r   base_image_seq_lenr   max_image_seq_lenr   r   r   r   r   r   r   r   r   r   r.   r!   r"   r$   r   r   )rl   rm   r$   nplinspacer!   r    rN   ry   r.   configgetr2   r#   r3   rt   rC   ri   rj   rv   r"   r$   r   r   r   r   rz   \  s.   


z"QwenImageSetTimestepsStep.__call__Nr|   r}   r~   r   r   r   r   rD   rr   r   rL   r   rW   r   r`   r   r
   rz   r   r   r   r   r   '  s    r   c                   @   s   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZe dedefddZdS ) QwenImageLayeredSetTimestepsStepa  
    Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents.

      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)

      Inputs:
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          image_latents (`Tensor`):
              image latents used to guide the image generation. Can be generated from vae_encoder step.

      Outputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process.
    r   r@   c                 C   rA   )Nz[Set timesteps step for QwenImage Layered with custom mu calculation based on image_latents.r   rB   r   r   r   rD     rE   z,QwenImageLayeredSetTimestepsStep.descriptionc                 C   r   r   r   rB   r   r   r   rL     r   z4QwenImageLayeredSetTimestepsStep.expected_componentsc                 C   s   t dt dt dgS )Nr!   r$   r   rU   rB   r   r   r   rW     s   z'QwenImageLayeredSetTimestepsStep.inputsc                 C   r   )Nr#   /The timesteps to use for the denoising process.rZ   r   rB   r   r   r   r`     r   z5QwenImageLayeredSetTimestepsStep.intermediate_outputsrj   c                 C   s   |  |}|j}d}|jjd | d }|jd u r$tdd|j |jn|j}t|j	|j|||d\|_
|_|j	d | || ||fS )Ng      p@r   r   r   )r$   r   r   )rl   rm   r   ry   r$   r   r   r!   r2   r.   r#   r3   rt   )rC   ri   rj   rv   r"   base_seqlenr   r$   r   r   r   rz     s$   

z)QwenImageLayeredSetTimestepsStep.__call__N)r|   r}   r~   r   r   r   r   rD   rr   r   rL   r   rW   r   r`   r^   r   r
   rz   r   r   r   r   r   }  s    r   c                   @   r   )%QwenImageSetTimestepsWithStrengthStepa  
    Step that sets the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare
    latents step.

      Components:
          scheduler (`FlowMatchEulerDiscreteScheduler`)

      Inputs:
          num_inference_steps (`int`, *optional*, defaults to 50):
              The number of denoising steps.
          sigmas (`list`, *optional*):
              Custom sigmas for the denoising process.
          latents (`Tensor`):
              The latents to use for the denoising process. Can be generated in prepare latents step.
          strength (`float`, *optional*, defaults to 0.9):
              Strength for img2img/inpainting.

      Outputs:
          timesteps (`Tensor`):
              The timesteps to use for the denoising process.
          num_inference_steps (`int`):
              The number of denoising steps to perform at inference time. Updated based on strength.
    r?   r@   c                 C   rA   )NzStep that sets the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step.r   rB   r   r   r   rD     rE   z1QwenImageSetTimestepsWithStrengthStep.descriptionc                 C   r   r   r   rB   r   r   r   rL     r   z9QwenImageSetTimestepsWithStrengthStep.expected_componentsc                 C   s0   t dt dt ddtjddt jddd	gS )
Nr!   r$   rN   TzWThe latents to use for the denoising process. Can be generated in prepare latents step.)r   r\   rD   r9   g?)defaultr   rB   r   r   r   rW     s   z,QwenImageSetTimestepsWithStrengthStep.inputsc                 C   s   t dtjddt dtddgS )Nr#   r   rZ   r!   zVThe number of denoising steps to perform at inference time. Updated based on strength.)r   r^   r_   r5   rB   r   r   r   r`      s   z:QwenImageSetTimestepsWithStrengthStep.intermediate_outputsri   rj   c              	   C   s   |  |}|j}|jd u rtdd|j |jn|j}t|jjd |j	j
dd|j	j
dd|j	j
dd|j	j
d	d
d}t|j	|j|||d\|_|_t|j	|j|jd\|_|_| || ||fS )Nr   r   r   r   r   r   r   r   r   r   r   r   )r.   r!   r9   )rl   rm   r$   r   r   r!   r    rN   ry   r.   r   r   r2   r#   r<   r9   rt   r   r   r   r   rz     s6   


z.QwenImageSetTimestepsWithStrengthStep.__call__Nr   r   r   r   r   r     s    r   c                   @   h   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zd
ededefddZdS )QwenImageRoPEInputsStepa  
    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step

      Inputs:
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.

      Outputs:
          img_shapes (`list`):
              The shapes of the images latents, used for RoPE calculation
    r?   r@   c                 C      	 dS NzhStep that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents stepr   rB   r   r   r   rD   S     z#QwenImageRoPEInputsStep.descriptionc                 C   s4   t dt jdddt jdddt dt dgS )NrS   rO   Tr   rP   prompt_embeds_masknegative_prompt_embeds_maskrU   rB   r   r   r   rW   Y  s   zQwenImageRoPEInputsStep.inputsc              	   C   $   t ddttttttf   ddgS N
img_shapesdenoiser_input_fieldsz;The shapes of the images latents, used for RoPE calculationr[   kwargs_typer\   rD   r   rr   tupler5   rB   r   r   r   r`   c     z,QwenImageRoPEInputsStep.intermediate_outputsri   rj   c                 C   sL   |  |}d|j|j d |j|j d fgg|j |_| || ||fS Nr   r   )rl   rO   rf   rP   rS   r   rt   rC   ri   rj   rv   r   r   r   rz   n  s   

z QwenImageRoPEInputsStep.__call__Nr|   r}   r~   r   r   r   r   rD   rr   r   rW   r   r`   r   r
   rz   r   r   r   r   r   ;  s    	
r   c                   @   r   )QwenImageEditRoPEInputsStepa  
    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after
    prepare_latents step

      Inputs:
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          image_height (`int`):
              The height of the reference image. Can be generated in input step.
          image_width (`int`):
              The width of the reference image. Can be generated in input step.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.

      Outputs:
          img_shapes (`list`):
              The shapes of the images latents, used for RoPE calculation
    r?   r@   c                 C   rA   )NzStep that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be placed after prepare_latents stepr   rB   r   r   r   rD     rE   z'QwenImageEditRoPEInputsStep.descriptionc              	   C   sP   t dt ddtddt ddtddt jddd	t jd
dd	t dt dgS )NrS   image_heightTzBThe height of the reference image. Can be generated in input step.r   image_widthzAThe width of the reference image. Can be generated in input step.rO   r   rP   r   r   )r   rV   r5   rB   r   r   r   rW     s$   z"QwenImageEditRoPEInputsStep.inputsc              	   C   r   r   r   rB   r   r   r   r`     r   z0QwenImageEditRoPEInputsStep.intermediate_outputsri   rj   c                 C   sl   |  |}d|j|j d |j|j d fd|j|j d |j|j d fgg|j |_| || ||fS r   )	rl   rO   rf   rP   r   r   rS   r   rt   r   r   r   r   rz     s   
z$QwenImageEditRoPEInputsStep.__call__Nr   r   r   r   r   r     s    
r   c                   @   r   )QwenImageEditPlusRoPEInputsStepa  
    Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
      Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images. Should be placed
      after prepare_latents step.

      Inputs:
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          image_height (`list`):
              The heights of the reference images. Can be generated in input step.
          image_width (`list`):
              The widths of the reference images. Can be generated in input step.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.

      Outputs:
          img_shapes (`list`):
              The shapes of the image latents, used for RoPE calculation
          txt_seq_lens (`list`):
              The sequence lengths of the prompt embeds, used for RoPE calculation
          negative_txt_seq_lens (`list`):
              The sequence lengths of the negative prompt embeds, used for RoPE calculation
    zqwenimage-edit-plusr@   c                 C   r   )NzStep that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit Plus.
Unlike Edit, Edit Plus handles lists of image_height/image_width for multiple reference images.
Should be placed after prepare_latents step.r   rB   r   r   r   rD     r   z+QwenImageEditPlusRoPEInputsStep.descriptionc              	   C   sX   t dt ddtt ddt ddtt ddt jddd	t jd
dd	t dt dgS )NrS   r   TzDThe heights of the reference images. Can be generated in input step.r   r   zCThe widths of the reference images. Can be generated in input step.rO   r   rP   r   r   )r   rV   rr   r5   rB   r   r   r   rW     s$   z&QwenImageEditPlusRoPEInputsStep.inputsc              	   C   sH   t ddttttttf   ddt ddtt ddt ddtt ddgS )	Nr   r   :The shapes of the image latents, used for RoPE calculationr   txt_seq_lensDThe sequence lengths of the prompt embeds, used for RoPE calculationnegative_txt_seq_lensMThe sequence lengths of the negative prompt embeds, used for RoPE calculationr   rB   r   r   r   r`     s&   z4QwenImageEditPlusRoPEInputsStep.intermediate_outputsri   rj   c                    s   |  |}|j d|j  d |j  d fg fddt|j|jD g|j |_|j	d ur9|j	j
dd nd |_|jd urJ|jj
dd nd |_| || ||fS )Nr   r   c                    s*   g | ]\}}d |  d |  d fqS )r   r   r   ).0
img_height	img_widthrf   r   r   
<listcomp><  s    z<QwenImageEditPlusRoPEInputsStep.__call__.<locals>.<listcomp>dim)rl   rf   rO   rP   zipr   r   rS   r   r   sumtolistr   r   r   rt   r   r   r   r   rz   3  s&   


z(QwenImageEditPlusRoPEInputsStep.__call__Nr   r   r   r   r   r     s    r   c                   @   sl   e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Ze d
edefddZdS )QwenImageLayeredRoPEInputsStepae  
    Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step

      Inputs:
          batch_size (`int`, *optional*, defaults to 1):
              Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can
              be generated in input step.
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          prompt_embeds_mask (`Tensor`):
              mask for the text embeddings. Can be generated from text_encoder step.
          negative_prompt_embeds_mask (`Tensor`, *optional*):
              mask for the negative text embeddings. Can be generated from text_encoder step.

      Outputs:
          img_shapes (`list`):
              The shapes of the image latents, used for RoPE calculation
          txt_seq_lens (`list`):
              The sequence lengths of the prompt embeds, used for RoPE calculation
          negative_txt_seq_lens (`list`):
              The sequence lengths of the negative prompt embeds, used for RoPE calculation
          additional_t_cond (`Tensor`):
              The additional t cond, used for RoPE calculation
    r   r@   c                 C   r   r   r   rB   r   r   r   rD   r  r   z*QwenImageLayeredRoPEInputsStep.descriptionc                 C   s<   t dt dt jdddt jdddt dt dgS )	NrS   r   rO   Tr   rP   r   r   rU   rB   r   r   r   rW   x  s   z%QwenImageLayeredRoPEInputsStep.inputsc              	   C   sX   t dttttttf   dddt dtt dddt dtt dddt d	tjdd
dgS )Nr   r   r   )r[   r\   r   rD   r   r   r   r   additional_t_condz0The additional t cond, used for RoPE calculation)r   rr   r   r5   r^   r_   rB   r   r   r   r`     s2   z3QwenImageLayeredRoPEInputsStep.intermediate_outputsrj   c                 C   s   |  |}|j}d|j|j d |j|j d f}|g|jd  g|j |_|jd ur4|jj	dd
 nd |_|jd urE|jj	dd
 nd |_tdg|j j|tjd|_| || ||fS )Nr   r   r   r   r   )rl   rm   rO   rf   rP   r   rS   r   r   r   r   r   r   r   r^   tensorr   longr   rt   )rC   ri   rj   rv   r"   ry   r   r   r   rz     s    

 z'QwenImageLayeredRoPEInputsStep.__call__N)r|   r}   r~   r   r   r   r   rD   rr   r   rW   r   r`   r^   r   r
   rz   r   r   r   r   r   R  s    
r   c                   @   s   e Zd ZdZdZedee fddZede	fddZ
edee fdd	Zedee fd
dZe dededefddZdS )%QwenImageControlNetBeforeDenoiserStepa  
    step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step.

      Components:
          controlnet (`QwenImageControlNetModel`)

      Inputs:
          control_guidance_start (`float`, *optional*, defaults to 0.0):
              When to start applying ControlNet.
          control_guidance_end (`float`, *optional*, defaults to 1.0):
              When to stop applying ControlNet.
          controlnet_conditioning_scale (`float`, *optional*, defaults to 1.0):
              Scale for ControlNet conditioning.
          control_image_latents (`Tensor`):
              The control image latents to use for the denoising process. Can be generated in controlnet vae encoder
              step.
          timesteps (`Tensor`):
              The timesteps to use for the denoising process. Can be generated in set_timesteps step.

      Outputs:
          controlnet_keep (`list`):
              The controlnet keep values
    r?   r@   c                 C   r   )N
controlnet)r   r   rB   r   r   r   rL     r   z9QwenImageControlNetBeforeDenoiserStep.expected_componentsc                 C   rA   )Nzbstep that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step.r   rB   r   r   r   rD     rE   z1QwenImageControlNetBeforeDenoiserStep.descriptionc              
   C   s<   t dt dt dt ddtjddt ddtjd	dgS )
Ncontrol_guidance_startcontrol_guidance_endcontrolnet_conditioning_scalecontrol_image_latentsTzlThe control image latents to use for the denoising process. Can be generated in controlnet vae encoder step.r   r#   r   r   rB   r   r   r   rW     s    z,QwenImageControlNetBeforeDenoiserStep.inputsc                 C   s   t dtt ddgS )Ncontrolnet_keepzThe controlnet keep values)r\   rD   )r   rr   floatrB   r   r   r   r`     s   z:QwenImageControlNetBeforeDenoiserStep.intermediate_outputsri   rj   c                    sL  |  | t|j}t jts!t jtr!t j jg  _n>t jts8t jtr8t j jg  _n't jts_t jts_t|trNt j	nd}| jg | jg  _ _t|trqt j
trq j
g|  _
g  _tt jD ]  fddt j jD } jt|tr|d n| q{| |  ||fS )Nr   c                    s@   g | ]\}}d t t j |k pd t j |k qS )r   r   )r   r-   r#   )r   serv   ir   r   r   *  s    .zBQwenImageControlNetBeforeDenoiserStep.__call__.<locals>.<listcomp>r   )rl   r   r   rq   r   rr   r   r-   r   r   r   r   r   ranger#   r   appendr   rt   )rC   ri   rj   r   multkeepsr   r   r   rz     sD   






 z.QwenImageControlNetBeforeDenoiserStep.__call__N)r|   r}   r~   r   r   r   rr   r   rL   r   rD   r   rW   r   r`   r^   r   r   r
   rz   r   r   r   r   r     s    r   )r   r   r   r   )NNNN)*r'   numpyr   r^   modelsr   r   
schedulersr   utils.torch_utilsr   r   modular_pipeliner	   r
   modular_pipeline_utilsr   r   r   r   r   r   r5   r   r    r   r"   rr   r2   r<   r>   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s`   



<twiZVPnF[vr