o
    ۷iB                     @   s   d dl mZ d dlZddlmZ ddlmZmZ ddlm	Z	 ddl
mZ dd	lmZmZ dd
lmZmZmZ ddlmZmZmZ eeZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    )AnyN   )
FrozenDict)InpaintProcessorVaeImageProcessor)AutoencoderKLQwenImage)logging   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )QwenImageLayeredPachifierQwenImageModularPipelineQwenImagePachifierc                   @      e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZe dededefddZdS )QwenImageAfterDenoiseStepaa  
    Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size,
    channels, 1, height, width)

      Components:
          pachifier (`QwenImagePachifier`)

      Inputs:
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          latents (`Tensor`):
              The latents to decode, can be generated in the denoise step.

      Outputs:
          latents (`Tensor`):
              The denoisedlatents unpacked to B, C, 1, H, W
    	qwenimagereturnc                 C      dS )NzStep that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width) selfr   r   d/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/modular_pipelines/qwenimage/decoders.pydescription;      z%QwenImageAfterDenoiseStep.descriptionc                 C   s   t dtddg}|S N	pachifierfrom_config)default_creation_method)r   r   r   
componentsr   r   r   expected_components?   s   z-QwenImageAfterDenoiseStep.expected_componentsc                 C   s,   t jdddt jdddt ddtjddgS )NheightTrequiredwidthlatentsz<The latents to decode, can be generated in the denoise step.namer'   	type_hintr   )r   templatetorchTensorr   r   r   r   inputsG   s   z QwenImageAfterDenoiseStep.inputsc                 C   s   t dtjddgS )Nr)   z-The denoisedlatents unpacked to B, C, 1, H, Wr+   r,   r   )r   r.   r/   r   r   r   r   intermediate_outputsT   s   z.QwenImageAfterDenoiseStep.intermediate_outputsr#   statec                 C   s@   |  |}|j}|jj|j|j|j|d|_| || ||fS )N)vae_scale_factor)get_block_stater4   r   unpack_latentsr)   r%   r(   set_block_state)r   r#   r3   block_stater4   r   r   r   __call__\   s   
z"QwenImageAfterDenoiseStep.__call__N__name__
__module____qualname____doc__
model_namepropertystrr   listr   r$   r   r0   r   r2   r.   no_gradr   r   r9   r   r   r   r   r   $   s    r   c                   @      e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZe dedefddZdS ) QwenImageLayeredAfterDenoiseStepa  
    Unpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.

      Components:
          pachifier (`QwenImageLayeredPachifier`)

      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step.
          height (`int`):
              The height in pixels of the generated image.
          width (`int`):
              The width in pixels of the generated image.
          layers (`int`, *optional*, defaults to 4):
              Number of layers to extract from the image

      Outputs:
          latents (`Tensor`):
              Denoised latents. (unpacked to B, C, layers+1, H, W)
    qwenimage-layeredr   c                 C   r   )NzLUnpack latents from (B, seq, C*4) to (B, C, layers+1, H, W) after denoising.r   r   r   r   r   r      r   z,QwenImageLayeredAfterDenoiseStep.descriptionc                 C   s   t dtddgS r   )r   r   r   r   r   r   r$         z4QwenImageLayeredAfterDenoiseStep.expected_componentsc                 C   s4   t ddtjddt jdddt jdddt dgS )	Nr)   TzEThe denoised latents to decode, can be generated in the denoise step.r*   r%   r&   r(   layersr   r.   r/   r-   r   r   r   r   r0      s   z'QwenImageLayeredAfterDenoiseStep.inputsc                 C      t jdddgS )Nr)   z unpacked to B, C, layers+1, H, Wnoter   r-   r   r   r   r   r2      rG   z5QwenImageLayeredAfterDenoiseStep.intermediate_outputsr3   c                 C   s>   |  |}|j|j|j|j|j|j|_| || ||fS )N)	r5   r   r6   r)   r%   r(   rH   r4   r7   r   r#   r3   r8   r   r   r   r9      s   
z)QwenImageLayeredAfterDenoiseStep.__call__Nr;   r<   r=   r>   r?   r@   rA   r   rB   r   r$   r   r0   r   r2   r.   rC   r   r9   r   r   r   r   rE   j   s    rE   c                   @   r   )QwenImageDecoderStepa  
    Step that decodes the latents to images

      Components:
          vae (`AutoencoderKLQwenImage`)

      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.

      Outputs:
          images (`list`):
              Generated images. (tensor output of the vae decoder.)
    r   r   c                 C   r   )Nz'Step that decodes the latents to imagesr   r   r   r   r   r      r   z QwenImageDecoderStep.descriptionc                 C   s   t dtg}|S )Nvae)r   r   r"   r   r   r   r$      s   z(QwenImageDecoderStep.expected_componentsc                 C   s   t ddtjddgS )Nr)   TlThe denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise step.r*   )r   r.   r/   r   r   r   r   r0      s   zQwenImageDecoderStep.inputsc                 C   rJ   )Nimagesz!tensor output of the vae decoder.rK   rM   r   r   r   r   r2      s   z)QwenImageDecoderStep.intermediate_outputsr#   r3   c                 C   s  |  |}|jjdkr|jjdd|_n|jjdkr$td|jj d|j|jj|_t	
|jjjd|jjjddd|jj|jj}dt	
|jjjd|jjjddd|jj|jj }|j| | |_|jj|jdd	d
 d d d d d
f |_| || ||fS )N   r   )dim   z0expect latents to be a 4D or 5D tensor but got: z?. Please make sure the latents are unpacked before decode step.      ?Freturn_dictr   )r5   r)   ndim	unsqueeze
ValueErrorshapetorQ   dtyper.   tensorconfiglatents_meanviewz_dimdevicelatents_stddecoderS   r7   )r   r#   r3   r8   rb   rf   r   r   r   r9      s*   
*zQwenImageDecoderStep.__call__Nr:   r   r   r   r   rP      s    
rP   c                   @   rD   )QwenImageLayeredDecoderStepa  
    Decode unpacked latents (B, C, layers+1, H, W) into layer images.

      Components:
          vae (`AutoencoderKLQwenImage`) image_processor (`VaeImageProcessor`)

      Inputs:
          latents (`Tensor`):
              The denoised latents to decode, can be generated in the denoise step and unpacked in the after denoise
              step.
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.

      Outputs:
          images (`list`):
              Generated images.
    rF   r   c                 C   r   )NzADecode unpacked latents (B, C, layers+1, H, W) into layer images.r   r   r   r   r   r     r   z'QwenImageLayeredDecoderStep.descriptionc                 C   s"   t dtt dttddiddgS )NrQ   image_processorr4      r    ra   r!   )r   r   r   r   r   r   r   r   r$     s   
z/QwenImageLayeredDecoderStep.expected_componentsc                 C      t ddtjddt dgS )Nr)   TrR   r*   output_typerI   r   r   r   r   r0   %     z"QwenImageLayeredDecoderStep.inputsc                 C      t dgS NrS   rM   r   r   r   r   r2   1     z0QwenImageLayeredDecoderStep.intermediate_outputsr3   c                 C   sR  |  |}|j}||jj}t|jjj	d|jjj
ddd|j|j}dt|jjj	d|jjj
ddd|j|j }|| | }|j\}}}	}
}|d d d d dd f }|dddddd|d|
|}|jj|dd	d }|d}|jj||jd
}g }t|D ]}||||	 |d |	   q||_| || ||fS )Nr   rW   r   r	   r   rT   FrX   rm   )r5   r)   r^   rQ   r_   r.   r`   ra   rb   rc   rd   re   rf   r]   permutereshaperg   squeezeri   postprocessrm   rangeappendrS   r7   )r   r#   r3   r8   r)   rb   rf   bcfhwimagerS   bidxr   r   r   r9   5  s2   
 
 z$QwenImageLayeredDecoderStep.__call__NrO   r   r   r   r   rh      s    rh   c                   @      e Zd ZdZdZedefddZedee	 fddZ
edee fdd	Zedee fd
dZedd Ze dedefddZdS ) QwenImageProcessImagesOutputStepa  
    postprocess the generated image

      Components:
          image_processor (`VaeImageProcessor`)

      Inputs:
          images (`Tensor`):
              the generated image tensor from decoders step
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.

      Outputs:
          images (`list`):
              Generated images.
    r   r   c                 C   r   )Nzpostprocess the generated imager   r   r   r   r   r   w  r   z,QwenImageProcessImagesOutputStep.descriptionc                 C      t dttddiddgS )Nri   r4   rj   r    rk   )r   r   r   r   r   r   r   r$   {     
z4QwenImageProcessImagesOutputStep.expected_componentsc                 C   rl   )NrS   T-the generated image tensor from decoders stepr*   rm   rI   r   r   r   r   r0     rn   z'QwenImageProcessImagesOutputStep.inputsc                 C   ro   rp   rM   r   r   r   r   r2     rq   z5QwenImageProcessImagesOutputStep.intermediate_outputsc                 C   s   | dvrt d|  d S )NpilnpptInvalid output_type: r\   rs   r   r   r   check_inputs  s   z-QwenImageProcessImagesOutputStep.check_inputsr#   r3   c                 C   s@   |  |}| |j |jj|j|jd|_| || ||fS )N)r   rm   )r5   r   rm   ri   rw   rS   r7   rN   r   r   r   r9     s   
z)QwenImageProcessImagesOutputStep.__call__Nr;   r<   r=   r>   r?   r@   rA   r   rB   r   r$   r   r0   r   r2   staticmethodr   r.   rC   r   r   r9   r   r   r   r   r   c  s    

r   c                   @   r   )'QwenImageInpaintProcessImagesOutputStepa  
    postprocess the generated image, optional apply the mask overally to the original image..

      Components:
          image_mask_processor (`InpaintProcessor`)

      Inputs:
          images (`Tensor`):
              the generated image tensor from decoders step
          output_type (`str`, *optional*, defaults to pil):
              Output format: 'pil', 'np', 'pt'.
          mask_overlay_kwargs (`dict`, *optional*):
              The kwargs for the postprocess step to apply the mask overlay. generated in
              InpaintProcessImagesInputStep.

      Outputs:
          images (`list`):
              Generated images.
    r   r   c                 C   r   )NzYpostprocess the generated image, optional apply the mask overally to the original image..r   r   r   r   r   r     r   z3QwenImageInpaintProcessImagesOutputStep.descriptionc                 C   r   )Nimage_mask_processorr4   rj   r    rk   )r   r   r   r   r   r   r   r$     r   z;QwenImageInpaintProcessImagesOutputStep.expected_componentsc                 C   s0   t ddtjddt dt dtttf ddgS )	NrS   Tr   r*   rm   mask_overlay_kwargszjThe kwargs for the postprocess step to apply the mask overlay. generated in InpaintProcessImagesInputStep.r1   )r   r.   r/   r-   dictrA   r   r   r   r   r   r0     s   
z.QwenImageInpaintProcessImagesOutputStep.inputsc                 C   ro   rp   rM   r   r   r   r   r2     rq   z<QwenImageInpaintProcessImagesOutputStep.intermediate_outputsc                 C   s2   | dvrt d|  |r| dkrt dd S d S )Nr   r   r   z/only support output_type 'pil' for mask overlayr   )rm   r   r   r   r   r     s
   z4QwenImageInpaintProcessImagesOutputStep.check_inputsr#   r3   c                 C   s^   |  |}| |j|j |jd u ri }n|j}|jjdd|ji||_| || ||fS )Nr   r   )r5   r   rm   r   r   rw   rS   r7   )r   r#   r3   r8   r   r   r   r   r9     s   


z0QwenImageInpaintProcessImagesOutputStep.__call__Nr   r   r   r   r   r     s    

r   )typingr   r.   configuration_utilsr   ri   r   r   modelsr   utilsr   modular_pipeliner
   r   modular_pipeline_utilsr   r   r   r   r   r   
get_loggerr;   loggerr   rE   rP   rh   r   r   r   r   r   r   <module>   s    
FKKcH