o
    ۷i                     @   s   d dl mZ d dlZd dlZd dlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZ dd
lmZmZmZ eeZG dd deZdS )    )AnyN   )
FrozenDict)AutoencoderKLWan)logging)VideoProcessor   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParamc                   @   s   e Zd ZdZedee fddZedefddZ	edee
eef  fddZedee fd	d
Ze dedefddZdS )WanVaeDecoderStepwanreturnc                 C   s"   t dtt dttddiddgS )Nvaevideo_processorvae_scale_factor   from_config)configdefault_creation_method)r   r   r   r   self r   ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/modular_pipelines/wan/decoders.pyexpected_components#   s   
z%WanVaeDecoderStep.expected_componentsc                 C   s   dS )Nz2Step that decodes the denoised latents into imagesr   r   r   r   r   description/   s   zWanVaeDecoderStep.descriptionc                 C   s"   t ddtjddt ddtddgS )	NlatentsTz,The denoised latents from the denoising step)required	type_hintr   output_typenpz%The output type of the decoded videos)defaultr    r   )r   torchTensorstrr   r   r   r   inputs3   s   zWanVaeDecoderStep.inputsc                 C   s0   t dtttjj  ttj B ttj B ddgS )NvideoszMThe generated videos, can be a PIL.Image.Image, torch.Tensor or a numpy array)r    r   )r   listPILImager$   r%   r"   ndarrayr   r   r   r   intermediate_outputsA   s   "z&WanVaeDecoderStep.intermediate_outputsstatec           	      C   s   |  |}|jj}|j}t|jjjd|jjj	ddd
|j|j}dt|jjjd|jjj	ddd
|j|j }|| | }|
|}|jj|ddd |_t|dd}|jj|j|d|_| || ||fS )	N   g      ?F)return_dictr   r!   r"   )r!   )get_block_stater   dtyper   r$   tensorr   latents_meanviewz_dimtodevicelatents_stddecoder(   getattrr   postprocess_videoset_block_state)	r   
componentsr.   block_state	vae_dtyper   r4   r9   r!   r   r   r   __call__K   s&   

zWanVaeDecoderStep.__call__N)__name__
__module____qualname__
model_namepropertyr)   r   r   r&   r   tupler   r'   r-   r$   no_gradr
   rA   r   r   r   r   r       s    	r   )typingr   numpyr"   r*   r$   configuration_utilsr   modelsr   utilsr   r   r   modular_pipeliner	   r
   modular_pipeline_utilsr   r   r   
get_loggerrB   loggerr   r   r   r   r   <module>   s   
