o
    ۷in                     @   s2  d dl Z d dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZmZ d
dlmZmZm Z  ddlm!Z! e rgd dl"Z"e rpd dl#m$Z$ e%e&Z'dd Z(dd Z)dd Z*dedede+e,e+ B de-dej.f
ddZ/	dAdede	de
dej.dB fd d!Z0	"dBd#ej1d$ej2dB d%e+fd&d'Z3	(dCd)ej1d*ed$ej2dej.d+ej4d,e-fd-d.Z5G d/d0 d0eZ6G d1d2 d2eZ7G d3d4 d4eZ8G d5d6 d6eZ9G d7d8 d8eZ:G d9d: d:eZ;G d;d< d<eZ<G d=d> d>eZ=G d?d@ d@eZ>dS )D    N)AutoTokenizerCLIPImageProcessorCLIPVisionModelUMT5EncoderModel   )
FrozenDict)ClassifierFreeGuidance)PipelineImageInput)AutoencoderKLWan)is_ftfy_availableis_torchvision_availablelogging)VideoProcessor   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )WanModularPipeline)
transformsc                 C   s"   t | } tt| } |  S N)ftfyfix_texthtmlunescapestriptext r    ^/home/ubuntu/vllm_env/lib/python3.10/site-packages/diffusers/modular_pipelines/wan/encoders.pybasic_clean,   s   
r"   c                 C   s   t dd| } |  } | S )Nz\s+ )resubr   r   r    r    r!   whitespace_clean2   s   r&   c                 C   s   t t| } | S r   )r&   r"   r   r    r    r!   prompt_clean8   s   r'   text_encoder	tokenizerpromptmax_sequence_lengthdevicec              	      s   | j }t|tr|gn|}dd |D }||d ddddd}|j|j}}|djdd	 }	| ||||j	}
|
j||d
}
dd t
|
|	D }
tj fdd|
D dd	}
|
S )Nc                 S   s   g | ]}t |qS r    )r'   .0ur    r    r!   
<listcomp>F   s    z(get_t5_prompt_embeds.<locals>.<listcomp>
max_lengthTpt)paddingr1   
truncationadd_special_tokensreturn_attention_maskreturn_tensorsr   r   dim)dtyper,   c                 S   s   g | ]
\}}|d | qS r   r    )r.   r/   vr    r    r!   r0   U   s    c                    s2   g | ]}t || |d  |dgqS )r   r   )torchcat	new_zerossizer-   r+   r    r!   r0   W   s   2 )r:   
isinstancestr	input_idsattention_maskgtsumlongtolast_hidden_statezipr<   stack)r(   r)   r*   r+   r,   r:   text_inputstext_input_idsmaskseq_lensprompt_embedsr    r@   r!   get_t5_prompt_embeds=   s*   	rQ   imageimage_processorimage_encoderc                 C   s2   || dd |} |di | ddi}|jd S )Nr2   )imagesr7   output_hidden_statesTr    )rH   hidden_states)rR   rS   rT   r,   image_embedsr    r    r!   encode_image]   s   
rZ   sampleencoder_output	generatorsample_modec                 C   sR   t | dr|dkr| j|S t | dr|dkr| j S t | dr%| jS td)Nlatent_distr[   argmaxlatentsz3Could not access latents of provided encoder_output)hasattrr_   r[   modera   AttributeError)r\   r]   r^   r    r    r!   retrieve_latentsi   s   

re      video_tensorvaer:   latent_channelsc           	         s  t tjstdt dt  tr.t jd kr.tdt  djd  dj||dt  trR fddt	jd D }tj
|dd	}n	td
d}tjjd|ddd|j|j}dtjjd|ddd|j|j }|| | }|S )Nz*Expected video_tensor to be a tensor, got .r   z/You have passed a list of generators of length z), but it is not same as number of images r,   r:   c              	      s0   g | ]}t ||d    | ddqS )r   r`   )r]   r^   )re   encode)r.   ir]   rh   rg   r    r!   r0      s    "z$encode_vae_image.<locals>.<listcomp>r8   r`   )r^   r   g      ?)rA   r<   Tensor
ValueErrortypelistlenshaperH   ranger=   re   rl   tensorconfiglatents_meanviewr,   r:   latents_std)	rg   rh   r]   r,   r:   ri   video_latentsrx   rz   r    rn   r!   encode_vae_imagev   s,   
r|   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zedd Ze				ddedejdB dededB def
ddZe dededefddZdS )WanTextEncoderStepwanreturnc                 C      dS )NzMText Encoder step that generate text_embeddings to guide the video generationr    selfr    r    r!   description      zWanTextEncoderStep.descriptionc                 C   s*   t dtt dtt dttddiddgS )Nr(   r)   guiderguidance_scaleg      @from_configrw   default_creation_method)r   r   r   r   r   r   r    r    r!   expected_components   s   
z&WanTextEncoderStep.expected_componentsc                 C   s   t dt dt dddgS )Nr*   negative_promptr+      )default)r   r   r    r    r!   inputs   s   
zWanTextEncoderStep.inputsc                 C   s$   t dtjdddt dtjdddgS )NrP   denoiser_input_fieldsz2text embeddings used to guide the image generation)	type_hintkwargs_typer   negative_prompt_embedsz;negative text embeddings used to guide the image generationr   r<   ro   r   r    r    r!   intermediate_outputs   s   z'WanTextEncoderStep.intermediate_outputsc                 C   sB   | j d urt| j tst| j tstdt| j  d S d S d S )Nz2`prompt` has to be of type `str` or `list` but is )r*   rA   rB   rr   rp   rq   )block_stater    r    r!   check_inputs   s   


zWanTextEncoderStep.check_inputsNTr   r*   r,   prepare_unconditional_embedsr   r+   c           	   
   C   s   |p| j }t|ts|g}t|}t| j| j|||d}|rm|p!d}t|tr,||g n|}|durIt|t|urIt	dt| dt| d|t|krbt
d| dt| d	| d| d
	t| j| j|||d}||fS )aC  
        Encodes the prompt into text encoder hidden states.

        Args:
            prompt (`str` or `list[str]`, *optional*):
                prompt to be encoded
            device: (`torch.device`):
                torch device
            prepare_unconditional_embeds (`bool`):
                whether to use prepare unconditional embeddings or not
            negative_prompt (`str` or `list[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            max_sequence_length (`int`, defaults to `512`):
                The maximum number of text tokens to be used for the generation process.
        )r(   r)   r*   r+   r,    Nz?`negative_prompt` should be the same type to `prompt`, but got z != rj   z`negative_prompt`: z has batch size z, but `prompt`: zT. Please make sure that passed `negative_prompt` matches the batch size of `prompt`.)_execution_devicerA   rr   rs   rQ   r(   r)   rB   rq   	TypeErrorrp   )	
componentsr*   r,   r   r   r+   
batch_sizerP   r   r    r    r!   encode_prompt   sH   

z WanTextEncoderStep.encode_promptr   statec                 C   sX   |  |}| | |j|_| j||j|j|j|j|jd\|_	|_
| || ||fS )N)r   r*   r,   r   r   r+   )get_block_stater   r   r,   r   r*   requires_unconditional_embedsr   r+   rP   r   set_block_state)r   r   r   r   r    r    r!   __call__  s    

zWanTextEncoderStep.__call__)NTNr   )__name__
__module____qualname__
model_namepropertyrB   r   rr   r   r   r   r   r   r   staticmethodr   r<   r,   boolintr   no_gradr   r   r   r    r    r    r!   r}      s<    
@r}   c                   @   d   e Zd ZdZedefddZedee fddZ	edee
 fddZd	ed
edefddZdS )WanImageResizeStepr~   r   c                 C   r   )NzoImage Resize step that resize the image to the target area (height * width) while maintaining the aspect ratio.r    r   r    r    r!   r   0  r   zWanImageResizeStep.descriptionc                 C   s,   t dtjjddt dtddt dtddgS )	NrR   Tr   requiredheighti  r   r   widthi@  r   PILImager   r   r    r    r!   r   4  s   zWanImageResizeStep.inputsc                 C      t dtjjdgS )Nresized_imager   r   r   r   r   r    r    r!   r   <     z'WanImageResizeStep.intermediate_outputsr   r   c                 C   s   |  |}|j|j }|j}|j|j }|j|j }tt|| | | |_tt|| | | |_|	|j|jf|_
| || ||fS r   )r   r   r   rR   vae_scale_factor_spatialpatch_size_spatialroundnpsqrtresizer   r   )r   r   r   r   max_arearR   aspect_ratio	mod_valuer    r    r!   r   B  s   
zWanImageResizeStep.__call__Nr   r   r   r   r   rB   r   rr   r   r   r   r   r   r   r   r    r    r    r!   r   -  s    r   c                   @   r   )WanImageCropResizeStepr~   r   c                 C   r   )NzdImage Resize step that resize the last_image to the same size of first frame image with center crop.r    r   r    r    r!   r   T  r   z"WanImageCropResizeStep.descriptionc                 C   s(   t dtjjdddt dtjjdddgS )Nr   TzThe resized first frame image)r   r   r   
last_imagezThe last frameimager   r   r   r   r    r    r!   r   X  s
   zWanImageCropResizeStep.inputsc                 C   r   )Nresized_last_imager   r   r   r    r    r!   r   a  r   z+WanImageCropResizeStep.intermediate_outputsr   r   c           
      C   s   |  |}|jj}|jj}|j}t||j ||j }t|j| }t|j| }||g}tj	||}	|	|_
| || ||fS r   )r   r   r   r   r   maxr   r   
functionalcenter_cropr   r   )
r   r   r   r   r   r   rR   resize_ratior?   r   r    r    r!   r   g  s   
zWanImageCropResizeStep.__call__Nr   r    r    r    r!   r   Q  s    r   c                   @   z   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
ZdededefddZdS )WanImageEncoderStepr~   r   c                 C   r   )NzfImage Encoder step that generate image_embeds based on first frame image to guide the video generationr    r   r    r    r!   r     r   zWanImageEncoderStep.descriptionc                 C      t dtt dtgS NrS   rT   r   r   r   r   r    r    r!   r        z'WanImageEncoderStep.expected_componentsc                 C   s   t dtjjddgS )Nr   Tr   r   r   r    r    r!   r        zWanImageEncoderStep.inputsc                 C      t dtjddgS NrY   zThe image embeddingsr   r   r   r   r    r    r!   r     r   z(WanImageEncoderStep.intermediate_outputsr   r   c                 C   sD   |  |}|j}|j}t|j|j||d}||_| || ||fS N)rS   rT   rR   r,   )r   r   r   rZ   rS   rT   rY   r   )r   r   r   r   r,   rR   rY   r    r    r!   r     s   
zWanImageEncoderStep.__call__Nr   r   r   r   r   rB   r   rr   r   r   r   r   r   r   r   r   r   r    r    r    r!   r   |  s    r   c                   @   r   )!WanFirstLastFrameImageEncoderStepr~   r   c                 C   r   )NzpImage Encoder step that generate image_embeds based on first and last frame images to guide the video generationr    r   r    r    r!   r     r   z-WanFirstLastFrameImageEncoderStep.descriptionc                 C   r   r   r   r   r    r    r!   r     r   z5WanFirstLastFrameImageEncoderStep.expected_componentsc                 C   s$   t dtjjddt dtjjddgS )Nr   Tr   r   r   r   r    r    r!   r     s   z(WanFirstLastFrameImageEncoderStep.inputsc                 C   r   r   r   r   r    r    r!   r     r   z6WanFirstLastFrameImageEncoderStep.intermediate_outputsr   r   c                 C   sN   |  |}|j}|j}|j}t|j|j||g|d}||_| || ||fS r   )	r   r   r   r   rZ   rS   rT   rY   r   )r   r   r   r   r,   first_frame_imagelast_frame_imagerY   r    r    r!   r     s   
z*WanFirstLastFrameImageEncoderStep.__call__Nr   r    r    r    r!   r     s    r   c                   @      e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zedd ZdededefddZdS )WanVaeEncoderStepr~   r   c                 C   r   )NzoVae Image Encoder step that generate condition_latents based on first frame image to guide the video generationr    r   r    r    r!   r     r   zWanVaeEncoderStep.descriptionc                 C   "   t dtt dttddiddgS Nrh   video_processorvae_scale_factor   r   r   r   r
   r   r   r   r    r    r!   r        
z%WanVaeEncoderStep.expected_componentsc                 C   s2   t dtjjddt dt dt dtddt d	gS )
Nr   Tr   r   r   
num_framesQ   r   r]   r   r   r    r    r!   r     s   zWanVaeEncoderStep.inputsc                 C   r   )Nfirst_frame_latentsz@video latent representation with the first frame image conditionr   r   r   r    r    r!   r        z&WanVaeEncoderStep.intermediate_outputsc                 C      |j d ur|j | j dks|jd ur+|j| j dkr+td| j d|j  d|j d|jd urL|jdk s?|jd | j dkrNtd| j d|j dd S d S 	Nr   z-`height` and `width` have to be divisible by z	 but are z and rj   r   zQ`num_frames` has to be greater than 0, and (num_frames - 1) must be divisible by z
, but got r   r   r   rp   r   vae_scale_factor_temporalr   r   r    r    r!   r        
zWanVaeEncoderStep.check_inputsr   r   c              	   C   s   |  |}| || |j}|j}tj}|jj}|jp|j	}|j
p#|j}	|jp)|j}
|jj|||	dj||d}| dkrC|d}tj|||jd |jd |
d ||	gddj||d}t||j|j|||jd|_| || ||fS )	Nr   r   rk      r   r   r   r8   rg   rh   r]   r,   r:   ri   )r   r   r   r   r<   float32rh   r:   r   default_heightr   default_widthr   default_num_framesr   
preprocessrH   r9   	unsqueezer=   r>   rt   r|   r]   num_channels_latentsr   r   )r   r   r   r   rR   r,   r:   	vae_dtyper   r   r   image_tensorrg   r    r    r!   r     s@   

 	zWanVaeEncoderStep.__call__Nr   r   r   r   r   rB   r   rr   r   r   r   r   r   r   r   r   r   r   r   r    r    r    r!   r     s    		
r   c                   @   r   )WanPrepareFirstFrameLatentsStepr~   r   c                 C   r   )NzTstep that prepares the masked first frame latents and add it to the latent conditionr    r   r    r    r!   r   9  r   z+WanPrepareFirstFrameLatentsStep.descriptionc                 C   s   t dtjd B dt dddgS )Nr   r   r   T)r   )r   r<   ro   r   r    r    r!   r   =  s   
z&WanPrepareFirstFrameLatentsStep.inputsc                 C      t dtjd B dgS Nimage_condition_latentsr   r   r   r    r    r!   r   D  r   z4WanPrepareFirstFrameLatentsStep.intermediate_outputsr   r   c           
      C   s  |  |}|jj\}}}}}t|d|j||}d|d d d d ttd|jf< |d d d d ddf }	tj|	d|j	d}	tj
|	|d d d d dd d d f gdd}||d|j	||}|dd}||jj}tj
||jgdd|_| || ||fS Nr   r   r   )r9   repeatsr8   )r   r   rt   r<   onesr   rr   ru   repeat_interleaver   concatry   	transposerH   r,   r   r   
r   r   r   r   r   _latent_heightlatent_widthmask_lat_sizefirst_frame_maskr    r    r!   r   J  s"   
".z(WanPrepareFirstFrameLatentsStep.__call__Nr   r    r    r    r!   r   6      r   c                   @   r   )WanFirstLastFrameVaeEncoderStepr~   r   c                 C   r   )NzyVae Image Encoder step that generate condition_latents based on first and last frame images to guide the video generationr    r   r    r    r!   r   e  r   z+WanFirstLastFrameVaeEncoderStep.descriptionc                 C   r   r   r   r   r    r    r!   r   i  r   z3WanFirstLastFrameVaeEncoderStep.expected_componentsc              	   C   sB   t dtjjddt dtjjddt dt dt dtdd	t d
gS )Nr   Tr   r   r   r   r   r   r   r]   r   r   r    r    r!   r   u  s   z&WanFirstLastFrameVaeEncoderStep.inputsc                 C   r   )Nfirst_last_frame_latentszJvideo latent representation with the first and last frame images conditionr   r   r   r    r    r!   r     r   z4WanFirstLastFrameVaeEncoderStep.intermediate_outputsc                 C   r   r   r   r   r    r    r!   r     r   z,WanFirstLastFrameVaeEncoderStep.check_inputsr   r   c              	   C   s  |  |}| || |j}|j}|j}tj}|jj}|j	p |j
}	|jp&|j}
|jp,|j}|jj||	|
dj||d}|d}|jj||	|
dj||d}|d}tj|||jd |jd |d |	|
|gddj||d}t||j|j|||jd|_| || ||fS )Nr   rk   r   r   r   r8   r   )r   r   r   r   r   r<   r   rh   r:   r   r   r   r   r   r   r   r   rH   r   r=   r>   rt   r|   r]   r   r  r   )r   r   r   r   r   r   r,   r:   r   r   r   r   first_image_tensorlast_image_tensorrg   r    r    r!   r     sN   


		z(WanFirstLastFrameVaeEncoderStep.__call__Nr   r    r    r    r!   r  b  s    
	
r  c                   @   r   )#WanPrepareFirstLastFrameLatentsStepr~   r   c                 C   r   )Nzcstep that prepares the masked latents with first and last frames and add it to the latent conditionr    r   r    r    r!   r     r   z/WanPrepareFirstLastFrameLatentsStep.descriptionc                 C   s    t dtjd B dt dtddgS )Nr  r   r   Tr   )r   r<   ro   r   r   r    r    r!   r     s   z*WanPrepareFirstLastFrameLatentsStep.inputsc                 C   r   r   r   r   r    r    r!   r     r   z8WanPrepareFirstLastFrameLatentsStep.intermediate_outputsr   r   c           
   	   C   s  |  |}|jj\}}}}}t|d|j||}d|d d d d ttd|jd f< |d d d d ddf }	tj|	d|j	d}	tj
|	|d d d d dd d d f gdd}||d|j	||}|dd}||jj}tj
||jgdd|_| || ||fS r   )r   r  rt   r<   r  r   rr   ru   r  r   r  ry   r  rH   r,   r   r   r  r    r    r!   r     s&   
&.
z,WanPrepareFirstLastFrameLatentsStep.__call__Nr   r    r    r    r!   r    r  r  r   )Nr[   )rf   )?r   numpyr   r   regexr$   r<   transformersr   r   r   r   configuration_utilsr   guidersr   rS   r	   modelsr
   utilsr   r   r   r   r   modular_pipeliner   r   modular_pipeline_utilsr   r   r   r   r   torchvisionr   
get_loggerr   loggerr"   r&   r'   rB   rr   r   r,   rQ   rZ   ro   	Generatorre   r:   r|   r}   r   r   r   r   r   r   r  r  r    r    r    r!   <module>   s   


$


( $+,.`,i