o
    Giq^                     @   sJ  d dl Z d dlZddlmZ ddlmZ ddlmZ ddlm	Z	 ddl
mZmZ dd	lmZmZmZ d
dl
mZ eeZ	
d&dedejdededejf
ddZdejdededeeef fddZ				d'dedB deejB dB dee dB dee dB fddZG dd deZG d d! d!eZ G d"d# d#eZ!G d$d% d%eZ"dS )(    N   )WanTransformer3DModel)UniPCMultistepScheduler)logging)randn_tensor   )ModularPipelineBlocksPipelineState)ComponentSpec
InputParamOutputParam   )WanModularPipeline
input_nameinput_tensor
batch_sizenum_videos_per_promptreturnc                 C   sz   t |tjstd|  d|jd dkr|| }n|jd |kr$|}ntd|  d| d|jd  |j|dd}|S )a(  Repeat tensor elements to match the final batch size.

    This function expands a tensor's batch dimension to match the final batch size (batch_size * num_videos_per_prompt)
    by repeating each element along dimension 0.

    The input tensor must have batch size 1 or batch_size. The function will:
    - If batch size is 1: repeat each element (batch_size * num_videos_per_prompt) times
    - If batch size equals batch_size: repeat each element num_videos_per_prompt times

    Args:
        input_name (str): Name of the input tensor (used for error messages)
        input_tensor (torch.Tensor): The tensor to repeat. Must have batch size 1 or batch_size.
        batch_size (int): The base batch size (number of prompts)
        num_videos_per_prompt (int, optional): Number of videos to generate per prompt. Defaults to 1.

    Returns:
        torch.Tensor: The repeated tensor with final batch size (batch_size * num_videos_per_prompt)

    Raises:
        ValueError: If input_tensor is not a torch.Tensor or has invalid batch size

    Examples:
        tensor = torch.tensor([[1, 2, 3]]) # shape: [1, 3] repeated = repeat_tensor_to_batch_size("image", tensor,
        batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape:
        [4, 3]

        tensor = torch.tensor([[1, 2, 3], [4, 5, 6]]) # shape: [2, 3] repeated = repeat_tensor_to_batch_size("image",
        tensor, batch_size=2, num_videos_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]])
        - shape: [4, 3]
    `z` must be a tensorr   r   z!` must have have batch size 1 or 
, but got )dim)
isinstancetorchTensor
ValueErrorshaperepeat_interleave)r   r   r   r   	repeat_by r   b/home/ubuntu/.local/lib/python3.10/site-packages/diffusers/modular_pipelines/wan/before_denoise.pyrepeat_tensor_to_batch_size%   s   %
r    latentsvae_scale_factor_temporalvae_scale_factor_spatialc           
      C   sT   | j dkrtd| j  | j\}}}}}|d | d }|| }|| }	|||	fS )a  Calculate image dimensions from latent tensor dimensions.

    This function converts latent temporal and spatial dimensions to image temporal and spatial dimensions by
    multiplying the latent num_frames/height/width by the VAE scale factor.

    Args:
        latents (torch.Tensor): The latent tensor. Must have 4 or 5 dimensions.
            Expected shapes: [batch, channels, height, width] or [batch, channels, frames, height, width]
        vae_scale_factor_temporal (int): The scale factor used by the VAE to compress temporal dimension.
            Typically 4 for most VAEs (video is 4x larger than latents in temporal dimension)
        vae_scale_factor_spatial (int): The scale factor used by the VAE to compress spatial dimension.
            Typically 8 for most VAEs (image is 8x larger than latents in each dimension)

    Returns:
        tuple[int, int]: The calculated image dimensions as (height, width)

    Raises:
        ValueError: If latents tensor doesn't have 4 or 5 dimensions

       z(latents must have 5 dimensions, but got r   )ndimr   r   )
r!   r"   r#   _num_latent_frameslatent_heightlatent_width
num_framesheightwidthr   r   r    calculate_dimension_from_latents]   s   

r-   num_inference_stepsdevice	timestepssigmasc                 K   s  |dur|durt d|dur>dtt| jj v }|s(t d| j d| jd||d| | j}t	|}||fS |durpdtt| jj v }|sZt d| j d| jd||d	| | j}t	|}||fS | j|fd
|i| | j}||fS )a  
    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.

    Args:
        scheduler (`SchedulerMixin`):
            The scheduler to get timesteps from.
        num_inference_steps (`int`):
            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
            must be `None`.
        device (`str` or `torch.device`, *optional*):
            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
        timesteps (`list[int]`, *optional*):
            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
            `num_inference_steps` and `sigmas` must be `None`.
        sigmas (`list[float]`, *optional*):
            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
            `num_inference_steps` and `timesteps` must be `None`.

    Returns:
        `tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
        second element is the number of inference steps.
    NzYOnly one of `timesteps` or `sigmas` can be passed. Please choose one to set custom valuesr0   zThe current scheduler class zx's `set_timesteps` does not support custom timestep schedules. Please check whether you are using the correct scheduler.)r0   r/   r1   zv's `set_timesteps` does not support custom sigmas schedules. Please check whether you are using the correct scheduler.)r1   r/   r/   r   )
r   setinspect	signatureset_timesteps
parameterskeys	__class__r0   len)	schedulerr.   r/   r0   r1   kwargsaccepts_timestepsaccept_sigmasr   r   r   retrieve_timesteps   s2   r>   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZedee fd	d
Zdd Ze dededefddZdS )WanTextInputStepwanr   c                 C   s   	 dS )Na  Input processing step that:
  1. Determines `batch_size` and `dtype` based on `prompt_embeds`
  2. Adjusts input tensor shapes based on `batch_size` (number of prompts) and `num_videos_per_prompt`

All input tensors are expected to have either batch_size=1 or match the batch_size
of prompt_embeds. The tensors will be duplicated across the batch dimension to
have a final batch_size of batch_size * num_videos_per_prompt.r   selfr   r   r   description   s   zWanTextInputStep.descriptionc                 C      t dtgS )Ntransformer)r
   r   rA   r   r   r   expected_components      z$WanTextInputStep.expected_componentsc                 C   s,   t dddt ddtjddt dtjd	d
gS )Nr   r   defaultprompt_embedsTzGPre-generated text embeddings. Can be generated from text_encoder step.required	type_hintrC   negative_prompt_embedszPPre-generated negative text embeddings. Can be generated from text_encoder step.rM   rC   )r   r   r   rA   r   r   r   inputs   s   
zWanTextInputStep.inputsc                 C   s   t dtddt dtjddgS )Nr   zdNumber of prompts, the final batch size of model inputs should be batch_size * num_videos_per_promptrO   dtypezDData type of model tensor inputs (determined by `transformer.dtype`))r   intr   rQ   rA   r   r   r   intermediate_outputs   s   z%WanTextInputStep.intermediate_outputsc                 C   sN   |j d ur!|jd ur#|j j|jjkr%td|j j d|jj dd S d S d S )Nzu`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but got: `prompt_embeds` z != `negative_prompt_embeds` .)rJ   rN   r   r   )rB   
componentsblock_stater   r   r   check_inputs   s   zWanTextInputStep.check_inputsrU   statec                 C   s   |  |}| || |jjd |_|jj|_|jj\}}}|jd|jd|_|j|j|j |d|_|j	d urX|j	j\}}}|j	d|jd|_	|j	|j|j |d|_	| 
|| ||fS )Nr   r   )get_block_staterW   rJ   r   r   rQ   repeatr   viewrN   set_block_state)rB   rU   rX   rV   r&   seq_lenr   r   r   __call__   s&   


zWanTextInputStep.__call__N)__name__
__module____qualname__
model_namepropertystrrC   listr
   rF   r   rP   rS   rW   r   no_gradr   r	   r_   r   r   r   r   r?      s    
	r?   c                       sx   e Zd ZdZdgg fdee dee f fddZedefdd	Zedee	 fd
dZ
dededefddZ  ZS )WanAdditionalInputsStepr@   image_condition_latentsimage_latent_inputsadditional_batch_inputsc                    s:   t |ts|g}t |ts|g}|| _|| _t   dS )a  Initialize a configurable step that standardizes the inputs for the denoising step. It:
"

        This step handles multiple common tasks to prepare inputs for the denoising step:
        1. For encoded image latents, use it update height/width if None, and expands batch size
        2. For additional_batch_inputs: Only expands batch dimensions to match final batch size

        This is a dynamic block that allows you to configure which inputs to process.

        Args:
            image_latent_inputs (list[str], optional): Names of image latent tensors to process.
                In additional to adjust batch size of these inputs, they will be used to determine height/width. Can be
                a single string or list of strings. Defaults to ["image_condition_latents"].
            additional_batch_inputs (List[str], optional):
                Names of additional conditional input tensors to expand batch size. These tensors will only have their
                batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
                Defaults to [].

        Examples:
            # Configure to process image_condition_latents (default behavior) WanAdditionalInputsStep() # Configure to
            process image latents and additional batch inputs WanAdditionalInputsStep(
                image_latent_inputs=["image_condition_latents"], additional_batch_inputs=["image_embeds"]
            )
        N)r   rf   _image_latent_inputs_additional_batch_inputssuper__init__)rB   rj   rk   r8   r   r   ro     s   

z WanAdditionalInputsStep.__init__r   c                 C   sT   d}d}| j s
| jr"d}| j r|d| j  7 }| jr"|d| j 7 }d}|| | S )NzInput processing step that:
  1. For image latent inputs: Updates height/width if None, and expands batch size
  2. For additional batch inputs: Expands batch dimensions to match final batch size z

Configured inputs:z
  - Image latent inputs: z
  - Additional batch inputs: zN

This block should be placed after the encoder steps and the text input step.)rl   rm   )rB   summary_sectioninputs_infoplacement_sectionr   r   r   rC   =  s   z#WanAdditionalInputsStep.descriptionc                 C   sl   t dddt dddt ddt d	dt d
dg}| jD ]
}|t |d q| jD ]
}|t |d q)|S )Nr   r   )namerI   r   T)ru   rL   r+   )ru   r,   r*   )r   rl   appendrm   )rB   rP   image_latent_input_namer   r   r   r   rP   T  s   


	
zWanAdditionalInputsStep.inputsrU   rX   c                 C   s   |  |}| jD ]9}t||}|d u rqt||j|j\}}}|jp#||_|jp)||_|jp/||_t	|||j
|jd}t||| q| jD ]}	t||	}
|
d u rQqEt	|	|
|j
|jd}
t||	|
 qE| || ||fS )N)r   r   r   r   )rZ   rl   getattrr-   r"   r#   r*   r+   r,   r    r   r   setattrrm   r]   )rB   rU   rX   rV   rw   image_latent_tensorr*   r+   r,   r   r   r   r   r   r_   h  s>   






z WanAdditionalInputsStep.__call__)r`   ra   rb   rc   rf   re   ro   rd   rC   r   rP   r   r	   r_   __classcell__r   r   rp   r   rh     s    %rh   c                   @   sl   e Zd ZdZedee fddZedefddZ	edee
 fddZe d	ed
edefddZdS )WanSetTimestepsStepr@   r   c                 C   rD   )Nr:   )r
   r   rA   r   r   r   rF     rG   z'WanSetTimestepsStep.expected_componentsc                 C      dS )Nz6Step that sets the scheduler's timesteps for inferencer   rA   r   r   r   rC        zWanSetTimestepsStep.descriptionc                 C   s   t dddt dt dgS )Nr.   2   rH   r0   r1   )r   rA   r   r   r   rP     s   
zWanSetTimestepsStep.inputsrU   rX   c                 C   sD   |  |}|j}t|j|j||j|j\|_|_| || ||fS )N)rZ   _execution_devicer>   r:   r.   r0   r1   r]   )rB   rU   rX   rV   r/   r   r   r   r_     s   
zWanSetTimestepsStep.__call__N)r`   ra   rb   rc   rd   rf   r
   rF   re   rC   r   rP   r   rg   r   r	   r_   r   r   r   r   r|     s    r|   c                   @   s   e Zd ZdZedefddZedee fddZ	edee
 fddZed	d
 Ze								ddedededededejdB dejdB dejeej B dB dejdB dejfddZe dededefddZdS ) WanPrepareLatentsStepr@   r   c                 C   r}   )NzWPrepare latents step that prepares the latents for the text-to-video generation processr   rA   r   r   r   rC     r~   z!WanPrepareLatentsStep.descriptionc                 C   s`   t dtdt dtdt dtdt dtjd B dt dtddt d	t d
dtddt dtjddgS )Nr+   )rM   r,   r*   r!   r   r   )rM   rI   	generatorr   TzNumber of prompts, the final batch size of model inputs should be `batch_size * num_videos_per_prompt`. Can be generated in input step.rK   rQ   zThe dtype of the model inputsrO   )r   rR   r   r   rQ   rA   r   r   r   rP     s   


zWanPrepareLatentsStep.inputsc                 C   s   t dtjddgS )Nr!   z4The initial latents to use for the denoising processrO   )r   r   r   rA   r   r   r   rS     s   z*WanPrepareLatentsStep.intermediate_outputsc                 C   s   |j d ur|j | j dks|jd ur+|j| j dkr+td| j d|j  d|j d|jd urL|jdk s?|jd | j dkrNtd| j d|j dd S d S )	Nr   z-`height` and `width` have to be divisible by z	 but are z and rT   r   zQ`num_frames` has to be greater than 0, and (num_frames - 1) must be divisible by r   )r+   r#   r,   r   r*   r"   )rU   rV   r   r   r   rW     s   
z"WanPrepareLatentsStep.check_inputs     @  Q   Nr   num_channels_latentsr+   r,   r*   rQ   r/   r   r!   c
                 C   s   |	d ur|	j ||dS |d | j d }
|||
t|| j t|| j f}t|tr=t||kr=tdt| d| dt||||d}	|	S )N)r/   rQ   r   z/You have passed a list of generators of length z+, but requested an effective batch size of z@. Make sure the batch size matches the length of the generators.)r   r/   rQ   )	tor"   rR   r#   r   rf   r9   r   r   )compr   r   r+   r,   r*   rQ   r/   r   r!   r'   r   r   r   r   prepare_latents  s"   z%WanPrepareLatentsStep.prepare_latentsrU   rX   c                 C   s   |  |}| || |j}tj}|jp|j|_|jp|j|_|j	p$|j
|_	| j||j|j |j|j|j|j	|||j|jd
|_| || ||fS )N)	r   r   r+   r,   r*   rQ   r/   r   r!   )rZ   rW   r   r   float32r+   default_heightr,   default_widthr*   default_num_framesr   r   r   r   r   r!   r]   )rB   rU   rX   rV   r/   rQ   r   r   r   r_     s*   

zWanPrepareLatentsStep.__call__)r   r   r   r   NNNN)r`   ra   rb   rc   rd   re   rC   rf   r   rP   r   rS   staticmethodrW   rR   r   rQ   r/   	Generatorr   r   rg   r   r	   r_   r   r   r   r   r     sT    
	
 r   )r   )NNNN)#r3   r   modelsr   
schedulersr   utilsr   utils.torch_utilsr   modular_pipeliner   r	   modular_pipeline_utilsr
   r   r   r   
get_loggerr`   loggerre   r   rR   r    tupler-   r/   rf   floatr>   r?   rh   r|   r   r   r   r   r   <module>   sb   

8

&


;Y &