o
    :i$                     @   s   d dl mZmZ d dlZG dd dejjZG dd deZG dd dZG d	d
 d
eZ	dddde
dfdededededej
dejfddZdS )    )OptionalUnionNc                       x   e Zd ZdZ	ddejjdef fddZ		dd	ej	d
ej	dej	dej	de
ej	 deeej	f dej	fddZ  ZS )DiffusionModelzA wrapper of diffusion models for inference.
    Args:
        model: The diffusion model.
        func_name: The function name to call.
    forward_fm_decodermodel	func_namec                    s(   t    || _|| _t| j|| _d S )N)super__init__r   r   getattr
model_funcselfr   r   	__class__ 5/home/ubuntu/LuxTTS/zipvoice/models/modules/solver.pyr
      s   
zDiffusionModel.__init__N        txtext_conditionspeech_conditionpadding_maskguidance_scalereturnc                 K   s  t |st j||j|jd}|dk r#| jd
|||||d|S | dks+J t j|gd dd}t j|gd dd}t jt 	||gdd}|dkr\t jt 	||gdd}n|d }t j||gdd}| jd
|||||d|j
ddd\}}	d| |	 ||  }
|
S )aK  
        Forward function that Handles the classifier-free guidance.
        Args:
            t: The current timestep, a tensor of a tensor of a single float.
            x: The initial value, with the shape (batch, seq_len, emb_dim).
            text_condition: The text_condition of the diffision model, with
                the shape (batch, seq_len, emb_dim).
            speech_condition: The speech_condition of the diffision model, with the
                shape (batch, seq_len, emb_dim).
            padding_mask: The mask for padding; True means masked position, with the
                shape (batch, seq_len).
            guidance_scale: The scale of classifier-free guidance, a float or a tensor
                of shape (batch, 1, 1).
        Retrun:
            The prediction with the shape (batch, seq_len, emb_dim).
        dtypedevicer   )r   xtr   r   r   r      )dimg      ?   Nr   )torch	is_tensortensorr   r   allr   r    cat
zeros_likechunk)r   r   r   r   r   r   r   kwargsdata_uncond	data_condresr   r   r   forward(   sR   

	
	zDiffusionModel.forwardr   Nr   __name__
__module____qualname____doc__r"   nnModulestrr
   Tensorr   r   floatr-   __classcell__r   r   r   r   r      s2    		r   c                       r   )DistillDiffusionModelzA wrapper of distilled diffusion models for inference.
    Args:
        model: The distilled diffusion model.
        func_name: The function name to call.
    r   r   r   c                    s   t  j||d d S )N)r   r   )r	   r
   r   r   r   r   r
   x   s   zDistillDiffusionModel.__init__Nr   r   r   r   r   r   r   r   c              	   K   s<   t |st j||j|jd}| jd||||||d|S )a?  
        Forward function that Handles the classifier-free guidance.
        Args:
            t: The current timestep, a tensor of a single float.
            x: The initial value, with the shape (batch, seq_len, emb_dim).
            text_condition: The text_condition of the diffision model, with
                the shape (batch, seq_len, emb_dim).
            speech_condition: The speech_condition of the diffision model, with the
                shape (batch, seq_len, emb_dim).
            padding_mask: The mask for padding; True means masked position, with the
                shape (batch, seq_len).
            guidance_scale: The scale of classifier-free guidance, a float or a tensor
                of shape (batch, 1, 1).
        Retrun:
            The prediction with the shape (batch, seq_len, emb_dim).
        r   )r   r   r   r   r   r   Nr   )r"   r#   r$   r   r   r   )r   r   r   r   r   r   r   r)   r   r   r   r-      s   

zDistillDiffusionModel.forwardr.   r/   r0   r   r   r   r   r;   q   s2    		r;   c                   @   sx   e Zd Z	ddejjdefddZ					dd	ejd
ejdejdejde	de
eejf dedededejfddZdS )EulerSolverr   r   r   c                 C      t ||d| _dS )zConstruct a Euler Solver
        Args:
            model: The diffusion model.
            func_name: The function name to call.
        r   N)r   r   r   r   r   r   r
      s   
zEulerSolver.__init__
   r         ?r   r   r   r   num_stepr   t_startt_endt_shiftr   c
              
   K   s   |j }t|trt|tsJ t||||	|d}t|D ]<}|| }||d  }| jd||||||d|
}|d| |  }|||  }||d k rVd| | ||  }q|}q|S )N)rB   rC   rA   rD   r   r!   )r   r   r   r   r   r   r@   r   )r   
isinstancer9   get_time_stepsranger   )r   r   r   r   r   rA   r   rB   rC   rD   r)   r   	timestepsstept_curt_nextvx_1_predx_0_predr   r   r   sample   s8   zEulerSolver.sampleNr.   )r?   r   r   r@   r@   )r1   r2   r3   r"   r5   r6   r7   r
   r8   intr   r9   rO   r   r   r   r   r<      sB    
	
r<   c                   @   s&   e Zd Z	ddejjdefddZdS )DistillEulerSolverr   r   r   c                 C   r=   )zwConstruct a Euler Solver for distilled diffusion models.
        Args:
            model: The diffusion model.
        r>   N)r;   r   r   r   r   r   r
      s   	zDistillEulerSolver.__init__Nr.   )r1   r2   r3   r"   r5   r6   r7   r
   r   r   r   r   rQ      s    rQ   r   r@   r?   cpurB   rC   rA   rD   r   r   c                 C   s4   t | ||d |}|| d|d |   }|S )a6  Compute the intermediate time steps for sampling.

    Args:
        t_start: The starting time of the sampling (default is 0).
        t_end: The starting time of the sampling (default is 1).
        num_step: The number of sampling.
        t_shift: shift the t toward smaller numbers so that the sampling
            will emphasize low SNR region. Should be in the range of (0, 1].
            The shifting will be more significant when the number is smaller.
        device: A torch device.
    Returns:
        The time step with the shape (num_step + 1,).
    r!   )r"   linspaceto)rB   rC   rA   rD   r   rH   r   r   r   rF      s   rF   )typingr   r   r"   r5   r6   r   r;   r<   rQ   r   r9   rP   r8   rF   r   r   r   r   <module>   s0   Z7G