o
    :idN                     @   sz   d dl mZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZmZmZmZ G dd dejZdS )	    )ListOptionalN)DistributedDataParallel)EulerSolver)TTSZipformer)condition_time_maskget_tokens_indexmake_pad_mask
pad_labelsprepare_avg_tokens_durationsc                )       s  e Zd ZdZg dg dg ddddddddd	d	d	d
ddddddfdee dee dee dededededededededededededed ed!ed"ed#ef( fd$d%Z	&	&dWd'ejd(ejd)ejd*ejd+e	ej d,e	ej d-ejfd.d/Z
d0eee  fd1d2Zd3ejd4ejd5ejfd6d7Zd0eee  d5ejfd8d9Zd0eee  d5ejd:eee  d;ejfd<d=Zd0eee  d:eee  d;ejd>efd?d@Z	AdXd0eee  dBejd5ejdCejd'ejdDed-ejfdEdFZ	&	G	G	H	I	JdYd0eee  d:eee  dKejd;ejd5e	ej d>edLedMedNed,ed-ejfdOdPZ	Q	&dZd0eee  dBejd5ejdCejdRejdSedTedNed,ejd-ejfdUdVZ  ZS )[ZipVoicezThe ZipVoice model.)         r   r   )r   r   r   r   r   )         r   r   i   r   i   	             0   d      r   fm_decoder_downsampling_factorfm_decoder_num_layersfm_decoder_cnn_module_kernelfm_decoder_feedforward_dimfm_decoder_num_headsfm_decoder_dimtext_encoder_num_layerstext_encoder_feedforward_dimtext_encoder_cnn_module_kerneltext_encoder_num_headstext_encoder_dimtime_embed_dimtext_embed_dimquery_head_dimvalue_head_dimpos_head_dimpos_dimfeat_dim
vocab_sizepad_idc                    s   t    t|d |||||||||||d|d| _t||d||	|||
||||dd| _|| _|| _|| _t	||| _
t| dd| _d	S )
a  
        Initialize the model with specified configuration parameters.

        Args:
            fm_decoder_downsampling_factor: List of downsampling factors for each layer
                in the flow-matching decoder.
            fm_decoder_num_layers: List of the number of layers for each block in the
                flow-matching decoder.
            fm_decoder_cnn_module_kernel: List of kernel sizes for CNN modules in the
                flow-matching decoder.
            fm_decoder_feedforward_dim: Dimension of the feedforward network in the
                flow-matching decoder.
            fm_decoder_num_heads: Number of attention heads in the flow-matching
                decoder.
            fm_decoder_dim: Hidden dimension of the flow-matching decoder.
            text_encoder_num_layers: Number of layers in the text encoder.
            text_encoder_feedforward_dim: Dimension of the feedforward network in the
                text encoder.
            text_encoder_cnn_module_kernel: Kernel size for the CNN module in the
                text encoder.
            text_encoder_num_heads: Number of attention heads in the text encoder.
            text_encoder_dim: Hidden dimension of the text encoder.
            time_embed_dim: Dimension of the time embedding.
            text_embed_dim: Dimension of the text embedding.
            query_head_dim: Dimension of the query attention head.
            value_head_dim: Dimension of the value attention head.
            pos_head_dim: Dimension of the position attention head.
            pos_dim: Dimension of the positional encoding.
            feat_dim: Dimension of the acoustic features.
            vocab_size: Size of the vocabulary.
            pad_id: ID used for padding tokens.
           T)in_dimout_dimdownsampling_factornum_encoder_layerscnn_module_kernelencoder_dimfeedforward_dim	num_headsr'   r)   r(   r*   use_time_embedr%   r   F)r/   r0   r1   r2   r3   r4   r5   r6   r'   r)   r(   r*   r7   forward_fm_decoder)	func_nameN)super__init__r   
fm_decodertext_encoderr+   r&   r-   nn	Embeddingembedr   solver)selfr   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   	__class__ //home/ubuntu/LuxTTS/zipvoice/models/zipvoice.pyr;   &   sJ   
7zZipVoice.__init__Ntxttext_conditionspeech_conditionpadding_maskguidance_scalereturnc                 C   s
  t j|||gdd}| dv sJ | dkr1|ddkr1|d}| dkr1|ddks| dkr?||jd }|dur{| dkrb|ddkrb|d}| dkrb|ddksP| dkrp||jd }| j||||d}|S | j|||d	}|S )
ai  Compute velocity.
        Args:
            t:  A tensor of shape (N, 1, 1) or a tensor of a float,
                in the range of (0, 1).
            xt: the input of the current timestep, including condition
                embeddings and noisy acoustic features.
            text_condition: the text condition embeddings, with the
                shape (batch, seq_len, emb_dim).
            speech_condition: the speech condition embeddings, with the
                shape (batch, seq_len, emb_dim).
            padding_mask: The mask for padding, True means masked
                position, with the shape (N, T).
            guidance_scale: The guidance scale in classifier-free guidance,
                which is a tensor of shape (N, 1, 1) or a tensor of a float.

        Returns:
            predicted velocity, with the shape (batch, seq_len, emb_dim).
        r   )dim)r   r.   r   r   N)xrG   rK   rL   rP   rG   rK   )torchcatrN   sizesqueezerepeatshaper<   )rB   rG   rH   rI   rJ   rK   rL   vtrE   rE   rF   r8      s&   

zZipVoice.forward_fm_decodertokensc                 C   s|   t | tr| jnt|  j}t|| j|d}| |}tj	dd |D tj
|d}t||jd }| j|d|d}||fS )a  
        Get the text embeddings.
        Args:
            tokens: a list of list of token ids.
        Returns:
            embed: the text embeddings, shape (batch, seq_len, emb_dim).
            tokens_lens: the length of each token sequence, shape (batch,).
        )r-   devicec                 S      g | ]}t |qS rE   len.0tokenrE   rE   rF   
<listcomp>       z/ZipVoice.forward_text_embed.<locals>.<listcomp>dtyperZ   r   NrQ   )
isinstanceDDPrZ   next
parametersr
   r-   r@   rR   tensorint64r	   rW   r=   )rB   rY   rZ   tokens_paddedr@   tokens_lenstokens_padding_maskrE   rE   rF   forward_text_embed   s   
zZipVoice.forward_text_embedr@   rl   features_lensc           	   
   C   sh   t | }t||d}t||}t|||j}tj|d|	d
|d||dd}||fS )aP  
        Get the text condition with the same length of the acoustic feature.
        Args:
            embed: the text embeddings, shape (batch, token_seq_len, emb_dim).
            tokens_lens: the length of each token sequence, shape (batch,).
            features_lens: the length of each acoustic feature sequence,
                shape (batch,).
        Returns:
            text_condition: the text condition, shape
                (batch, feature_seq_len, emb_dim).
            padding_mask: the padding mask of text condition, shape
                (batch, feature_seq_len).
        )max_lenr   rO   r   )rN   index)intmaxr	   r   r   torZ   rR   gather	unsqueezeexpandrT   )	rB   r@   rl   ro   
num_framesrK   tokens_durationstokens_indexrI   rE   rE   rF   forward_text_condition   s   


zZipVoice.forward_text_conditionc                 C   s(   |  |\}}| |||\}}||fS )zX
        Process text for training, given text tokens and real feature lengths.
        )rn   r{   )rB   rY   ro   r@   rl   rI   rK   rE   rE   rF   forward_text_train   s   zZipVoice.forward_text_trainprompt_tokensprompt_features_lensc           	      C   sD   dd t ||D }|| }| |\}}| |||\}}||fS )zb
        Process text for inference, given text tokens, real feature lengths and prompts.
        c                 S      g | ]\}}|| qS rE   rE   r_   prompt_tokenr`   rE   rE   rF   ra         z?ZipVoice.forward_text_inference_gt_duration.<locals>.<listcomp>)ziprn   r{   )	rB   rY   ro   r}   r~   r@   rl   rI   rK   rE   rE   rF   "forward_text_inference_gt_duration  s   
z+ZipVoice.forward_text_inference_gt_durationspeedc                 C   s   t | tr| jnt|  j}dd t||D }tjdd |D tj|d}tjdd |D tj|d}| 	|\}	}
|t
|| | | jtjd }| |	|
|\}}||fS )z
        Process text for inference, given text tokens and prompts,
        feature lengths are predicted with the ratio of token numbers.
        c                 S   r   rE   rE   r   rE   rE   rF   ra   1  r   zBZipVoice.forward_text_inference_ratio_duration.<locals>.<listcomp>c                 S   r[   rE   r\   r^   rE   rE   rF   ra   6  rb   rc   c                 S   r[   rE   r\   r^   rE   rE   rF   ra   <  rb   )rd   )re   rf   rZ   rg   rh   r   rR   ri   rj   rn   ceilrt   r{   )rB   rY   r}   r~   r   rZ   
cat_tokensprompt_tokens_lensrl   	cat_embedcat_tokens_lensro   rI   rK   rE   rE   rF   %forward_text_inference_ratio_duration"  s2   z.ZipVoice.forward_text_inference_ratio_duration        featuresnoisecondition_drop_ratioc                 C   s   | j ||d\}}t|d|dd}	t|	dd|}
|dkr5t|ddd|j|k}|| }|| |d|   }|| }| j	||||
|d}|	| @ }t
|| ||  d	 }|S )
a3  Forward pass of the model for training.
        Args:
            tokens: a list of list of token ids.
            features: the acoustic features, with the shape (batch, seq_len, feat_dim).
            features_lens: the length of each acoustic feature sequence, shape (batch,).
            noise: the intitial noise, with the shape (batch, seq_len, feat_dim).
            t: the time step, with the shape (batch, 1, 1).
            condition_drop_ratio: the ratio of dropped text condition.
        Returns:
            fm_loss: the flow-matching loss.
        rY   ro   )gffffff?      ?r   )ro   mask_percentrp   rO   r   r   )rG   rH   rI   rJ   rK   r   )r|   r   rT   rR   whererv   randrt   rZ   r8   mean)rB   rY   r   ro   r   rG   r   rI   rK   speech_condition_maskrJ   	drop_maskrH   utrX   	loss_maskfm_lossrE   rE   rF   forwardL  s8   

zZipVoice.forwardr   predict         ?prompt_featurest_shiftdurationnum_stepc              
   C   s  |dv sJ |dkr| j ||||d\}}n|dusJ | j||||d\}}|j\}}}tjj|ddd||d f}t||}t	|
dt||}tj|||d|jd	}| jj|||||	|
|d
}| d| }tj|d| |d|jd	}tj|d| |d|jd	}t|dD ]2}|||| || ||  f ||d|| ddf< ||d|| f ||d|| ddf< q||||fS )a  
        Generate acoustic features, given text tokens, prompts feature
            and prompt transcription's text tokens.
        Args:
            tokens: a list of list of text tokens.
            prompt_tokens: a list of list of prompt tokens.
            prompt_features: the prompt feature with the shape
                (batch_size, seq_len, feat_dim).
            prompt_features_lens: the length of each prompt feature,
                with the shape (batch_size,).
            features_lens: the length of the predicted eature, with the
                shape (batch_size,). It is used only when duration is "real".
            duration: "real" or "predict". If "real", the predicted
                feature length is given by features_lens.
            num_step: the number of steps to use in the ODE solver.
            guidance_scale: the guidance scale for classifier-free guidance.
        )realr   r   )rY   r}   r~   r   N)rY   ro   r}   r~   r   r   rO   )rZ   )rP   rI   rJ   rK   r   rL   r   r   )r   r   rW   rR   r>   
functionalpadrT   r	   r   rv   
zeros_likerandnrZ   rA   samplesumzerosrs   range)rB   rY   r}   r   r~   ro   r   r   r   r   rL   rI   rK   
batch_sizerx   _rJ   r   x0x1x1_wo_prompt_lens	x1_promptx1_wo_promptirE   rE   rF   r     s|   

	zZipVoice.sampler   r   t_startt_endc
              
   C   sV   | j ||d\}
}t|dd|}| jj||
||||	||d}| d}||fS )a8  
        Generate acoustic features in intermediate timesteps.
        Args:
            tokens: List of list of token ids.
            features: The acoustic features, with the shape (batch, seq_len, feat_dim).
            features_lens: The length of each acoustic feature sequence,
                with the shape (batch,).
            noise: The initial noise, with the shape (batch, seq_len, feat_dim).
            speech_condition_mask: The mask for speech condition, True means
                non-condition positions, with the shape (batch, seq_len).
            t_start: The start timestep.
            t_end: The end timestep.
            num_step: The number of steps for sampling.
            guidance_scale: The scale for classifier-free guidance inference,
                with the shape (batch, 1, 1).
        r   rO   r   )rP   rI   rJ   rK   r   rL   r   r   )r|   rR   r   rv   rA   r   r   )rB   rY   r   ro   r   r   r   r   r   rL   rI   rK   rJ   x_t_endx_t_end_lensrE   rE   rF   sample_intermediate  s"   

zZipVoice.sample_intermediate)NN)r   )Nr   r   r   r   r   )r   N)__name__
__module____qualname____doc__r   rr   r;   rR   Tensorr   r8   rn   r{   r|   r   floatr   r   strr   r   __classcell__rE   rE   rC   rF   r   #   sj   	
g
4


'







1

>

	

m
	
r   )typingr   r   rR   torch.nnr>   torch.nn.parallelr   rf   zipvoice.models.modules.solverr   !zipvoice.models.modules.zipformerr   zipvoice.utils.commonr   r   r	   r
   r   Moduler   rE   rE   rE   rF   <module>   s   	