o
    :i4                  	   @   sD  d dl mZ d dlZd dlZd dlZd dlmZ d dlm	Z	m
Z
 h dZd3dee defd	d
Zd3dee defddZdeee  dededefddZ	d4deej dededejfddZdefddZdedefdd Zdejd!efd"d#Z	$	 d5d%ejded&ed'efd(d)Z	*d6d%ed+ed,efd-d.Zd/d0 Zd1d2 ZdS )7    )ListN)AudioSegment)detect_leading_silencesplit_on_silence>      。   ！   ，   ：   ；   ？,.?!:;d   tokens_list
max_tokensc                 C   s   g }g }| D ].}t |dkr$t |dkr$|tv s|dkr$|d | q|| |tv r4|| g }qt |dkr@|| g }g }|D ]}t |t | |krX|| qFt |dkrc|| |}qFt |dkrq|| |S )aB  
    Splits the input tokens list into chunks according to punctuations,
        each with a maximum number of tokens.

    Args:
        token_list (list of str): The list of tokens to be split.
        max_tokens (int): The maximum number of tokens per chunk.

    Returns:
        List[str]: A list of text chunks.
    r    )lenpunctuationappendextend)r   r   	sentencescurrent_sentencetokenchunkscurrent_chunksentence r!   +/home/ubuntu/LuxTTS/zipvoice/utils/infer.pychunk_tokens_punctuation   s2   




r#   c                 C   s   g }g }| D ]}|dkrt |dkr|| g }|| qt |dkr*|| g }g }|D ]}t |t | |krB|| q0t |dkrM|| |}q0t |dkr[|| |S )aN  
    Splits the input tokens list into chunks according to speaker-turn
        symbol [S1], each with a maximum number of tokens.

    Args:
        token_list (list of str): The list of tokens to be split.
        max_tokens (int): The maximum number of tokens per chunk.

    Returns:
        List[str]: A list of text chunks.
    z[S1]r   )r   r   r   )r   r   dialogscurrent_dialogr   r   r   dialogr!   r!   r"   chunk_tokens_dialogA   s*   



r'   max_durationprompt_durationtoken_durationc                    s   t t| }t|dd d  fddtt D } fddtt D }g }g }d}	|D ]2}
|	| t||  t|
|  |krP||
 |	t|
7 }	q0t|dkr[|| |
g}t|
}	q0t|dkrn|| ||fS )a  
    Sort and group the input list of token sequences into batches, where each batch's
        total duration does not exceed the maximum.

    Args:
        tokens_list (List[List[int]]): A list of token sequences, where each inner
            list represents a sequence of tokens.
        max_duration (float): The maximum allowed total duration for each batch.
        prompt_duration (float): The duration cost per prompt in the batch.
        token_duration (float): The duration cost per token.

    Returns:
        batches: List[List[List[int]]]: A list of batches, where each batch is a list of
            token sequences that fit within the max duration.
        index: List[int]: The original index of each sentence, used to recover the
            sequential order in the future.
    c                 S   s   t | d S )N   )r   )xr!   r!   r"   <lambda>   s    z!batchify_tokens.<locals>.<lambda>)keyc                       g | ]} | d  qS )r   r!   .0iindexed_sorted_tokensr!   r"   
<listcomp>   s    z#batchify_tokens.<locals>.<listcomp>c                    r/   )r+   r!   r0   r3   r!   r"   r5      s    r   )list	enumeratesortedranger   r   )r   r(   r)   r*   indexed_tokensindexsorted_tokensbatchesbatch
batch_sizetokensr!   r3   r"   batchify_tokensl   s4   







rA   皙?]  r   fade_durationsample_ratereturnc                 C   s  t | dkr| r| d S tg S t|| }|dkr"tj| ddS | d }| dd D ]T}t||jd |jd }|dkrHtj||gdd}q,tjdd||jdd }tj|dd| f |d| df | |dd|f d|   |d|df gdd}q,|S )a  
    Concatenates audio chunks with cross-fading between consecutive chunks.

    Args:
        chunks: List of audio tensors, each with shape (C, T) where
                C = number of channel, T = time dimension (samples)
        fade_duration: Duration of cross-fade in seconds
        sample_rate: Audio sample rate in Hz

    Returns:
        Concatenated audio tensor with shape (N, T_total)
    r+   r   r   )dimN)device.)	r   torchtensorintcatminshapelinspacerH   )r   rD   rE   fade_samplesfinal
next_chunkkfader!   r!   r"   cross_fade_concat   s,   
rU   textc                 C   s    |   } | d tvr| d7 } | S )z2Add punctuation if there is not in the end of textr   r   )stripr   )rV   r!   r!   r"   add_punctuation   s   rX   
prompt_wavsampling_ratec                 C   s2   t | \} }||krt jj||d}|| } | S )a  
    Load the waveform with torchaudio and resampling if needed.

    Parameters:
        prompt_wav: path of the prompt wav.
        sampling_rate: target sampling rate.

    Returns:
        Loaded prompt waveform with target sampling rate,
        PyTorch tensor of shape (C, T)
    )	orig_freqnew_freq)
torchaudioload
transformsResample)rY   rZ   prompt_sampling_rate	resamplerr!   r!   r"   load_prompt_wav   s   rc   
target_rmsc                 C   s2   t t t | }||k r| | | } | |fS )am  
    Normalize the rms of prompt_wav is it is smaller than target rms.

    Parameters:
        prompt_wav: PyTorch tensor with shape (C, T).
        target_rms: target rms value

    Returns:
        prompt_wav: normalized prompt wav with shape (C, T).
        promt_rms: rms of original prompt wav. Will be used to
            re-normalize the generated wav.
    )rI   sqrtmeansquare)rY   rd   
prompt_rmsr!   r!   r"   rms_norm  s   ri   Faudio	only_edge	trail_silc                 C   sb   t | |}|st|ddddd}tjdd}|D ]}||7 }qt|dd}|tj|d }t|S )a  
    Remove silences longer than 1 second, and edge silences longer than 0.1 seconds

    Parameters:
        audio: PyTorch tensor with shape (C, T).
        sampling_rate: sampling rate of the audio.
        only_edge: If true, only remove edge silences.
        trail_sil: the duration of added trailing silence in ms.

    Returns:
        PyTorch tensor with shape (C, T), where C is number of channels
            and T is number of audio samples
    i  
   )min_silence_lensilence_threshkeep_silence	seek_stepr   )durationr   )tensor_to_audiosegmentr   r   silentremove_silence_edgesaudiosegment_to_tensor)rj   rZ   rk   rl   wavenon_silent_segssegr!   r!   r"   remove_silence  s   
	
r{   rm   rq   silence_thresholdc                 C   s`   t | |d}td|| }| |d } |  } t | |d}td|| }| |d } |  } | S )a>  
    Remove edge silences longer than `keep_silence` ms.

    Parameters:
        audio: an AudioSegment object.
        keep_silence: kept silence in the edge.
        only_edge: If true, only remove edge silences.
        silence_threshold: the threshold of silence.

    Returns:
        An AudioSegment object
    )r|   r   N)r   maxreverse)rj   rq   r|   	start_idxr!   r!   r"   rv   H  s   rv   c                 C   sV   t |  }|t jd }| jdkrt|d}|S t|	d| jj
}|S )z>
    Convert a pydub.AudioSegment to PyTorch audio tensor
          @r+   r   r   )nparrayget_array_of_samplesastypefloat32channelsrI   
from_numpy	unsqueezereshapeT)aseg
audio_datatensor_datar!   r!   r"   rw   f  s   
rw   c                 C   s   |    }|jdkr|tjddf }|d ddtj}|jd dkr/|	dd
 }| }t|d|| jd d}|S )	z
    Convert a PyTorch audio tensor to pydub.AudioSegment

    Parameters:
        tensor: Tensor with shape (C, T), where C is the number of channels
            and T is the time steps
        sample_rate: Audio sample rate
    r+   Nr   i i  r      )datasample_width
frame_rater   )cpunumpyndimr   newaxisclipr   int16rN   	transposeflattentobytesr   )rJ   rE   audio_npaudio_bytesaudio_segmentr!   r!   r"   rt   z  s   

rt   )r   )rB   rC   )Fr   )r   rm   )typingr   r   r   rI   r]   pydubr   pydub.silencer   r   r   strrK   r#   r'   floatrA   TensorrU   rX   rc   ri   boolr{   rv   rw   rt   r!   r!   r!   r"   <module>   sh    5+

B
;
0
