o
    i<                     @   s&  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
Z
d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZ d	d
lmZ d	dlmZ g ZdZG dd dejZG dd dejZG dd de
jj ej!Z"G dd de
jj ej!Z#G dd dZ$G dd dZ%eG dd dZ&eG dd dZ'G dd dZ(eG dd  d e'e&e$eZ)eG d!d" d"e'e&e%eZ*eG d#d$ d$e(e&e$eZ+eG d%d& d&e(e&e%eZ,e+d'ej-d(d)d*Z.d+e._/e,d,ej-d-d)d*Z0d.e0_/e)d/ej-d(d)d0e1 d1Z2d2e2_/e*d3ej-d-d)d0e1 d1Z3d4e3_/dS )5    N)	dataclass)AnyDictListOptionalTupleUnion)Tensor)load_state_dict_from_url)mu_law_decoding)	Tacotron2WaveRNN)
GriffinLimInverseMelScale   )utils)Tacotron2TTSBundlez.https://download.pytorch.org/torchaudio/modelsc                       sN   e Zd Z fddZedd Zdeeee f de	e
e
f fddZ  ZS )	_EnglishCharProcessorc                    s.   t    t | _dd t| jD | _d S )Nc                 S      i | ]\}}||qS  r   ).0isr   r   T/home/ubuntu/vllm_env/lib/python3.10/site-packages/torchaudio/pipelines/_tts/impl.py
<dictcomp>       z2_EnglishCharProcessor.__init__.<locals>.<dictcomp>)super__init__r   
_get_chars_tokens	enumerate_mappingself	__class__r   r   r      s   

z_EnglishCharProcessor.__init__c                 C      | j S Nr   r"   r   r   r   tokens      z_EnglishCharProcessor.tokenstextsreturnc                    s,   t |tr|g} fdd|D }t|S )Nc                    s"   g | ]} fd d|  D qS )c                    s    g | ]}| j v r j | qS r   r!   )r   cr"   r   r   
<listcomp>&   s     z=_EnglishCharProcessor.__call__.<locals>.<listcomp>.<listcomp>)lower)r   tr"   r   r   r/   &   s   " z2_EnglishCharProcessor.__call__.<locals>.<listcomp>)
isinstancestrr   
_to_tensor)r#   r+   indicesr   r"   r   __call__#   s   

z_EnglishCharProcessor.__call____name__
__module____qualname__r   propertyr)   r   r3   r   r   r	   r6   __classcell__r   r   r$   r   r      s
    
.r   c                       sT   e Zd Zdd fdd
Zedd Zdeeee f de	e
e
f fd	d
Z  ZS )_EnglishPhoneProcessorN	dl_kwargsc                   sD   t    t | _dd t| jD | _tjd|d| _d| _	d S )Nc                 S   r   r   r   )r   r   pr   r   r   r   .   r   z3_EnglishPhoneProcessor.__init__.<locals>.<dictcomp>zen_us_cmudict_forward.ptr>   z(\[[A-Z]+?\]|[_!'(),.:;? -]))
r   r   r   _get_phonesr   r    r!   _load_phonemizer_phonemizer_patternr#   r?   r$   r   r   r   +   s
   


z_EnglishPhoneProcessor.__init__c                 C   r&   r'   r(   r"   r   r   r   r)   2   r*   z_EnglishPhoneProcessor.tokensr+   r,   c                    sb   t |tr|g}g } j|ddD ]}dd t j|D }| fdd|D  qt|S )Nen_us)langc                 S   s   g | ]	}t d d|qS )z[\[\]] )resub)r   rr   r   r   r/   =   s    z3_EnglishPhoneProcessor.__call__.<locals>.<listcomp>c                    s   g | ]} j | qS r   r-   )r   r@   r"   r   r   r/   >   r   )	r2   r3   rC   rI   findallrD   appendr   r4   )r#   r+   r5   phonesretr   r"   r   r6   6   s   

z_EnglishPhoneProcessor.__call__r7   r   r   r$   r   r=   *   s
    
.r=   c                       sB   e Zd Zddedee f fddZedd Zdd	d
Z	  Z
S )_WaveRNNVocodermodelmin_level_dbc                    s    t    d| _|| _|| _d S )N"V  )r   r   _sample_rate_model_min_level_db)r#   rR   rS   r$   r   r   r   H   s   

z_WaveRNNVocoder.__init__c                 C   r&   r'   rU   r"   r   r   r   sample_rateN   r*   z_WaveRNNVocoder.sample_rateNc                 C   s   t |}dt t j|dd }| jd ur&| j| | j }t j|ddd}| j||\}}t|| jj	}t
|| jj}|d}||fS )N   gh㈵>)minr   r   )r[   max)torchexplog10clamprW   rV   inferr   _unnormalize_waveformn_bitsr   	n_classessqueeze)r#   mel_speclengthswaveformr   r   r   forwardR   s   


z_WaveRNNVocoder.forward)rQ   r'   )r8   r9   r:   r   r   floatr   r;   rY   ri   r<   r   r   r$   r   rP   G   s
    
rP   c                       s2   e Zd Z fddZedd ZdddZ  ZS )	_GriffinLimVocoderc              	      s@   t    d| _tdd| jddddd| _tdd	d
dd| _d S )NrT   i  P   g        g     @@slaney)n_stftn_melsrY   f_minf_max	mel_scalenormi   r      )n_fftpower
hop_length
win_length)r   r   rU   r   rY   _inv_melr   _griffin_limr"   r$   r   r   r   `   s"   
	z_GriffinLimVocoder.__init__c                 C   r&   r'   rX   r"   r   r   r   rY   s   r*   z_GriffinLimVocoder.sample_rateNc                 C   sF   t |}|  d}| |}| d}| |}||fS )NTF)r]   r^   clonedetachrequires_grad_ry   rz   )r#   rf   rg   spec	waveformsr   r   r   ri   w   s   


z_GriffinLimVocoder.forwardr'   )r8   r9   r:   r   r;   rY   ri   r<   r   r   r$   r   rk   _   s
    
rk   c                   @   s   e Zd ZdejfddZdS )
_CharMixinr,   c                 C      t  S r'   )r   r"   r   r   r   get_text_processor      z_CharMixin.get_text_processorNr8   r9   r:   r   TextProcessorr   r   r   r   r   r      s    r   c                   @   s"   e Zd ZdddejfddZdS )_PhoneMixinNr>   r,   c                C   s
   t |dS Nr>   )r=   rE   r   r   r   r      s   
z_PhoneMixin.get_text_processorr   r   r   r   r   r      s    r   c                   @   s:   e Zd ZU eed< eeef ed< dddefddZdS )_Tacotron2Mixin_tacotron2_path_tacotron2_paramsNr>   r,   c                C   V   t di | j}t d| j }|d u ri n|}t|fi |}|| |  |S N/r   )r   r   	_BASE_URLr   r
   load_state_dictevalr#   r?   rR   url
state_dictr   r   r   get_tacotron2      
z_Tacotron2Mixin.get_tacotron2)	r8   r9   r:   r3   __annotations__r   r   r   r   r   r   r   r   r      s   
 r   c                   @   sJ   e Zd ZU ee ed< eeeef  ed< ddddZddddZ	dS )	_WaveRNNMixin_wavernn_path_wavernn_paramsNr>   c                C   s   | j |d}t|S r   )_get_wavernnrP   )r#   r?   wavernnr   r   r   get_vocoder   s   z_WaveRNNMixin.get_vocoderc                C   r   r   )r   r   r   r   r
   r   r   r   r   r   r   r      r   z_WaveRNNMixin._get_wavernn)
r8   r9   r:   r   r3   r   r   r   r   r   r   r   r   r   r      s
   
 r   c                   @   s   e Zd Zdd ZdS )_GriffinLimMixinc                 K   r   r'   )rk   )r#   _r   r   r   r      r   z_GriffinLimMixin.get_vocoderN)r8   r9   r:   r   r   r   r   r   r      s    r   c                   @      e Zd ZdS )_Tacotron2WaveRNNCharBundleNr8   r9   r:   r   r   r   r   r          r   c                   @   r   )_Tacotron2WaveRNNPhoneBundleNr   r   r   r   r   r      r   r   c                   @   r   )_Tacotron2GriffinLimCharBundleNr   r   r   r   r   r      r   r   c                   @   r   )_Tacotron2GriffinLimPhoneBundleNr   r   r   r   r   r      r   r   z5tacotron2_english_characters_1500_epochs_ljspeech.pth&   )	n_symbols)r   r   a  Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.

The text processor encodes the input texts character-by-character.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The default parameters were used.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z3tacotron2_english_phonemes_1500_epochs_ljspeech.pth`   a  Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and
:py:class:`~torchaudio.transforms.GriffinLim` as vocoder.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The text processor is set to the *"english_phonemes"*.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

z=tacotron2_english_characters_1500_epochs_wavernn_ljspeech.pthz%wavernn_10k_epochs_8bits_ljspeech.pth)r   r   r   r   a  Character-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs and :py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.

The text processor encodes the input texts character-by-character.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

You can find the training script `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>

Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_CHAR_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
z;tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech.ptha  Phoneme-based TTS pipeline with :py:class:`~torchaudio.models.Tacotron2` trained on *LJSpeech* :cite:`ljspeech17` for 1,500 epochs, and
:py:class:`~torchaudio.models.WaveRNN` vocoder trained on 8 bits depth waveform of *LJSpeech* :cite:`ljspeech17` for 10,000 epochs.

The text processor encodes the input texts based on phoneme.
It uses `DeepPhonemizer <https://github.com/as-ideas/DeepPhonemizer>`__ to convert
graphemes to phonemes.
The model (*en_us_cmudict_forward*) was trained on
`CMUDict <http://www.speech.cs.cmu.edu/cgi-bin/cmudict>`__.

You can find the training script for Tacotron2 `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_tacotron2>`__.
The following parameters were used; ``win_length=1100``, ``hop_length=275``, ``n_fft=2048``,
``mel_fmin=40``, and ``mel_fmax=11025``.

You can find the training script for WaveRNN `here <https://github.com/pytorch/audio/tree/main/examples/pipeline_wavernn>`__.

Please refer to :func:`torchaudio.pipelines.Tacotron2TTSBundle` for the usage.

Example - "Hello world! T T S stands for Text to Speech!"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>


Example - "The examination and testimony of the experts enabled the Commission to conclude that five shots may have been fired,"

   .. image:: https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.png
      :alt: Spectrogram generated by Tacotron2

   .. raw:: html

      <audio controls="controls">
         <source src="https://download.pytorch.org/torchaudio/doc-assets/TACOTRON2_WAVERNN_PHONE_LJSPEECH_v2.wav" type="audio/wav">
         Your browser does not support the <code>audio</code> element.
      </audio>
)4rI   dataclassesr   typingr   r   r   r   r   r   r]   r	   torchaudio._internalr
   torchaudio.functionalr   torchaudio.modelsr   r   torchaudio.transformsr   r   rH   r   	interfacer   __all__r   r   r   r=   nnModuleVocoderrP   rk   r   r   r   r   r   r   r   r   r   _get_taco_params"TACOTRON2_GRIFFINLIM_CHAR_LJSPEECH__doc__#TACOTRON2_GRIFFINLIM_PHONE_LJSPEECH_get_wrnn_paramsTACOTRON2_WAVERNN_CHAR_LJSPEECH TACOTRON2_WAVERNN_PHONE_LJSPEECHr   r   r   r   <module>   sp     &
	
#
(
%

