o
    }oi;                  	   @   s`  d dl Z d dlmZmZ d dlmZmZ d dlmZm	Z	 d dl
Z
d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ d dlmZmZ dZzd dlZW n e e!fyo   dZY nw G dd dZ"G dd de"eeZ#G dd deeZ$G dd de$Z%G dd deeZ&G dd de"eeZ'G dd deeZ(dS )    N)ABCabstractmethod)	ExitStackcontextmanager)ListOptional)instantiate)
DictConfig)tqdm)OperationMode)ModelPT)PretrainedModelInfo	typecheck)AudioSignal)
NeuralType)loggingmodel_utilsTFc                   @   s   e Zd ZdZdd ZdS )NeedsNormalizerz?Base class for all TTS models that needs text normalization(TN)c                 C   s   d|v r<t std td d S i }d|jv r#| d|jj|d< t|jfi || _| jj| _	d|v r>|j
| _
d S d S d S )Ntext_normalizerzj`nemo_text_processing` not installed, see https://github.com/NVIDIA/NeMo-text-processing for more details.z The normalizer will be disabled.	whitelistztext_normalizer.whitelisttext_normalizer_call_kwargs)PYNINI_AVAILABLEr   errorr   register_artifactr   r   
normalizer	normalizetext_normalizer_callr   )selfcfgnormalizer_kwargs r    T/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/tts/models/base.py_setup_normalizer*   s$   


z!NeedsNormalizer._setup_normalizerN)__name__
__module____qualname____doc__r"   r    r    r    r!   r   '   s    r   c                       sR   e Zd ZdZededdfddZeddd	ZedddZ	 fddZ
  ZS )SpectrogramGeneratorz?Base class for all TTS models that turn text into a spectrogram	str_inputreturntorch.tensorc                 K      dS )a  
        A helper function that accepts raw python strings and turns them into a tensor. The tensor should have 2
        dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor
        should represent either tokenized or embedded text, depending on the model.

        Note that some models have `normalize` parameter in this function which will apply normalizer if it is available.
        Nr    r   r(   kwargsr    r    r!   parseB       zSpectrogramGenerator.parsetokensc                 K   r+   )z
        Accepts a batch of text or text_tokens and returns a batch of spectrograms

        Args:
            tokens: A torch tensor representing the text to be generated

        Returns:
            spectrograms
        Nr    r   r0   r-   r    r    r!   generate_spectrogramL   r/   z)SpectrogramGenerator.generate_spectrogramList[PretrainedModelInfo]c                 C   <   g }|   D ]}| }|durt|dkr|| q|S 
        This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
        Returns:
            List of available pre-trained models.
        Nr   __subclasses__list_available_modelslenextendclslist_of_modelssubclasssubclass_modelsr    r    r!   r9   X      
z*SpectrogramGenerator.list_available_modelsc                    sr   dD ]}||v rt || | j|< || qd|v r)t|d | jd< |d d|v r1tdt | d S )N)enable_volumeenable_ragged_batchesnum_speakers	emb_rangez$embedding range is not user-settable)boolexport_configpopint	Exceptionsuperset_export_config)r   argsk	__class__r    r!   rL   f   s   

z&SpectrogramGenerator.set_export_config)r0   r*   r)   r*   r)   r3   )r#   r$   r%   r&   r   strr.   r2   classmethodr9   rL   __classcell__r    r    rO   r!   r'   ?   s    	r'   c                   @   ,   e Zd ZdZedddZeddd	Zd
S )Vocoderz
    A base class for models that convert spectrograms to audios. Note that this class takes as input either linear
    or mel spectrograms.
    specr*   r)   c                 K   r+   )z
        Accepts a batch of spectrograms and returns a batch of audio.

        Args:
            spec:  ['B', 'n_freqs', 'T'], A torch tensor representing the spectrograms to be vocoded.

        Returns:
            audio
        Nr    )r   rW   r-   r    r    r!   convert_spectrogram_to_audioy   r/   z$Vocoder.convert_spectrogram_to_audior3   c                 C   r4   r5   r7   r<   r    r    r!   r9      rA   zVocoder.list_available_modelsN)rW   r*   r)   r*   rQ   )r#   r$   r%   r&   r   rX   rS   r9   r    r    r    r!   rV   s   s    rV   c                       s   e Zd ZdZ fddZedd Zedd Zedd	 Z	d
d Z
dd Zeede eddddede idddddefddZ  ZS )GlowVocoderzBase class for all Vocoders that use a Glow or reversible Flow-based setup. All child class are expected
    to have a parameter called audio_to_melspec_precessor that is an instance of
    nemo.collections.asr.parts.FilterbankFeaturesc                    s6   t  j|i | tj| _d | _d | _d | _d | _d S N)	rK   __init__r   infer_modestftistftn_mel
bias_spect)r   rM   r-   rO   r    r!   r[      s   
zGlowVocoder.__init__c                 C   s   | j S rZ   )r]   )r   r    r    r!   mode   s   zGlowVocoder.modec                 c   s*    | j }|| _ z	d V  W || _ d S || _ w rZ   )rb   )r   rb   old_moder    r    r!   	temp_mode   s   zGlowVocoder.temp_modec                 c   sT    t  }|| tj |t  d V  W d    d S 1 s#w   Y  d S rZ   )r   enter_contextrd   r   r\   torchno_grad)r   stackr    r    r!   
nemo_infer   s   "zGlowVocoder.nemo_inferc              
      s   | j d u rIz| jj| jj | jj| jj| jW n ty/ } zt|  d|d }~ww dd  fdd| _  fdd| _	| j
d u rkz| jj| _
W d S  tyj } zt|  d|d }~ww d S )Nz could not find a valid audio_to_melspec_precessor. GlowVocoder requires child class to have audio_to_melspec_precessor defined to obtain stft parameters. audio_to_melspec_precessor requires n_fft, hop_length, win_length, window, and nfilt to be defined.c                 S   sJ   t j| ||||dd}t |}t |ddt |d |d fS )NT)n_fft
hop_length
win_lengthwindowreturn_complex   ).rp   .r   )rf   r^   view_as_realsqrtpowsumatan2)audiorj   rk   rl   rm   rW   r    r    r!   yet_another_patch   s   
*z@GlowVocoder.check_children_attributes.<locals>.yet_another_patchc                    s   |  dS N)rj   rk   rl   rm   r    )xrk   rj   rl   rm   rx   r    r!   <lambda>   s    z7GlowVocoder.check_children_attributes.<locals>.<lambda>c                    s0   t jt | t | | t |  dS ry   )rf   r_   complexcossin)rz   y)rk   rj   rl   rm   r    r!   r|      s    z could not find a valid audio_to_melspec_precessor. GlowVocoder requires child class to have audio_to_melspec_precessor defined to obtain stft parameters. audio_to_melspec_precessor requires nfilt to be defined.)r^   audio_to_melspec_precessorrj   rk   rl   rm   todeviceAttributeErrorr_   r`   nfilt)r   er    r{   r!   check_children_attributes   s:   

z%GlowVocoder.check_children_attributesc                 C   s|   |    |  , td| jdf| j}| j|ddd}| |\}}|d d | _	W d    d S 1 s7w   Y  d S )N   X           F)rW   sigmadenoiserq   ).N)
r   ri   rf   zerosr`   r   r   rX   r^   ra   )r   spect
bias_audiora   _r    r    r!   update_bias_spect   s   
"zGlowVocoder.update_bias_spect)BTT)optional)rw   strengthrw   )input_typesoutput_types{Gz?r*   r   c                 C   sZ   |    | jd u r|   | |\}}|| j|j|  }t|d}| ||}|S )Nr   )	r   ra   r   r^   r   r   rf   clampr_   )r   rw   r   audio_spectaudio_anglesaudio_spect_denoisedaudio_denoisedr    r    r!   r      s   
zGlowVocoder.denoise)r   )r#   r$   r%   r&   r[   propertyrb   r   rd   ri   r   r   r   r   r   floatr   rT   r    r    rO   r!   rY      s     


4	rY   c                   @   rU   )	MelToSpeczb
    A base class for models that convert mel spectrograms to linear (magnitude) spectrograms
    melr*   r)   c                 K   r+   )a3  
        Accepts a batch of spectrograms and returns a batch of linear spectrograms

        Args:
            mel: A torch tensor representing the mel spectrograms ['B', 'mel_freqs', 'T']

        Returns:
            spec: A torch tensor representing the linear spectrograms ['B', 'n_freqs', 'T']
        Nr    )r   r   r-   r    r    r!   !convert_mel_spectrogram_to_linear  r/   z+MelToSpec.convert_mel_spectrogram_to_linearr3   c                 C   r4   r5   r7   r<   r    r    r!   r9     rA   zMelToSpec.list_available_modelsN)r   r*   r)   r*   rQ   )r#   r$   r%   r&   r   r   rS   r9   r    r    r    r!   r     s    r   c                   @   sB   e Zd ZdZededdfddZedd	d
ZedddZ	dS )TextToWaveformzKBase class for all end-to-end TTS models that generate a waveform from textr(   r)   r*   c                 K   r+   )aC  
        A helper function that accepts a raw python string and turns it into a tensor. The tensor should have 2
         dimensions. The first is the batch, which should be of size 1. The second should represent time. The tensor
         should represent either tokenized or embedded text, depending on the model.
        Nr    r,   r    r    r!   r.   %  r/   zTextToWaveform.parser0   List[torch.tensor]c                K   r+   )a3  
        Accepts a batch of text and returns a list containing a batch of audio
        Args:
            tokens: A torch tensor representing the text to be converted to speech
        Returns:
            audio: A list of length batch_size containing torch tensors representing the waveform output
        Nr    r1   r    r    r!   convert_text_to_waveform-  r/   z'TextToWaveform.convert_text_to_waveformr3   c                 C   r4   r5   r7   r<   r    r    r!   r9   7  rA   z$TextToWaveform.list_available_modelsN)r0   r*   r)   r   rQ   )
r#   r$   r%   r&   r   rR   r.   r   rS   r9   r    r    r    r!   r   "  s    	r   c                   @   sZ   e Zd Ze 				ddedededed	ed
ee dee fddZ	e
dddZdS )G2PModeltext_graphemes    r   	pred_textmanifest_filepathoutput_manifest_filepathgrapheme_field
batch_sizenum_workers
pred_fieldr)   c              	   C   s   ||dd||d}|  t|}t|dA}	t|ddd*}
tt|	D ]\}}t|}|| ||< |
tj|ddd  q$W d	   n1 sJw   Y  W d	   n1 sYw   Y  t	
d
| d |S )a  
        Main function for Inference. Converts grapheme entries from the manifest "graheme_field" to phonemes
        Args:
            manifest_filepath: Path to .json manifest file
            output_manifest_filepath: Path to .json manifest file to save predictions, will be saved in "target_field"
            grapheme_field: name of the field in manifest_filepath for input grapheme text
            pred_field:  name of the field in the output_file to save predictions
            batch_size: int = 32 # Batch size to use for inference
            num_workers: int = 0 # Number of workers to use for DataLoader during inference

        Returns: Predictions generated by the model
        F)r   r   	drop_lastshuffler   r   rwzutf-8)encoding)ensure_ascii
NzPredictions saved to .)_inferr	   openr
   	enumeratejsonloadswritedumpsr   info)r   r   r   r   r   r   r   config	all_predsf_inf_outiliner    r    r!   convert_graphemes_to_phonemesG  s(   	
z&G2PModel.convert_graphemes_to_phonemesr3   c                 C   s   t | }|S )r6   )r   &resolve_subclass_pretrained_model_info)r=   r>   r    r    r!   r9   q  s   
zG2PModel.list_available_modelsN)r   r   r   r   rQ   )r#   r$   r%   rf   rg   rR   rI   r   r   r   rS   r9   r    r    r    r!   r   F  s.    )r   ))r   abcr   r   
contextlibr   r   typingr   r   rf   hydra.utilsr   	omegaconfr	   r
   (nemo.collections.tts.parts.utils.helpersr   nemo.core.classesr   nemo.core.classes.commonr   r   nemo.core.neural_types.elementsr   "nemo.core.neural_types.neural_typer   
nemo.utilsr   r   r   nemo_text_processingImportErrorModuleNotFoundErrorr   r'   rV   rY   r   r   r   r    r    r    r!   <module>   s6   4!n $