o
    }oiV                     @   s   d dl mZmZmZmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZmZmZ G dd dZG dd deZG dd deZ G dd deZ!dS )    )CallableDictListOptionalTupleUnionN)_audio_feature_collate_fn)ExternalFeatureLoader)normalize_batch)ChannelSelectorType)load_speech_segments_from_rttm)
tokenizers)collectionsparsers)Dataset)AcousticEncodedRepresentation
LabelsTypeLengthsType
NeuralTypec                   @   s   e Zd Z							ddedeeef dee dee ded	ee d
ee dede	fddZ
dedeee ef fddZdedeee ef fddZdejjdeee ef fddZdS )ASRFeatureManifestProcessorNr   Fmanifest_filepathparsermax_durationmin_durationmax_uttsbos_ideos_idpad_idindex_by_file_idc
           
      C   s4   || _ tj||||||	d| _|| _|| _|| _d S )N)manifests_filesr   r   r   
max_numberr   )r   r   ASRFeatureText
collectionr   r   r   )
selfr   r   r   r   r   r   r   r   r    r$   ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/data/feature_to_text.py__init__   s   	
z$ASRFeatureManifestProcessor.__init__indexreturnc                 C   s   | j | }| |S N)r"   process_text_by_sample)r#   r'   sampler$   r$   r%   process_text_by_id9   s   

z.ASRFeatureManifestProcessor.process_text_by_idfile_idc                 C   s$   | j j| d }| j | }| |S )Nr   )r"   mappingr*   )r#   r-   manifest_idxr+   r$   r$   r%   process_text_by_file_id=   s   

z3ASRFeatureManifestProcessor.process_text_by_file_idr+   c                 C   sV   |j t|j }}| jd ur| jg| }|d7 }| jd ur'|| jg }|d7 }||fS )N   )text_tokenslenr   r   )r#   r+   ttlr$   r$   r%   r*   B   s   

z2ASRFeatureManifestProcessor.process_text_by_sample)NNr   NNr   F)__name__
__module____qualname__strr   r   r   floatintboolr&   r   r   r,   r0   r   ASRAudioTextOUTPUT_TYPEr*   r$   r$   r$   r%   r      s<    
	

&r   c                *   @   s"  e Zd ZdZdZddgZddgZedee	e
ef  fdd	Z		
																d4de
dee
ef dee
 dee
ef dede
dedee dedee dddee dee ded ed!ee d"ee d#ed$ed%ee f(d&d'Zd(d) Zd*d+ Zd,d- Zd.d/ Zd0d1 Zd2d3 ZdS )5_FeatureTextDataseta4
  
    Dataset that loads tensors via a json file containing paths to audio feature files, transcripts,
    durations (in seconds) and optional RTTM files. Each new line is a different sample. Example below:
    {"feature_filepath": "/path/to/audio_feature.pt", "text_filepath": "/path/to/audio.txt",
    "rttm_filepath": "/path/to/audio_rttm.rttm", "duration": 23.147}
    ...
    {"feature_filepath": "/path/to/audio_feature.pt", "text": "the transcription", "offset": 301.75, "duration": 0.82, "utt":
    "utterance_id", "ctm_utt": "en_4156", "side": "A"}
    Args:
        manifest_filepath (str): Path to manifest json as described above. Can be comma-separated paths.
        parser: Str for a language specific preprocessor or a callable.
        normalize (bool): whether and where to normalize feature, must be one of [None, "post_norm", "pre_norm"]
        normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch`
        use_rttm (bool): whether to use RTTM files if there is any, default to False
        rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask'
        feat_min_len (int): minimum length of feature when rttm_mode=deop, default to 4.
        feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram
        frame_unit_time_secs (float): time in seconds for each frame
        sample_rate (int): Sample rate to resample loaded audio to
        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor object used to augment loaded audio
        max_duration (float): If audio exceeds this length, do not include in dataset
        min_duration (float): If audio is less than this length, do not include in dataset
        max_utts (int): Limit number of utterances
        trim (bool): whether or not to trim silence. Defaults to False
        bos_id (int): Id of beginning of sequence symbol to append if not None
        eos_id (int): Id of end of sequence symbol to append if not None
        pad_id (int): Id of pad symbol. Defaults to 0
        return_sample_id (bool): whether to return the sample_id as a part of each sample
        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
    g(\0pre_norm	post_normmaskdropr(   c              	   C   sH   t dt t tdt t dt t tdt t tdt dddS )z+Returns definitions of module output ports.)BDTrD   )rD   rF   T)optional)featuresfeature_lengthtranscriptstranscript_length	sample_id)r   r   tupler   r   r#   r$   r$   r%   output_typest   s   

z _FeatureTextDataset.output_typesper_featureF   N{Gz?>  r   r   r   	normalizenormalize_typeuse_rttm	rttm_modefeat_min_lenfeat_mask_valframe_unit_time_secssample_rate	augmentor3nemo.collections.asr.parts.perturb.FeatureAugmentorr   r   r   trimr   r   r   return_sample_idchannel_selectorc              
   C   s   t |tkr|d}|
| _|| _|| _|| _|| _| jr/| j| jvr/t	d| j d| d|| _
|d ur:|| _n|dkrBd| _n| j| _|d ur[|| jvr[t	d| j d| d|	| _t||||||||d| _t|d	| _|| _|| _|| _d S )
N,z`rttm_mode` must be one of z, got `z	` insteadr@   g        z`normalize` must be one of )r   r   r   r   r   r   r   r   )r\   )typer9   splitr[   rT   rU   rV   rW   
RTTM_MODES
ValueErrorrX   rY   ZERO_LEVEL_SPEC_DB_VAL
NORM_MODESrZ   r   manifest_processorr	   
featurizerr^   r_   r`   )r#   r   r   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r   r   r   r^   r   r   r   r_   r`   r$   r$   r%   r&      s@   


z_FeatureTextDataset.__init__c                 C   s   | j j| S r)   )rh   r"   )r#   rL   r$   r$   r%   get_manifest_sample      z'_FeatureTextDataset.get_manifest_samplec           
      C   s4  | j j| }|j}|d u rd}| j|j}|t|jd 	 }}| j j
|d\}}| jd u rB| jrA|jrA| |||j| j}n0| jdkr]| jrW|jrW| |||j| j}| |}n| |}| jrr|jrr| |||j| j}| jr||t|	 t|	 |f}	|	S ||t|	 t|	 f}	|	S )Nr   r1   )r+   rA   )rh   r"   offsetri   processfeature_filetorchtensorshapelongr*   rT   rV   	rttm_fileprocess_features_with_rttmrY   normalize_featurer_   )
r#   r'   r+   rl   rH   fflr4   r5   outputr$   r$   r%   __getitem__   s.   


" z_FeatureTextDataset.__getitem__c                 C   sP  t |}| }d\}}t|dD ]j}	||	| j  }
|t|d k r@|| d |
k r@|d7 }|t|d k r@|| d |
k s,|| d dksX|
|| d k sX|
|| d krk| jdkr^q||d d |	f< |d7 }q|d d |	f |d d |f< |d7 }q|| jk r| jdkr||d d d | jf< |d d d | jf S |d d d |f S )N)r   r   r1   r   rC   )r   clonerangesizerZ   r3   rW   rX   )r#   rH   rl   rs   mask_valsegmentsnew_featuressidfidir4   r$   r$   r%   rt      s&     0


z._FeatureTextDataset.process_features_with_rttmc                 C   s   t | jjS r)   )r3   rh   r"   rN   r$   r$   r%   __len__   rk   z_FeatureTextDataset.__len__c                 C   s   t || j| jjdS )N)feat_pad_vallabel_pad_id)r   rY   rh   r   )r#   batchr$   r$   r%   _collate_fn  s   z_FeatureTextDataset._collate_fnc                 C   s6   | d}t|t|dg| j\}}}|dS )zH
        Args:
            feat: feature tensor of shape [M, T]
        r   )	unsqueezer
   ro   rp   r|   rU   squeeze)r#   feat_r$   r$   r%   ru     s   
"
z%_FeatureTextDataset.normalize_feature)rA   rP   FrB   rQ   NrR   rS   NNNr   FNNr   FN)r6   r7   r8   __doc__rf   rg   rd   propertyr   r   r9   r   rO   r   r   dictr<   r;   r:   r   r&   rj   ry   rt   r   r   ru   r$   r$   r$   r%   r?   O   s     

	

>$r?   c                0       s   e Zd ZdZ													
						
			d'dedeeee f dee deeef de	dede
dee dedee
 dddee
 dee
 de
de
de
de	dee
 d ee
 d!e
d"eeef d#e	d$ee f. fd%d&Z  ZS )(FeatureToCharDataseta
  
    Dataset that loads tensors via a json file containing paths to audio feature
    files, transcripts, durations (in seconds) and optional RTTM files. Each new line is a
    different sample. Example below:
    {"feature_filepath": "/path/to/audio_feature.pt", "text_filepath":
    "/path/to/audio.txt", "duration": 23.147, "rttm_filepath": "/path/to/audio_rttm.rttm",}
    ...
    {"feature_filepath": "/path/to/audio_feature.pt", "text": "the
    transcription", "offset": 301.75, "duration": 0.82, "utt":
    "utterance_id", "ctm_utt": "en_4156", "side": "A"}

    Args:
        manifest_filepath (str): Path to manifest json as described above. Can
            be comma-separated paths.
        labels (str): String containing all the possible characters to map to
        normalize (str): how to normalize feature, must be one of [None, "post_norm", "pre_norm"]
        normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch`
        use_rttm (bool): whether to use RTTM files if there is any, default to False
        rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask'
        feat_min_len (int): minimum length of feature, default to 4
        feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram
        frame_unit_time_secs: time in seconds for each frame
        sample_rate (int): Sample rate to resample loaded audio to
        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
            object used to augment loaded audio
        max_duration: If audio exceeds this length, do not include in dataset
        min_duration: If audio is less than this length, do not include
            in dataset
        max_utts: Limit number of utterances
        blank_index: blank character index, default = -1
        unk_index: unk_character index, default = -1
        bos_id: Id of beginning of sequence symbol to append if not None
        eos_id: Id of end of sequence symbol to append if not None
        return_sample_id (bool): whether to return the sample_id as a part of each sample
        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
    rA   rP   FrB   rQ   NrR   rS   r   r   enr   labelsrT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r   r   r   blank_index	unk_indexr^   r   r   r   r   r_   r`   c                    s   || _ tj|||||d}t jdi d|d|d|d|d|d|d|d	|d
|	d|
d|d|d|d|d|d|d|d|d|d| d S )N)r   nameunk_idblank_iddo_normalizer   r   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r   r   r   r^   r   r   r   r_   r`   r$   )r   r   make_parsersuperr&   )r#   r   r   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r   r   r   r   r   r^   r   r   r   r   r_   r`   	__class__r$   r%   r&   8  sZ   
	
zFeatureToCharDataset.__init__)rA   rP   FrB   rQ   NrR   rS   NNNr   r   r   FNNr   r   FN)r6   r7   r8   r   r9   r   r   r   r   r<   r;   r:   r   r   r&   __classcell__r$   r$   r   r%   r     s    *
	

r   c                &       s   e Zd ZdZ													
				d"dedddee deeef dedede	dee
 de
dee	 dddee	 dee	 de	dedededee f$ fd d!Z  ZS )#FeatureToBPEDataseta  
    Dataset that loads tensors via a json file containing paths to audio feature
    files, transcripts, durations (in seconds) and optional RTTM files. Each new line is a different sample.
    Example below:
    {"audio_filepath": "/path/to/audio.wav", "text_filepath":
    "/path/to/audio.txt", "duration": 23.147, "rttm_filepath": "/path/to/audio_rttm.rttm",}
    ...
    {"audio_filepath": "/path/to/audio.wav", "text": "the
    transcription", "offset": 301.75, "duration": 0.82, "utt":
    "utterance_id", "ctm_utt": "en_4156", "side": "A"}

    In practice, the dataset and manifest used for character encoding and byte pair encoding
    are exactly the same. The only difference lies in how the dataset tokenizes the text in
    the manifest.

    Args:
        manifest_filepath (str): Path to manifest json as described above. Can
            be comma-separated paths.
        tokenizer: A subclass of the Tokenizer wrapper found in the common collection,
            nemo.collections.common.tokenizers.TokenizerSpec. ASR Models support a subset of
            all available tokenizers.
        normalize (str): how to normalize feature, must be one of [None, "post_norm", "pre_norm"]
        normalize_type (Union[str, dict]): how to normalize feature, see `nemo.collections.asr.parts.preprocessing.features.normalize_batch`
        use_rttm (bool): whether to use RTTM files if there is any, default to False
        rttm_mode (str): how to use RTTM files, must be one of ['mask', 'drop'], default to 'mask'
        feat_min_len (int): minimum length of feature, default to 4
        feat_mask_val (Optional[float]): value used to mask features with RTTM files, default to None to use zero mel-spectralgram
        frame_unit_time_secs: time in seconds for each frame
        sample_rate (int): Sample rate to resample loaded audio to
        int_values (bool): If true, load samples as 32-bit integers. Defauts to False.
        augmentor (nemo.collections.asr.parts.perturb.AudioAugmentor): An AudioAugmentor
            object used to augment loaded audio
        max_duration: If audio exceeds this length, do not include in dataset
        min_duration: If audio is less than this length, do not include
            in dataset
        max_utts: Limit number of utterances
        trim: Whether to trim silence segments
        use_start_end_token: Boolean which dictates whether to add [BOS] and [EOS]
            tokens to beginning and ending of speech respectively.
        return_sample_id (bool): whether to return the sample_id as a part of each sample
        channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. Uses zero-based indexing.
    rA   rP   FrB   rQ   NrR   rS   r   Tr   	tokenizerz0nemo.collections.common.tokenizers.TokenizerSpecrT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r   r   r   use_start_end_tokenr^   r_   r`   c                    s  |rt |dr|jdkr|j}nd }|r"t |dr"|jdkr"|j}nd }t |dr2|jdkr2|j}nd}G dd d}t jdi d|d||d	|d
|d|d|d|d|d|	d|
d|d|d|d|d|d|d|d|d|d| d S )Nr   r   r   r   c                   @   s   e Zd Zdd Zdd ZdS )z6FeatureToBPEDataset.__init__.<locals>.TokenizerWrapperc                 S   s&   t |tjjrd| _nd| _|| _d S )NTF)
isinstancer   aggregate_tokenizerAggregateTokenizeris_aggregate
_tokenizer)r#   r   r$   r$   r%   r&     s   
z?FeatureToBPEDataset.__init__.<locals>.TokenizerWrapper.__init__c                 W   sV   t |d tr#| jr#g }|d D ]}|| j|d |d  q|S | jj| }|S )Nr   r9   lang)r   r   r   extendr   text_to_ids)r#   argsr4   spanr$   r$   r%   __call__  s   z?FeatureToBPEDataset.__init__.<locals>.TokenizerWrapper.__call__N)r6   r7   r8   r&   r   r$   r$   r$   r%   TokenizerWrapper  s    r   r   r   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r   r   r   r^   r_   r`   r$   )hasattrr   r   r   r   r&   )r#   r   r   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r   r   r   r   r^   r_   r`   r   r   r   r   r   r$   r%   r&     sf   	
zFeatureToBPEDataset.__init__)rA   rP   FrB   rQ   NrR   rS   NNNr   TFFN)r6   r7   r8   r   r9   r   r   r   r<   r;   r:   r   r&   r   r$   r$   r   r%   r   p  sn    /
	
r   )"typingr   r   r   r   r   r   ro   *nemo.collections.asr.data.feature_to_labelr   7nemo.collections.asr.parts.preprocessing.feature_loaderr	   1nemo.collections.asr.parts.preprocessing.featuresr
   0nemo.collections.asr.parts.preprocessing.segmentr   *nemo.collections.asr.parts.utils.vad_utilsr   nemo.collections.commonr   +nemo.collections.common.parts.preprocessingr   r   nemo.core.classesr   nemo.core.neural_typesr   r   r   r   r   r?   r   r   r$   r$   r$   r%   <module>   s     1 C_