o
    }oiO                     @   s   d dl mZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlmZ dd	 Zd
d Zdd ZG dd de
ZG dd deZG dd de
ZG dd de
ZdS )    )DictListOptionalN)ExternalFeatureLoader)collections)Dataset)AcousticEncodedRepresentation
LabelsTypeLengthsType
NeuralType)loggingc                 C   s   t t|  }t|dkr|\}}}}}nt|dkr#d}|\}}}}ntdg g }}| D ]}|d |d }	}
||	 ||
 q.t|}t|}t|}t|}|du rb||||fS tj|tjd}|||||fS )a  collate batch of feat sig, feat len, labels, labels len, assuming all features have the same shape.
    Args:
        batch (FloatTensor, LongTensor, LongTensor, LongTensor):  A tuple of tuples of feature, feature lengths,
               encoded labels, and encoded labels length. 
          N$Expects 4 or 5 tensors in the batch!r      dtype)	listziplen
ValueErrorappendtorchstacktensorint32)batchpacked_batch_feat_lengthslabels_lengths
sample_idsfeatureslabelsbfeat_ilabels_i r'   ^/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/data/feature_to_label.py_feature_collate_fn   s(   





r)   c                 C   s  t t|  }t|dkr|\}}}}}nt|dkr#d}|\}}}}ntdd}|d du}	|	r7t| }t| }
g g }}| D ]P}|d |d |d |d f\}}}}|	rw| }||k rrd|| f}tjjj	|||d	}|
| | }||
k rd|
| f}tjjj	|||d	}|
| qD|	rt|}t|}nd
\}}t|}t|}|du r||||fS tj|tjd}|||||fS )a}  collate batch of audio feature, audio len, labels, labels len
    Args:
        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
               LongTensor):  A tuple of tuples of feature, feature lengths,
               labels, and label lengths.  This collate func assumes the 
               features are torch tensors of Log-Melspectrogram (i.e. [N_MEL, T]).
    r   r   Nr   r      r      )valueNNr   )r   r   r   r   maxitemr   nn
functionalpadr   r   r   r   )r   feat_pad_vallabel_pad_idr   r   r   r    r!   max_feat_lenhas_featmax_labels_lenr"   r#   r$   r%   
feat_i_lenlabel_ilabel_i_lenr2   feature_lengthsr'   r'   r(   _audio_feature_collate_fn;   sH   
$



r<   c                 C   s  t || }t|  \}}}}t t|t|}t || }	|d du}
|d jd }g g g g f\}}}}tj|ddd}|tj|ddd }| D ]b\}}}}t||}t||}tj|||fdd}||7 }|
rtdtj|| |	dd}t	|D ]}||	 }|| }|dd||f }|
| q{|
| ||g|  ||g|  qH|
rt|}t|}nd\}}t|}t|}||||fS )	a  collate batch of audio features, features len, tokens, tokens len
    Args:
        batch (Optional[FloatTensor], Optional[LongTensor], LongTensor,
            LongTensor):  A tuple of tuples of signal, signal lengths,
            encoded tokens, and encoded tokens length.  This collate func
            assumes the signals are 1d torch tensors (i.e. mono audio).
            batch size equals to 1.
    r   Nr   trunc)rounding_moder*   )dimr-   )intr   minr.   shaper   divzeroscatranger   extendr   r   )r   window_length_in_secshift_length_in_secframe_unit_in_secslice_lengthaudio_featuresr   r   tokens_lengthsshift	has_audiof_dim
num_slicestokensappend_len_startappend_len_endr%   r8   tokens_istartendslicesslice_id	start_idxend_idx
feat_slicer'   r'   r(   _vad_feature_segment_collate_fnq   s@   	



r]   c                       sf   e Zd ZdZedeeeef  fddZ	dddede
e d	ef fd
dZdd Zdd Z  ZS )_FeatureSeqSpeakerLabelDataseta.  
    Dataset that loads tensors via a json file containing paths to feature files, sequences of labels. 
    Each new line is a different sample. Example below:
    and their target labels. JSON files should be of the following format:
        {"feature_filepath": "/path/to/feature_0.p", "seq_label": speakerA speakerB SpeakerA ....}         ...
        {"feature_filepath": "/path/to/feature_n.p", "seq_label": target_seq_label_n} 
    target_seq_label_n is the string of sequence of speaker label, separated by space.

    Args:
        manifest_filepath (str): Dataset parameter. Path to JSON containing data.
        labels (Optional[list]): Dataset parameter. List of unique labels collected from all samples.
        feature_loader : Dataset parameter. Feature loader to load (external) feature.       
    returnc                 C   s   t dt t tdt d}| jr2|t dt t tdt t dt t tdt d |S |t dt t tdt d |S )4Returns definitions of module output ports.
        BDTrb   )external_featfeat_lengthrb   rd   )embsembs_lengthlabellabel_length)rj   rk   )r   r   tupler
   is_speaker_embupdater	   selfoutput_typesr'   r'   r(   rq      s    


z+_FeatureSeqSpeakerLabelDataset.output_typesF)rm   manifest_filepathr#   rm   c                   s   t    tj|dd| _|| _|r|n| jj| _|| _	i i | _
| _t| jD ]\}}|| j
|< || j|< q*tt| jd d D ]}td|| j|  qDd S )N,)manifests_filesr   $ label id {} and its mapped label {})super__init__r   ASRFeatureSequenceLabelsplit
collectionfeature_loaderuniq_labelsr#   rm   label2idid2label	enumeraterF   r   r   debugformat)rp   rr   r#   r{   rm   label_idrj   idx	__class__r'   r(   rw      s   

z'_FeatureSeqSpeakerLabelDataset.__init__c                 C   
   t | jS Nr   rz   rp   r'   r'   r(   __len__      
z&_FeatureSeqSpeakerLabelDataset.__len__c                 C   sb   | j | }| j|j}|t|jd  }}t|j	 }tt
|j }||||fS Nr   )rz   r{   processfeature_filer   r   rB   long	seq_labelfloatr   rp   indexsampler"   fflttlr'   r'   r(   __getitem__   s   
z*_FeatureSeqSpeakerLabelDataset.__getitem__)__name__
__module____qualname____doc__propertyr   r   strr   rq   r   boolrw   r   r   __classcell__r'   r'   r   r(   r^      s    r^   c                   @   s   e Zd ZdZdd ZdS )FeatureToSeqSpeakerLabelDataseta#  
    Dataset that loads tensors via a json file containing paths to feature
    files and sequence of speakers. Each new line is a
    different sample. Example below:
    {"feature_filepath": "/path/to/feature_0.p", "seq_label": speakerA speakerB SpeakerA ....}     ...
    {"feature_filepath": "/path/to/feature_n.p", "seq_label": target_seq_label_n} 
    target_seq_label_n is the string of sequence of speaker label, separated by space.

    Args:
        manifest_filepath (str): Path to manifest json as described above. Canbe comma-separated paths.
        labels (Optional[list]): String containing all the possible labels to map to
            if None then automatically picks from ASRFeatureSequenceLabel collection.
        feature_loader, Feature load to loader (external) feature.
    
    c                 C   s   t |S r   )r)   rp   r   r'   r'   r(   _collate_fn   s   z+FeatureToSeqSpeakerLabelDataset._collate_fnN)r   r   r   r   r   r'   r'   r'   r(   r      s    r   c                       s   e Zd ZdZdZdZedeee	e
f  fddZddddd	d	dddd
	de	dee	 dddedededee dedee dee f fddZdd Zdd Zdd Zdd Z  ZS ) FeatureToLabelDataseta  
    Dataset that loads tensors via a json file containing paths to feature files and their labels. 
    Each new line is a different sample. Example below:
    and their target labels. JSON files should be of the following format:
        {"feature_filepath": "/path/to/audio_feature.pt", "label": "1"}
        ...
        {"feature_filepath": "/path/to/audio_feature.pt", "label": "0"} 
    Args:
        manifest_filepath (str): Path to JSON containing data.
        labels (Optional[list]): List of unique labels collected from all samples.
        augmentor (Optional): feature augmentation
        window_length_in_sec (float): Window length in seconds.
        shift_length_in_sec (float): Shift length in seconds.
        is_regression_task (bool): if True, the labels are treated as for a regression task.
        cal_labels_occurrence (bool): if True, the labels occurrence will be calculated.
        zero_spec_db_val (float): Value to replace non-speech signals in log-melspectrogram.
        min_duration (float): Minimum duration of the audio file in seconds.
        max_duration (float): Maximum duration of the audio file in seconds.
    (\0g{Gz?r_   c                 C   s:   t dt t tdt t dt t tdt d}|S )r`   ra   rb   
audio_featrf   r#   labels_lengthr   r   rl   r
   r	   ro   r'   r'   r(   rq        

z"FeatureToLabelDataset.output_typesNg)\(?F)	r#   	augmentorrH   rI   is_regression_taskcal_labels_occurrencezero_spec_db_valmin_durationmax_durationrr   r#   r   1nemo.collections.asr.parts.perturb.AudioAugmentorrH   rI   r   r   r   r   r   c       
            sZ  t    | _| _| _t|tr|d}tj	||||	|
d _
t|d _|r-|n j
j _| _|s|r;|n j
j _ jd urJt jnd _i i  _ _i g  _ _t jD ]\}}| j|< | j|< |ry j
j|  j|< q`|r fddt jD  _tt jd d D ]}td| j|  qd S g  _d _d S )	Nrs   )rt   r   r   r   r   r   r*   c                       g | ]} j | qS r'   )id2occurrence).0kr   r'   r(   
<listcomp>Y      z2FeatureToLabelDataset.__init__.<locals>.<listcomp>r   ru   )rv   rw   rH   rI   r   
isinstancer   ry   r   ASRFeatureLabelrz   r   r{   r|   r#   r   r   num_classesr}   r~   r   labels_occurrencer   sortedrF   r   r   r   )rp   rr   r#   r   rH   rI   r   r   r   r   r   r   rj   r   r   r   r(   rw   )  sD   





zFeatureToLabelDataset.__init__c                 C   r   r   r   r   r'   r'   r(   r   a  r   zFeatureToLabelDataset.__len__c                 C   s^   | j | }| j|j}|t|jd  }}t| j|j	 }td }||||fS )Nr*   )
rz   r{   r   r   r   r   rB   r   r}   rj   r   r'   r'   r(   r   d  s   
z!FeatureToLabelDataset.__getitem__c                 C      t || jdS r   r<   r   r   r'   r'   r(   r   o     z!FeatureToLabelDataset._collate_fnc                 C   s   t || j| j| jS r   )r]   rH   rI   FRAME_UNIT_TIME_SECSr   r'   r'   r(   _vad_segment_collate_fnr  s   z-FeatureToLabelDataset._vad_segment_collate_fn)r   r   r   r   ZERO_LEVEL_SPEC_DB_VALr   r   r   r   r   r   rq   r   r   r   rw   r   r   r   r   r   r'   r'   r   r(   r     sP    	
8r   c                       s   e Zd ZdZdZedeeee	f  fddZ
dddddddddd	ed
ee dddee dedee dedee dee f fddZdd ZdefddZdd Zdd Zdd Z  ZS ) FeatureToMultiLabelDatasetaK  
    Dataset that loads tensors via a json file containing paths to feature files and their labels. 
    Each new line is a different sample. Example below:
    and their target labels. JSON files should be of the following format:
        {"feature_filepath": "/path/to/audio_feature.pt", "label": "1 1 0 0 1"}
        ...
        {"feature_filepath": "/path/to/audio_feature.pt", "label": "0 1 0 0"} 
    Args:
        manifest_filepath (str): Path to JSON containing data.
        labels (Optional[list]): List of unique labels collected from all samples.
        augmentor (Optional): feature augmentation
        delimiter (str): delimiter to split the labels.
        is_regression_task (bool): if True, the labels are treated as for a regression task.
        cal_labels_occurrence (bool): if True, the labels occurrence will be calculated.
        zero_spec_db_val (float): Value to replace non-speech signals in log-melspectrogram.
        min_duration (float): Minimum duration of the audio file in seconds.
        max_duration (float): Maximum duration of the audio file in seconds.
    r   r_   c                 C   s:   t dt t tdt t dt t tdt d}|S )r`   ra   rb   rg   r   r   ro   r'   r'   r(   rq     r   z'FeatureToMultiLabelDataset.output_typesNF)r#   r   	delimiterr   r   r   r   r   rr   r#   r   r   r   r   r   r   r   r   c       	            sJ  t    || _|| _t|tr|d}tj||||||	d| _	|| _
t|d| _|r.|n| j	j| _i i | _| _|s|r@|n|  | _| jd urOt| jnd| _i i | _| _t| jD ]"\}
}|
| j|< || j|
< |r| j	j| | j|
< | j| j|
  q^tt| jd d D ]}td|| j|  qd S g | _d| _d S )Nrs   )rt   r   r   r   r   r   r   r*   r   ru   )rv   rw   r   r   r   r   ry   r   r   rz   r   r   r{   r|   r#   r}   r~   _get_label_setr   r   r   r   r   r   rF   r   r   r   )rp   rr   r#   r   r   r   r   r   r   r   r   rj   r   r   r'   r(   rw     sB   


	


z#FeatureToMultiLabelDataset.__init__c                 C   sJ   g }| j D ]}|j}|r| jr|| jn| }|| qtt|S r   )rz   rj   r   ry   rG   r   set)rp   r#   r   	label_strlabel_str_listr'   r'   r(   r     s   

z)FeatureToMultiLabelDataset._get_label_setr   c                    sd    j r	| j n| } jr dd |D }t| }|S  fdd|D }t| }|S )Nc                 S   s   g | ]}t |qS r'   )r   r   sr'   r'   r(   r     s    zCFeatureToMultiLabelDataset._label_str_to_tensor.<locals>.<listcomp>c                    r   r'   )r}   r   r   r'   r(   r     r   )r   ry   r   r   r   r   r   )rp   r   r#   r'   r   r(   _label_str_to_tensor  s   z/FeatureToMultiLabelDataset._label_str_to_tensorc                 C   r   r   r   r   r'   r'   r(   r     r   z"FeatureToMultiLabelDataset.__len__c                 C   s^   | j | }| j|j}|t|jd  }}| |j	}t|
d }||||fS )Nr*   r   )rz   r{   r   r   r   r   rB   r   r   rj   sizer   r'   r'   r(   r     s   
z&FeatureToMultiLabelDataset.__getitem__c                 C   r   r   r   r   r'   r'   r(   r     r   z&FeatureToMultiLabelDataset._collate_fn)r   r   r   r   r   r   r   r   r   r   rq   r   r   r   rw   r   r   r   r   r   r   r'   r'   r   r(   r   x  sJ    	
3	r   )typingr   r   r   r   7nemo.collections.asr.parts.preprocessing.feature_loaderr   +nemo.collections.common.parts.preprocessingr   nemo.core.classesr   nemo.core.neural_typesr   r	   r
   r   
nemo.utilsr   r)   r<   r]   r^   r   r   r   r'   r'   r'   r(   <module>   s   "62Kt