o
    2wiZ                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZmZmZmZ d dlmZmZmZmZ d dlmZmZ d dlmZmZ d d	lmZmZmZmZmZm Z m!Z! d
e"de"fddZ#dedee" fddZ$					d3dede%dee dee& de'de%de&deeee ee f fddZ(		d4dededed ee& d!ee' f
d"d#Z)			d5d$ed%edee deedf fd&d'Z*	d6ded(e'd)e'dee&ee& f fd*d+Z+d,ee&e
f defd-d.Z,	d7d/ede%d0eee  dee%e&f fd1d2Z-dS )8    N)defaultdict)ProcessPoolExecutor)Path)AnyDictListOptionalTuple)AudioSource	RecordingRecordingSetinfo)Features
FeatureSet)SupervisionSegmentSupervisionSet)PathlikeSecondsadd_durationscompute_num_samplesfastcopyis_module_availableto_listdurationreturnc                 C   s   t d|  d S )a  
    Floor the duration to multiplies of 0.001 seconds.
    This is to avoid float precision problems with workflows like:
      lhotse kaldi import ...
      lhotse fix ...
      ./local/compute_fbank_imported.py (from icefall)
      lhotse cut trim-to-supervisions ...
      ./local/validate_manifest.py ... (from icefall)

    Without flooring, there were different lengths:
      Supervision end time 1093.33995833 is larger than cut end time 1093.3399375

    This is still within the 2ms tolerance in K2SpeechRecognitionDataset::validate_for_asr():
      https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L201
    i  )mathfloor)r    r   I/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/kaldi.pyfloor_duration_to_milliseconds   s   r   pathc                 C   s   t | } |  dr<tdstdddl}z|| }|jjd dks0J d|jjd  t	|j
W S    Y dS t| }t	|j
S )z
    Read a audio file, it supports pipeline style wave path and real waveform.

    :param path: Path to an audio file or a Kaldi-style pipe.
    :return: float duration of the recording, in seconds or `None` in case of read error.
    |kaldi_native_iozeTo read Kaldi's data dir where wav.scp has 'pipe' inputs, please 'pip install kaldi_native_io' first.r   N   zExpect 1 channel. Given )strstripendswithr   
ValueErrorr"   	read_wavedatashaper   r   r   )r    r"   wave
audio_infor   r   r   get_duration,   s"   	

r-   Tr#   kaldi-fbanksampling_rateframe_shiftmap_string_to_underscoresuse_reco2durnum_jobsfeature_typec           %         s   t | } |  s
J dtdtffddt| d dd}| d }|r:| r:t|dd	 t t|ks9J d
n5tdt||d  }	t|d}
t|
j	t
| |	d}W d   n1 saw   Y  tt| |   D ]\}}|du rtd| d||  d ||= qst|t d k rtd|  dt fdd| D }d}| d }| d }| d }t|||d}| rKg }| }dd |D }W d   n1 sw   Y  t| d t| d }t| d t| d  |D ]E\}}}}|r|| \}}nt|d!krt|n | t| d"}|t||t||d#| | || ||  d$	 qt|}n:| rt| d }t|t|ksaJ t| d t| d t| d  t fd%d| D }d}| rtd&r|durd#dl}d#d'l m!} g } t|Q}|D ]F}!|!" j#dd(\}"}#|j$%|#}$|r||" \}}nd#}|$j&| }| t'||$j&|$j(||||j)|#|"|dur||" j*n|"d#d) qW d   n	1 sw   Y  t+,| }nt-.d* |||fS )+a>  
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and
    SupervisionSet manifests. For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists. reco2dur is used by
    default when exists (to enforce reading the duration from the audio files
    themselves, please set use_reco2dur = False.
    All the other files (text, utt2spk, etc.) are optional, and some of them might
    not be handled yet. In particular, feats.scp files are ignored.

    :param path: Path to the Kaldi data directory.
    :param sampling_rate: Sampling rate of the recordings.
    :param frame_shift: Optional, if specified, we will create a Features manifest
        and store the frame_shift value in it.
    :param map_string_to_underscores: optional string, when specified, we will replace
        all instances of this string in SupervisonSegment IDs to underscores.
        This is to help with handling underscores in Kaldi
        (see :func:`.export_to_kaldi`). This is also done for speaker IDs.
    :param use_reco2dur: If True, we will use the reco2dur file to read the durations
        of the recordings. If False, we will read the durations from the audio files
        themselves.
    :param num_jobs: Number of parallel jobs to use when reading the audio files.
    tr   c                    s    d u r| S |   dS )N_)replace)r5   )r1   r   r   fix_idn   s   z#load_kaldi_data_dir.<locals>.fix_idwav.scpT)
must_existreco2dur)
float_valszMThe duration file reco2dur does not have the same length as the  wav.scp filer#   
   )max_workers)	chunksizeN[z5] Could not get duration. Failed to read audio from `z(`. Dropping the recording from manifest.g?z9Failed to load more than 20% utterances of the dataset: ""c              	   3   sf    | ].\}}t |t|d rdnddg|d r|dd n|dgt |  | dV  qdS )r!   commandfiler   N)typechannelssource)idsourcesr/   num_samplesr   )r   r
   r&   r   ).0recording_idpath_or_cmd)	durationsr/   r   r   	<genexpr>   s$    	
z&load_kaldi_data_dir.<locals>.<genexpr>segmentsutt2spkz	feats.scp)segments_path
feats_pathr0   c                 S   s   g | ]}|   qS r   r%   split)rK   
sup_stringr   r   r   
<listcomp>   s    z'load_kaldi_data_dir.<locals>.<listcomp>text
spk2genderutt2langz-1)r/   r   	rH   rL   startr   channelrX   languagespeakergenderc                 3   sF    | ]\}}t ||d  | d| | || d	V  qdS )g        r   r[   N)r   )rK   rec_idspkr)rN   r8   genders	languagestextsr   r   rO      s    
r"   )KaldiReadermaxsplit)rE   
num_framesnum_featuresr0   r/   r\   r   storage_typestorage_pathstorage_keyrL   rF   zcFailed to import Kaldi 'feats.scp' to Lhotse: frame_shift must be not None. Feature import omitted.)/r   is_dirr$   load_kaldi_text_mappingis_filelenmaxr   listmapr-   valuesdictzipkeysitemsloggingwarningRuntimeErrorr   from_recordingsload_start_and_durationopenr   floatappendr   r   from_segmentsexistsr   r"   lhotse.features.iorf   r%   rU   MatrixShapereadnum_rowsr   num_colsnamerL   r   from_featureswarningswarn)%r    r/   r0   r1   r2   r3   r4   
recordingsr;   r?   exdur_valsrL   	dur_valuerecording_setsupervision_setrP   	utt2spk_f	feats_scputt_id_to_start_and_durationsupervisionsfsupervision_segmentsspeakers
segment_idr\   endr6   r   feature_setr"   rf   featureslineutt_idark	mat_shaper   )rN   r8   rc   rd   r1   r/   re   r   load_kaldi_data_dirL   s   











r   Fr   r   
output_dirmap_underscores_toprefix_spk_idc                    sP  t |}|jddd  dur| fdd}|r!|dd }tdd | D rtd	d
 | D |d d tdd
 |D |d d tdd
 | D |d d tdd
 |D |d d tdd
 |D |d d tdd
 |D |d d tdd |D rtdd
 |D |d d tdd |D rtdd
 |D |d d dS dS tdd
 | D |d d tdd
 | D |d d tdd
 |D |d d td d
 |D |d d td!d
 |D |d d td"d
 |D |d d td#d |D rtd$d
 |D |d d td%d |D r&td&d
 |D |d d dS dS )'a  
    Export a pair of ``RecordingSet`` and ``SupervisionSet`` to a Kaldi data
    directory. It even supports recordings that have multiple channels but
    the recordings will still have to have a single ``AudioSource``.

    The ``RecordingSet`` and ``SupervisionSet`` must be compatible, i.e. it must
    be possible to create a ``CutSet`` out of them.

    :param recordings: a ``RecordingSet`` manifest.
    :param supervisions: a ``SupervisionSet`` manifest.
    :param output_dir: path where the Kaldi-style data directory will be created.
    :param map_underscores_to: optional string with which we will replace all
        underscores. This helps avoid issues with Kaldi data dir sorting.
    :param prefix_spk_id: add speaker_id as a prefix of utterance_id (this is to
        ensure correct sorting inside files which is required by Kaldi)

    .. note:: If you export a ``RecordingSet`` with multiple channels, then the
        resulting Kaldi data directory may not be back-compatible with Lhotse
        (i.e. you won't be able to import it back to Lhotse in the same form).
        This is because Kaldi does not inherently support multi-channel recordings,
        so we have to break them down into single-channel recordings.
    T)parentsexist_okNc                    s"   t | | jd | jd dS )Nr6   rH   r_   )r   rH   r7   r_   sr   r   r   <lambda>I  s
    z!export_to_kaldi.<locals>.<lambda>c                 S   s   t | | j d| j dS )N-)rH   )r   r_   rH   r   r   r   r   r   Q      c                 s   s    | ]}|j d kV  qdS )r#   N)num_channels)rK   rr   r   r   rO   S      z"export_to_kaldi.<locals>.<genexpr>c              	   S   s2   i | ]}|j D ]}|jt||j|jd d qqS )r/   
transformsr   )rI   rH   make_wavscp_channel_string_mapr/   r   )rK   	recordingrG   r   r   r   
<dictcomp>[  s    z#export_to_kaldi.<locals>.<dictcomp>r9   )r)   r    c                 S   s*   i | ]}|j |j d |j d |j qS ) )rH   rL   r\   r   rK   supr   r   r   r   h  s    rP   c                 S      i | ]}|j |jqS r   rH   r   )rK   r   r   r   r   r   p      r;   c                 S   r   r   )rH   rX   r   r   r   r   r   v  r   rX   c                 S   r   r   r   r   r   r   r   r   {  r   rQ   c                 S   r   r   r   r   r   r   r   r     r   utt2durc                 s       | ]}|j d uV  qd S Nr^   rK   r   r   r   r   rO     r   c                 S   r   r   )rH   r^   r   r   r   r   r     r   rZ   c                 s   r   r   r`   r   r   r   r   rO     r   c                 S   r   r   )rH   r`   r   r   r   r   r     r   
utt2genderc              
   S   sH   i | ] }|j D ]}|jD ]}|j d | t||j|jd| qqqS )r6   r   )rI   rF   rH   r   r/   r   rK   r   rG   r]   r   r   r   r     s     
c                 S   s8   i | ]}|j D ]}|jD ]}|j d | |jqqqS )r6   )rI   rF   rH   r   r   r   r   r   r     s    
c                 S   sJ   i | ]!}t |jD ]}|jd |  |j d| d|j d|j q	qS )r   r6   r   )r   r]   rH   rL   r\   r   rK   r   r]   r   r   r   r     s    c                 S   0   i | ]}t |jD ]}|jd |  |jq	qS r   )r   r]   rH   rX   r   r   r   r   r         c                 S   r   r   )r   r]   rH   r_   r   r   r   r   r     r   c                 S   r   r   )r   r]   rH   r   r   r   r   r   r     r   c                 s   r   r   r   r   r   r   r   rO     r   c                 S   r   r   )r   r]   rH   r^   r   r   r   r   r     r   c                 s   r   r   r   r   r   r   r   rO     r   c                 S   r   r   )r   r]   rH   r`   r   r   r   r   r     r   )r   mkdirrt   allsave_kaldi_text_mapping)r   r   r   r   r   r   r   r   export_to_kaldi'  s   
	

				
r   rR   rS   c              	   C   s  i }|   r|  rtdr|durddl}|  d}| G}t||D ]9\}}|  \}	}
}}
| jdd\}}|	|krKt|  d| d|j	|}|j
| }t||f||< q%W d   n1 siw   Y  W d   |S W d   |S 1 sw   Y  |S )zt
    Load start time from segments and duration from feats,
    when both segments and feats.scp are available.
    r"   Nr   r#   rg   z and z not aligned.)rp   r   r"   r   rw   r%   rU   r'   r   r   r   r   )rR   rS   r0   r   r"   
segments_ffeats_fsegments_line
feats_liner   r6   r\   r   r   r   r   r   r   r   r~     s6   	

(r~   r:   r<   c                 C   s~   t dd }|  r4|  }tdd |D }W d   n1 s"w   Y  |r2dd | D }|S |r=td|  |S )	zCLoad Kaldi files such as utt2spk, spk2gender, text, etc. as a dict.c                   S   s   d S r   r   r   r   r   r   r     s    z)load_kaldi_text_mapping.<locals>.<lambda>c                 s   s     | ]}|  jd dV  qdS )r#   rg   NrT   )rK   r   r   r   r   rO     s    z*load_kaldi_text_mapping.<locals>.<genexpr>Nc                 S   s   i | ]	\}}|t |qS r   )r   )rK   keyvalr   r   r   r     r   z+load_kaldi_text_mapping.<locals>.<dictcomp>zNo such file: )r   rp   r   rv   ry   r'   )r    r:   r<   mappingr   r   r   r   ro   
  s   
ro   r)   c                 C   sT   | d}t|  D ]\}}t|||d qW d   dS 1 s#w   Y  dS )zFSave flat dicts to Kaldi files such as utt2spk, spk2gender, text, etc.w)rC   N)r   sortedry   print)r)   r    r   r   valuer   r   r   r     s
   "r   rG   r   c                 C   sF  | j dkr	td| j dkr!t| jdkrtdd| j diS | j dkrt| jjd	krIt| jdkrI|d u rIt }| jD ]}| j||< q?|S t| jjd
krmt }| jD ]}d| j d|d  d| d||< qW|S t }| jD ]%}t| jdkrd| j d| d||< qsd| j d| d| d||< qs|S td| j  )Nurlz-URL audio sources are not supported by Kaldi.rB   r#   z9Command audio multichannel sources are not supported yet.r   z |rC   z.wavz.sphz	sph2pipe z -f wav -c z& -p | ffmpeg -threads 1 -i pipe:0 -ar z -f wav -threads 1 pipe:1 |zffmpeg -threads 1 -i z -ar z/ -map_channel 0.0.0  -f wav -threads 1 pipe:1 |z -map_channel 0.0.z  -f wav -threads 1 pipe:1 |zUnknown AudioSource type: )rE   r'   rq   rF   rG   r   suffixrv   )rG   r/   r   audiosr]   r   r   r   r      sF   





r   )NNTr#   r.   )NF)NNN)FFr   ).rz   r   r   collectionsr   concurrent.futuresr   pathlibr   typingr   r   r   r   r	   lhotse.audior
   r   r   r   lhotse.featuresr   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   r   r   r   r   r   r-   intr$   boolr   r   r~   ro   r   r   r   r   r   r   <module>   s    $

#
 `
 B

$


