o
    Si]                     @   s  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
mZmZmZmZ d dlmZmZmZmZ d dlmZmZ d dlmZmZ d d	lmZmZmZmZmZm Z m!Z! d
e"de"fddZ#dedee" fddZ$					d6dede%dee dee& de'de%de&deeee ee f fddZ(		d7dededed ee& d!ee' f
d"d#Z)			d8d$ed%edee deedf fd&d'Z*d9ded(e'fd)d*Z+	d:ded+e'd,e'dee&ee& f fd-d.Z,d/ee&e
f defd0d1Z-	d;d2ede%d3eee  dee%e&f fd4d5Z.dS )<    N)defaultdict)ProcessPoolExecutor)Path)AnyDictListOptionalTuple)AudioSource	RecordingRecordingSetinfo)Features
FeatureSet)SupervisionSegmentSupervisionSet)PathlikeSecondsadd_durationscompute_num_samplesfastcopyis_module_availableto_listdurationreturnc                 C   s   t d|  d S )a  
    Floor the duration to multiplies of 0.001 seconds.
    This is to avoid float precision problems with workflows like:
      lhotse kaldi import ...
      lhotse fix ...
      ./local/compute_fbank_imported.py (from icefall)
      lhotse cut trim-to-supervisions ...
      ./local/validate_manifest.py ... (from icefall)

    Without flooring, there were different lengths:
      Supervision end time 1093.33995833 is larger than cut end time 1093.3399375

    This is still within the 2ms tolerance in K2SpeechRecognitionDataset::validate_for_asr():
      https://github.com/lhotse-speech/lhotse/blob/master/lhotse/dataset/speech_recognition.py#L201
    i  )mathfloor)r    r   @/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/kaldi.pyfloor_duration_to_milliseconds   s   r   pathc                 C   s   t | } |  dr<tdstdddl}z|| }|jjd dks0J d|jjd  t	|j
W S    Y dS t| }t	|j
S )z
    Read a audio file, it supports pipeline style wave path and real waveform.

    :param path: Path to an audio file or a Kaldi-style pipe.
    :return: float duration of the recording, in seconds or `None` in case of read error.
    |kaldi_native_iozeTo read Kaldi's data dir where wav.scp has 'pipe' inputs, please 'pip install kaldi_native_io' first.r   N   zExpect 1 channel. Given )strstripendswithr   
ValueErrorr"   	read_wavedatashaper   r   r   )r    r"   wave
audio_infor   r   r   get_duration,   s"   	

r-   Tr#   kaldi-fbanksampling_rateframe_shiftmap_string_to_underscoresuse_reco2durnum_jobsfeature_typec           %         s&  t | } |  s
J dtdtffddt| d dd}| d }|r:| r:t|dd	 t t|ks9J d
n5tdt||d  }	t|d}
t|
j	t
| |	d}W d   n1 saw   Y  tt| |   D ]\}}|du rtd| d||  d ||= qst|t d k rtd|  dt fdd| D }d}| d }| d }| d }t|||d}| rNg }| }dd |D }W d   n1 sw   Y  t| d ddt| d }t| d  t| d! |D ]F\}}}}|r|| \}}nt|d"krt|n | t| d#}|t||t||d$| | || ||  d%	 qt|}n:| rt| d }t|t|ksdJ t| d t| d  t| d! t fd&d| D }d}| rtd'r|dur	d$dl }d$d(l!m"} g } t|Q}|D ]F}!|!# j$dd)\}"}#|j%&|#}$|r||" \}}nd$}|$j'| }| t(||$j'|$j)||||j*|#|"|dur||" j+n|"d$d* qW d   n	1 sw   Y  t,-| }nt./d+ |||fS ),a>  
    Load a Kaldi data directory and convert it to a Lhotse RecordingSet and
    SupervisionSet manifests. For this to work, at least the wav.scp file must exist.
    SupervisionSet is created only when a segments file exists. reco2dur is used by
    default when exists (to enforce reading the duration from the audio files
    themselves, please set use_reco2dur = False.
    All the other files (text, utt2spk, etc.) are optional, and some of them might
    not be handled yet. In particular, feats.scp files are ignored.

    :param path: Path to the Kaldi data directory.
    :param sampling_rate: Sampling rate of the recordings.
    :param frame_shift: Optional, if specified, we will create a Features manifest
        and store the frame_shift value in it.
    :param map_string_to_underscores: optional string, when specified, we will replace
        all instances of this string in SupervisonSegment IDs to underscores.
        This is to help with handling underscores in Kaldi
        (see :func:`.export_to_kaldi`). This is also done for speaker IDs.
    :param use_reco2dur: If True, we will use the reco2dur file to read the durations
        of the recordings. If False, we will read the durations from the audio files
        themselves.
    :param num_jobs: Number of parallel jobs to use when reading the audio files.
    tr   c                    s    d u r| S |   dS )N_)replace)r5   )r1   r   r   fix_idn   s   z#load_kaldi_data_dir.<locals>.fix_idwav.scpT)
must_existreco2dur)
float_valszMThe duration file reco2dur does not have the same length as the  wav.scp filer#   
   )max_workers)	chunksizeN[z5] Could not get duration. Failed to read audio from `z(`. Dropping the recording from manifest.g?z9Failed to load more than 20% utterances of the dataset: ""c              	   3   sf    | ].\}}t |t|d rdnddg|d r|dd n|dgt |  | dV  qdS )r!   commandfiler   N)typechannelssource)idsourcesr/   num_samplesr   )r   r
   r&   r   ).0recording_idpath_or_cmd)	durationsr/   r   r   	<genexpr>   s     	
z&load_kaldi_data_dir.<locals>.<genexpr>segmentsutt2spkz	feats.scp)segments_path
feats_pathr0   c                 S   s   g | ]}|   qS r   r%   split)rK   
sup_stringr   r   r   
<listcomp>   s    z'load_kaldi_data_dir.<locals>.<listcomp>text)allow_empty_ref
spk2genderutt2langz-1)r/   r   	rH   rL   startr   channelrX   languagespeakergenderc                 3   sF    | ]\}}t ||d  | d| | || d	V  qdS )g        r   r\   N)r   )rK   rec_idspkr)rN   r8   genders	languagestextsr   r   rO      s    
r"   )KaldiReadermaxsplit)rE   
num_framesnum_featuresr0   r/   r]   r   storage_typestorage_pathstorage_keyrL   rF   zcFailed to import Kaldi 'feats.scp' to Lhotse: frame_shift must be not None. Feature import omitted.)0r   is_dirr$   load_kaldi_text_mappingis_filelenmaxr   listmapr-   valuesdictzipkeysitemsloggingwarningRuntimeErrorr   from_recordingsload_start_and_durationopenload_kaldi_text_filer   floatappendr   r   from_segmentsexistsr   r"   lhotse.features.iorg   r%   rU   MatrixShapereadnum_rowsr   num_colsnamerL   r   from_featureswarningswarn)%r    r/   r0   r1   r2   r3   r4   
recordingsr;   r?   exdur_valsrL   	dur_valuerecording_setsupervision_setrP   	utt2spk_f	feats_scputt_id_to_start_and_durationsupervisionsfsupervision_segmentsspeakers
segment_idr]   endr6   r   feature_setr"   rg   featureslineutt_idark	mat_shaper   )rN   r8   rd   re   r1   r/   rf   r   load_kaldi_data_dirL   s   










 
r   Fr   r   
output_dirmap_underscores_toprefix_spk_idc                    sP  t |}|jddd  dur| fdd}|r!|dd }tdd | D rtd	d
 | D |d d tdd
 |D |d d tdd
 | D |d d tdd
 |D |d d tdd
 |D |d d tdd
 |D |d d tdd |D rtdd
 |D |d d tdd |D rtdd
 |D |d d dS dS tdd
 | D |d d tdd
 | D |d d tdd
 |D |d d td d
 |D |d d td!d
 |D |d d td"d
 |D |d d td#d |D rtd$d
 |D |d d td%d |D r&td&d
 |D |d d dS dS )'a  
    Export a pair of ``RecordingSet`` and ``SupervisionSet`` to a Kaldi data
    directory. It even supports recordings that have multiple channels but
    the recordings will still have to have a single ``AudioSource``.

    The ``RecordingSet`` and ``SupervisionSet`` must be compatible, i.e. it must
    be possible to create a ``CutSet`` out of them.

    :param recordings: a ``RecordingSet`` manifest.
    :param supervisions: a ``SupervisionSet`` manifest.
    :param output_dir: path where the Kaldi-style data directory will be created.
    :param map_underscores_to: optional string with which we will replace all
        underscores. This helps avoid issues with Kaldi data dir sorting.
    :param prefix_spk_id: add speaker_id as a prefix of utterance_id (this is to
        ensure correct sorting inside files which is required by Kaldi)

    .. note:: If you export a ``RecordingSet`` with multiple channels, then the
        resulting Kaldi data directory may not be back-compatible with Lhotse
        (i.e. you won't be able to import it back to Lhotse in the same form).
        This is because Kaldi does not inherently support multi-channel recordings,
        so we have to break them down into single-channel recordings.
    T)parentsexist_okNc                    s"   t | | jd | jd dS )Nr6   rH   r`   )r   rH   r7   r`   sr   r   r   <lambda>K  s
    z!export_to_kaldi.<locals>.<lambda>c                 S   s   t | | j d| j dS )N-)rH   )r   r`   rH   r   r   r   r   r   S      c                 s   s    | ]}|j d kV  qdS )r#   N)num_channels)rK   rr   r   r   rO   U      z"export_to_kaldi.<locals>.<genexpr>c              	   S   s2   i | ]}|j D ]}|jt||j|jd d qqS )r/   
transformsr   )rI   rH   make_wavscp_channel_string_mapr/   r   )rK   	recordingrG   r   r   r   
<dictcomp>]  s    z#export_to_kaldi.<locals>.<dictcomp>r9   )r)   r    c                 S   s*   i | ]}|j |j d |j d |j qS ) )rH   rL   r]   r   rK   supr   r   r   r   j  s    rP   c                 S      i | ]}|j |jqS r   rH   r   )rK   r   r   r   r   r   r      r;   c                 S   r   r   )rH   rX   r   r   r   r   r   x  r   rX   c                 S   r   r   r   r   r   r   r   r   }  r   rQ   c                 S   r   r   r   r   r   r   r   r     r   utt2durc                 s       | ]}|j d uV  qd S Nr_   rK   r   r   r   r   rO     r   c                 S   r   r   )rH   r_   r   r   r   r   r     r   r[   c                 s   r   r   ra   r   r   r   r   rO     r   c                 S   r   r   )rH   ra   r   r   r   r   r     r   
utt2genderc              
   S   sH   i | ] }|j D ]}|jD ]}|j d | t||j|jd| qqqS )r6   r   )rI   rF   rH   r   r/   r   rK   r   rG   r^   r   r   r   r     s     
c                 S   s8   i | ]}|j D ]}|jD ]}|j d | |jqqqS )r6   )rI   rF   rH   r   r   r   r   r   r     s    
c                 S   sJ   i | ]!}t |jD ]}|jd |  |j d| d|j d|j q	qS )r   r6   r   )r   r^   rH   rL   r]   r   rK   r   r^   r   r   r   r     s    c                 S   0   i | ]}t |jD ]}|jd |  |jq	qS r   )r   r^   rH   rX   r   r   r   r   r         c                 S   r   r   )r   r^   rH   r`   r   r   r   r   r     r   c                 S   r   r   )r   r^   rH   r   r   r   r   r   r     r   c                 s   r   r   r   r   r   r   r   rO     r   c                 S   r   r   )r   r^   rH   r_   r   r   r   r   r     r   c                 s   r   r   r   r   r   r   r   rO     r   c                 S   r   r   )r   r^   rH   ra   r   r   r   r   r     r   )r   mkdirru   allsave_kaldi_text_mapping)r   r   r   r   r   r   r   r   export_to_kaldi)  s   
	

				
r   rR   rS   c              	   C   s  i }|   r|  rtdr|durddl}|  d}| G}t||D ]9\}}|  \}	}
}}
| jdd\}}|	|krKt|  d| d|j	|}|j
| }t||f||< q%W d   n1 siw   Y  W d   |S W d   |S 1 sw   Y  |S )zt
    Load start time from segments and duration from feats,
    when both segments and feats.scp are available.
    r"   Nr   r#   rh   z and z not aligned.)rq   r   r"   r   rx   r%   rU   r'   r   r   r   r   )rR   rS   r0   r   r"   
segments_ffeats_fsegments_line
feats_liner   r6   r]   r   r   r   r   r   r   r   r     s6   	

(r   rY   c                 C   s   |   std|  t }|  6}|D ]*}| }d|v r,|jdd\}}|||< q|r5|}d||< qtd| d|  dW d	   |S 1 sKw   Y  |S )
z[
    Load Kaldi file `text` as a dict.
    Allow entry with empty ref. text (default).
    No such file: r   r#   rh    zEmpty ref. text in: z ()N)rq   r'   rw   r   r%   rU   )r    rY   mappingr   r   keyvaluer   r   r   r     s$   



r   r:   r<   c                 C   s~   t dd }|  r4|  }tdd |D }W d   n1 s"w   Y  |r2dd | D }|S |r=td|  |S )	z=Load Kaldi files such as utt2spk, spk2gender, etc. as a dict.c                   S   s   d S r   r   r   r   r   r   r   (  s    z)load_kaldi_text_mapping.<locals>.<lambda>c                 s   s     | ]}|  jd dV  qdS )r#   rh   NrT   )rK   r   r   r   r   rO   +  s    z*load_kaldi_text_mapping.<locals>.<genexpr>Nc                 S   s   i | ]	\}}|t |qS r   )r   )rK   r   valr   r   r   r   -  r   z+load_kaldi_text_mapping.<locals>.<dictcomp>r   )r   rq   r   rw   rz   r'   )r    r:   r<   r   r   r   r   r   rp   $  s   
rp   r)   c                 C   sT   | d}t|  D ]\}}t|||d qW d   dS 1 s#w   Y  dS )zFSave flat dicts to Kaldi files such as utt2spk, spk2gender, text, etc.w)rC   N)r   sortedrz   print)r)   r    r   r   r   r   r   r   r   3  s
   "r   rG   r   c                 C   sF  | j dkr	td| j dkr!t| jdkrtdd| j diS | j dkrt| jjd	krIt| jdkrI|d u rIt }| jD ]}| j||< q?|S t| jjd
krmt }| jD ]}d| j d|d  d| d||< qW|S t }| jD ]%}t| jdkrd| j d| d||< qsd| j d| d| d||< qs|S td| j  )Nurlz-URL audio sources are not supported by Kaldi.rB   r#   z9Command audio multichannel sources are not supported yet.r   z |rC   z.wavz.sphz	sph2pipe z -f wav -c z& -p | ffmpeg -threads 1 -i pipe:0 -ar z -f wav -threads 1 pipe:1 |zffmpeg -threads 1 -i z -ar z/ -map_channel 0.0.0  -f wav -threads 1 pipe:1 |z -map_channel 0.0.z  -f wav -threads 1 pipe:1 |zUnknown AudioSource type: )rE   r'   rr   rF   rG   r   suffixrw   )rG   r/   r   audiosr^   r   r   r   r   :  sF   





r   )NNTr#   r.   )NF)NNN)T)FFr   )/r{   r   r   collectionsr   concurrent.futuresr   pathlibr   typingr   r   r   r   r	   lhotse.audior
   r   r   r   lhotse.featuresr   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   r   r   r   r   r   r-   intr$   boolr   r   r   r   rp   r   r   r   r   r   r   <module>   s    $

#
 b
 B

#


