o
    Si'                     @   sf  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZm Z  ddl!m"Z"m#Z# dZ$dZ%dZ&de"de"de'deeeef  fddZ(	d de'de"de'de)deee f f
ddZ*			d!de"dee" de'de)de
e'e
e'eee f f f f
ddZ+dS )"au  
The ICMC-ASR Grand Challenge dataset is collected in a hybrid electric vehicle with speakers sitting in different positions, including the driver seat and passenger seats. The total number of speakers is over 160 and all of them are native Chinese speakers speaking Mandarin without strong accents. To comprehensively capture speech signals of the entire cockpit, two types of recording devices are used: far-field and near-field recording devices. 8 distributed microphones are placed at four seats in the car, which are the driver's seat (DS01C01, DX01C01), the passenger seat (DS02C01, DX02C01), the rear right seat (DS03C01, DX03C01) and the rear left seat (DS04C01, DX04C01). Additionally, 2 linear microphone arrays, each consisting of 2 microphones, are placed on the display screen (DL01C01, DL02C02) and at the center of the inner sunroof (DL02C01, DL02C02), respectively. All 12 channels of far-field data are time-synchronized and included in the released dataset as far-field data. For transcription purposes, each speaker wears a high-fidelity headphone to record near-field audio, denoted by the seat where the speaker is situated. Specifically, DA01, DA02, DA03, and DA04 represent the driver seat, passenger seat, rear right seat and rear left seat, respectively. The near-field data only have single-channel audio recordings. Additionally, a sizable real noise dataset is provided, following the recording setup of the far-filed data but without speaker talking, to facilitate research of in-car scenario data simulation technology.

Participants can obtain the datasets at https://icmcasr.org - please download the datasets manually.
    N)defaultdict)ThreadPoolExecutor)Path)DictListOptionalSequenceTupleUnion)tqdm)AudioSource	RecordingRecordingSet)info)fix_manifests$validate_recordings_and_supervisions)manifests_existnormalize_text_alimeeting)SupervisionSegmentSupervisionSet)Pathlikeis_module_available)traindeveval_track1)DA01DA02DA03DA04)DX01C01DX02C01DX03C01DX04C01
corpus_dirsection_pathmicreturnc                    s  t dstddd l}g }g }tD ].d   }| s"q|dkrCd   g}t t d ddd	g}n@|d
kr\fddtD } fddtD }n'|dkr|dg}td t d ddd	d	  g}ntd| t	||D ]\}	}
|dkrfddtD }t
|d d dd}|t|
dd t|D d|j|jd n|	 std|	 d q|tj|	|
d |jt|}t|jdksJ dt|j d|jd }|j}t|jD ]H\}}|jdkr?|j}|j}|j}t|
 d	t|d dd	t|d d|
|t|| d |d!v r-dnttd d"|t |d#}|| qqq||fS )$Ntextgridz>To prepare ICMC ASR data, please 'pip install textgrid' first.r   z	.TextGridihm.wav/ -sdmc                       g | ]
} |d     qS r)   resolve.0sdm_positionr$    J/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/icmcasr.py
<listcomp><       z$_parse_utterance.<locals>.<listcomp>c                    s:   g | ]}t | t  d  dd dd  qS )r*   r+   r,   )strreplacer2   r#   positionr$   r6   r7   r8   @   s    
mdmfake_audio_path_for_mdmDXmixC01zUnsupported mic type: c                    r.   r/   r0   )r3   r=   r5   r6   r7   r8   T   r9   F)force_opus_sampling_rateforce_read_audioc                 S   s$   g | ]\}}t d |gt|dqS )file)typechannelssource)r   r:   )r3   idx
audio_pathr6   r6   r7   r8   `   s    i>  )idsourcessampling_ratenum_samplesdurationzAudio file z does not exist - skipping.)pathrecording_id   zExpected 1 tier, found z tiers.i  06   )r-   r(   Chinese)rI   rO   startrM   channellanguagespeakertext)!r   
ValueErrorr'   POSITIONr1   is_filer:   r;   SDM_POSITIONzipr   appendr   	enumerateframesrM   loggingwarning	from_fileTextGridfromFilelentiersname	intervalsmarkminTimemaxTimer   roundlistranger   )r#   r$   r%   r'   
recordingssegments	text_pathaudio_pathsrecording_idsrH   rO   channel_paths
audio_infotgtierrW   iintervalrT   endrX   segmentr6   r<   r7   _parse_utterance#   s   





$
&

<r}   rP   subsetnum_jobsc              
   C   s   t |}||  }t|}t|_}g }g }g }	t|ddD ]}
||
 }||t||| qt|ddD ]}| }|du rAq6|\}}|	| |		| q6t
|}t|	}	t||	\}}	t||	 W d   ||	fS 1 ssw   Y  ||	fS )z
    Returns the RecodingSet and SupervisionSet given a dataset part.
    :param subset: str, the name of the subset.
    :param corpus_dir: Pathlike, the path of the data dir.
    :return: the RecodingSet and SupervisionSet for train and valid.
    zDistributing tasksdesc
ProcessingN)r   oslistdirr   r   r^   submitr}   resultextendr   from_recordingsr   from_segmentsr   r   )r~   r#   r%   r   	part_pathsectionsexfuturesrecording_setsupervision_setsectionr$   futurer   rp   rq   r6   r6   r7   _prepare_subset   s2   





r   r(   
output_dirc           	   	   C   s  t | } |  sJ d|  td t}|dkrd}|dur+t |}|jddd tt}t|dd	D ]O}td
|  t	||d| ddrTtd| d q5t
|| ||\}}|dur}||d| d| d  ||d| d| d  ||d||< q5|S )aF  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Path to the ICMC-ASR dataset.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: zPreparing ICMC-ASR...r(   )r   r   NT)parentsexist_okzDataset partsr   zProcessing ICMC-ASR subset: zicmcasr-zjsonl.gz)partr   prefixsuffixzICMC-ASR subset: z already prepared - skipping._supervisions_z	.jsonl.gz_recordings_)rp   supervisions)r   is_dirra   r   ICMCASRmkdirr   dictr   r   r   to_file)	r#   r   r%   r   subsets	manifestsr   r   r   r6   r6   r7   prepare_icmcasr   s@   
r   )rP   )Nr(   rP   ),__doc__ra   r   collectionsr   concurrent.futures.threadr   pathlibr   typingr   r   r   r   r	   r
   	tqdm.autor   lhotse.audior   r   r   lhotse.audio.backendr   	lhotse.qar   r   lhotse.recipes.utilsr   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   rZ   r\   r:   r}   intr   r   r6   r6   r6   r7   <module>   sh     
r

,