o
    SiO                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZ G dd de
Z	ddedee dede	ee	eeeef f f fddZdS )a  
This is a data preparation script for the ASpIRE dataset. The following description
is taken from the LDC website:

ASpIRE Development and Development Test Sets was developed for the Automatic Speech
recognition In Reverberant Environments (ASpIRE) Challenge sponsored by IARPA
(the Intelligent Advanced Research Projects Activity). It contains approximately 226
hours of English speech with transcripts and scoring files.

The ASpIRE challenge asked solvers to develop innovative speech recognition systems
that could be trained on conversational telephone speech, and yet work well on far-
field microphone data from noisy, reverberant rooms. Participants had the opportunity
to evaluate their techniques on a common set of challenging data that included
significant room noise and reverberation.

The data is provided in LDC catalog LDC2017S21. The audio data is a subset of Mixer 6
Speech (LDC2013S03), audio recordings of interviews, transcript readings and
conversational telephone speech collected by the Linguistic Data Consortium in 2009
and 2010 from native English speakers local to the Philadelphia area. The transcripts
were developed by Appen for the ASpIRE challenge.

Data is divided into development and development test sets.

There are 2 versions: "single" and "multi", which stand for single-channel and
multi-channel audio respectively. All audio is presented as single channel, 16kHz
16-bit Signed Integer PCM *.wav files. Transcripts are plain text tdf files or as STM
files. Scoring files (glm) are also included.
    N)defaultdict)Path)Dict
NamedTupleOptionalUnion)$validate_recordings_and_supervisions)AudioSource	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)PathlikeSecondsc                   @   s6   e Zd ZU eed< eed< eed< eed< eed< dS )AspireSegmentAnnotationsessionspeakerstartendtextN)__name__
__module____qualname__str__annotations__r    r   r   I/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/aspire.pyr   ,   s   
 r   single
corpus_dir
output_dirmicreturnc                    s  t | } |  sJ d|   dv sJ d  | d d } | d }| d } dkr?|d	 |d
 d}|d |d d}n|d |d d}|d |d d}tt}|durdt |}|jddd dD ]}g }	g }
 dkryt|| dnIddl}dd tj	t
|| ddd dD }| D ]&\}}|t|d }|	t|dd t
|D |j|j|j|j d qt|	g }t|| *}|D ]}| jdd \}}}}|tt|t|| qW d   n1 sw   Y  tt}|D ]}||j|jf | qg }
| D ]\}}|\|
 fd!dt|D 7 }
qt|
}t|\}t | |dur]|!|d"| d#  !|d$| d#  |d%||< qf|S )&a  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Pathlike, the path of the corpus dir (LDC2017S21).
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str, the microphone type, either "single" or "multi".
    :return: a Dict whose key is the dataset part ('dev' and 'dev_test'), and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: )r   multiz,mic must be either 'single' or 'multi', got zIARPA-ASpIRE-Dev-Sets-v2.0datadev_and_dev_test_audiodev_and_dev_test_STM_filesr   ASpIRE_single_devASpIRE_single_dev_test)devdev_testzdev.stmzdev_test.stmASpIRE_multi_devASpIRE_multi_dev_testzmulti_dev.stmzmulti_dev_test.stmNT)parentsexist_okz*.wavr   c                 S   s   i | ]	\}}|t |qS r   )list).0kvr   r   r   
<dictcomp>j   s    z"prepare_aspire.<locals>.<dictcomp>c                 S   s   d | jdd d S )N_)joinstemsplit)xr   r   r   <lambda>n   s    z prepare_aspire.<locals>.<lambda>)keyc                 S   s2   g | ]}t d t|jdd d gt|dqS )fileN   )typechannelssource)r	   intr7   r   )r0   audior   r   r   
<listcomp>v   s    z"prepare_aspire.<locals>.<listcomp>)idsourcessampling_ratenum_samplesduration   )maxsplitc                    s^   g | ]+\}}t  d  d |d|jt|j|j d|jd dkr&dn jdqS )-03d   Englishr   r   )rE   recording_idr   rI   r   r   languagechannel)r   r   roundr   r   channel_ids)r0   isegr!   recording_setr   r   r   r   rD      s    aspire_supervisions_z	.jsonl.gzaspire_recordings_)
recordingssupervisions)"r   is_dirr   dictmkdirr   from_dir	soundfile	itertoolsgroupbysortedglobitems	SoundFiler   appendr
   
samplerateframesfrom_recordingsopenstripr8   r   floatr/   r   r   	enumerater   from_segmentsr   r   to_file)r   r    r!   	audio_dirstm_diraudio_pathsstm_file	manifestspartr[   r\   sfaudio_groupssession_nameaudiosaudio_sfsegmentsfliner4   r   r   r   segments_groupedsegmentr1   segssupervision_setr   rW   r   prepare_aspire4   s   







r   )Nr   )__doc__rb   loggingtarfilecollectionsr   pathlibr   typingr   r   r   r   lhotser   lhotse.audior	   r
   r   	lhotse.qar   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   r   r   r   r   r   <module>   s.    	