o
    Si^                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ee Z!e!"ej# g dZ$eG dd dZ%eG dd dZ&dede
e'e&f fddZ(dd Z)			dCdede'dee de
e'eeef f fddZ*	dDde'de%de+fd d!Z,	dDde'de%de+fd"d#Z-	dDde'de%de+fd$d%Z.	dDde'de%de+fd&d'Z/	dDde'de%de+fd(d)Z0	dDde'de%de+fd*d+Z1	dDde'de%d,e	ee'ef gee
e'f f de+fd-d.Z2	dEd/ed0ed1e+d2ee fd3d4Z3	dEd/ed0ed1e+d2ee de
eef f
d5d6Z4d7edeee ee f fd8d9Z5d:ed;e'd<e'deee ee f fd=d>Z6d?eded@ fdAdBZ7dS )Fa
  
This is a data preparation recipe for the National Corpus of Speech in Singaporean English.

The entire corpus is organised into a few parts.

Part 1 features about 1000 hours of prompted recordings of phonetically-balanced scripts from about 1000 local English speakers.

Part 2 presents about 1000 hours of prompted recordings of sentences randomly generated from words based on people, food, location, brands, etc, from about 1000 local English speakers as well. Transcriptions of the recordings have been done orthographically and are available for download.

Part 3 consists of about 1000 hours of conversational data recorded from about 1000 local English speakers, split into pairs. The data includes conversations covering daily life and of speakers playing games provided.

Parts 1 and 2 were recorded in quiet rooms using 3 microphones: a headset/ standing microphone (channel 0), a boundary microphone (channel 1), and a mobile phone (channel 3). Recordings that are available for download here have been down-sampled to 16kHz. Details of the microphone models used for each speaker as well as some corresponding non-personal and anonymized information can be found in the accompanying spreadsheets.

Part 3's recordings were split into 2 environments. In the Same Room environment where speakers were in same room, the recordings were done using 2 microphones: a close-talk mic and a boundary mic. In the Separate Room environment, speakers were separated into individual rooms. The recordings were done using 2 microphones in each room: a standing mic and a telephone.

Under Part 4, speakers were encouraged as best as possible to switch from Singapore English to their Mother Tongue languages. These recordings were done under two environments. In the Same Room recordings, speakers sit at least two metres apart and record using their mobile phones. In the Different Room environment, speakers would speak through each other via Zoom on their laptops, and recording using their mobile phones.

Under Part 5, speakers were made to speak following the 4 styles: Debate, Finance topics, Positive Emotion and Negative Emotions. All recordings were done in a Separate room session, via Zoom, where the audio is recorded using the mobile phone.

Under Part 6, speakers were made to speak following the 3 styles within either of the 3 designs: Design 1 (holiday/hotel/restaurant), Design 2 (bank, telephone, insurance), Design 3 (HDB, MOE, MSF). All recordings were done in a Separate room session, via Zoom, where the audio is recorded using the mobile phone.

We currently only support the part 3 recordings, in "same room close mic" and "separate rooms phone mic" environments.
    N)	dataclass)Path)CallableDictListOptionalTupleUnion)tqdm)logging_redirect_tqdm)	RecordingRecordingSetSupervisionSegmentSupervisionSet$validate_recordings_and_supervisions)parallel_map)fix_manifests)Pathlike)PART1_CHANNEL0PART1_CHANNEL1PART1_CHANNEL2PART2_CHANNEL0PART2_CHANNEL1PART2_CHANNEL2PART3_SameBoundaryMicPART3_SameCloseMicPART3_SeparateIVRPART3_SeparateStandingMicPART4_CodeswitchingDiffRoomPART4_CodeswitchingSameRoomPART5_DebatePART5_FinanceEmotionPART6_CallCentreDesign1PART6_CallCentreDesign2PART6_CallCentreDesign3c                   @   sH   e Zd ZU eeef ed< eeef ed< deeef dd fddZdS )ScriptAudioDir
script_dir	audio_dirparentreturnc                 C   s    t |}t|| j || j dS )Nr&   r'   )r   r%   r&   r'   )selfr(    r,   F/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/nsc.pyrelative_toJ   s
   zScriptAudioDir.relative_toN)__name__
__module____qualname__r	   strr   __annotations__r.   r,   r,   r,   r-   r%   E   s   
 r%   c                   @   s,   e Zd ZU eeeegef ed< eed< dS )HandlerMappinghandlerscript_audioN)	r/   r0   r1   r   r2   r%   intdictr3   r,   r,   r,   r-   r4   R   s   
 r4   
corpus_dirr)   c                 C   s  | d }| d d }i dt ttddd|dd	t ttd
dd|ddt ttddd|ddt ttddd|ddt ttddd|ddt ttddd|ddt ttddd|ddt ttddd|ddt ttddd|dd t ttdd!d|dd"t ttd#d$d|dd%t ttd&d'd|dd(t ttd)d*d|dd+t ttd,d-d|dd.t ttd/d0d|dd1t ttd2d3d|dd4t ttd5d6d|d}|S )7NzIMDA - National Speech Corpusz*IMDA - National Speech Corpus - Additionalz*IMDA - National Speech Corpus (Additional)r   zPART1/DATA/CHANNEL0/SCRIPTzPART1/DATA/CHANNEL0/WAVEr*   )r5   r6   r   zPART1/DATA/CHANNEL1/SCRIPTzPART1/DATA/CHANNEL1/WAVEr   zPART1/DATA/CHANNEL2/SCRIPTzPART1/DATA/CHANNEL2/WAVEr   zPART2/DATA/CHANNEL0/SCRIPTzPART2/DATA/CHANNEL0/WAVEr   zPART2/DATA/CHANNEL1/SCRIPTzPART2/DATA/CHANNEL1/WAVEr   zPART2/DATA/CHANNEL2/SCRIPTzPART2/DATA/CHANNEL2/WAVEr   zPART3/Scripts SamezPART3/Audio Same BoundaryMicr   zPART3/Audio Same CloseMicr   zPART3/Scripts SeparatezPART3/Audio Separate IVRr   z PART3/Audio Separate StandingMicr   z%PART4/Codeswitching/Diff Room Scriptsz#PART4/Codeswitching/Diff Room Audior   z%PART4/Codeswitching/Same Room Scriptsz#PART4/Codeswitching/Same Room Audior    zPART5/Debate ScriptszPART5/Debate Audior!   zPART5/Finance + Emotion ScriptszPART5/Finance + Emotions Audior"   z"PART6/Call Centre Design 1/Scriptsz PART6/Call Centre Design 1/Audior#   z"PART6/Call Centre Design 2/Scriptsz PART6/Call Centre Design 2/Audior$   z"PART6/Call Centre Design 3/Scriptsz PART6/Call Centre Design 3/Audio)	r4   prepare_part1r%   r.   prepare_part2prepare_part3prepare_part4prepare_part5prepare_part6)r9   part_1_3_parent_dirpart_4_6_parent_dirpart_handler_mappingr,   r,   r-   get_part_handler_mapY   sV   	
rC   c                  C   s   zdd l } W d S    td)Nr   zNSC data preparation requires the forked 'textgrids' package to be installed. Please install it with 'pip install git+https://github.com/pzelasko/Praat-textgrids' and try again.)	textgridsImportError)rD   r,   r,   r-   check_dependenciesx   s   rF   r      dataset_part
output_dirc           	      C   s   t   t| } |  sJ d|  t| }||v r'|| }|||j|}ntd| td	i |\}}t|| |durat|}|j	ddd |
|d| d  |
|d| d  |S )
a  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path to the raw corpus distribution.
    :param dataset_part: str, name of the dataset part to be prepared.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    zNo such directory: zUnknown dataset part: NT)parentsexist_oknsc_supervisions_z	.jsonl.gznsc_recordings_r,   )rF   r   is_dirrC   r5   r6   
ValueErrorr   r   mkdirto_file)	r9   rH   rI   num_jobspart_handler_maphandler_map	manifests
recordingssupervisionsr,   r,   r-   prepare_nsc   s(   
rX   	part_namescript_audio_dirrR   c              	   O   s   g }g }t |j}t |j}t| dd  }	|	dv sJ |d }
|
 s(|
  dd |dD }ttt	|t
|t
|	t
|
|dt|d|  d	D ]}||d
  ||d  qOt|t|dS )N>   r   rG      	extractedc                 S      g | ]}|qS r,   r,   .0fr,   r,   r-   
<listcomp>       z!prepare_part1.<locals>.<listcomp>zSPEAKER*.zip)rR   Creating manifests for totaldescrV   rW   rV   rW   )r   r'   r&   r7   existsrP   globr
   r   _parse_part1_speaker	itertoolsrepeatlenextendr   from_recordingsr   from_segments)rY   rZ   rR   argskwargsrV   rW   r'   r&   channelextract_to_dirspeaker_zip_filesspeaker_manifestsr,   r,   r-   r:      s6   


r:   c                 O      t || ||d|S N)rY   rZ   rR   )r:   rY   rZ   rR   rr   rs   r,   r,   r-   r;         r;   c                    s^   ddl m  dksJ ddtttf dtttf f fdd}t|||d	|S )
Nr   TextGridr   z1The recipe too different, currently not supported
audio_filer)   c                    sr   t | } t j}dkr | jjd | j }|| d }|}n|| j d }| j}t|} ||d}||fS )Nr   _	.TextGridcoding)r   r&   r(   namestem_detect_textgrid_encoding)r~   r&   script_file_stemscript_filetextgrid_keyr   tgr}   rY   rZ   r,   r-   textgrid_field_id_resolver   s   
z1prepare_part3.<locals>.textgrid_field_id_resolverrY   rZ   textgrid_loaderrR   )rD   r}   r	   r2   r   r   r   prepare_textgrid_based_partrY   rZ   rR   rr   rs   r   r,   r   r-   r<      s   
*r<   c                    L   ddl m  dtttf dt tf f fdd}t|| ||d|S )Nr   r|   r~   r)   c                    N   t | } t j}|| j d }t|} ||d}tt| }||fS Nr   r   r   r&   r   r   nextiterkeysr~   r&   r   r   r   r   r}   rZ   r,   r-   r        
z1prepare_part4.<locals>.textgrid_field_id_resolverr   rD   r}   r	   r2   r   r   r   r   r,   r   r-   r=        

r=   c                    r   )Nr   r|   r~   r)   c                    r   r   r   r   r   r,   r-   r   /  r   z1prepare_part5.<locals>.textgrid_field_id_resolverr   r   r   r,   r   r-   r>   &  r   r>   c                 O   rx   ry   )r>   rz   r,   r,   r-   r?   E  r{   r?   r   c                    sN  t   g }g }t|j}dd t|d|dD }	t }
t|	t|	d|  dD ]o}zH|  d|j	 |
vsFJ d d	| d
|

 tj|d ||\}}dd  fddt|| D D }|| |  W q, ty   t  td| d W d   n1 sw   Y  Y q,w t|t|dS )ah  Prepare part that use textgrid to storing script like: PART3, PART4, PART5, PART6

    Args:
        part_path (Path): path to part
        script_audio_dir (Path): root dir of audios
        script_file_resolver (Callable): a function resolve an audio path in to textgrid script file and it's key
        num_jobs (int): number of workers to process data
    c                 S   r^   r,   r,   r_   r,   r,   r-   rb   j  s    z/prepare_textgrid_based_part.<locals>.<listcomp>z**/*.wavz**/*.WAVrd   re   r   zDuplicated recording id "z", audio path: ""recording_idc                 S   s   g | ]	}|j d kr|qS )r   )duration)r`   sr,   r,   r-   rb   ~  s
    
c              
   3   sd    | ]-\}}|j d vrt j d|  j|jtt|j|j dd j|j |j ddV  qdS ))z<S>z<Z>-   )ndigitszSingaporean English)idr   startr   textlanguagespeakerN)r   r   r   xminminroundxmaxr   )r`   idxsegment	recordingr   r,   r-   	<genexpr>  s$    

z.prepare_textgrid_based_part.<locals>.<genexpr>zError when processing "z" - skipping...Nrh   )rF   r   r'   rl   chainrglobsetr
   rn   r   addr   	from_file	enumeratero   append	Exceptionr   loggerwarningr   rp   r   rq   )rY   rZ   r   rR   rr   rs   rV   rW   r'   audio_filesprocessed_recordings
audio_pathr   r   segmentsr,   r   r-   r   U  sN   





r   speaker_zip_filer&   rt   ru   c                 C   s   t | |||d}g }g }| D ]\}}|| || qg }	g }
dd t||D D ]\}}|	| |
| q-t|	t|
dS )N)r   r&   rt   ru   c                 S   s   g | ]	\}}t ||qS r,   )_parse_part1_script)r`   sc_fss_dr,   r,   r-   rb     s    z(_parse_part1_speaker.<locals>.<listcomp>rh   )	_preprocess_part1_speakeritemsr   zipro   r   rp   r   rq   )r   r&   rt   ru   script_session_dir_mappingsessions_dirscripts_filer   session_dirrV   rW   rr   r,   r,   r-   rk     s*   

rk   c                 C   s  t | } t |}|du r| j}|| j }| s3t| }|| W d   n1 s-w   Y  nt  t	d| d|  d W d   n1 sMw   Y  dd |
dD }g }|D ]}|jd}	|jd	}
|| |	 |
 d
 }|| q`dd t||D S )a  Extract PART1/PART2 speaker audio

    Args:
        speaker_zip_file (Pathlike): Path to speaker zipped audio file
        script_dir (Pathlike): Path to script dir of the channel
        channel (int): Channel of the PART, we can parse from $script_dir but it is not necessary
        extract_to_dir (Optional[Path]): Directory to extract zipped audio file, default to parent dir of $speaker_zip_file

    Returns:
        Dict[Path, Path]: Mapping of script file -> speaker's session dir
    Nz	Reusing "z" as extracted "z" since it is existed alreadyc                 S   r^   r,   r,   r_   r,   r,   r-   rb     rc   z-_preprocess_part1_speaker.<locals>.<listcomp>zSESSION*SPEAKERSESSIONz.TXTc                 S   s   i | ]\}}||qS r,   r,   )r`   sr_fr   r,   r,   r-   
<dictcomp>  s    z-_preprocess_part1_speaker.<locals>.<dictcomp>)r   r(   r   ri   zipfileZipFile
extractallr   r   r   rj   removeprefixr   r   )r   r&   rt   ru   speaker_audio_dirzfr   r   r   spk_idsession_numbersession_script_filer,   r,   r-   r     s.   
r   r   c                 C   s$  g }g }t | ddd{}d}d}|D ]?}|dd}|dkrI|d |krI|d dkr0|d }t|||\}	}
|	rD||	 ||
 d }}q|d }|d }q|rrt|||\}	}
|	r|||	 ||
 W d    ||fS W d    ||fS W d    ||fS 1 sw   Y  ||fS )	Nr   z	utf-8-sig)encoding 
	r   rG   )openrstripsplit_create_part1_single_recordr   )r   r   rV   rW   frprevious_audio_idprevious_textlinecolumnsr   r   r,   r,   r-   r     sH   







r   r   	_audio_idr   c                 C   s   | | d }zt j||d}t|j|jd|j|d}||fW S  tyJ   t  td| d|  d W d    Y d	S 1 sBw   Y  Y d	S  t	y } z*t  t
d| | W d    n1 shw   Y  W Y d }~d	S W Y d }~d	S d }~ww )
Nz.WAVr   r   )r   r   r   r   r   zRecording audio of script "z" can not be found in "r   zError occurred with )NN)r   r   r   r   r   FileNotFoundErrorr   r   r   r   error)r   r   r   r~   r   r   er,   r,   r-   r     s>   
&
r   textgrid_filer2   c                 C   s   ddl }d}t| d}|d}W d   n1 sw   Y  |dt| |kr,dS || }|du r9dS |jdkrA|jS dS )zk_summary_

    Returns:
        str: encoding of the file or None if it is binary file or undetectable
    r   Ns   ooBinaryFileTextGridrbi (  asciizutf-8)charset_normalizerr   readrn   
from_bytesbestr   )r   r   textgrid_binary_markr   checking_bytescharset_matchr,   r,   r-   r   &  s   r   )r   NrG   )rG   )N)8__doc__rl   loggingr   dataclassesr   pathlibr   typingr   r   r   r   r   r	   	tqdm.autor
   tqdm.contrib.loggingr   lhotser   r   r   r   r   lhotse.parallelr   	lhotse.qar   lhotse.utilsr   	getLoggerr/   r   setLevelINFO	NSC_PARTSr%   r4   r2   rC   rF   rX   r7   r:   r;   r<   r=   r>   r?   r   rk   r   r   r   r   r,   r,   r,   r-   <module>   s    

-
(

(
"
"

O
$

)
#
