o
    Si.                     @   s   d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ edZddedee defddZ			ddedee dee dee deeeeef f f
ddZdS )aF  
Data preparation recipe for CSLU Kids corpus (https://catalog.ldc.upenn.edu/LDC2007S18):

Summary of corpus from LDC webpage:

Collection of spontaneous and prompted speech from 1100 children between Kindergarten
and Grade 10 in the Forest Grove School District in Oregon. All children -- approximately
100 children at each grade level -- read approximately 60 items from a total list of 319
phonetically-balanced but simple words, sentences or digit strings. Each utterance of
spontaneous speech begins with a recitation of the alphabet and contains a monologue of
about one minute in duration. This release consists of 1017 files containing approximately
8-10 minutes of speech per speaker. Corresponding word-level transcriptions are also included.

Prompted speech is verified and divided into following categories:

1 Good: Only the target word is said.
2 Maybe: Target word is present, but there's other junk in the file.
3 Bad: Target word is not said.
4 Puff: Same as good, but w/ an air puff.

This data is not available for free - your institution needs to have an LDC subscription.
    N)Path)DictOptionalUnion)tqdm)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)Pathlikecheck_and_rglobz<.*?>Tfile	normalizereturnc                 C   sV   t | d}| dd}|rttd|n|}W d    |S 1 s$w   Y  |S )Nr
  )openreadreplaceresubNOISE_TAGS_REGEX)r   r   ftext r   L/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/cslu_kids.py	read_text'   s   
r    
corpus_dir
output_dirabsolute_pathsnormalize_textc                 C   s|  t | tr	t| n| } t| d}i }t| dD ]/}t|d }|D ]}|  \}	}
t|	j}t|
||< q!W d   n1 sAw   Y  qi }t| d d d%}|D ]}| dkro| jdd	\}}|dd
 ||< qUW d   n1 szw   Y  g }g }t	|ddD ]p}|j}|j
j}|j
j
j}|j
j
j
j}|j
j
j
j
j}tj||rdndd}|| |dkr|| }||v r|| nd}||d}n|dkrt| d | | | | | d |d}d|i}|t||d|j|d||d qt|}t|}t||\}}t|| ||d}|dur<td t|}|jddd |d |d  |d |d   |S )!a  
    Prepare manifests for CSLU Kids corpus. The supervision contains either the
    prompted text, or a transcription of the spontaneous speech, depending on
    whether the utterance was scripted or spontaneous.

    Additionally, the following information is present in the `custom` tag:
    scripted/spontaneous utterance, and verification label (rating between 1 and 4)
    for scripted utterances (see https://catalog.ldc.upenn.edu/docs/LDC2007S18/verification-note.txt
    or top documentation in this script for more information).

    :param corpus_dir: Path to downloaded LDC corpus.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :param normalize_text: remove noise tags (<bn>, <bs>) from spontaneous speech transcripts (default = True)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    z*.wavz*-verified.txtr   Ndocszall.mapr      )maxsplitzPreparing manifests)desc   )relative_path_depthscripted)typeverification_labelspontaneoustransz.txt)r   r-   r   English)idrecording_idstartdurationspeakerlanguager   custom)
recordingssupervisionszWriting manifests to JSON filesT)parentsexist_okr9   z!cslu-kids_recordings_all.jsonl.gzr:   z#cslu_kids_supervisions_all.jsonl.gz)
isinstancestrr   r   r   stripsplitstemintr   parentr   	from_fileappendr    r   r5   r	   from_recordingsr   from_segmentsr
   r   logginginfomkdirto_file)r!   r"   r#   r$   audio_pathsverificationr   r   linepathlabeluttpromptspromptr   r9   r:   puttidspkcatr-   	recordingr.   r8   	manifestsr   r   r   prepare_cslu_kids.   s   



 




rZ   )T)NTT)__doc__rH   r   pathlibr   typingr   r   r   r   lhotser   lhotse.audior   r	   	lhotse.qar
   lhotse.supervisionr   r   lhotse.utilsr   r   compiler   boolr>   r    rZ   r   r   r   r   <module>   s6    
	