o
    Si                     @   s   d Z ddlZddlmZ ddlmZmZmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZmZ dd	lmZ 		
ddedee dee deeeeef f fddZdS )aT  
Data preparation recipe for CMU Kids corpus (https://catalog.ldc.upenn.edu/LDC97S63):

Summary of corpus from LDC webpage:

This database is comprised of sentences read aloud by children. It was originally designed
in order to create a training set of children's speech for the SPHINX II automatic speech
recognizer for its use in the LISTEN project at Carnegie Mellon University.

The children range in age from six to eleven (see details below) and were in first through
third grades (the 11-year-old was in 6th grade) at the time of recording. There were 24 male
and 52 female speakers. There are 5,180 utterances in all.

The speakers come from two separate populations:

 1. SIM95: They were recorded in the summer of 1995 and were enrolled in either the Chatham
    College Summer Camp or the Mount Lebanon Extended Day Summer Fun program in Pittsburgh.
    They were recorded on-site. There are 44 speakers and 3,333 utterances in this set. They
    "good" reading examples.
 2. FP: These are examples of errorful reading and dialectic variants. The readers come from
    Fort Pitt School in Pittsburgh and were recorded in April 1996. There are 32 speakers and
    1,847 utterances in this set.

The user should be aware that the speakers' dialect partly reflects what is locally called "Pittsburghese."

The corpus does not come with a train/dev/test split, and the Kaldi recipe splits it randomly
into 70%/30% train-test. We do not perform any splits, and just return the complete recording
and supervision manifests.

This data is not available for free - your institution needs to have an LDC subscription.
    N)Path)DictOptionalUnion)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)PathlikeT
corpus_dir
output_dirabsolute_pathsreturnc                 C   s  t | tr	t| n| } | jdkr| jn| } g }g }i }t| d d d d}|D ]}| d\}}	}
|
||< q)W d   n1 sDw   Y  i }t| d d d d2}tdD ]}t	| q[|D ]}| d\}}}}}|d	\}}|||f||< qdW d   n1 sw   Y  t| d d d
 d}|D ]t}| jdd\}}|dd }|dd }t
|d }|| \}}}| d d | d | d }tj||rdndd}|| |t||d|j||d dkrdndd|| |dkr|nd|dkrt
|nd|||dd	 qW d   n	1 sw   Y  t|}t|}t||\}}t|| ||d}|duretd t|}|jddd |d  |d!  |d" |d#  |S )$a  
    Prepare manifests for CMU Kids corpus. The prepared supervisions contain the
    prompt text as the `text`. Additionally, in the `custom` tag, we provide the
    following data: speaker grade/age, population where the speaker came from
    (SIM95/FP), spoken transcript, and transcription bin (1/2).

    Here, bin `1` means utterances where the speaker followed the prompt and no
    noise/mispronunciation is present, and `2` refers to noisy utterances.

    The tag `spoken_transcript` is the transcription that was actually spoken. It
    contains noise tags and phone transcription in case the pronunciation differed
    from that in CMU Dict.

    :param corpus_dir: Path to downloaded LDC corpus.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Wheter to write absolute paths to audio sources (default = False)
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    cmu_kidstableszsentence.tblr	Nzspeaker.tbl   /ztranscrp.tbl   )maxsplitr         kidssignalz.sph   )relative_path_depthmMaleFemaleEnglishNA)speaker_gradespeaker_agespeaker_populationbinspoken_transcript)	idrecording_idstartdurationspeakergenderlanguagetextcustom)
recordingssupervisionsz Writing manifests to JSONL filesT)parentsexist_okr2   z cmu-kids_recordings_all.jsonl.gzr3   z"cmu-kids_supervisions_all.jsonl.gz)
isinstancestrr   stemparentopenstripsplitrangenextintr   	from_fileappendr
   r,   r   from_recordingsr   from_segmentsr	   r   logginginfomkdirto_file)r   r   r   r2   r3   
utterancesflineuttcountr0   speaker_info_spkpopgr_agegradeagetrn_id
transcriptr'   
audio_path	recording	manifests rY   K/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/cmu_kids.pyprepare_cmu_kids,   s   



$



r[   )NT)__doc__rD   pathlibr   typingr   r   r   lhotser   lhotse.audior   r   	lhotse.qar	   lhotse.supervisionr
   r   lhotse.utilsr   boolr7   r[   rY   rY   rY   rZ   <module>   s(     