o
    Si                     @   s0  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZmZ d	Zd
ZddddddddZddddddddZdedefdede	e dee dee def
ddZ	d#dedee deee
eef f fddZ d edefd!d"Z!dS )$a  
The CMU_ARCTIC databases were constructed at the Language Technologies Institute at Carnegie Mellon University
as phonetically balanced, US English single speaker databases designed for unit selection speech synthesis research.

A detailed report on the structure and content of the database and the recording environment etc is available as a
Carnegie Mellon University, Language Technologies Institute Tech Report CMU-LTI-03-177 and is also available here:
http://www.festvox.org/cmu_arctic/cmu_arctic_report.pdf

The databases consist of around 1150 utterances carefully selected from out-of-copyright texts from Project Gutenberg.
The databses include US English male (bdl) and female (slt) speakers (both experinced voice talent) as well as
other accented speakers.

The 1132 sentence prompt list is available from cmuarctic.data:
http://www.festvox.org/cmu_arctic/cmuarctic.data

The distributions include 16KHz waveform and simultaneous EGG signals.
Full phoentically labelling was perfromed by the CMU Sphinx using the FestVox based labelling scripts.
Complete runnable Festival Voices are included with the database distributions, as examples though better voices
can be made by improving labelling etc.

Note: The Lhotse recipe is currently not downloading or using the phonetic labeling.
    N)Path)DictOptionalSequenceUnion)tqdm)	RecordingRecordingSetSupervisionSegmentSupervisionSet$validate_recordings_and_supervisions)fix_manifests)Pathlikeresumable_downloadsafe_extractz%http://festvox.org/cmu_arctic/packed/)aewahwaupawbaxbbdlclbeeyfemgkajmkkspljmlnhrmsrxrslpsltmalefemale)r   r"   r   r   r   r   r   z
US MidwestUSzCanadian OntariozScottish South EasternIndian.F
target_dirspeakersforce_downloadbase_urlreturnc              	   C   s   t | } | jddd t|ddD ]]}d| d}| d}| | }| | }| | }	|	d }
|
 rAtd	| d
|
 d qt|||d tj|	dd t	
|}t|| d W d   n1 sew   Y  |
  q| S )a  
    Download and untar the CMU Arctic dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param speakers: a list of speakers to download. By default, downloads all.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of CMU Arctic download site.
    :return: the path to downloaded and extracted directory with data.
    T)parentsexist_okz)Downloading/unpacking CMU Arctic speakers)desccmu_us__arcticz.tar.bz2z
.completedzSkiping z	 because z exists.)filenamer*   )ignore_errors)pathN)r   mkdirr   is_filelogginginfor   shutilrmtreetarfileopenr   touch)r(   r)   r*   r+   spknametar_namefull_urltar_pathpart_dircompleted_detectortar rF   M/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/cmu_arctic.pydownload_cmu_arcticW   s&   

rH   
corpus_dir
output_dirc           
      C   s0  t | } |  sJ d|  tdd | dD }g }| dD ]H}|  }t|jjj	}|D ]6}|dd }|j
dd	\}}	| d
| }|t||d|| j|	ddd|t|dt|id	 q4q#t|}t||\}}t|| |durt |}||d  ||d  ||dS )a.  
    Prepares and returns the CMU Arctic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    zNo such directory: c                 s   s4    | ]}t j|t|jjj d |j dV  qdS )-)recording_idN)r   	from_file_get_speakerparentr?   stem).0wavrF   rF   rG   	<genexpr>   s    
z%prepare_cmu_arctic.<locals>.<genexpr>z*.wavztxt.done.data      )maxsplitrK   r   " Englishaccent)	idrL   startdurationtextlanguagespeakergendercustomNz"cmu-arctic_recordings_all.jsonl.gzz$cmu-arctic_supervisions_all.jsonl.gz)
recordingssupervisions)r   is_dirr	   from_recordingsrglob	read_text
splitlinesrN   rO   r?   splitappendr
   r^   replace
GENDER_MAPget
ACCENT_MAPr   from_segmentsr   r   to_file)
rI   rJ   rd   re   r4   linesra   lseg_idr_   rF   rF   rG   prepare_cmu_arctic|   sF   




rv   dirnamec                 C   s   |  dd S )N_rT   )rk   )rw   rF   rF   rG   rN      s   rN   )N)"__doc__r7   r9   r;   pathlibr   typingr   r   r   r   	tqdm.autor   lhotser   r	   r
   r   r   	lhotse.qar   lhotse.utilsr   r   r   BASE_URLSPEAKERSrn   rp   strboolrH   rv   rN   rF   rF   rF   rG   <module>   sh    
'
9