o
    Si                     @   s>  d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZmZ d	Zd
ZddddddddddddddZdddddddddZdedefdede	e dee dee def
ddZ	d'd ed!ee deee
eef f fd"d#Z d$edefd%d&Z!dS )(a  
The CMU_INDIC databases were constructed at the Language Technologies Institute at Carnegie Mellon University as phonetically balanced, single speaker databases designed for corpus based speech synthesis research. They are covering major languages spoken in the Indian subcontinet.
The distributions include the raw waveform files, with transcriptions in the language's native script (etc/txt.done.data file), and also complete built synthesis voices from these databases using CMU Clustergen statistical parameteric speech synthesizer.

Complete android voices for CMU Flite are voice built from these databases are available in the Google Play store. You can hear voices built from these databases here

CMU INDIC Databases

All 13 voices are available from packed
do_indic a script to download and build a full voice from these databases (assuming FestVox build tools are all installed.
These packed versions contain only the waveform files, and the txt.done.data file.
Acknowledgements

These datasets were collected and developed with help from Hear2Read. We acknowledge their contributions to making these practical languages for festvox. Special Thanks for to Suresh Bazaj.

Source: http://festvox.org/cmu_indic/
    N)Path)DictOptionalSequenceUnion)tqdm)	RecordingRecordingSetSupervisionSegmentSupervisionSet$validate_recordings_and_supervisions)fix_manifests)Pathlikeresumable_downloadsafe_extractzhttp://festvox.org/h2r_indic/)ben_rmguj_adguj_dpguj_kthin_abkan_plvmar_aupmar_slppan_amptam_sdrtel_kpntel_sktel_ssFM)r   r   r   r   r   r   r   r   r   r   r   r   r   BengaliGujaratiKannadaHindiMarathiPunjabiTamilTelugu)bengujkanhinmarpantamtel.F
target_dirspeakersforce_downloadbase_urlreturnc              	   C   s   t | } | jddd t|ddD ]\}d| }| d}| | }| | }| | }	|	d }
|
 r@td| d	|
 d
 qt|||d tj|	dd t	
|}t|| d W d   n1 sdw   Y  |
  q| S )a  
    Download and untar the CMU Indic dataset.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param speakers: a list of speakers to download. By default, downloads all.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param base_url: str, the url of CMU Arctic download site.
    :return: the path to downloaded and extracted directory with data.
    T)parentsexist_okz(Downloading/unpacking CMU Indic speakers)desc
cmu_indic_z.tar.bz2z
.completedzSkiping z	 because z exists.)filenamer3   )ignore_errors)pathN)r   mkdirr   is_filelogginginfor   shutilrmtreetarfileopenr   touch)r1   r2   r3   r4   spknametar_namefull_urltar_pathpart_dircompleted_detectortar rN   L/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/cmu_indic.pydownload_cmu_indicT   s&   


rP   
corpus_dir
output_dirc                 C   s  t | } |  sJ d|  tdd | dD }g }| dD ]}|  }t|jjj	}|
dd }zt|jd   d	 d
d }W n   d}Y |D ]U}	|	dd }	|	j
d	d\}
}| d|
 }
t| }d|
v }d}|s~|duri }|r||d< |dur||d< |t|
|
d||
 j|dd|rdn||t||d	 qWq#t|}t||\}}t|| |durt |}||d  ||d  ||dS )a-  
    Prepares and returns the CMU Indic manifests,
    which consist of Recordings and Supervisions.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a dict of {'recordings': ..., 'supervisions': ...}
    zNo such directory: c                 s   s4    | ]}t j|t|jjj d |j dV  qdS )-)recording_idN)r   	from_file_get_speakerparentrG   stem).0wavrN   rN   rO   	<genexpr>   s    
z$prepare_cmu_indic.<locals>.<genexpr>z*.wavztxt.done.data_r   zvoice.feats   zage  N   maxsplitrS   arcticaccentage"English)	idrT   startdurationtextlanguagespeakergendercustomz!cmu-indic_recordings_all.jsonl.gzz#cmu-indic_supervisions_all.jsonl.gz)
recordingssupervisions)r   is_dirr	   from_recordingsrglob	read_text
splitlinesrV   rW   rG   splitintreplacestripLANGUAGE_MAPappendr
   rj   
GENDER_MAPgetr   from_segmentsr   r   to_file)rQ   rR   rp   rq   r<   linesrm   	lang_codere   lseg_idrk   rl   
is_englishro   rN   rN   rO   prepare_cmu_indicy   sp   





r   dirnamec                 C   s   | j dddd S )Nr\   r_   ra   )rw   )r   rN   rN   rO   rV      s   rV   )N)"__doc__r?   rA   rC   pathlibr   typingr   r   r   r   	tqdm.autor   lhotser   r	   r
   r   r   	lhotse.qar   lhotse.utilsr   r   r   BASE_URLSPEAKERSr}   r{   strboolrP   r   rV   rN   rN   rN   rO   <module>   sv    
'
W