o
    Si,+                     @   s,  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	m
Z
mZmZ d dlmZmZ d dlmZmZmZ d dlmZmZ d dlmZmZmZ d	Zd
ZdZdZ			ddedee dee  defddZ!G dd de
Z"G dd de
Z#	d dededee de	e e	e eeef f f fddZ$dS )!    N)defaultdict)Path)AnyDict
NamedTupleOptionalUnion)fix_manifests$validate_recordings_and_supervisions)AudioSource	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadsafe_extractzheroico-answers.txtzheroico-recordings.txtzusma-prompts.txt)traindevtesttest.F#http://www.openslr.org/resources/39
target_dirforce_downloadurlreturnc                 C   s   t | } | jddd d}| | }| d }| r'td| d| d | S t| d| ||d	 t|}t|| d
 W d    n1 sIw   Y  |	  | S )NTparentsexist_okzLDC2006S37.tar.gzz
.completedz	Skipping z	 because z exists./)filenamer   )path)
r   mkdiris_filelogginginfor   tarfileopenr   touch)r   r   r   tar_nametar_pathcompleted_detectortar r-   J/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/heroico.pydownload_heroico   s    r/   c                   @   s&   e Zd ZU eed< eed< eed< dS )HeroicoMetaData
audio_path
audio_infotextN)__name__
__module____qualname__r   __annotations__r   strr-   r-   r-   r.   r0   ,   s   
 r0   c                   @   s>   e Zd ZU eed< eed< eed< eed< eed< eed< dS )UttInfofoldspeaker	prompt_id	subcorpusutterance_id
transcriptN)r4   r5   r6   r8   r7   r-   r-   r-   r.   r9   2   s   
 r9   
speech_dirtranscript_dir
output_dirc           (   	      s@  ddl }t| } t|}|  sJ d|  | s"J d| |dur1t|}|jddd tt}td}td}td}td	}td	}	td
}
td}td}td}td}td}tt}t|t}t	|dd2}|D ]'}|
 }||sq~|jdd\}}|d\}}dd||g}|||< q~W d   n1 sw   Y  t|t}t	|dd*}|D ]}|
 }||sq|jdd\}}dd|g}|||< qW d   n1 sw   Y  t|t}t	|dd-}|D ]"}|
 }|
|sq |jdd\}}dd|g}|||< q W d   n	1 s.w   Y  | d}i }|D ]8}t|}|j} |j}!t|t|r~|jd }"dd|"|!g}||vrld|t|< q<td|"|!d||| d|t|< q<t|t|r|jd }"dd|"|!g}dd|!g}#||"sd|t|< ||!sd|t|< q<td|"|!d|||# d|t|< q<t|t|r|jd }"dd|"|!g}dd|!g}#||"sd|t|< q<td|"|!d|||# d|t|< q<t|!dkst|!dkr8|jd }"dd|"|!g}dd|!g}#td|"|!d|||# d|t|< q<t|!dkrlt|!dk rl|jd }"dd |"|!g}dd |!g}#td!|"|!d |||# d|t|< q<td"|  q<| d}d#d$ |D }$tD ]}%i |$D ]I}t|}|t| sq|t| j|%krq|j} |j}|t|}&|jd }"d|t| j|"|g}t||&|t| jd%|< qt fd&d'D  t!" fd(d' j#D }'t$ |'\ }'t% |' |dur|'&|d)|% d*   &|d+|% d*   |'d,||%< q|S )-a  
    Returns the manifests which consist of the Recordings and Supervisions

    :param speech_dir: Pathlike, the path of the speech data dir.
    :param transcripts_dir: Pathlike, the path of the transcript data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the fold, and the value is Dicts with the keys 'audio' and 'supervisions'.
    r   NzNo such directory: Tr   z\d+/\d+\t.+Answers_Spanishz\d+\t.+Recordings_Spanishzs\d+\t.+z2usma/native\-[fm]\-\w+\-\S+\-\S+\-\S+\-\S+\-\w+\d+zusma/nativezs\d+zWnonnative\-[fm]\-[a-zA-Z]+\d*\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\-[a-zA-Z]+\d+znonnative.+\.wavz
iso-8859-1)encoding   )maxsplitr   -answerszheroico-recitationsusmaz*.wavr   )r:   r;   r<   r=   r>   r?   r   ib  i2  zheroico-recitations-repeatsr   zNo such file: c                 S   s   g | ]}|qS r-   r-   ).0wr-   r-   r.   
<listcomp>   s    z#prepare_heroico.<locals>.<listcomp>)r1   r2   r3   c              	   3   sV    | ]&}t |td dgt | jdgt | jj | jj | jjdV  qdS )filer   )typechannelssource)idsourcessampling_ratenum_samplesdurationN)	r   r   r8   r1   intr2   
samplerateframesrW   rL   idx)metadatar-   r.   	<genexpr>   s    


z"prepare_heroico.<locals>.<genexpr>c                 3   s@    | ]}t ||d  j| jdd|dd | jdV  qdS )g        r   SpanishrH   rK   )rS   recording_idstartrW   channellanguager;   r3   N)r   
recordingsrW   splitr3   r[   audior]   r-   r.   r^   
  s    

heroico_supervisions_z	.jsonl.gzheroico_recordings_)rd   supervisions)'	soundfiler   is_dirr"   r   dictrecompileheroico_dataset_answersr'   rstripmatchre   joinheroico_dataset_recordingsusma_datasetrglobpartsstemfindallr8   r9   rX   r$   warningfoldsr:   r%   r=   r0   r?   r   from_recordingsr   from_segmentsrd   r	   r
   to_file)(r@   rA   rB   rk   	manifestsanswers_line_patternanswers_path_pattern heroico_recitations_line_pattern(heroico_recitations_devtest_path_pattern&heroico_recitations_train_path_patternusma_line_patternusma_native_demo_patternusma_native_path_patternusma_native_prompt_id_patternusma_nonnative_demo_patternusma_nonnative_path_patterntranscriptsanswers_trans_pathflinespk_uttr3   spk_idr<   utt_idheroico_recitations_trans_pathr\   usma_trans_pathaudio_pathsuttdatawav_filewav_pathpath_componentspidspktrans_idaudio_filesfldr%   supervisionr-   rf   r.   prepare_heroico;   sR  

























	



r   )r   Fr   )N)%r$   rn   r&   collectionsr   pathlibr   typingr   r   r   r   r   lhotser	   r
   lhotse.audior   r   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   rp   rt   ru   r{   boolr8   r/   r0   r9   r   r-   r-   r-   r.   <module>   sL    
