o
    Sib                     @   sV  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ dZdZdZde
e de de	e e	e eeef f f fddZ!de dedede
eeef  fddZ"			d!dede
e dee ee  f de#de	e e	e eeef f f f
dd Z$dS )"z
Description taken from the abstract of paper:
"GigaSpeech 2: An Evolving, Large-Scale and Multi-domain ASR Corpus for Low-Resource Languages with Automated Crawling, Transcription and Refinement"
https://arxiv.org/abs/2406.11546
    N)defaultdict)ProcessPoolExecutor)Path)DictOptionalSequenceTupleUnion)tqdm)load_manifest)	RecordingRecordingSet)fix_manifests$validate_recordings_and_supervisions)SupervisionSegmentSupervisionSet)Pathlikez7https://huggingface.co/datasets/speechcolab/gigaspeech2)thidvi	train_rawtrain_refineddevtest
output_dirlanguagereturnc              
   C   sb   | du ri S t t}dD ]"}dD ]}| d| d| d| d }| s%qt||| |< qq|S )z
    Returns:
        {
          "train_raw": {"recordings": ..., "supervisions": ...},
          "train_refined": ...,
          "dev": ...,
          "test": ...,
        }
    Nr   
recordingssupervisionsgigaspeech2-_	.jsonl.gz)r   dictis_filer   )r   r   	manifestspartmanifestpath r*   N/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/gigaspeech2.py_read_manifests_if_cached   s   r,   langpart_dir
audio_infoc              	   C   s   | d\}}|j| dd d  | d }| }| s*td|  d S tj||d}t||d|j	d| |
 d	}||fS )
N	-z.wavzNo such file: )r)   recording_idg        r   )r   r3   startdurationchannelr   text)splitjoinpathresolver%   loggingwarningr   	from_filer   r5   strip)r-   r.   r/   
segment_idr7   
audio_path	recordingsegmentr*   r*   r+   _parse_utterance4   s(   "
rC   auto   
corpus_dir	languagesnum_jobsc                 C   s  t | } |  sJ d|  | d } |dkr0ttdd | dD }|s/td|  nt|tr8|g}|durGt |}|j	d	d	d
 t
t}t|ddD ]}td|  | | }t||d}ttddD ]}td|  ||v rtd| d| d qlt|d| d| d }	t|d| d| d }
||dddd }|| d }g }t|}|  }W d   n1 sw   Y  t|Y}g }t|ddD ]}||t||| qt|ddD ]4}| }|du rq|\}}tt|gt|gd\}}t ||d |	!|d  |
!|d  qW d   n	1 s5w   Y  t"|	j#t"|
j#d||< W d   n	1 sTw   Y  W d   n	1 sdw   Y  ql|||< qQt|S ) a  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Path to the GigaSpeech 2 dataset.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param languages: 'auto' (prepare all discovered data) or a list of language codes.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: datarD   c                 s   s    | ]}|j V  qd S )N)name).0r)   r*   r*   r+   	<genexpr>e   s    
z&prepare_gigaspeech2.<locals>.<genexpr>*z1Could not find any of GigaSpeech 2 languages in: NT)parentsexist_okz!Processing GigaSpeech 2 languages)descz
Language: )r   r   zProcessing GigaSpeech 2 subsetz Processing GigaSpeech 2 subset: zGigaSpeech 2  z already prepared - skipping.r!   _recordings_r#   _supervisions__raw _refinedz.tsvzDistributing tasks
Processingr   r   )$r   is_dirsetGIGASPEECH2_LANGSintersectionglob
ValueError
isinstancestrmkdirr   r$   r
   r;   infor,   GIGASPEECH2_SPLITSr   open_writerr   replaceopenread
splitlinesr   appendsubmitrC   resultr   from_recordingsfrom_segmentsr   writefrom_jsonl_lazyr)   )rF   r   rG   rH   r&   r-   lang_dirlang_manifestsr'   
rec_writer
sup_writerr.   tsv_pathaudio_infosfexfuturesr/   futurerj   rA   rB   r   segmentsr*   r*   r+   prepare_gigaspeech2S   s   








 
+rz   )NrD   rE   )%__doc__r;   collectionsr   concurrent.futures.processr   pathlibr   typingr   r   r   r   r	   	tqdm.autor
   lhotser   lhotse.audior   r   	lhotse.qar   r   lhotse.supervisionr   r   lhotse.utilsr   GIGASPEECH2_URLrZ   rb   r_   r,   rC   intrz   r*   r*   r*   r+   <module>   sZ    

!