o
    Si:#                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlmZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ ddl m!Z!m"Z" dZ#dd dD Z$dZ%dZ&e'(e%e&Z)dd Z*dd Z+			d'de!deee'ee' f  de,defddZ-	d(d e!d!ee! de,de
e'e
e'eeef f f fd"d#Z.d$e	deeeef  fd%d&Z/dS ))a  
ReazonSpeech is an open-source dataset that contains a diverse set of natural Japanese speech,
collected from terrestrial television streams. It contains more than 35,000 hours of audio.

The dataset is available on Hugging Face. For more details, please visit:

Dataset: https://huggingface.co/datasets/reazon-research/reazonspeech
Paper: https://research.reazon.jp/_static/reazonspeech_nlp2023.pdf
    N)defaultdict)Path)AnyDictOptionalSequenceTupleUnion)tqdm)CutSetfix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)parallel_map)manifests_existread_manifests_if_cached)SupervisionSegmentSupervisionSet)Pathlikeis_module_available)tinysmallmediumlargeallsmall-v1z	medium-v1zall-v1c                 C   s   i | ]}t |d qS ) )ord).0x r!   O/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/reazonspeech.py
<dictcomp>&   s    r#   u!   、。「」『』，,？！!!?!?u   ａｂｃｄｅｆｇｈｉｊｋｌｍｎｏｐｑｒｓｔｕｖｗｘｙｚＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ０１２３４５６７８９>abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789c                    sD   t dr	ddl ntd| tt}  fdd}td|| S )z
    Convert full-width characters to half-width, and remove punctuations.
    :param s: str, input string.
    :return: str, normalized string.
    	num2wordsr   Nz]To process the ReazonSpeech corpus, please install optional dependency: pip install num2wordsc                    s    j | dddS )Nr   ja)lang)r%   group)mr%   r!   r"   <lambda>9   s    znormalize.<locals>.<lambda>z	\d+\.?\d*)r   r%   ImportError	translatePUNCTUATIONSZEN2HANresub)sconvr!   r*   r"   	normalize,   s   
r4   c                 C   sF   t |ddd}tj| |ddd W d   dS 1 sw   Y  dS )z
    Writes data to a JSON file.
    :param data: The data to write.
    :param filename: The name of the file to write to.
    wutf-8encodingF   )ensure_asciiindentN)openjsondump)datafilenamefr!   r!   r"   write_to_json=   s   "rB   .auto   
target_dirdataset_partsnum_jobsreturnc           	         s   t drddl ddlm}m} ntdt| } | jddd | d }|d	kr+d
}nt|t	r3|g}|D ]}t
d|  |d|d||dd }q5dtdtdtf fdd}|d|dd}|j|d|j|d}|j|d |ddd|jd |S )aY  
    Download the ReazonSpeech dataset.
    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: the parts of the dataset to download (e.g. small, medium, or large).
    :param num_jobs: the number of processes to download and format.
    :return: the path to downloaded data and the JSON file.
    datasetsr   N)Audioload_datasetzhTo process the ReazonSpeech corpus, please install optional dependencies: pip install datasets soundfileTparentsexist_okReazonSpeechrD   )r   zDownloading ReazonSpeech part: zreazon-research/reazonspeech)trust_remote_code	cache_dirnum_proctrainexampleidxrI   c                    sH   t || d< | d d | d< t| d | d<  | d d j| d< | S )Nidaudiopathaudio_filepathtranscriptiontextduration)strr4   infor]   )rU   rV   sfr!   r"   format_exampleo   s
   z-download_reazonspeech.<locals>.format_examplerX   )decode)with_indicesremove_columnsrS   dataset.jsonFr9   )rS   force_asciir;   lines
batch_size)r   	soundfilerJ   rK   rL   r,   r   mkdir
isinstancer^   loggingr_   dictintcast_columnmapcolumn_namesto_jsonnum_rows)	rF   rG   rH   rK   rL   
corpus_dirpartdsrb   r!   r`   r"   download_reazonspeechH   sR   
		rx   ru   
output_dirc                 C   s  t | } |  sJ d|  t| d ddd4}t|}|dd }|dd }|dd }t|| d	  t|| d
  t|| d  W d   n1 sOw   Y  d}t |}|jddd t||dddd}	|D ]}
t	d|
  t
|
|dddrt	d|
 d ql| |
 d }t|ddd}t|}W d   n1 sw   Y  t|d|
 d }t|d|
 d f}t|d|
 d J}ttt||dddD ]6\}}tt|gt|gd\}}t||d tj||d}||d  ||d  ||d  qW d   n	1 sw   Y  W d   n	1 s,w   Y  W d   n	1 s<w   Y  t|jt|jt|jd |	|
< qlt|	S )!a  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.
    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param num_jobs: int, number of parallel threads used for 'parse_utterance' calls.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    zNo such directory: rf   rr6   r7   Ni  iL  z
train.jsonzdev.jsonz	test.json)rT   devtestTrM   reazonspeechzjsonl.gz)rG   ry   prefixsuffixlazyz Processing ReazonSpeech subset: )rv   ry   r~   r   zReazonSpeech subset: z already prepared - skipping.z.jsonreazonspeech_recordings_z	.jsonl.gzreazonspeech_supervisions_reazonspeech_cuts_)rH   z$Processing reazonspeech JSON entries)desc)
recordingssupervisionsr   )r   r   cuts)r   is_dirr<   r=   loadrB   rk   r   rm   r_   r   r   open_writerr   r   r
   r   parse_utterancer   from_recordingsfrom_segmentsr   from_manifestswritefrom_jsonl_lazyrY   rn   )ru   ry   rH   filefullr{   r|   rT   parts	manifestsrv   r@   items
rec_writer
sup_writer
cut_writer	recordingsegmentr   segmentsr   r!   r!   r"   prepare_reazonspeech   s   

	


  
"

r   itemc              	   C   sD   t j| d | d d}t| d | d d| d dd| d d	}||fS )
z
    Process a single utterance from the ReazonSpeech dataset.
    :param item: The utterance to process.
    :return: A tuple containing the Recording and SupervisionSegment.
    rZ   rW   )recording_idg        r]   r   Japaneser\   )rW   r   startr]   channellanguager\   )r   	from_filer   )r   r   r   r!   r!   r"   r      s   	r   )rC   rD   rE   )rE   )0__doc__r=   rm   r0   collectionsr   pathlibr   typingr   r   r   r   r   r	   	tqdm.autor
   lhotser   r   r   lhotse.audior   r   lhotse.parallelr   lhotse.recipes.utilsr   r   lhotse.supervisionr   r   lhotse.utilsr   r   REAZONSPEECHr.   ZENKAKUHANKAKUr^   	maketransr/   r4   rB   ro   rx   r   r   r!   r!   r!   r"   <module>   sX    
 
F
"^