o
    Siq                     @   s"  d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
mZmZmZ ddlmZ ddlmZmZmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ dZdede
eeef  fddZ de!dedeeef fddZ"	ddede
e dee!ee!eeef f f fddZ#dS )z
The SpeechIO Chinese data is a collection of test sets covering wide range of speech recognition tasks & scenarios.

Participants can obtain the datasets at https://github.com/SpeechColab/Leaderboard - please download the datasets manually.
    N)defaultdict)Path)DictListOptionalSequenceTupleUnion)tqdm)AudioSource	RecordingRecordingSet)fix_manifests$validate_recordings_and_supervisions)manifests_exist)SupervisionSegmentSupervisionSet)Pathlikeis_module_available   
corpus_dirreturnc                 C   s  g }g }t dstddd l}|jt|  ddd}|d  }|d  }|d	  }t|D ]J\}}	t| |	 }	tj	|	sLt
d
|	 d q2t|	}
||
 || }|| }|dd }t|  d| |d|
jdd||d}|| q2||fS )Npandasz<To prepare speechio data, please 'pip install pandas' first.r   z/metadata.tsv	)sepIDTEXTAUDIOzAudio file z does not exist - skipping._-Chinese)idrecording_idstartdurationchannellanguagespeakertext)r   
ValueErrorr   read_csvstrtolist	enumerateospathexistsloggingwarningr   	from_fileappendsplitr   r$   )r   
recordingssegmentspddfrecording_idstexts	wav_pathsidx
audio_path	recordingr"   r(   r'   segment rA   K/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/speechio.py_parse_one_subset   s>   


rC   subsetc                 C   sP   t |}||  }t|\}}t|}t|}t||\}}t|| ||fS )z
    Returns the RecodingSet and SupervisionSet given a dataset part.
    :param subset: str, the name of the subset.
    :param corpus_dir: Pathlike, the path of the data dir.
    :return: the RecodingSet and SupervisionSet for train and valid.
    )r   rC   r   from_recordingsr   from_segmentsr   r   )rD   r   	part_pathrecording_setsupervision_setrA   rA   rB   _prepare_subsetB   s   



rJ   
output_dirc           	      C   s  t | } |  sJ d|  td g }ttd D ]}| d}|d|  q|dur<t |}|jddd t	t
}t|d	d
D ]D}td|  t||dddrbtd| d qFt|| \}}|dur||d| d  ||d| d  ||d||< qF|S )aF  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Path to the SpeechIO dataset.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: zPreparing SpeechIO...      SPEECHIO_ASR_ZH000NT)parentsexist_okzDataset parts)desczProcessing SpeechIO subset: speechiozjsonl.gz)partrK   prefixsuffixzSpeechIO subset: z already prepared - skipping.speechio_supervisions_z	.jsonl.gzspeechio_recordings_)r6   supervisions)r   is_dirr1   inforangeSPEECHIO_TESTSET_INDEXzfillr4   mkdirr   dictr
   r   rJ   to_file)	r   rK   subsetsir=   	manifestsrS   rH   rI   rA   rA   rB   prepare_speechioZ   s:   

rd   )N)$__doc__r1   r.   collectionsr   pathlibr   typingr   r   r   r   r   r	   	tqdm.autor
   lhotse.audior   r   r   	lhotse.qar   r   lhotse.recipes.utilsr   lhotse.supervisionr   r   lhotse.utilsr   r   r\   rC   r+   rJ   rd   rA   rA   rA   rB   <module>   sB     
*

