o
    2wiL                     @   s   d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
ZddlmZmZ ddlmZmZ ddlmZmZ ddlmZmZ d	Zd
Z			ddededee dededeee	eef f fddZddefddZdS )a=  
 About the eval2000 corpus
     2000 HUB5 English Evaluation was developed by the Linguistic Data Consortium (LDC) and
     consists of approximately 11 hours of English conversational telephone speech used in the
     2000 HUB5 evaluation sponsored by NIST (National Institute of Standards and Technology).
     The source data consists of conversational telephone speech collected by LDC:
     (1) 20 unreleased telephone conversations from the Swtichboard studies in which recruited
      speakers were connected through a robot operator to carry on casual conversations about a
      daily topic announced by the robot operator at the start of the call; and
     (2) 20 telephone conversations from CALLHOME American English Speech which consists of
      unscripted telephone conversations between native English speakers.
    N)Path)DictListOptionalTupleUnion)	RecordingRecordingSet)fix_manifests$validate_recordings_and_supervisions)SupervisionSegmentSupervisionSet)Pathlikecheck_and_rglob
LDC2002S09
LDC2002T43F   
corpus_dir
output_dirtranscript_pathabsolute_pathsnum_jobsreturnc                    s0  t | } t |}|jddd | t d d }| s"J d| | t d d }|du r0|nt |}| s?J d| g }|dD ]}	t |	j}
|d	|	i qFt	 fd
d|D }t
|}t|}t||\}}t|| |durt |}|jddd ||d  ||d  ||dS )a  
    Prepares manifests for Eval2000.

    :param corpus_path: Path to global corpus
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    T)parentsexist_okhub5e_00englishzNo such directory:	referenceNz*.sphaudioc                 3   s*    | ]}t j|d   rdnddV  qdS )r   N   )relative_path_depth)r   	from_file).0groupr    T/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/eval2000.py	<genexpr>?   s    
z#prepare_eval2000.<locals>.<genexpr>z eval2000_recordings_all.jsonl.gzz%eval2000_supervisions_unnorm.jsonl.gz)
recordingssupervisions)r   mkdirEVAL2000_AUDIO_DIRis_dirEVAL2000_TRANSCRIPT_DIRrglobstemappendr	   from_recordingsmake_segmentsr   from_segmentsr
   r   to_file)r   r   r   r   r   audio_partition_dir_pathdefault_transcript_pathtranscript_dir_pathgroupspathbaser(   segmentsr)   r%   r$   r&   prepare_eval2000   s<   



r<   Tomit_silencec                 C   s  g }|  dD ]~}t|j}dd t|D }d}tdt|D ]d}|| r|| }d|d vr|d }t|d }	t|d }
t|
|	 dd	}|d
 dd }|dkrXd}nd}d	|dd  }|d t
| }|}|d | }t|||	||d||d}|| q q|S )Nz*.txtc                 S   s   g | ]}|  qS r%   )split)r"   lr%   r%   r&   
<listcomp>U   s    z!make_segments.<locals>.<listcomp>r   #r      )ndigits   :A r   -English)idrecording_idstartdurationchannellanguagespeakertext)r.   r   r/   openrangelenfloatroundr>   joinstrr   r0   )r7   r=   segment_supervision	text_path
trans_filetrans_file_linesrK   i
trans_linerM   endrN   siderO   	text_line
segment_idrL   rQ   segmentr%   r%   r&   r2   Q   sF   


r2   )NFr   )T)__doc__ospathlibr   typingr   r   r   r   r   numpynplhotse.audior   r	   	lhotse.qar
   r   lhotse.supervisionr   r   lhotse.utilsr   r   r+   r-   boolintrY   r<   r2   r%   r%   r%   r&   <module>   s8    
4