o
    Si%                     @   s  d Z ddlZddlZddlZddlZddlZddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlm Z m!Z! ddgZ"ddgZ#dee e$f dee fddZ%dee ee&df f dee fddZ'deee$ee$e$f f e f dee fddZ(dee  de$de$dee fddZ)e"e#dd fd!e d"e d#ee$ d$ee$ d%e*d&e&dee$eeef f fd'd(Z+dS ))aO  
About the Fisher English Part 1,2 corpus

    This is conversational telephone speech collected as 2-channel, 8kHz-sampled data.
    The catalog number LDC2004S13 and LDC2005S13 for audio corpora and LDC2004T19 LDC2005T19 for transcripts.

    This data is not available for free - your institution needs to have an LDC subscription.
    N)ProcessPoolExecutorThreadPoolExecutorPath)CalledProcessError)DictListOptionalTupleUnion)tqdm)	RecordingRecordingSet)fix_manifests$validate_recordings_and_supervisions)SupervisionSegmentSupervisionSet)Pathlikecheck_and_rglob
LDC2004S13
LDC2005S13
LDC2004T19
LDC2005T19fold_path_and_patternreturnc                 C   s   t |  S )N)r   )r    r   Q/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/fisher_english.py	get_paths   s   r   audio_path_and_rel_path_depthc                 C   s.   | \}}zt j||dW S  ty   Y d S w )N)relative_path_depth)r   	from_filer   )r   
audio_pathrel_path_depthr   r   r   create_recording#   s   r#   sessions_and_transcript_pathc                    s   | \t  sg S ddd jdd tdd>}dd	 | D d
d  dd	 D dd	 D dkrIdd	 D  fdd	tD }W d    |S 1 scw   Y  |S )Nr      AB_   rutf8c                 S   s   g | ]}| d qS )
)rstrip.0lr   r   r   
<listcomp>;   s    z&create_supervision.<locals>.<listcomp>   c                 S   s    g | ]}|  d kr| qS  )stripsplitr/   r   r   r   r2   <   s     c                 S   sL   g | ]"}t |d  t |d |d dd ddd |dd D gqS )	r   r%   r*   N c                 S   s   g | ]
}|  d kr|qS r4   )r6   )r0   wr   r   r   r2   B   s    z1create_supervision.<locals>.<listcomp>.<listcomp>r3   )floatjoinr/   r   r   r   r2   =   s    

11487c                 S   s:   g | ]}|d  dkr|d dkrdg|dd n|qS )r   gףp=
?@r%   gRAm@g{Gl@Nr   r/   r   r   r   r2   I   s    ,c                    s~   g | ];\}}t jd  t|ttt jt|d dt|d |d  d |d  |d d |d  dqS )-r   r3   r%   r*   English)idrecording_idstartdurationchanneltextlanguagespeaker)r   stemstrzfilllenround)r0   kr1   channel_to_intlines
session_idsessionstranscript_pathr   r   r2   N   s    "
)r   is_filerH   r7   codecsopen	readlines	enumerate)r$   trans_fsegmentsr   rN   r   create_supervision0   s.   

""r[   dirspattern	pbar_descc           
   	      s    fdd| D }d gt |  }tt | t d }t|5}tt ||d}t|t|D ]\}}	|	||< |	  q1W d    n1 sHw   Y  W d    n1 sWw   Y  t
tj|}|S )Nc                    s   g | ]}t | fqS r   r   )r0   dir_pathr]   r   r   r2   c   s    z&walk_dirs_parallel.<locals>.<listcomp>   totaldesc)rK   minos	cpu_countr   r   rX   mapr   updatesorteditchainfrom_iterable)
r\   r]   r^   get_path_inputsoutput_pathsnjobsexecutorpbarrM   tmp_output_pathsr   r`   r   walk_dirs_parallel_   s    


rt   Fr%   
corpus_dir
output_dir
audio_dirstranscript_dirsabsolute_pathsnum_jobsc           #   
      sl  t | } t |}|jddd || D ]}| | }| s(td| d|  dqg }|D ]}	| |	 }
|
 D ]}|d }|| 7 }q7q-g }|D ]}| | d d }|| 7 }qIt|d	d
}t|dd}i |D ];}t| | d dd }t|dd}dd |	 D dd }
dd |D  W d   n1 sw   Y  qjt|t|ksJ t| dt| t|tkrtdt dt| d |d }| rtd|  t|}ntd  fdd|D }d}t|V}t|@}tt|d d!'}|t|D ]}|dur"|j|dd" n|d7 }|
  qW d   n	1 s7w   Y  W d   n	1 sGw   Y  W d   n	1 sWw   Y  |rmtd#t| d$| d% | }|d& }| rtd'|  t|}ntd( fd)d|D }d} tt d* W}t|A}tt|d+d!(}|t|D ]}!|!s| d7 } |!D ]}"||" q|
  qW d   n	1 sw   Y  W d   n	1 sw   Y  W d   n	1 sw   Y  | }| rtd#t| d,|  d% t||\}}t || |!|j"d-  |!|j"d.  ||d/S )0a  
    Prepares manifests for Fisher English Part 1, 2.
    Script assumes that audio_dirs and transcript_dirs are in the corpus_path.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param corpus_path: Path to Fisher corpus
    :param audio_dirs: List of dirs of audio corpora.
    :param transcripts_dirs: List of dirs of transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    T)parentsexist_okzCould not find 'z' directory inside 'z'.audiodatatransz*.sphzParsing audio sub-dirsz*.txtzParsing transcript sub-dirsdocz*_calldata.tblr   r+   r,   c                 S   s   g | ]
}| d dqS )r-   ,)r.   r7   r/   r   r   r   r2          z*prepare_fisher_english.<locals>.<listcomp>r%   Nc                 S   s$   i | ]}|d  |d |d dqS )r      
   r&   r   r/   r   r   r   
<dictcomp>   s   $ z*prepare_fisher_english.<locals>.<dictcomp>z == z7Fisher's *_calldata.tbl files indicate there should be zS sessions, but our scanning of audio and transcript files indicates there are only .zrecordings_notfixed.jsonl.gzz%Using existing recording manifest at z!Building fresh recording manifestc                    s   g | ]
}| r	d ndfqS )Nr   r   r0   p)ry   r   r   r2      r   zCollect recordingsrb   )flushzOut of z recordings, z had errors and were omitted.zsupervisions_notfixed.jsonl.gzz'Using existing supervision manifest at z#Building fresh supervision manifestc                    s   g | ]} |fqS r   r   r   )rR   r   r   r2      s    ra   zCreate supervisionsz transcript files, z&fisher-english_recordings_all.jsonl.gzz(fisher-english_supervisions_all.jsonl.gz)
recordingssupervisions)#r   mkdiris_dir
ValueErroriterdirrt   r   rU   rV   rW   ri   rK   warningswarnrT   logginginfor   from_jsonl_lazyr   open_writerr   rh   r#   writeopen_manifestr   r   rf   rg   r[   r   r   to_fileparent)#ru   rv   rw   rx   ry   rz   workdirworkdir_pathaudio_subdir_paths	audio_diraudio_dir_pathaudio_partition_diraudio_partition_dir_pathtranscript_subdir_pathstranscript_dirtranscript_dir_pathaudio_pathstranscript_pathssessions_data_pathsessions_data_ftmp_sessions	recs_pathr   create_recordings_input	err_recosrq   writerrr   reco	sups_pathr   create_supervisions_inputerr_supstmp_supervisionssr   )ry   rR   r   prepare_fisher_englishr   s  


 



 

r   ),__doc__rU   	itertoolsrk   r   rf   r   concurrent.futuresr   r   pathlibr   
subprocessr   typingr   r   r	   r
   r   	tqdm.autor   lhotse.audior   r   	lhotse.qar   r   lhotse.supervisionr   r   lhotse.utilsr   r   FISHER_AUDIO_DIRSFISHER_TRANSCRIPT_DIRSrI   r   intr#   r[   rt   boolr   r   r   r   r   <module>   sp    	

/
