o
    Sif                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZmZmZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZ dee
ee
eef f ef dee fddZ		ddededee de de
eeeef f f
ddZ!dS )u2  
About the Fisher Spanish corpus

    This is conversational telephone speech collected as 2-channel μ-law, 8kHz-sampled data.
    The catalog number LDC2010S01 for audio corpus and LDC2010T04 for transcripts.

    This data is not available for free - your institution needs to have an LDC subscription.
    N)ThreadPoolExecutor)Path)DictListOptionalTupleUnion)tqdm)RecordingSet)fix_manifests$validate_recordings_and_supervisions)create_recording)SupervisionSegmentSupervisionSet)Pathlikecheck_and_rglobsessions_and_transcript_pathreturnc                    s   | \t tdd1}dd | D dd   dd  D  dd  D   fddt D }W d    |S 1 sCw   Y  |S )	Nrutf8c                 S   s   g | ]}| d qS )
)rstrip.0l r   Q/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/fisher_spanish.py
<listcomp>"   s    z&create_supervision.<locals>.<listcomp>   c                 S   s"   g | ]}|  d kr|dqS ) 	)stripsplitr   r   r   r   r   #   s   " c              
   S   sH   g | ] }t |d  t |d t|d ddd |d  D gqS )   r       c                 S   s   g | ]
}|  d kr|qS )r   )r!   )r   wr   r   r   r   )       z1create_supervision.<locals>.<listcomp>.<listcomp>   )floatintjoinr"   r   r   r   r   r   $   s    


c                    s   g | ]?\}}t jd  t|ttt  jt|d dt|d |d  d|d |d djdd  |d  dqS )	-r   
   r$   r#   r   Spanish_)idrecording_idstartdurationchanneltextlanguagespeaker)r   stemstrzfilllenroundr"   )r   kr   linessessionstranscript_pathr   r   r   .   s    ")r   codecsopen	readlines	enumerate)r   trans_fsegmentsr   r>   r   create_supervision   s   

rH   Faudio_dir_pathtranscript_dir_path
output_dirabsolute_pathsc              	      s  t | t |} }t| d}t|d}t|dd }t|dd}dd | D d	d
 }dd |D W d
   n1 s@w   Y  t|t  krVt|ksYJ  J  fdd|D }	d
gt| }
tt d 5}t	t|dd}t
|t|	D ]\}}||
|< |  qW d
   n1 sw   Y  W d
   n1 sw   Y  t|
}
fdd|D }d
gt| }tt d 5}t	t|dd}t
|t|D ]\}}|||< |  qW d
   n1 sw   Y  W d
   n	1 sw   Y  ttj|}t|dd }t|
|\}
}t|
| |d
urEt |}|jddd |
|d  ||d  |
|dS )aT  
    Prepares manifests for Fisher Spanish.
    We create two manifests: one with recordings, and the other one with text supervisions.

    :param audio_dir_path: Path to audio directory (usually LDC2010S01).
    :param transcript_dir_path: Path to transcript directory (usually LDC2010T04).
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    z*.sphz*.tdfz
*_call.tblr   r   r   c                 S   s   g | ]
}| d dqS )r   ,)r   r"   r   r   r   r   r   [   s    z*prepare_fisher_spanish.<locals>.<listcomp>r$   Nc                 S   s$   i | ]}|d  |d |d dqS )r   r#      )r   r$   r   r   r   r   r   
<dictcomp>^   s   $ z*prepare_fisher_spanish.<locals>.<dictcomp>c                    s   g | ]
}| r	d ndfqS )N   r   r   p)rL   r   r   r   b   r'   rP   zCollect recordings)totaldescc                    s   g | ]} |fqS r   r   rQ   )r@   r   r   r   m   s    zCreate supervisionsc                 S   s
   | j dkS )Ng        )r3   )sr   r   r   <lambda>z   s   
 z(prepare_fisher_spanish.<locals>.<lambda>T)parentsexist_okz#fisher-spanish_recordings_all.jsonlz%fisher-spanish_supervisions_all.jsonl)
recordingssupervisions)r   r   rB   rC   rD   r;   r   os	cpu_countr	   rE   mapr   updater
   from_recordingsrH   listitchainfrom_iterabler   from_segmentsfilterr   r   mkdirto_file)rI   rJ   rK   rL   audio_pathstranscript_pathssessions_data_pathsessions_data_fsession_linescreate_recordings_inputrY   executorpbarirecocreate_supervisions_inputrZ   tmp_supervisionsr   )rL   r@   r   prepare_fisher_spanish?   sr   

(




	



rt   )NF)"__doc__rB   	itertoolsra   r[   concurrent.futuresr   pathlibr   typingr   r   r   r   r   	tqdm.autor	   lhotse.audior
   	lhotse.qar   r   lhotse.recipes.fisher_englishr   lhotse.supervisionr   r   lhotse.utilsr   r   r9   rH   boolrt   r   r   r   r   <module>   s>    	
(