o
    Si[#                     @   s  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZ g dZdd Z		dde
e de
e dee dede	eeeef f f
ddZde
e de
e fddZ dS )a  
The GALE Arabic Broadcast corpus consists of the following LDC corpora:

GALE Arabic phase 2 Conversation Speech
LDC2013S02: http://catalog.ldc.upenn.edu/LDC2013S02
LDC2013S07: http://catalog.ldc.upenn.edu/LDC2013S07
LDC2013T17: http://catalog.ldc.upenn.edu/LDC2013T17
LDC2013T04: http://catalog.ldc.upenn.edu/LDC2013T04

# GALE Arabic phase 2 News Speech
LDC2014S07: http://catalog.ldc.upenn.edu/LDC2014S07
LDC2015S01: http://catalog.ldc.upenn.edu/LDC2015S01
LDC2014T17: http://catalog.ldc.upenn.edu/LDC2014T17
LDC2015T01: http://catalog.ldc.upenn.edu/LDC2015T01

# GALE Arabic phase 3 Conversation Speech
LDC2015S11: http://catalog.ldc.upenn.edu/LDC2015S11
LDC2016S01: http://catalog.ldc.upenn.edu/LDC2016S01
LDC2015T16: http://catalog.ldc.upenn.edu/LDC2015T16
LDC2016T06: http://catalog.ldc.upenn.edu/LDC2016T06

# GALE Arabic phase 3 News Speech
LDC2016S07: http://catalog.ldc.upenn.edu/LDC2016S07
LDC2017S02: http://catalog.ldc.upenn.edu/LDC2017S02
LDC2016T17: http://catalog.ldc.upenn.edu/LDC2016T17
LDC2017T04: http://catalog.ldc.upenn.edu/LDC2017T04

# GALE Arabic phase 4 Conversation Speech
LDC2017S15: http://catalog.ldc.upenn.edu/LDC2017S15
LDC2017T12: http://catalog.ldc.upenn.edu/LDC2017T12

# GALE Arabic phase 4 News Speech
LDC2018S05: http://catalog.ldc.upenn.edu/LDC2018S05
LDC2018T14: http://catalog.ldc.upenn.edu/LDC2018T14

# Training: 941h Testing: 10.4h

The data has two types of speech: conversational and report.
There is no separate dev set provided with the corpus.

The `S` corpora contain speech data and the `T` corpora contain the corresponding
transcriptions. This recipe prepares any subset of these corpora provided as
arguments, but pairs of speech and transcript corpora must be present. E.g.
to only prepare phase 3 news speech, the arguments
`audio_dirs = ["/export/data/LDC2016S07","/export/data/LDC2017S02"]` and
`transcript_dirs = ["/export/data/LDC2016T17","/export/data/LDC2017T04"]` must
be provided to the `prepare_gale_arabic` method.

This data is not available for free - your institution needs to have an LDC subscription.
    N)defaultdict)chain)Path)DictListOptionalUnion)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)Pathlikecheck_and_rglobis_module_available)
"ALAM_WITHEVENT_ARB_20070116_205800"ALAM_WITHEVENT_ARB_20070206_205801"ALAM_WITHEVENT_ARB_20070213_205800"ALAM_WITHEVENT_ARB_20070227_205800"ALAM_WITHEVENT_ARB_20070306_205800"ALAM_WITHEVENT_ARB_20070313_205800$ARABIYA_FROMIRAQ_ARB_20070216_175800$ARABIYA_FROMIRAQ_ARB_20070223_175801$ARABIYA_FROMIRAQ_ARB_20070302_175801$ARABIYA_FROMIRAQ_ARB_20070309_175800c                   C   s   t dstdd S )NpandaszGale Arabic data preparation requires the 'pandas' package to be installed. Please install it with 'pip install pandas' and try again)r   ImportError r   r   N/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/gale_arabic.pycheck_dependenciesP   s
   r    T
audio_dirstranscript_dirs
output_dirabsolute_pathsreturnc           
         s|  t | t |ksJ dtd ttdd tdd | D D }tdd |D }dd |D }td	 t fd
d|	 D }td t
t|}t||\}}t|| tt}|dd |dd d|d< |dd |dd d|d< |durtd t|}|jddd dD ] }	||	 d |d|	 d  ||	 d |d|	 d  q|S )a}  
    Prepare manifests for GALE Arabic Broadcast speech corpus.

    :param audio_dirs: List of paths to audio corpora.
    :param transcripts_dirs: List of paths to transcript corpora.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    z@Paths to the same speech and transcript corpora must be providedz5Reading audio and transcript paths from provided dirsc                 S   s   i | ]}|j |qS r   )stem.0pr   r   r   
<dictcomp>p   s    z'prepare_gale_arabic.<locals>.<dictcomp>c                 S   s$   g | ]}d D ]	}t ||ddqqS ))z*.wavz*.flacF)strictr   )r(   dirextr   r   r   
<listcomp>s   s    z'prepare_gale_arabic.<locals>.<listcomp>c                 S   s   g | ]}t |d qS )z*.tdfr,   )r(   r-   r   r   r   r/   |       c                 S   s   g | ]}|qS r   r   r'   r   r   r   r/   ~       zPreparing recordings manifestc                 3   s&    | ]}t j| rd nddV  qd S )N   )relative_path_depth)r
   	from_filer'   r$   r   r   	<genexpr>   s
    
z&prepare_gale_arabic.<locals>.<genexpr>zPreparing supervisions manifestc                 S   
   | j tv S NidTESTrr   r   r   <lambda>      
 z%prepare_gale_arabic.<locals>.<lambda>c                 S   r7   r8   recording_idr;   sr   r   r   r>      r?   )
recordingssupervisionstestc                 S   
   | j tvS r8   r9   r<   r   r   r   r>      r?   c                 S   rG   r8   r@   rB   r   r   r   r>      r?   trainNz Writing manifests to JSONL filesT)parentsexist_ok)rH   rF   rD   zgale-arabic_recordings_z	.jsonl.gzrE   zgale-arabic_supervisions_)lenlogginginfor   r   r   from_iterabler   from_recordingsvaluesr   from_segmentsparse_transcriptsr   r	   dictfiltermkdirto_file)
r!   r"   r#   r$   audio_pathstranscript_pathsrD   rE   	manifestspartr   r5   r   prepare_gale_arabicX   s^   






r[   rX   c           	         sr  t   dd l g }t }| D ]} j|ddtdg dttttttddddd		}||jd
k }|d 	dd |d< |d 	 fdd|d< |d 	 fdd|d< |
 D ]\\}}|d  d|d  d| }t|d |d  dd}||v s|dkrqY|| |t||d |d ||d |d d|d |d |d |d |d |d |d |d d d!
 qYq|S )"Nr   	r2      )reco_idchannelstartendspeakergenderdialecttextsectionturnsegmentsection_typesu_type)r^   r_   r`   ra   rb   re   TF)	delimiterskiprowsusecolsnamesdtypeskipinitialspaceerror_bad_lineswarn_bad_linesz
no speakerr^   c                 S   s   |   ddS )Nz.sph )stripreplacexr   r   r   r>      r1   z#parse_transcripts.<locals>.<lambda>rb   c                    s     | s| dd S | S )N*rs   )isnullru   rt   rv   pdr   r   r>      s    re   c                    s     | s	|  S | S r8   )ry   rt   rv   rz   r   r   r>      r0   -ra   r`      )ndigitsrc   Arabicr_   rd   rf   rg   rh   ri   rj   )rd   rf   rg   rh   ri   rj   )
r:   rA   r`   durationrb   rc   languagere   r_   custom)r    r   setread_csvrangestrintfloatrb   applyiterrowsroundaddappendr   )	rX   rE   supervision_idsfiledfidxrowsupervision_idr   r   rz   r   rR      sp   !

rR   )NT)!__doc__rL   collectionsr   	itertoolsr   pathlibr   typingr   r   r   r   lhotser	   lhotse.audior
   r   	lhotse.qar   lhotse.supervisionr   r   lhotse.utilsr   r   r   r;   r    boolr   r[   rR   r   r   r   r   <module>   s6    3
O