o
    SiF                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZmZmZmZmZmZ ddlmZ dd	lmZmZ i d
ddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd d!ddddddddd"d#ddddddddd$d%ddddddddd&d'ddddddddd(d)ddddddddd*d+ddddddddd,d-ddddddddd.d/ddddddddd0d1ddddddddd2i d3ddddddddd4d5ddddddddd6d7ddddddddd8d9ddddddddd:d;ddddddddd<d=ddddddddd>d?ddddddddd@dAdddddddddBdCdddddddddDdEdddddddddFdGdddddddddHdIdddddddddJdKdddddddddLdMdddddddddNdOdddddddddPdQdddddddddRdSdddddddddTi dUdddddddddVdWdddddddddXdYdddddddddZd[ddddddddd\d]ddddddddd^d_ddddddddd`dadddddddddbdcdddddddddddedddddddddfdgdddddddddhdidddddddddjdkdddddddddldmdddddddddndodddddddddpdqdddddddddrdsdddddddddtdudddddddddvdddddddddwdddddddddxdddddddddydddddddddzddddddddd{ddddddddd|ddddddddd}ddddddddd~dddddddddd	Zg dZddededefddZ			ddedededede
ee
eeeeef f f f
ddZdd ZdS )a  
LibriCSS is a multi-talker meeting corpus formed from mixing together LibriSpeech utterances
and replaying in a real meeting room. It consists of 10 1-hour sessions of audio, each
recorded on a 7-channel microphone. The sessions are recorded at a sampling rate of 16 kHz.
For more information, refer to the paper:
Z. Chen et al., "Continuous speech separation: dataset and analysis,"
ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP),
Barcelona, Spain, 2020
    N)defaultdict)Path)DictUnion)tqdm)CutSetRecordingSetSupervisionSegmentSupervisionSetfix_manifests$validate_recordings_and_supervisions)	Recording)Pathlikefastcopyz/overlap_ratio_0.0_sil0.1_0.5_session0_actual0.0                     )10891320158040774992682969307176z/overlap_ratio_0.0_sil0.1_0.5_session1_actual0.0)r   121296135755105r   84638555z/overlap_ratio_0.0_sil0.1_0.5_session2_actual0.0)r    4970r"   563961r   77298224z/overlap_ratio_0.0_sil0.1_0.5_session3_actual0.0)r   r   260r"   r'   672r   908z/overlap_ratio_0.0_sil0.1_0.5_session4_actual0.0)118812211995r    4507r%   5683r+   z/overlap_ratio_0.0_sil0.1_0.5_session5_actual0.0)r   r.   2300237r0   r%   7021r#   z/overlap_ratio_0.0_sil0.1_0.5_session6_actual0.0)r*   r!   3729r0   r%   r1   r   r(   z/overlap_ratio_0.0_sil0.1_0.5_session7_actual0.0)r   r2   r*   r5   r   r)   8230r#   z/overlap_ratio_0.0_sil0.1_0.5_session8_actual0.0)r-   r/   r3   3570r&   r1   r'   7127z/overlap_ratio_0.0_sil0.1_0.5_session9_actual0.0)r'   r+   r   r4   r8   r(   r6   r#   z/overlap_ratio_0.0_sil2.9_3.0_session0_actual0.0)r   r*   r!   r"   r1   r   r)   r6   z/overlap_ratio_0.0_sil2.9_3.0_session1_actual0.0)1284r   r3   r    r!   4446r0   r8   z/overlap_ratio_0.0_sil2.9_3.0_session2_actual0.0)r-   r   r/   r:   r4   r(   r#   r$   z/overlap_ratio_0.0_sil2.9_3.0_session3_actual0.0)r/   20942830r    r5   r   r"   r4   z/overlap_ratio_0.0_sil2.9_3.0_session4_actual0.0)r   r-   r    r4   r(   r6   r#   r$   z/overlap_ratio_0.0_sil2.9_3.0_session5_actual0.0)r   r;   r*   r5   r   r+   r4   8455z/overlap_ratio_0.0_sil2.9_3.0_session6_actual0.0)r-   r   r/   r2   r5   r0   r8   r=   z/overlap_ratio_0.0_sil2.9_3.0_session7_actual0.0)r   r   r<   r   r   r8   r6   r,   z/overlap_ratio_0.0_sil2.9_3.0_session8_actual0.0)r    r   5142r+   r   r   r#   r,   z/overlap_ratio_0.0_sil2.9_3.0_session9_actual0.0)r   r-   r2   r*   r   r+   r$   r,   z1overlap_ratio_10.0_sil0.1_1.0_session0_actual10.1)r   r/   r*   r   r+   r   r=   r#   z1overlap_ratio_10.0_sil0.1_1.0_session1_actual10.2)r-   r   r;   r7   r)   r#   r$   r,   z1overlap_ratio_10.0_sil0.1_1.0_session2_actual10.0)r-   r7   r5   r1   r'   r8   r(   r#   z1overlap_ratio_10.0_sil0.1_1.0_session3_actual10.1)r   r/   r2   r!   r+   r   r(   r)   z1overlap_ratio_10.0_sil0.1_1.0_session4_actual10.0)r-   r   r2   r*   r+   r   r4   r)   z0overlap_ratio_10.0_sil0.1_1.0_session5_actual9.9)r3   r!   r5   r0   r%   r+   r   r6   z0overlap_ratio_10.0_sil0.1_1.0_session6_actual9.9)r   r   r   r!   r:   r   r   r$   z1overlap_ratio_10.0_sil0.1_1.0_session7_actual10.1)r   r.   r/   r   r'   r(   r#   r,   z1overlap_ratio_10.0_sil0.1_1.0_session8_actual10.0)r   r   r2   r   r:   r+   r   r(   z1overlap_ratio_10.0_sil0.1_1.0_session9_actual10.0)r   r<   r7   r&   r   r)   r=   r$   z1overlap_ratio_20.0_sil0.1_1.0_session0_actual20.8)r   r   r9   r0   r%   r   r8   r$   z1overlap_ratio_20.0_sil0.1_1.0_session1_actual20.5)r   r   r   r*   r:   r"   r>   r)   z1overlap_ratio_20.0_sil0.1_1.0_session2_actual21.1)r   r<   r    r7   r0   r&   r   r6   z1overlap_ratio_20.0_sil0.1_1.0_session3_actual20.0)r   r*   r   r"   r>   r(   r=   r,   z1overlap_ratio_20.0_sil0.1_1.0_session4_actual20.0)r   r   r<   r7   r5   r"   r8   r6   z1overlap_ratio_20.0_sil0.1_1.0_session5_actual19.6)r   r-   r9   r    r7   r!   r'   r=   z1overlap_ratio_20.0_sil0.1_1.0_session6_actual20.0)r   r:   r0   r"   r   r   r)   r#   z1overlap_ratio_20.0_sil0.1_1.0_session7_actual20.1)r2   r3   r<   r    r%   r   r+   r   z1overlap_ratio_20.0_sil0.1_1.0_session8_actual19.8)r.   r/   r2   r+   r8   r)   r6   r,   z1overlap_ratio_20.0_sil0.1_1.0_session9_actual20.7)r   r9   r   r:   r"   r&   r   r(   z1overlap_ratio_30.0_sil0.1_1.0_session0_actual29.7)r   r/   r3   r<   r    r!   r+   r4   z1overlap_ratio_30.0_sil0.1_1.0_session1_actual30.4)r   r!   r%   r   r>   r(   r6   r=   z1overlap_ratio_30.0_sil0.1_1.0_session2_actual29.6)r9   r/   r!   r0   r&   r'   r)   r#   z1overlap_ratio_30.0_sil0.1_1.0_session3_actual30.2)r   r;   r*   r!   r:   r"   r   r(   z1overlap_ratio_30.0_sil0.1_1.0_session4_actual29.8)r   r   r*   r<   r1   r   r#   r$   z1overlap_ratio_30.0_sil0.1_1.0_session5_actual29.7)r   r*   r<   r5   r   r:   r)   r,   z1overlap_ratio_30.0_sil0.1_1.0_session6_actual30.1)r;   r3   r   r1   r'   r   r$   r,   z1overlap_ratio_30.0_sil0.1_1.0_session7_actual30.2)r   r-   r9   r2   r<   r7   r:   r%   z1overlap_ratio_30.0_sil0.1_1.0_session8_actual29.7)r-   r9   r7   r!   r%   r4   r6   r,   z1overlap_ratio_30.0_sil0.1_1.0_session9_actual29.8)r-   r   r'   r   r4   r8   r   r(   z1overlap_ratio_40.0_sil0.1_1.0_session0_actual39.5)r   r9   r   r<   r5   r:   r8   r(   )r   r   r3   r*   r:   r4   r(   r=   )r-   r9   r   r*   r0   r   r)   r6   )r   r   r!   r   r%   r"   r8   r#   )r-   r   r/   r5   r   r(   r$   r,   )r   r9   r3   r    r   r:   r0   r)   )r-   r;   r!   r%   r"   r+   r4   r6   )r.   r   r<   r>   r4   r6   r=   r#   )r   r*   r5   r%   r&   r'   r   r6   )r9   r/   r    r!   r   r   r)   r#   )	z1overlap_ratio_40.0_sil0.1_1.0_session1_actual39.7z1overlap_ratio_40.0_sil0.1_1.0_session2_actual41.2z1overlap_ratio_40.0_sil0.1_1.0_session3_actual40.2z1overlap_ratio_40.0_sil0.1_1.0_session4_actual39.0z1overlap_ratio_40.0_sil0.1_1.0_session5_actual42.0z1overlap_ratio_40.0_sil0.1_1.0_session6_actual39.9z1overlap_ratio_40.0_sil0.1_1.0_session7_actual40.5z1overlap_ratio_40.0_sil0.1_1.0_session8_actual40.5z1overlap_ratio_40.0_sil0.1_1.0_session9_actual39.9)0L0SOV10OV20OV30OV40F
target_dirforce_downloadreturnc                 C   s   d}t | } | jddd | d }| d }|s$| r$t| d ntj|d| d | r2|retd| d	|   t|d
}t	|
 ddD ]}|||  qLW d   | S 1 s`w   Y  | S )aE  
    Downloads the LibriCSS data from the Google Drive and extracts it.
    :param target_dir: the directory where the LibriCSS data will be saved.
    :param force_download: if True, it will download the LibriCSS data even if it is already present.
    :return: the path to downloaded and extracted directory with data.
    a  wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/
/p')&id=1Piioxd5G_85K9Bhcr8ebdhXx0CnaHy7l" -O for_release.zip && rm -rf /tmp/cookies.txtT)parentsexist_okzfor_release.zipfor_releasez# already exists. Skipping download.)shellcwdzExtracting z to r
Extracting)descN)r   mkdirexistslogginginfo
subprocessrunzipfileZipFiler   infolistextract)rE   rF   command
corpus_zip
corpus_dirmember r^   K/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/libricss.pydownload_libricssg   s$   
r`   mdmr\   
output_dirtypesegmented_cutsc                    s\  |dv sJ t | } | jdkr| d n| } g }g }i }tD ]}| |  D ]  jd\}}}}}}	}
t|
dd }
| d|	 }|| j< |dkrS d d n|d	kr] d d
 n d d }tj||d}|dkry|	t
|dgd n|	| tt d d D ]C\}}|dks|dkrd}n|d	krt j |d  }nttd}|	t| d| ||d |d |d  |d d|d |d qq%qt|}t|}t||\}}t|| ||d}|rxtt}t| d d }t|}| D ]\ }||   | qW d   n	1 sw   Y  tj||d}g }| D ]A\ }| fdd  d }t|D ]*\}}|d d }|d |d  d }|j!||d d!"  d| }|	| qBq-t#|}||d"< |durt |}|j$d#d#d$ |%|d%| d&  |%|d%| d'  |r|%|d%| d(  |S ))a}  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    NOTE: The recordings contain all 7 channels. If you want to use only one channel, you can
    use either ``recording.load_audio(channel=0)`` or ``MonoCut(id=...,recording=recording,channel=0)``
    while creating the CutSet.

    :param corpus_dir: Pathlike, the path to the extracted corpus.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param type: str, the type of data to prepare ('mdm', 'sdm', 'ihm-mix', or 'ihm'). These settings
        are similar to the ones in AMI and ICSI recipes.
    :param segmented_cuts: bool, if True, it will return 1-minute (as described in the original paper)
        in the form of a CutSet. These are saved under the index ``segments`` in the returned Dict.
        May be useful for evaluating multi-talker ASR systems, e.g., in this paper: https://arxiv.org/abs/2109.08555.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    )ra   sdmihm-mixihmrJ   _actualr   rf   cleanzmix.wavrg   zeach_spk.wavrecordzraw_recording.wavrecording_idre   r   )channel_idstranscriptionzmeeting_info.txtr   r   -r   English)idrm   startdurationtextlanguagespeakerchannel)
recordingssupervisionszall_res.jsonrM   Nc                    s
   | j  kS )Nrl   )csessionr^   r_   <lambda>   s   
 z"prepare_libricss.<locals>.<lambda>i>  F)offsetrt   keep_excessive_supervisionssegmentsT)rI   rH   z	libricss-z_recordings_all.jsonl.gzz_supervisions_all.jsonl.gzz_segments_all.jsonl.gz)&r   stemOVERLAP_RATIOSiterdirnamesplitfloatr   	from_fileappendr   	enumerateparse_transcriptSPK_TO_CHANNEL_MAPlistranger	   r
   from_segmentsr   from_recordingsr   r   r   openjsonloaditemsextendr   from_manifestsfilterto_eagertruncatewith_id	from_cutsrP   to_file)r\   rb   rc   rd   ry   r   session_name_mapovrh   r   	actual_ovrm   
audio_path	recordingidxsegrx   rz   result_dictfressegs	cuts_recocuts_segmentedsession_segmentssession_cutrs   rt   new_cutr^   r|   r_   prepare_libricss   s   



)






r   c           	   	   C   sv   g }t | d*}t| |D ]}|d\}}}}}|t|t||||f qW d   |S 1 s4w   Y  |S )zV
    Parses the transcript file and returns a list of SupervisionSegment objects.
    rM   	N)r   nextr   r   r   )		file_namer   r   liners   endrw   utt_idru   r^   r^   r_   r     s   
r   )F)Nra   F) __doc__r   rR   rT   rV   collectionsr   pathlibr   typingr   r   r   lhotser   r   r	   r
   r   r   lhotse.audior   lhotse.utilsr   r   r   r   boolr`   strr   r   r^   r^   r^   r_   <module>   s   	 	
 !"#$%&'()*+,-./01234? 
 
