o
    Si                     @   s@  d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ ddlmZmZ 			d!dededee dee dee de	eeeef f fddZdede	ee	eeeef f f fddZdedede	de
e fddZ dedede
e fdd Z!dS )"a!  
About the DIHARD III corpus

    The DIHARD III corpus consists of multi-domain data prepared to evaluate
    "hard" speaker diarization. It was used for evaluation in the Third DIHARD
    Challenge, organized by NIST and LDC in Winter 2020. It consists of monologues,
    map task dialogues, broadcast interviews, sociolinguistic interviews, meeting
    speech, speech in restaurants, clinical recordings, and YouTube videos.
    More details can be found at:
    https://dihardchallenge.github.io/dihard3/docs/third_dihard_eval_plan_v1.2.pdf
    N)defaultdict)chain)Path)DictListOptionalUnion)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikecheck_and_rglobT   dev_audio_direval_audio_dir
output_diruem_manifestnum_jobsreturnc                    s|  t t}tddgddD ]}|dkr| n|}|du s t| s)td|  qtt|dtt|dt	j
|d	|d
}ttt|dd  tt fdd|D }	|rkttfdd|D }
t||	\}}	t||	 |durt|}|jddd ||d| d  |	|d| d  |r|
|d| d  ||	d||< |r|| d|
i q|S )aY  
    Prepare manifests for the DIHARD III corpus.
    We create two manifests: one with recordings, and the other one with supervisions containing speaker id
    and timestamps.

    :param dev_audio_dir: Path to downloaded DIHARD III dev corpus (LDC2020E12), e.g.
        /data/corpora/LDC/LDC2020E12
    :param eval_audio_dir: Path to downloaded DIHARD III eval corpus (LDC2021E02), e.g.
        /data/corpora/LDC/LDC2021E02`
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param uem_manifest: If True, also return a SupervisionSet describing the UEM segments (see use in
        dataset.DiarizationDataset)
    :param num_jobs: int (default = 1), number of jobs to scan corpus directory for recordings
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    devevalzPreparing DIHARD parts)descNzNothing to be done for z*.rttmz*.uemz*.flac)r   zrecordings.tblr   c                 3   s6    | ] t  fd dD d   j dV  qdS )c                       g | ]
}|j  jkr|qS  stemid.0x	recordingr   J/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/dihard3.py
<listcomp>A       -prepare_dihard3.<locals>.<genexpr>.<listcomp>r   )	rttm_pathr%   metadataN)make_rttm_segmentsr    r"   )r+   
rttm_pathsr$   r&   	<genexpr>?   s    
z"prepare_dihard3.<locals>.<genexpr>c                 3   s.    | ] t  fd dD d  dV  qdS )c                    r   r   r   r!   r$   r   r&   r'   L   r(   r)   r   )uem_pathr%   N)make_uem_segmentsr-   )	uem_pathsr$   r&   r/   J   s    
T)parentsexist_okdihard3_recordings_z	.jsonl.gzdihard3_supervisions_dihard3_uem_)
recordingssupervisionsuem)r   dictr	   r   existsloggingwarninglistr   r   from_dirparse_metadatar   from_segmentsr   from_iterabler
   r   mkdirto_fileto_jsonupdate)r   r   r   r   r   	manifestspart	audio_dirr8   r9   r:   r   )r+   r.   r2   r&   prepare_dihard3   sF   


rK   metadata_pathc           
      C   s~   t t}t| d,}t| |D ]}|  \	}}}}}}	}	}	}	|dk|||d||< qW d   |S 1 s8w   Y  |S )z
    Parses the recordings.tbl file and creates a dictionary from recording id to
    metadata containing the following keys: in_core, lang, domain, source
    rTrue)in_corelangdomainsourceN)r   r;   opennextstripsplit)
rL   r+   flinereco_idrO   rP   rQ   rR   _r   r   r&   rA   c   s   


rA   r*   r%   r+   c                    s(   |    } fddttj|D S )Nc                    s   g | ]<\
}}}}}}}}}}t j d | d tdt| dd tdt|t|  djt|t|| d  dqS )-d   06drP   )r    recording_idstartdurationspeakerlanguagecustom)r   r    intfloat)r"   rZ   channelr_   r`   ra   r+   r%   r   r&   r'   z   s    
>z&make_rttm_segments.<locals>.<listcomp>	read_text
splitlinesmapstrrV   )r*   r%   r+   linesr   rg   r&   r,   v   s   

r,   r0   c                    s&   |    } fddttj|D S )Nc                    sl   g | ]2\}}}}t  j d tdt| dd tdt| d jt|tt|t| dddqS )r[   r\   r]      )ndigits)r    r^   r_   r`   )r   r    rd   re   round)r"   rZ   r_   endr$   r   r&   r'      s    
0z%make_uem_segments.<locals>.<listcomp>rh   )r0   r%   rm   r   r$   r&   r1      s   

r1   )NTr   )"__doc__r=   collectionsr   	itertoolsr   pathlibr   typingr   r   r   r   	tqdm.autor	   lhotser
   r   lhotse.audior   r   lhotse.supervisionr   r   lhotse.utilsr   r   boolrd   rl   rK   rA   r,   r1   r   r   r   r&   <module>   sX    
*I
