o
    Si'                     @   s  d Z ddlZddlmZ ddlmZmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZmZmZ 				d"dedee dee dee dede
eeeef f fddZ			d#dedee dee dede
eeeef f f
ddZ		d$dededee dede
eeeef f f
ddZ 			d%dedededefddZ!dedefd d!Z"dS )&a  
About the Callhome American English

    CALLHOME American English Speech was developed by the Linguistic Data
    Consortium (LDC) and consists of 120 unscripted 30-minute telephone
    conversations between native speakers of English.

    All calls originated in North America; 90 of the 120 calls were placed
    to various locations outisde of North America, while the remaining 30 calls
    were made within North America. Most participants called family members or
    close friends.

    This script support setup of two different tasks -- either ASR or SRE
    For ASR, the following LDC corpora are relevant
      Speech : LDC97S42
      Transcripts : LDC97T14
      Lexicon : LDC97L20 (not actually used)

    For SRE,  this script prepares data for speaker diarization on a portion
    of CALLHOME used in the 2000 NIST speaker recognition evaluation.
    The 2000 NIST SRE data is required. LDC catalog number LDC2001S97.
    N)Counter)DecimalInvalidOperation)Path)DictOptionalUnion)tqdm)	RecordingRecordingSetSupervisionSegmentSupervisionSet)fix_manifests$validate_recordings_and_supervisions)Pathlikecheck_and_rglobresumable_downloadsafe_extractF	audio_dirrttm_dirtranscript_dir
output_dirabsolute_pathsreturnc                 C   s$   |durt | |||S t| |||S )a<  
    Prepare manifests for the CallHome American English corpus.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    Depending on the value of transcript_dir, will prepare either
        * data for ASR task (expected LDC corpora ``LDC97S42`` and ``LDC97T14``)
        * or the SRE task (expected corpus ``LDC2001S97``)

    :param audio_dir: Path to ``LDC97S42``or ``LDC2001S97`` content
    :param transcript_dir: Path to the ``LDC97T14`` content
    :param rttm_dir: Path to the transcripts directory. If not provided,
        the transcripts will be downloaded.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are:
        ``{'recordings', 'supervisions'}``.
    N)prepare_callhome_english_asrprepare_callhome_english_sre)r   r   r   r   r    r   S/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/callhome_english.pyprepare_callhome_english%   s   r   c                    s   |du rt  }|d }t|}t| d}t fddt|D }t||\}}t|| |durKt|}|j	ddd |
|d  |
|d	  ||d
S )a  
    Prepare manifests for the Callhome American English portion prepartion.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC2001S97`` package.
    :param rttm_dir: Path to the transcripts directory. If not provided,
        the transcripts will be downloaded.
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests.
        The keys are: ``{'recordings', 'supervisions'}``.
    Nzfullref.rttm*.sphc                 3   &    | ]}t j| rd nddV  qd S N   )relative_path_depthr
   	from_file.0pr   r   r   	<genexpr>f   
    
z/prepare_callhome_english_sre.<locals>.<genexpr>Tparentsexist_okzrecordings.jsonzsupervisions.json
recordingssupervisions)download_callhome_metadata	read_rttmr   r   from_recordingsr	   r   r   r   mkdirto_json)r   r   r   r   	rttm_pathr1   audio_pathsr0   r   r)   r   r   K   s    


r   c                    sb  t | } t |}i }dD ]!}t| d |dd d}t fddt|D }t|d | d	}g }	|D ]}
d
}t }|
  D ]U}|	 }|sMqD|
drSqDz!|jdd\}}}}tt|t| }|d
krnW qD|| W qD ty   |d d | |d< Y qD ty   |d d | |d< Y qDw |D ]R}|
j}|jdd\}}}}|dd}tt|t| }|d
krqt|}|	t|||t|d
 td | d|d| d|dd|d|d |d7 }qq7t|	}	t||	\}}	t||	 |dur't |}|jddd ||d| d  |	|d| d  ||	d||< q|S )aX  
    Prepare manifests for the CallHome American English corpus.
    We create two manifests: one with recordings, and the other one with text
    supervisions.

    :param audio_dir: Path to ``LDC97S42`` content
    :param transcript_dir: Path to the ``LDC97T14`` content
    :param output_dir: Directory where the manifests should be written.
        Can be omitted to avoid writing.
    :param absolute_paths: Whether to return absolute or relative
        (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are:
        ``{'recordings', 'supervisions'}``.
    )evaltesttraindevtestdatar9   evltestr   c                 3   r    r!   r$   r&   r)   r   r   r*      r+   z/prepare_callhome_english_asr.<locals>.<genexpr>	transcrptz*.txtr   #   )maxsplit : A_z0>2sz0>5d)recording_idstartdurationchannelspeakeridtext   NTr,   zcallhome-english_recordings_z	.jsonl.gzzcallhome-english_supervisions_r/   )r   r   replacer   r4   r	   list	read_text
splitlinesstrip
startswithsplitfloatr   appendr   
ValueErrorstemr   ordr   from_segmentsr   r   r5   to_file)r   r   r   r   	manifestsrV   r8   r0   transcript_pathsr1   r(   idxpostprocessed_lineslinerI   endspkrN   rJ   rH   r   r)   r   r   v   s   






r   .6http://www.openslr.org/resources/10/sre2000-key.tar.gz
target_dirforce_downloadurlc                 C   s   t | } | d }| r|S | jddd d}| | }t|||d t|}t|| d W d    |S 1 s9w   Y  |S )Nzsre2000-keyTr,   zsre2000-key.tar.gz)filenamerh   )path)r   is_dirr5   r   tarfileopenr   )rg   rh   ri   sre_dirtar_nametar_pathtarr   r   r   r2      s   
r2   rk   c                 C   s   t |   }g }t }|D ]E}| \
}}}}}	}}}
}}t|t|	t|}}	}|	dkr2q||  d7  < |t| d||  |||	|| d|
 dd qt	
|S )Ng        rO   rG   English)rM   rH   rI   rJ   rK   rL   language)r   rR   rS   r   rV   rW   intrX   r   r   r\   )rk   linessupsrec_cntrrb   rG   rH   rK   rI   rJ   rL   r   r   r   r3      s*   
r3   )NNNF)NNF)NF)re   Frf   )#__doc__rm   collectionsr   decimalr   r   pathlibr   typingr   r   r   	tqdm.autor	   lhotser
   r   r   r   	lhotse.qar   r   lhotse.utilsr   r   r   r   boolstrr   r   r   r2   r3   r   r   r   r   <module>   s    
(
.
k
