o
    Si                     @   s(  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	 ddl
mZmZ ddlmZmZ ddlmZmZ dd	lmZmZmZmZ d
Z					d$dedee dee dee dededeee	eef f fddZ	d%dedededefddZddefdedededefdd Zded!efd"d#ZdS )&a;  
About the Switchboard corpus

    This is conversational telephone speech collected as 2-channel, 8kHz-sampled
    data.  We are using just the Switchboard-1 Phase 1 training data.
    The catalog number LDC97S62 (Switchboard-1 Release 2) corresponds, we believe,
    to what we have.  We also use the Mississippi State transcriptions, which
    we download separately from
    http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz

    This data is not available for free - your institution needs to have an LDC subscription.
    N)chain)Path)DictOptionalUnion)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikecheck_and_rglobresumable_downloadsafe_extractz`http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gzTF	audio_dirtranscripts_dirsentiment_dir
output_diromit_silenceabsolute_pathsreturnc                    s  |du rt  }t| d}t|d}g }dd |D }	|D ]}
|
jdd}||
|	| d |	| d	 d
 qt fdd|D }tt	
fddt||D }t||\}}t|| |durlt|| |durt|}|jddd ||d  ||d  ||dS )a  
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC97S62`` package.
    :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations
        for SWBD segments.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept.
    :param absolute_paths: Whether to return absolute or relative (to the corpus dir) paths for recordings.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    Nz*.sphz*trans.textc                 S   s   i | ]}|j d d |qS )-r   )stemsplit).0p r   N/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/switchboard.py
<dictcomp>7   s    z'prepare_switchboard.<locals>.<dictcomp>sw0swAB)audioztext-0ztext-1c                 3   s*    | ]}t j|d   rdnddV  qdS )r$   N   )relative_path_depth)r	   	from_file)r   group)r   r   r   	<genexpr>B   s    
z&prepare_switchboard.<locals>.<genexpr>c                 3   s8    | ]\}}d D ]}t |d|  || dV  qqdS ))r      ztext-)transcript_path	recordingchannelr   N)make_segments)r   r(   r,   r-   )r   r   r   r)   I   s    Tparentsexist_okzswbd_recordings_all.jsonl.gzzswbd_supervisions_all.jsonl.gz)
recordingssupervisions)download_and_untarr   r   replaceappendr
   from_recordingsr   from_segmentsr   from_iterablezipr   r   parse_and_add_sentiment_labelsr   mkdirto_file)r   r   r   r   r   r   audio_paths
text_pathsgroupsname_to_textapnamer2   r3   r   )r   r   r   prepare_switchboard   s@   




rD   r+   r,   r-   c                    s*   |    } fddttj|D S )Nc                    sd   g | ].^}}}}|d  dksst |jt|tt|t| dd d|dj ddqS )r   z	[silence]   )ndigits Englishr"   )idrecording_idstartdurationr-   textlanguagespeaker)r   rI   floatroundjoin)r   
segment_idrK   endwordsr-   r   r,   r   r   
<listcomp>g   s    

z!make_segments.<locals>.<listcomp>)	read_text
splitlinesmapstrr   )r+   r,   r-   r   linesr   rV   r   r.   c   s   
r.   .
target_dirforce_downloadurlc                 C   s   t | } | d }| r|S | jddd d}| | }t|||d t|}t|| d W d    |S 1 s9w   Y  |S )Nswb_ms98_transcriptionsTr/   z"switchboard_word_alignments.tar.gz)filenamer_   )path)r   is_dirr<   r   tarfileopenr   )r^   r_   r`   transcript_dirtar_nametar_pathtarr   r   r   r4   w   s   
r4   r3   c           
      C   s   ddl }t| } | d d }|  r| sJ |j|dg dd}| D ]8\}}|d d	d }t|j||d
 d |d d d}|sHq&|d d}|D ]}	dd t	|D |	_
qQq&dS )zQRead 'LDC2020T14' sentiment annotations and add then to the supervision segments.r   Ndatazsentiment_labels.tsv	)rI   rK   rT   	sentiment)	delimiternamesrI   _rK   g{Gz?rT   )rJ   start_after
end_beforerm   #c                 S   s   i | ]
\}}d | |qS )rm   r   )r   ilabelr   r   r   r      s    z2parse_and_add_sentiment_labels.<locals>.<dictcomp>)pandasr   rd   is_fileread_csviterrowsr   listfind	enumeratecustom)
r   r3   pdlabelsdfrp   rowcall_idmatchessegmentr   r   r   r;      s*   

r;   )NNNTF)T) __doc__re   	itertoolsr   pathlibr   typingr   r   r   lhotser   r   lhotse.audior	   r
   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   SWBD_TEXT_URLboolr[   rD   intr.   r4   r;   r   r   r   r   <module>   sn    
J

