o
    2wi                     @   s@  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZmZ ddlm Z m!Z!m"Z" d"ddZ#de dee$ fddZ%				d#de de&fddZ'	d$de dee  fddZ(	d$de de$dee  fd d!Z)dS )%a  
This dataset consists of transcripts for 663 podcasts from the This American Life radio program from 1995 to 2020, covering 637 hours of audio (57.7 minutes per conversation) and an average of 18 unique speakers per conversation.

We hope that this dataset can serve as a new benchmark for the difficult tasks of speech transcription, speaker diarization, and dialog modeling on long, open-domain, multi-speaker conversations.

To learn more, please read our paper at: https://arxiv.org/pdf/2005.08072.pdf, and check the README.txt.
    N)defaultdict)ProcessPoolExecutor)Path)DictIterableOptionalUnion)	HTTPError)tqdm)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)AlignmentItemSupervisionSegmentSupervisionSet)Pathlikeis_module_availableresumable_downloadi  i  c                 C   sr  t dstddd l}ddlm} i }t| D ]x}td| d |  d| }||}||jd}	t	 }
|	j
d	d
ddD ]}|d drS|
|  |d   q?tdt|
 d| d t|
D ]*}t|dd }||}||jd}	|	j
d	d
d
dD ]}|d |d| < qqeqtdt| d t|d}t|| W d    d S 1 sw   Y  d S )Nbs4z*Please 'pip install beautifulsoup4' first.r   )BeautifulSoupz	Scraping z...z/archive?year=zhtml.parseraTzgoto-episode)hrefclass_r   /zFound z episodes in .)r   downloadzep-zSaving results (z episodes)...w)r   ImportErrorrequestsr   r   rangeprintgettextsetfind_all
startswithaddlenr
   intsplitopenjsondump)website_urloutput_path
year_ranger!   r   urlsyearurlresponsesoup	page_urlsr   episode_url
episode_idf r<   ^/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/this_american_life.pyscrape_urls   s8   

"r>   
target_dirreturnc              	   c   s`    dD ]*}t t| | d }t| D ]}|V  qW d    n1 s(w   Y  qd S )N)trainvalidtest-transcripts-aligned.json)r-   r   r.   loadkeys)r?   subsetr;   r:   r<   r<   r=   included_episodes;   s   rH   r   Fghttps://ipfs.io/ipfs/bafybeidyt3ch6t4dtu2ehdriod3jvuh34qu4pwjyoba2jrjpmqwckkr6q4/this_american_life.ziphttps://thisamericanlife.orgforce_downloadc                 C   st  t | } | jddd | d }| d }| r|rCt|||d t|d}td ||  W d    n1 s:w   Y  |  | d }| sPt	|| t
|}t|}	W d    n1 sdw   Y  | d	 }
|
jdd
 t| D ]<}td| d|	|  d zt|	| |
| d |d W qw ty } ztd| d| d W Y d }~qwd }~ww td d S )NTparentsexist_okzmetadata.zipz
README.txt)rK   rzExtracting...z	urls.jsonaudio)rN   zDownloading episode z... ().mp3zFailed to download : z. Skipping...zDone!)r   mkdiris_filer   zipfileZipFiler#   
extractallunlinkr>   r-   r.   rE   rH   r	   )r?   rK   metadata_urlr0   zip_pathcompleted_detectorzip_ref	urls_pathr;   r3   	audio_direp_ider<   r<   r=   download_this_american_lifeB   s>   


rb   
corpus_dir
output_dirc                 C   s$   i }dD ]}t | ||d||< q|S )N)rA   devrC   )rc   rG   rd   )!prepare_this_american_life_subset)rc   rd   	manifestsrG   r<   r<   r=   prepare_this_american_lifep   s   rh   rG   c                    s4  t dstdddlm} t|  } |dkrdn|}tt| | d }t|}W d    n1 s7w   Y  g }g }t	|
 }	|	D ]\}
}|	jd| d	|
 d
d | d |
 d }| sptd| d qH|t| t|D ][\}}|d }|| t |d krtd|
 d| d|d  dt  d	  fdd|d D }t|
 d| |
|d |d |d  d|d|d d}|d|}|| q|qHt|}t|}t||\}}t|| |d urt|}|jd d d! ||d"| d#  ||d$| d#  ||d%S )&Nnltkz Please 'pip install nltk' first.r   )word_tokenizere   rB   rD   zProcessing z	 subset (rQ   )descrP   rR   zFile z not found - skipping.	utterancen_wordszTranscript mismatch for -rS   z words in the transcript, z tokens in the text.c                    s6   g | ]\}}}|t  k rt t| ||| qS r<   )r*   r   r+   ).0startendixwordsr<   r=   
<listcomp>   s
    z5prepare_this_american_life_subset.<locals>.<listcomp>
alignmentsutterance_startutterance_endenspeaker)idrecording_idrp   durationchannelr%   languagerz   wordTrL   zthis-american-life_recordings_z	.jsonl.gzz this-american-life_supervisions_)
recordingssupervisions)r   r    ri   rj   r   absoluter-   r.   rE   r
   itemsset_descriptionrU   loggingwarningappendr   	from_file	enumerater*   r   with_alignmentr   from_recordingsr   from_segmentsr   r   rT   to_file)rc   rG   rd   rj   file_subsetr;   transcriptsr   r   pbarr`   
transcript
audio_pathutt_ixuttr%   rv   segmentrecording_setsupervision_setr<   rs   r=   rf      sn   $






rf   )r   )r   FrI   rJ   )N)*__doc__globr.   r   rerV   collectionsr   concurrent.futuresr   pathlibr   typingr   r   r   r   urllib.errorr	   	tqdm.autor
   lhotser   r   lhotse.audior   r   lhotse.supervisionr   r   r   lhotse.utilsr   r   r   r>   strrH   boolrb   rh   rf   r<   r<   r<   r=   <module>   sR    
 
0
