o
    Si                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
mZmZ ddlmZmZ ddlmZmZ ddlmZ d	Zd
edefddZdedeeee f fddZddefdedee dee defddZdefddZ		d dedee dede	eef fddZdS )!aA  
About the Earnings 22 dataset:

    The Earnings 22 dataset ( also referred to as earnings22 ) is a 119-hour corpus
    of English-language earnings calls collected from global companies. The primary
    purpose is to serve as a benchmark for industrial and academic automatic speech
    recognition (ASR) models on real-world accented speech.

    This dataset has been submitted to Interspeech 2022. The paper describing our
    methods and results can be found on arXiv at https://arxiv.org/abs/2203.15591.

    @misc{https://doi.org/10.48550/arxiv.2203.15591,
    doi = {10.48550/ARXIV.2203.15591},
    url = {https://arxiv.org/abs/2203.15591},
    author = {Del Rio, Miguel and Ha, Peter and McNamara, Quinten and Miller, Corey and Chandra, Shipra},
    keywords = {Computation and Language (cs.CL), FOS: Computer and information sciences, FOS: Computer and information sciences},
    title = {Earnings-22: A Practical Benchmark for Accents in the Wild},
    publisher = {arXiv},
    year = {2022},
    copyright = {Creative Commons Attribution Share Alike 4.0 International}
    }

    N)Path)DictListOptionalUnion)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikez,https://github.com/revdotcom/speech-datasetstextreturnc                 C   s"   |  tddtj} |  } | S )N )	translatestr	maketransstringpunctuationlower)r    r   M/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/earnings22.py	normalize'   s   r   pathc                 C   sf   t | %}|  t }|D ]}|d}|dd ||d < q|W  d    S 1 s,w   Y  d S )N,   r   )openreadlinedictsplit)r   foutliner   r   r   read_metadata/   s   

$r%   .F
target_dirforce_downloadurlc                 C   s"   t ddt d d d  dS )aG  Download and untar the dataset.
    :param target_dir: Pathlike, the path of the dir to store the dataset.
        The extracted files are saved to target_dir/earnings22/
        Please note that the github repository contains other additional datasets and
        using this call, you will be downloading all of them and then throwing them out.
    :param force_download: Bool, if True, download the tar file no matter
        whether it exists or not.
    :param url: str, the url to download the dataset.
    :return: the path to downloaded and extracted directory with data.
    zBDownloading Earnings22 from github repository is not implemented. zPlease visit z) and download the files manually. Please zGfollow the instructions closely as you need to use git-lfs to download zsome of the audio files.N)loggingerror_DEFAULT_URL)r'   r(   r)   r   r   r   download_earnings229   s   
r-   filenamec                 C   s`   t | "}t }|  |D ]}|d}||d  q|W  d    S 1 s)w   Y  d S )N|r   )r   listr   r!   append)r.   r"   
transcriptr$   r   r   r   parse_nlp_fileP   s   

$r3   
corpus_dir
output_dirnormalize_textc              
   C   sn  t | } |  sJ d|  |durt |}|jddd | d }t|d}t|dks1J |  tdd	 |D }| d
 d }t|d}t|dksTJ t	| d }|  t }	|D ],}
|
j
}dt|
}|rut|}t||d|| jdd|| d  |d}|	| qct|	}t||\}}t|| |dur||d  ||d  ||fS )ay  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. The structure is
        expected to mimic the structure in the github repository, notably
        the mp3 files will be searched for in [corpus_dir]/media and transcriptions
        in the directory [corpus_dir]/transcripts/nlp_references
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: Bool, if True, normalize the text.
    :return: (recordings, supervisions) pair

    .. caution::
        The `normalize_text` option removes all punctuation and converts all upper case
        to lower case. This includes removing possibly important punctuations such as
        dashes and apostrophes.
    zNo such directory: NT)parentsexist_okmediaz*.mp3}   c                 s   s    | ]}t |V  qd S )N)r	   	from_file).0pr   r   r   	<genexpr>~   s    

z%prepare_earnings22.<locals>.<genexpr>transcriptsnlp_referencesz*.nlpzmetadata.csv g        r   zEnglish-   )idrecording_idstartdurationchannellanguager   z$earnings22_supervisions_all.jsonl.gzz"earnings22_recordings_all.jsonl.gz)r   is_dirmkdirr0   globlensortr
   from_recordingsr%   stemjoinr3   r   r   rF   r1   r   from_segmentsr   r   to_file)r4   r5   r6   	media_diraudio_filesrecording_setnlp_dir	nlp_filesmetadatasupervision_segmentsnlp_filerC   r   ssupervision_setr   r   r   prepare_earnings22Z   sP   
	

r]   )NF)__doc__r*   r   pathlibr   typingr   r   r   r   lhotser   r   lhotse.audior	   r
   lhotse.supervisionr   r   lhotse.utilsr   r,   r   r   r%   boolr-   r3   r]   r   r   r   r   <module>   sH    

