o
    2wiY                     @   s
  d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZmZ d	Zd
edefddZddefdede
e de
e defddZdefddZ		ddede
e dedeeef fddZdS )a  
About the Earnings 21 dataset:

    The Earnings 21 dataset ( also referred to as earnings21 ) is a 39-hour corpus of
    earnings calls containing entity dense speech from nine different financial sectors.
    This corpus is intended to benchmark automatic speech recognition (ASR) systems
    in the wild with special attention towards named entity recognition (NER).

    This dataset has been submitted to Interspeech 2021. The paper describing methods
    and results can be found on arXiv at https://arxiv.org/pdf/2104.11348.pdf

    @misc{delrio2021earnings21,
        title={Earnings-21: A Practical Benchmark for ASR in the Wild},
        author={Miguel Del Rio and Natalie Delworth and Ryan Westerman and Michelle Huang and Nishchal Bhandari and Joseph Palakapilly and Quinten McNamara and Joshua Dong and Piotr Zelasko and Miguel Jette},
        year={2021},
        eprint={2104.11348},
        archivePrefix={arXiv},
        primaryClass={cs.CL}
    }

    N)Path)DictListOptionalTupleUnion)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeresumable_downloadzIhttps://codeload.github.com/revdotcom/speech-datasets/zip/refs/heads/maintextreturnc                 C   s"   |  tddtj} |  } | S )N )	translatestr	maketransstringpunctuationlower)r    r   V/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/earnings21.py	normalize)   s   r   .F
target_dirforce_downloadurlc                 C   s   t d t| } | jddd | d }| d }|d }| r+t d| d |S t|||d	 tj|dd
 t	|}|
 D ]}d|v rP|j|| d qCW d   n1 s[w   Y  tt| d d t|  t| d  |  |S )aG  Download and untar the dataset.
    :param target_dir: Pathlike, the path of the dir to store the dataset.
        The extracted files are saved to target_dir/earnings21/
        Please note that the github repository contains other additional datasets and
        using this call, you will be downloading all of them and then throwing them out.
    :param force_download: Bool, if True, download the tar file no matter
        whether it exists or not.
    :param url: str, the url to download the dataset.
    :return: the path to downloaded and extracted directory with data.
    zDownloading Earnings21 from github repository is not very efficient way how to obtain the corpus. You will be downloading other data as well.Tparentsexist_ok
earnings21zspeech-datasets-main.zipz.lhotse-download.completedzSkipping - z exists.)filenamer   )ignore_errors)pathNzspeech-datasets-main)logginginfor   mkdiris_filer   shutilrmtreezipfileZipFilenamelistextractmover   touch)r   r   r   extracted_dirzip_pathcompleted_detectorzipfr   r   r   download_earnings211   s4   r8   r$   c                 C   s`   t | "}t }|  |D ]}|d}||d  q|W  d    S 1 s)w   Y  d S )N|r   )openlistreadlinesplitappend)r$   r7   
transcriptliner   r   r   parse_nlp_file`   s   

$rA   
corpus_dir
output_dirnormalize_textc              
   C   sL  t | } |  sJ d|  |durt |}|jddd | d }t|d}t|dks1J |  tdd	 |D }| d
 d }t|d}t|dksTJ t }|D ]%}	|	j	}
d
t|	}|rkt|}t|
|
d||
 jdd|d}|| qYt|}t||\}}t|| |dur||d  ||d  ||fS )ay  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply
    read and return them.

    :param corpus_dir: Pathlike, the path of the data dir. The structure is
        expected to mimic the structure in the github repository, notably
        the mp3 files will be searched for in [corpus_dir]/media and transcriptions
        in the directory [corpus_dir]/transcripts/nlp_references
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: Bool, if True, normalize the text.
    :return: (recordings, supervisions) pair

    .. caution::
        The `normalize_text` option removes all punctuation and converts all upper case
        to lower case. This includes removing possibly important punctuations such as
        dashes and apostrophes.
    zNo such directory: NTr    mediaz*.mp3,   c                 s   s    | ]}t |V  qd S )N)r
   	from_file).0pr   r   r   	<genexpr>   s    

z%prepare_earnings21.<locals>.<genexpr>transcriptsnlp_referencesz*.nlp g        r   English)idrecording_idstartdurationchannellanguager   z$earnings21_supervisions_all.jsonl.gzz"earnings21_recordings_all.jsonl.gz)r   is_dirr)   r;   globlensortr   from_recordingsstemjoinrA   r   r   rR   r>   r   from_segmentsr   r	   to_file)rB   rC   rD   	media_diraudio_filesrecording_setnlp_dir	nlp_filessupervision_segmentsnlp_filerO   r   ssupervision_setr   r   r   prepare_earnings21j   sL   
	

rg   )NF) __doc__r'   r+   r   r-   pathlibr   typingr   r   r   r   r   lhotser   r	   lhotse.audior
   r   lhotse.supervisionr   r   lhotse.utilsr   r   _DEFAULT_URLr   r   boolr8   rA   rg   r   r   r   r   <module>   sL    	
/
