o
    Si                     @   s   d Z ddlZddlZddlmZ ddlmZmZ ddlm	Z	 ddl
mZmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZmZ 	ddeddfddZdedefddZ		ddededededeeeeeeef f f f
ddZdS )uU  
Description taken from official website: https://datasets.kensho.com/datasets/spgispeech
SPGISpeech consists of 5,000 hours of recorded company earnings calls and their respective
transcriptions. The original calls were split into slices ranging from 5 to 15 seconds in
length to allow easy training for speech recognition systems. Calls represent a broad
cross-section of international business English; SPGISpeech contains approximately 50,000
speakers, one of the largest numbers of any speech corpus, and offers a variety of L1 and
L2 English accents. The format of each WAV file is single channel, 16kHz, 16 bit audio.

Transcription text represents the output of several stages of manual post-processing.
As such, the text contains polished English orthography following a detailed style guide,
including proper casing, punctuation, and denormalized non-standard words such as numbers
and acronyms, making SPGISpeech suited for training fully formatted end-to-end models.

Official reference:

O’Neill, P.K., Lavrukhin, V., Majumdar, S., Noroozi, V., Zhang, Y., Kuchaiev, O., Balam,
J., Dovzhenko, Y., Freyberg, K., Shulman, M.D., Ginsburg, B., Watanabe, S., & Kucsko, G.
(2021). SPGISpeech: 5, 000 hours of transcribed financial audio for fully formatted
end-to-end speech recognition. ArXiv, abs/2104.02014.

ArXiv link: https://arxiv.org/abs/2104.02014
    N)Path)DictUnion)tqdm)	RecordingRecordingSet)parallel_map)manifests_existread_manifests_if_cached)SupervisionSegmentSupervisionSet)PathlikeSeconds.
target_dirreturnc                 C   s   t d dS )z
    Download and untar the dataset.

    NOTE: This function just returns with a message since SPGISpeech is not available
    for direct download.

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    zSPGISpeech is not available for direct download. Please fill out the form at https://datasets.kensho.com/datasets/spgispeech to download the corpus.N)logginginfo)r    r   M/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/spgispeech.pydownload_spgispeech&   s   r   textc                 C   s"   |  tddtj} |  } | S )N )	translatestr	maketransstringpunctuationlower)r   r   r   r   	normalize7   s   r   T   
corpus_dir
output_dirnormalize_textnum_jobsc                 C   sN  t | } |  sJ d|  | d  r| n| d }ddg}i }t |}|jddd t||dddd}|D ]}td	|  t||ddd
rStd| d q7i }dt dtfddat	
|d| d &}	ttt|| d|dddD ]}
|
j||
j< |	|
 qzW d   n1 sw   Y  t
|d| d e}t| | d dK}t| t|ddD ]8}| d}|d ddd d!}|d" }|rt|}|dd }t||||d|| d#d$}|| qW d   n1 sw   Y  W d   n	1 sw   Y  t	|	jt|jd%||< q7td& |S )'a  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param normalize_text: Bool, if True, normalize the text (similar to ESPNet recipe).
    :param num_jobs: int, the number of jobs to use for parallel processing.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.

    .. note::
        Unlike other recipes, output_dir is not Optional here because we write the manifests
        to the output directory while processing to avoid OOM issues, since it is a large dataset.

    .. caution::
        The `normalize_text` option removes all punctuation and converts all upper case to lower case.
        This includes removing possibly important punctuations such as dashes and apostrophes.
    zNo such directory: train
spgispeechvalT)parentsexist_okzjsonl.gz)dataset_partsr"   prefixsuffixlazyzProcessing SPGISpeech subset: )partr"   r+   r,   zSPGISpeech subset: z already prepared - skipping.pr   c                 S   s"   t j| | jj d| j d}|S )N_)recording_id)r   	from_fileparentstem)r/   rr   r   r   audio_read_workerz   s   r6   spgispeech_recordings_z	.jsonl.gzz*.wav)r$   z Processing SPGISpeech recordings)descNspgispeech_supervisions_z.csvr5   zProcessing utterances|r   /r0   z.wavr      English)idr1   r   speakerstartdurationlanguage)
recordingssupervisionszManifests are lazily materialized. You may want to call `lhotse.qa.fix_manifests()` to ensure that all supervisions fall within the corresponding recordings.)r   is_dirmkdirr
   r   r   r	   r   r6   r   open_writerr   r   rglobrA   r>   writer   opennextstripsplitreplacer   r   from_jsonl_lazypathwarning)r!   r"   r#   r$   	audio_dirr*   	manifestsr.   	durations
rec_writer	recording
sup_writerflinepartsr1   r   spkidsegmentr   r   r   prepare_spgispeech?   s   
	 

r]   )r   )Tr    )__doc__r   r   pathlibr   typingr   r   	tqdm.autor   lhotse.audior   r   lhotse.parallelr   lhotse.recipes.utilsr	   r
   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   r   boolintr]   r   r   r   r   <module>   s>    
