o
    Si6                     @   sd  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZmZmZmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z# dZ$dZ%G dd dZ&			d!de dee'ee' f de(defddZ)		d"de de dee  dee'ee' f dee'ee' f dee'ee'eeef f f fdd Z*dS )#a  
About GigaST corpus:
GigaST: A large-scale speech translation corpus
https://arxiv.org/abs/2204.03939

GigaST is a large-scale speech translation corpus, by translating the transcriptions in GigaSpeech, a multi-domain English speech recognition corpus with 10,000 hours of labeled audio. The training data is translated by a strong machine translation system and the test data is produced by professional human translators.

Note:
This recipe assume you have downloaded and prepared GigaSpeech by lhotse.
We assure manifests_dir contains the following files:
  - gigaspeech_recordings_TEST.jsonl.gz
  - gigaspeech_recordings_XL.jsonl.gz
  - gigaspeech_supervisions_TEST.jsonl.gz
  - gigaspeech_supervisions_XL.jsonl.gz
    N)defaultdict)ProcessPoolExecutor)Path)AnyDictListOptionalSequenceTupleUnion)tqdm)CutSet)AudioSource	RecordingRecordingSet)manifests_existread_manifests_if_cached)SupervisionSegmentSupervisionSet)PathlikeSecondsis_module_availableresumable_download)XLLMSXSDEVTEST)dezhc                   @   s&   e Zd ZdedefddZdd ZdS )GigaST
corpus_dirlangc                 C   s`   t |d| d }tt|d | _W d    n1 sw   Y  tt| jd | _d S )NGigaST..jsonaudiossegments)openiterjsonloadaudio_generatornextsegment_generator)selfr#   r$   f r2   I/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/gigast.py__init__$   s   zGigaST.__init__c                 C   s>   zt | jW S  ty   tt | jd | _t | j Y S w )Nr(   )r.   r/   StopIterationr*   r-   )r0   r2   r2   r3   get_next_line)   s   zGigaST.get_next_lineN)__name__
__module____qualname__r   strr4   r6   r2   r2   r2   r3   r"   #   s    r"   .allF
target_dir	languagesforce_downloadreturnc                 C   s   t | } | jddd |dkrt}nt|tr|g}nt|}t|ddD ];}td|  | d| d }|	 rHtd	| d
| d q%d| d}| | }t
d| ||d |  q%| S )a~  
    Download GigaST dataset

    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param languages: one of: 'all' (downloads all known languages); a single language code (e.g., 'en'), or a list of language codes.
    :param force_download: bool, if True, download the archive even if it already exists.

    :return: the path to downloaded with data.
    Tparentsexist_okr<   zDownloading GigaSTdesczDownload language: r;   
_completedz	Skipping z	 because z exists.r%   r&   zJhttps://lf3-nlp-opensource.bytetos.com/obj/nlp-opensource/datasets/GigaST/)filenamer?   )r   mkdirGIGAST_LANGS
isinstancer:   listr   logginginfois_filer   touch)r=   r>   r?   r$   completed_detector	json_name	json_pathr2   r2   r3   download_gigast1   s,   

rS   autor#   manifests_dir
output_dirdataset_partsc                 C   sJ  t | } t |}|  sJ d|  td |dkrtn|}t|tr(|g}|dkr.dn|}t|tr8|g}|durGt |}|jddd td d	}d
}t||||d}|dus^J t	|t	|ksvJ t	|t	|t
| |ft|ddD ]}|tv sJ |tftd| d t| |}	| D ]\}
}g }td|
  t|
|dd
drtd| d|
 d q|	 }t|d ddD ]9}|d |jkr|}|
dkr|d |d d|_nd|d i|_|| z|	 }W q ty   Y  nw qtd| d|
  t|}||d| d|
 d   qq|dS )!a  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Path to the GigaST dataset
    :param manifests_dir: Path to the GigaSpeech manifests
    :param output_dir: Pathlike, the path where to write the manifests.
    :param languages: 'auto' (prepare all languages) or a list of language codes.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: zPreparing GigaST...rT   )r   r   NTrA   zLoading manifest
gigaspeechzjsonl.gz)rW   rV   prefixsuffixzProcessing GigaSTrD   zLoading GigaST.r&   zProcessing z	gigast-de)partrV   rY   rZ   zGigaST z	 subset: z already prepared - skipping.supervisionsz"Generate the extented supervisionssidr   text_rawextra)r^   r_   zSaving GigaST zgigast-_supervisions_z	.jsonl.gz)r   is_dirrL   rM   rI   rJ   r:   rH   r   lenrK   keysr   r"   itemsr   r6   idcustomappendr5   r   from_segmentsto_file)r#   rU   rV   r>   rW   rY   rZ   	manifestsr$   gigast	partitionmr\   cur_linesupnew_supsupervisionsetr2   r2   r3   prepare_gigast\   s   









rr   )r;   r<   F)rT   rT   )+__doc__r+   rL   collectionsr   concurrent.futuresr   pathlibr   typingr   r   r   r   r	   r
   r   	tqdm.autor   lhotser   lhotse.audior   r   r   lhotse.recipes.utilsr   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   GIGASPEECH_PARTSrI   r"   r:   boolrS   rr   r2   r2   r2   r3   <module>   sV    $
/