o
    Si                     @   s|  d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZ ddlmZ ddlm Z m!Z! ddl"m#Z# dZ$ed#ddZ%	d$de#de#de&dee& deeee f  f
ddZ'		d%de&de#dee( de)deee!f f
ddZ*		d%de#d ee# de)dee&ee&eee!f f f fd!d"Z+dS )&a  
About the Bengali.AI Speech corpus

The competition dataset comprises about 1200 hours of recordings of Bengali speech.
Your goal is to transcribe recordings of speech that is out-of-distribution with respect to the training set.

Note that this is a Code Competition, in which the actual test set is hidden.
In this public version, we give some sample data in the correct format to help you author your solutions.
The full test set contains about 20 hours of speech in almost 8000 MP3 audio files.
All of the files in the test set are encoded at a sample rate of 32k, a bit rate of 48k, in one channel.

It is covered in more detail at https://arxiv.org/abs/2305.09688

Please download manually by
kaggle competitions download -c bengaliai-speech
    N)defaultdict)ProcessPoolExecutor)contextmanager)Path)DictListOptionalSequenceTupleUnion)tqdm"get_ffmpeg_torchaudio_info_enabled"set_ffmpeg_torchaudio_info_enabled)	RecordingRecordingSet)fix_manifests$validate_recordings_and_supervisions)manifests_exist)SupervisionSegmentSupervisionSet)Pathlike)trainvalidtestreturnc                  c   s0    t  } td z
d V  W t|  d S t|  w )NFr   )enabled r   S/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/bengaliai_speech.pydisable_ffmpeg_torchaudio_info)   s   r    
corpus_dir
audio_pathaudio_idtextc              	   C   sR   |  }| std|  d S tj||d}t|||d|jddd}||fS )NzNo such file: )pathrecording_idg        r   Bengali)idr&   r$   startdurationchannellanguage)resolveis_fileloggingwarningr   	from_filer   r*   )r!   r"   r#   r$   	recordingsegmentr   r   r   _parse_utterance3   s$   
r4      subset
audio_infonum_jobsc                 C   sj  t |}| dkr|d }n|d }t|d}t  t|m}g }g }g }	t|ddD ].}
tjt	|
d 
dd	}|d
urM|| vrHq-|| }nd
}||t||
|| q-t|ddD ]}| }|d
u rmqb|\}}|| |	| qbt|}t|	}W d
   n1 sw   Y  W d
   ||fS W d
   ||fS 1 sw   Y  ||fS )z
    Returns the RecodingSet and SupervisionSet given a dataset part.
    :param subset: str, the name of the subset.
    :param corpus_dir: Pathlike, the path of the data dir.
    :return: the RecodingSet and SupervisionSet for train and valid.
    r   	test_mp3s
train_mp3sz*.mp3zDistributing tasksdescr5   z.mp3r    N
Processing)r   listrglobr   r   r   osr%   splitstrreplacekeysappendsubmitr4   resultr   from_recordingsr   from_segments)r6   r!   r7   r8   	part_pathaudio_pathsexfutures
recordingssupervisionsr"   r#   r$   futurerG   r2   r3   recording_setsupervision_setr   r   r   _prepare_subsetP   sJ   






rS   
output_dirc                 C   s  t | } |  sJ d|  td t}|dur%t |}|jddd tt}t| d }|	 
 }W d   n1 s@w   Y  i }i }|dd D ]/}	d|	v rh|	dd	d
d}	|	d ||	d < qOd|	v r~|	dd	d
d}	|	d ||	d < qOt|ddD ]_}
td|
  t|
|dddrtd|
 d qt|
| |
dkr|n|
dkr|nd|d\}}t||\}}t|| |dur||d|
 d  ||d|
 d  ||d||
< q|S )aO  
    Returns the manifests which consist of the Recordings and Supervisions
    :param corpus_dir: Path to the Bengali.AI Speech dataset.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: zPreparing Bengali.AI Speech...NT)parentsexist_okz	train.csvr5   z,trainr    ,r   z,validzDataset partsr;   z%Processing Bengali.AI Speech subset: bengaliai_speechzjsonl.gz)partrT   prefixsuffixzBengali.AI Speech subset: z already prepared - skipping.r   r   )r6   r!   r7   r8   bengaliai_speech_supervisions_z	.jsonl.gzbengaliai_speech_recordings_)rN   rO   )r   is_dirr/   infoBENGALIAI_SPEECHmkdirr   dictopenread
splitlinesrC   rA   r   r   rS   r   r   to_file)r!   rT   r8   subsets	manifestsfr7   
train_info
valid_infolinerY   rQ   rR   r   r   r   prepare_bengaliai_speech   sl   



rm   )r   N)r    )Nr5   ),__doc__r/   r@   collectionsr   concurrent.futures.processr   
contextlibr   pathlibr   typingr   r   r   r	   r
   r   	tqdm.autor   lhotser   r   lhotse.audior   r   	lhotse.qar   r   lhotse.recipes.utilsr   lhotse.supervisionr   r   lhotse.utilsr   r`   r   rB   r4   rb   intrS   rm   r   r   r   r   <module>   sl     
 

5