o
    Si                     @   sp  d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZmZmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlm Z m!Z! dZ"dZ#dZ$		d#de de%de
fddZ&de de'deeeef  fddZ(	d$de'de de)deeef fddZ*		d%de d ee  de)dee'ee'eeef f f fd!d"Z+dS )&a  
About the medical corpus

A dataset of simulated patient-physician medical interviews with a focus on respiratory cases.
The simulated medical conversation dataset is available on figshare.com.
The dataset is divided into two sets of files: audio files of the simulated conversations in mp3 format, and the transcripts of the audio files as text files.
There are 272 mp3 audio files and 272 corresponding transcript text files.
Each file is titled with three characters and four digits.
RES stands for respiratory, GAS represents gastrointestinal, CAR is cardiovascular, MSK is musculoskeletal, DER is dermatological, and the four following digits represent the case number of the respective disease category.

It is covered in more detail at https://www.nature.com/articles/s41597-022-01423-1.pdf
    N)defaultdict)ProcessPoolExecutor)Path)DictListOptionalSequenceTupleUnion)tqdm)	RecordingRecordingSet)fix_manifests$validate_recordings_and_supervisions)manifests_exist)SupervisionSegmentSupervisionSet)Pathlikeresumable_download)testdevtrain)zaudio.tar.gzzcleantext.tar.gzzmedical_test.infozmedical_dev.infozmedical_train.infoz=https://huggingface.co/datasets/yfyeung/medical/resolve/main/.F
target_dirforce_downloadreturnc              	   C   s   t | } | jddd t}t|ddD ]D}td|  | | }t|dd}tt	| ||d d	|v rWt
j|dd
 t|}||  W d   n1 sRw   Y  q| S )a  
    Download and unzip Medical dataset.

    :param target_dir: Pathlike, the path of the dir to store the dataset.
    :param force_download: bool, if True, download the archive even if it already exists.

    :return: the path to downloaded and extracted directory with data.
    Tparentsexist_okzDownloading MedicaldesczDownloading part: z.tar.gz )filenamer   ztar.gz)ignore_errorsN)r   mkdirMEDICAL_SPLITSr   logginginfostrreplacer   MEDICAL_BASE_URLshutilrmtreetarfileopen
extractall)r   r   dataset_partspart	part_pathpart_dirtar r5   J/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/medical.pydownload_medical*   s&   r7   
corpus_dir
audio_infoc           	   	   C   s   | dd dd ddd\}}}}t| dd dd}| |  }| s5td|  d S tj||d	}t	|d
 tt
| |t|t|t| dd|d}||fS )N,	[]r!   z.mp3zaudio/zNo such file: )pathrecording_id_r   English)idr?   startdurationchannellanguagetext)r)   splitr(   resolveis_filer&   warningr   	from_filer   hashfloat)	r8   r9   
audio_pathrC   endrG   	file_name	recordingsegmentr5   r5   r6   _parse_utteranceN   s$    

rT      subsetnum_jobsc              	   C   s(  t |}|d|  d  }t|}|  }W d   n1 s!w   Y  t|^}g }g }g }	t|ddD ]}
||t||
 q7t|ddD ]}|	 }|du rUqJ|\}}||vrb|| |	| qJt
|}t|	}t||\}}t|| W d   ||fS 1 sw   Y  ||fS )z
    Returns the RecodingSet and SupervisionSet given a dataset part.
    :param subset: str, the name of the subset.
    :param corpus_dir: Pathlike, the path of the data dir.
    :return: the RecodingSet and SupervisionSet for train and valid.
    medical_z.infoNzDistributing tasksr   
Processing)r   r.   read
splitlinesr   r   appendsubmitrT   resultr   from_recordingsr   from_segmentsr   r   )rV   r8   rW   	text_pathfaudio_infosexfutures
recordingssupervisionsr9   futurer^   rR   rS   recording_setsupervision_setr5   r5   r6   _prepare_subsetj   s6   





rk   
output_dirc                 C   s   t | } |  sJ d|  td t}|dur%t |}|jddd tt}t|ddD ]E}td|  t	||d	d
drKtd| d q/t
|| |\}}|durm||d| d  ||d| d  ||d||< q/|S )aF  
    Returns the manifests which consist of the Recordings and Supervisions.
    :param corpus_dir: Path to the Medical dataset.
    :param output_dir: Pathlike, the path where to write the manifests.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'recordings' and 'supervisions'.
    zNo such directory: zPreparing Medical...NTr   zDataset partsr   zProcessing Medical subset: medicalzjsonl.gz)r1   rl   prefixsuffixzMedical subset: z already prepared - skipping.medical_supervisions_z	.jsonl.gzmedical_recordings_)rf   rg   )r   is_dirr&   r'   MEDICALr$   r   dictr   r   rk   to_file)r8   rl   rW   subsets	manifestsr1   ri   rj   r5   r5   r6   prepare_medical   s4   
rx   )r   F)rU   )NrU   ),__doc__r&   osr+   r-   collectionsr   concurrent.futures.processr   pathlibr   typingr   r   r   r   r	   r
   	tqdm.autor   lhotse.audior   r   	lhotse.qar   r   lhotse.recipes.utilsr   lhotse.supervisionr   r   lhotse.utilsr   r   rs   r%   r*   boolr7   r(   rT   intrk   rx   r5   r5   r5   r6   <module>   sn     
$


-