o
    Si                     @   s  d Z ddlZddlZddlZddlZddlZddlmZmZ ddl	m
Z
 ddlmZmZmZ ddlmZ ddlmZmZmZmZmZ ddlmZmZ dd	lmZmZ d
Z		ddedede
fddZ			d dedee dede dee!eeef f f
ddZ"de
de!de
defddZ#dS )!a  
The Grid Corpus is a large multitalker audiovisual sentence corpus designed to support joint
computational-behavioral studies in speech perception. In brief, the corpus consists of high-quality
audio and video (facial) recordings of 1000 sentences spoken by each of 34 talkers (18 male, 16 female),
for a total of 34000 sentences. Sentences are of the form "put red at G9 now".

Source: https://zenodo.org/record/3625687
    N)ProcessPoolExecutoras_completed)Path)DictOptionalUnion)tqdm)	RecordingRecordingSetSupervisionSetfix_manifests$validate_recordings_and_supervisions)AlignmentItemSupervisionSegment)Pathlikeis_module_availablez10.5281/zenodo.3625687.F
target_dirforce_downloadreturnc              	   C   s  t dstdt| }|jddd |d }| r|r-tjdt dd|d |  t	|
dd	d
D ]}t|}|| W d   n1 sMw   Y  q6i ddddddddddddddddddddddddddddddddddi dddddd d!d"d"d!d#d#d$d$d%d&d&d%d'd(d(d'd)d)d*d*d+d,d,d+d-d-}|d. }tj|t_t }| D ]"\}	}
tj||
}tj||	}t|| td/|
 d0|	  qt| t|| |S )1a  
    Download and untar the dataset, supporting both LibriSpeech and MiniLibrispeech

    :param target_dir: Pathlike, the path of the dir to storage the dataset.
    :param dataset_parts: "librispeech", "mini_librispeech",
        or a list of splits (e.g. "dev-clean") to download.
    :param force_download: Bool, if True, download the tars no matter if the tars exist.
    :param alignments: should we download the alignments. The original source is:
        https://github.com/CorentinJ/librispeech-alignments
    :param base_url: str, the url of the OpenSLR resources.
    :param alignments_url: str, the url of LibriSpeech word alignments
    :return: the path to downloaded and extracted directory with data.
    
zenodo_getzLTo download Grid Audio-Visual Speech Corpus please 'pip install zenodo_get'.T)parentsexist_okz.downloadedzzenodo_get )shellcheckcwdz*.zipzUnzipping files)descNs1s2s3s4s5s6s7s8s9s10s13s11s12s14s15s16s17s18s19s20s21s22s23s24s25s26s27s28s29s30s31s32s33s34
alignmentszCopied entire folder from z to )r   RuntimeErrorr   mkdirexists
subprocessrunGRID_ZENODO_IDtouchr   globzipfileZipFile
extractallospathabspathtempfiletempdirmkdtempitemsjoinshutilcopytreeprintrmtreerename)r   r   
corpus_dirdownload_markerpfspeaker_fix_map	input_dirtemp_alignment_dir
tgt_folder
src_foldersrc_pathtgt_path rc   G/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/grid.pydownload_grid!   s   	
 !%
re   T   rX   
output_dirwith_supervisionsnum_jobsc                 C   s  t | } | d }| sJ | d }| sJ g }g }g }tt | d}	dd |	D }	t|\}
|	D ]}|jj}||
t	|||| q5t
t|t|ddD ]3}z| }|du r`W qSW n tys } zW Y d}~qSd}~ww |\}}|| |dur|| qSW d   n1 sw   Y  t|}|rt|}t||\}}t|| |durt |}||d	  |r||d
  d|i}|r|j|d |S )a  
    Returns the manifests which consist of the Recordings and Supervisions.
    When all the manifests are available in the ``output_dir``, it will simply read and return them.

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param with_supervisions: bool, when False, we'll only return recordings; when True, we'll also
        return supervisions created from alignments, but might remove some recordings for which
        they are missing.
    :param num_jobs: int, number of parallel jobs.
    :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'.
    r?   	audio_25kz*.mpgc                 S   s   g | ]
}d t |vr|qS )MACOSX)str).0r[   rc   rc   rd   
<listcomp>   s    z prepare_grid.<locals>.<listcomp>zScanning videos)totalr   Nzgrid_recordings.jsonl.gzzgrid_supervisions.jsonl.gz
recordings)supervisions)r   is_dirlistrglobr   parentnameappendsubmitprocess_singler   r   lenresult	Exceptionr
   from_recordingsr   from_segmentsr   r   to_fileupdate)rX   rg   rh   ri   ali_dir	audio_dirrp   rq   futuresall_mpg_filesex
video_pathspeakerr[   r{   e	recordingmaybe_supervisionansrc   rc   rd   prepare_grid{   sf   







r   r   r   r   c           
      C   s   | j }ztj| | d| d}W n ty. } ztd|  d|  W Y d }~d S d }~ww d }|| | d}|rk| rkdd dd	 |  D D }	t	|j
|j
d
|j|jddd	 |	D d|d|	id	}||fS )N_)recording_idzUnexpected error for z: z.alignc              	   S   s<   g | ]\}}}t |t|d  tt|t| d  dqS )i  )symbolstartduration)r   floatint)rm   br   wrc   rc   rd   rn      s    
z"process_single.<locals>.<listcomp>c                 s   s    | ]}|  V  qd S )N)split)rm   linerc   rc   rd   	<genexpr>   s    z!process_single.<locals>.<genexpr>g         c                 s   s     | ]}|j d kr|j V  qdS )silN)r   )rm   itemrc   rc   rd   r      s    Englishword)	idr   r   r   channeltextlanguager   	alignment)stemr	   	from_filer|   rU   with_suffixis_file	read_text
splitlinesr   r   r   channel_idsrR   )
r   r   r   rh   video_idr   r   supervisionali_pathalirc   rc   rd   ry      s8   
ry   )r   F)NTrf   )$__doc__rK   rS   rC   rN   rH   concurrent.futuresr   r   pathlibr   typingr   r   r   	tqdm.autor   lhotser	   r
   r   r   r   lhotse.supervisionr   r   lhotse.utilsr   r   rE   boolre   r   rl   r   ry   rc   rc   rc   rd   <module>   s\    
\
M