o
    Si"                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlm	Z	 ddl
mZmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZ 	d.dedee  de	fddZ!dddddZ"e#dZ$e#dZ%e#dZ&e#dZ'e#dZ(de)de)de)de)d e)d!e)fd"d#Z*d$ed%e+defd&d'Z,		(	(	)	)	)d/d*ed+ee dee) dee) dee) d ee) d!ee) dee)eeef f fd,d-Z-dS )0a  
The ATCOSIM Air Traffic Control Simulation Speech corpus is a speech database of air traffic control (ATC) operator speech, provided by Graz University of Technology (TUG) and Eurocontrol Experimental Centre (EEC). It consists of ten hours of speech data, which were recorded during ATC real-time simulations using a close-talk headset microphone. The utterances are in English language and pronounced by ten non-native speakers. The database includes orthographic transcriptions and additional information on speakers and recording sessions. It was recorded an annotated by Konrad Hofbauer.

See https://www.spsc.tugraz.at/databases-and-tools/atcosim-air-traffic-control-simulation-speech-corpus.html for more details.
    N)Path)DictOptionalUnion)tqdm)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)PathlikeSecondscompute_num_samplesis_module_availableresumable_download.F
target_dirforce_downloadreturnc                 C   s  t dstddd l}t| } | jddd d}| | d }| | }|d }| r:td	| d
| d |S td| d|d|d t	
t|d  dkrXtdtj|dd | }|| | rndn| rtdn| rzdndd }t|jdi |dig}	|	r|	 }
|j|
|dkd}|d}|
 rt||  |jdi ||iD ]}|r| s|  s|	!| qn|
" rt#d n|j$|| fi ||i |	s|%  |&  |S )Npycdlibz#Please 'pip install pycdlib' first.r   Tparentsexist_okatcosimz.isoz
.completedz	Skipping z	 because z exists.z3https://www2.spsc.tugraz.at/databases/ATCOSIM/.ISO/l     5 )filenamecompleted_file_sizer   rb cd5f0c82be46242a75d3382e340f6dcazMD5 checksum does not match)ignore_errorsudfrrjolietiso_path/rr_path)	rockridgezsymlink not implemented )'r   ImportErrorr   r   mkdiris_filelogginginfor   hashlibmd5openread	hexdigestRuntimeErrorshutilrmtreePyCdlibhas_udfhas_rock_ridge
has_jolietcollectionsdeque
get_recordpopleftfull_path_from_dirrecordlstripis_dirosmakedirslist_childrenis_dot	is_dotdotappend
is_symlinkwarningget_file_from_isoclosetouch)r   r   r   dataset_nameiso_path
corpus_dircompleted_detectorr#   path_argrecordsrabs_pathrel_pathchildr(   r(   J/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/atcosim.pydownload_atcosim!   sb   



rW   hmmohhellovicinity)hmohhhallo	viscinityz<FL>\s*</FL>z<OT>(.*?)</OT>z=(\w+)z(\w+)=z  +textsilence_sym
breath_symforeign_sympartial_symunknown_symc           	      C   s  t d| } g }|  D ])}|d dks|d dkr$||dd   q|tv r0|t|  q|| qd| } | d|} | d|} t|| } |d krZ| d	d
} nt	|| } t
|| } dD ]}| ||} qh| dd} td| } |  } | S )Nz\1r   @~    z[EMPTY]z[HNOISE]= )z
[FRAGMENT]z
[NONSENSE]z	[UNKNOWN]z	AIR SPACEAIRSPACE)OFF_TALK_PATTERNsubsplitrF   	FIX_TYPOSjoinupperreplaceFOREIGN_PATTERNINTERRUPTED_PATTERN1INTERRUPTED_PATTERN2WHITESPACE_PATTERNstrip)	r`   ra   rb   rc   rd   re   resultwunkr(   r(   rV   text_normalizeg   s,   	r|   durationsampling_ratec                 C   s   t | || S )aL  
    A handful of supervision durations do not compute to a round number of
    samples at the original recording sampling rate.

    This causes problem later using compute_num_frames(). Full description:
    https://github.com/lhotse-speech/lhotse/issues/1064

    Return: duration that computes to a round number of samples.
    )r   )r}   r~   r(   r(   rV   fix_duration   s   
r   rk   <unk>rN   
output_dirc                 C   s  t dstdddl}t| } |  sJ d|  |dur*t|}|jddd | d }|j|tjd	}	t	j
|d
 dd}
tj
|d dd}t|	 dt|	dD ]k\}}|jr]qUt|j|||||d}|dkrmqUt| d |j |j |j d }tj||jd}t|jd }td|j dddd|d|jdt|j|jdd||j|jd  d|jid
}|
 | | | qUW d   n1 sw   Y  W d   n1 sw   Y  t	!|
j"}t!|j"}t#$d ||fS )a7  
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param silence_sym: str, silence symbol
    :param breath_sym: str, breath symbol
    :param foreign_sym: str, foreign symbol.
    :param partial_sym: str, partial symbol. When set to None, will output partial words
    :param unknown_sym: str, unknown symbol
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    pandasz"Please 'pip install pandas' first.r   NzNo such directory: Tr   zTXTdata/fulldata.csv)quotingzatcosim_recordings_all.jsonl.gz)	overwritez!atcosim_supervisions_all.jsonl.gz	Preparing)desctotal)ra   rb   rc   rd   re   rk   WAVdataz.wav)recording_idd   atcosim__06dg        Englishrh   	orig_text)
idr   startr}   channellanguager`   speakergendercustomzManifests are lazily materialized. You may want to call `lhotse.qa.fix_manifests()` to ensure that all supervisions fall within the corresponding recordings.)%r   r)   r   r   r@   r*   read_csvcsv
QUOTE_NONEr	   open_writerr   r   iterrowslenrecording_corruptr|   transcriptionstr	directorysubdirectoryr   r   	from_filer   int
length_secr   r   r~   
speaker_idrr   writefrom_jsonl_lazypathr,   rH   )rN   r   ra   rb   rc   rd   re   pdcsv_pathdfrecs_writersups_writeridxrowr`   wav_path	recording	length100segment
recordingssupervisionsr(   r(   rV   prepare_atcosim   s   	

 7r   )r   F)Nrk   rk   r   r   r   ).__doc__r:   r   r.   r,   rA   rer4   pathlibr   typingr   r   r   	tqdm.autor   lhotser   lhotse.audior   r	   	lhotse.qar
   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   r   boolrW   rp   compilert   rm   ru   rv   rw   r   r|   r   r   r   r(   r(   r(   rV   <module>   s    
9





*