o
    SiO                     @   s^  d Z ddlZddlZddlZddlZddlZddlm  mZ	 ddl
mZ ddlmZmZmZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZmZmZmZ 	
ddedee  defddZ!dd Z"ddddddddZ#dZ$i ddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0d1d2d3d4d5d6d7d8d9d:d;d<d=d>d?d@dAdBdCdDdE	Z%dFZ&i dGdHdIdJdKdLdMdNdOdPdQdNdRdNdSdTdUdVdWdXdYdZd[d\d]dZd^d\d_d`dadbdcddi dedddfdgdhdgdidjdkdjdldjdmdndodpdpdpdqdrdsdtdudvdwdxdydzd{d|d}d~ddi ddzdd|dd|ddddddddddddddddddddddddddddi ddddddddddddddddddddddddddddddddddi ddddddddddddddddddddddddēddƓddȓddʓdd̓ddΓi ddГddғddԓdd֓ddؓddړddddddddߓddddddddddddddi ddddddddzddddddddddddddddddd dddddddi ddddd	d
ddddddddddddddddddddddddd d!d"d!i d#d$d%d&d'd(d)d*d+d,d-d!d.d/d0d1d2d3d4d5d6d7d8d9d8d9d:d;d<d=d>d=d?d;i d@dAdBdCdDdEdFdGdHdEdIdJdKdLdMdLdNdOdPdQdRdSdTdSdUdVdWdVdXdYdZd[d\d]i d^d_d`d_dadbdcdbdddedfdgdhdidjdkdldmdndodpdqdrdsdtdudvdudwdxdydzd{d|d}d}d}d~ddddddddddZ'dZ(de)de)de)de)de)de)de)de)fddZ*ddddZ+dd Z,								ddedee dee) dee) dee) dee) dee) dee) dee) dee)eeef f fddZ-dS (  u  
University of West Bohemia Air Traffic Control Communication (UWB-ATCC)

Šmídl, Luboš, 2011, Air Traffic Control Communication, LINDAT/CLARIAH-CZ digital library at the Institute of Formal and Applied Linguistics (ÚFAL), Faculty of Mathematics and Physics, Charles University, http://hdl.handle.net/11858/00-097C-0000-0001-CCA1-0.

Corpus contains recordings of communication between air traffic controllers and pilots. The speech is manually transcribed and labeled with the information about the speaker (pilot/controller, not the full identity of the person). The corpus is currently small (20 hours) but we plan to search for additional data next year. The audio data format is: 8kHz, 16bit PCM, mono.
    N)Path)AnyDictListOptionalUnion)fix_manifests$validate_recordings_and_supervisions)	RecordingRecordingSet)SupervisionSegmentSupervisionSet)Pathlikeis_module_availableresumable_downloadsafe_extract_rar.F
target_dirforce_downloadreturnc                 C   s   t dstddd l}t| } | jddd d}| | d }| | }|d }| r:td	| d
| d |S td| d|d|d t	
t|d  dkrXtdtj|dd ||}t||d W d    n1 suw   Y  |  |S )Nrarfilez#Please 'pip install rarfile' first.r   Tparentsexist_ok
ZCU_CZ_ATCz.rarz
.completedz	Skipping z	 because z exists.z\https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0001-CCA1-0/i")filenamecompleted_file_sizer   rb 44b4ea6ffe0ac0bf8fd29f14a735d23azMD5 checksum does not match)ignore_errors)path)r   ImportErrorr   r   mkdiris_filelogginginfor   hashlibmd5openread	hexdigestRuntimeErrorshutilrmtreeRarFiler   touch)r   r   r   dataset_namerar_path
corpus_dircompleted_detectorrar r5   K/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/uwb_atcc.pydownload_uwb_atcc   s8   
r7   c                 C   s   d dd td| D S )N c                 s   s"    | ]}t |d kr|V  qdS )MnN)unicodedatacategory).0cr5   r5   r6   	<genexpr>@   s    z strip_accents.<locals>.<genexpr>NFD)joinr:   	normalize)sr5   r5   r6   strip_accents?   s   

rC   DECIMALzDECIMAL DECIMALzFLIGHT LEVEL	LOCALIZERz
PAPA ROMEORUNWAY)r   z..FLFlLLZPRRWY)HDOVOZVLMAALFABBRAVOCCHARLIEDDELTAEECHOFFOXTROTGGOLFHHOTELIINDIAJJULIETTKKILOLLIMAMMIKENNOVEMBEROOSCARPPAPAQQUEBECROMEOSIERRATANGOUNIFORMVICTORWHISKEYXRAYYANKEEZULU)	RSTUVWXYZ)ATRCRJCSACTODEMDMEEFCIFRILSKLMQNHTMAUPSVFRVMCVORACCELARATINGACCELERATINGACCPETACCEPTACTUALYACTUALLY	AFETRNOON	AFTERNOONAFFRIMAFFIRMAFTENOONAFTERNONAIRBORNAIRBORNEALLRIGHTz	ALL RIGHTALTITUEDALTITUDEAPPORACHAPPROACHAPPORACHINGAPPROACHINGAPPRAOCH
APPROCHINGAPPRONAPRONAPROVEDAPPROVEDAPROXIMATELYAPPROXIMATELYAPROXIMETLYAUSRTIANAUSTRIANAUSTRAIN	AVAILBALE	AVAILABLE	AVALIABLE	AVIALABLE	BOARDLINEz
BROAD LINE	BRUSSELESBRUSSELSCANCELED	CANCELLED	CANCELING
CANCELLING	CHALENGER
CHALLENGERCHECHCZECHCIMBCLIMBCIMBINGCLIMBINGCLEARDCLEARED	CLEARENCE	CLEARANCECLIBMCLIMBINCLMBINGCOMMINGCOMINGCONACTCONTACTCONATACT	CONNTINUECONTINUECONTACCONTACECONTATCCONTROLECONTROLCONTROLO
COORECTION
CORRECTIONCOPPIEDCOPIED	CORECTIONCOTACTCOTINUECOTNACTCURCUITCIRCUIT
DEAPARTURE	DEPARTURE	DEAPRTUREDECENDDESCENDDEGEESDEGREESDEGRES
DENCENDING
DESCENDINGDEPARURE	DESCEDINGDESCENDESCENG
DESCENIDNGDESCNEDDESECENDDESENDDESSCENDDIRECDIRECT	DISCRTION
DISCRETIONEADINGHEADING	ESTABLSIH	ESTABLISHESTALBISHEDESTABLISHED
ETABLISHEDETIOPIAN	ETHIOPIANEVNINGEVENINGEXEPECTEXPECTEXPERIANCINGEXPERIENCING	EXTANSION	EXTENSIONFAVOURFAVORFINNARIFINNAIRFLIGTHFLIGHTFOLOWFOLLOWFOURTYFORTY
GERMANWINGGERMANWINGSGOAHEADzGO AHEADGODDGOODGOODBYEzGOOD BYEGROSJETGROSSJETGROUDNGROUNDHALLOHELLOHEADINTHEADNIGHEDINGHODLINGHOLDINGHUDREDHUNDRED
IFORMATIONINFORMATIONINBOUDINBOUNDINBOUDNINFOMRATIONINITIALY	INITIALLYINTERESCTIONINTERSECTIONKDNOTSKNOTSKNTOSLANDALANDLCIMBLENGHTLENGTHLENGTLEVELEDLEVELLEVLELIGHERLIGHTERLOUNDLOUDLUFHANSA	LUFTHANSA	LUFHTANSA	LUFTAHNSA	LUFTHASNAMAINATINANINGMAINTAINING	MAINTAING
MAINTANINGMAITAINMAINTAINMINTUESMINUTESMOLDAVAMOLDOVAMOORNINGMORNING	NEAGATIVENEGATIVENINTEENNINETEENNINTYNINETYNOICENOISENORTHSHUTTLE
NORSHUTTLENORTHSTHUTTELNORTHSTHUTTLENOSIGNOSING	NOSRHUTLEOPOSITEOPPOSITEOTTOPASSINFPASSINGPASSINPLESEPLEASEPOSSBILEPOSSIBLEPREFERED	PREFERRED
PROCCEDING
PROCEEDING	PROCEEDTOz
PROCEED TOPSSINGQHNr   QUANTASQANTASQUATARIQATARIRADRRADARREADBACKz	READ BACKRECOMEND	RECOMMEND	REQEUSTED	REQUESTEDREQEUSTREQUESTREQUESTEREQUSTED	REQUSTING
REQUESTINGRESETING	RESETTINGRESRTICTIONRESTRICTIONRESTRCTIONSRESTRICTIONSRESTRISCTIONRIGHRIGHTROGGERROGERROGRESESIONDECISIONSHOTRCUTSHORTCUT
SINAGAPORE	SINGAPORE	SINGAPOORSKYRAVEL	SKYTRAVELSKYTAVEL	SMARTWING
SMARTWINGS	SPEEDBIRG	SPEEDBIRDSQUAKING	SQUAWKINGSQUAKSQUAWKSQUWAKSTANDARSTANDARDSTANDARTSTARTUPzSTART UP	SUFFICIAN
SUFFICIENT	SWTICHING	SWITCHINGTAHNKTHANK	TECHNICAN
TECHNICIANTELAVIVzTEL AVIVzTHAT'TzTHAT'STHIRDYTHIRTYTHOSUANDTHOUSAND	THOUASAND	TIMECHECKz
TIME CHECKTRAFICTRAFFICTRESHOLD	THRESHOLD
TURBULENCE
UNREADABLEUNTILVACATEz	VECTOR INWHICHWITHWIENWIZZAIR	WONDERFUL)	TUBULENCE
TURBOLENCE
TURUBLENCE
UNREADEBLEUNTILLUTNILVACATVECTORINWCHICHWIHTWINEWIZZIAR	WONDREFUL))z	AIR SPACEAIRSPACE)z	CLEAR FORzCLEARED FOR)z
DESCENT TOz
DESCEND TO)zDESCENT FLIGHTzDESCEND FLIGHT)zDESCEND RATEzDESCENT RATE)z	STAND BYESTANDBYtextsilence_sym
breath_sym	noise_symforeign_symunintelligble_sympartial_symunknown_symc                  C   s  t dsJ dddlm} td}	td}
td}td}td	}td
}td}td}td}td}td}td}td}td}td}td}td}td}td}|	d| } |
d| } | dd} | dd} | dd} | dd} | dd } |d| } |d!| } |d!| } |d!| } |d"| } |d#| } |d$d% | } |d&d% | } |d!| } | d'd(} |d| } |d| } t| } |||||d)}g }|  D ]d}||v r|||  q|t	v r|| q|t
v r|t
|  q|tv r.|d*g |  q|tv r;|t|  q| rR|||d+d*d,d  q||  qd*|} |d krw| d-|} |d!| } |d!| } n| d-|} ||| } ||| } |d kr|d!| } n||| } |d kr||| } ||| } | d.d} |d*| } |  } d*d/d0 |  D } tD ]}| |d |d1 } q| S )2N	num2wordszDPlease run 'pip install num2words' for number to word normalization.r   )r  z([\w\.\+])(\[|\()z(\]|\))([\w\+])z\[comment_\|].*?\[\|_comment]z3\[background_speech_\|](.*?)\[\|_background_speech]z\[noise_\|](.*?)\[\|_noise]z\[speaker_\|](.*?)\[\|_speaker]z	\.([0-9])z	([0-9])\.z
([A-Z]+\+)z
(\+[A-Z]+)z(\w+\+)z(\+\w+)z#\(((\w*|\s*|\+)*)\(((\w*|\s*)*)\)\)z([0-9])([A-Za-z])z([A-Za-z])([0-9])z\[NO_ENG_\|](.*?)\[\|_NO_ENG]z\[CZECH_\|](.*?)\[\|_CZECH]z-\[UNINTELLIGIBLE_\|](.*?)\[\|_UNINTELLIGIBLE]  +z\1 \2z](z] (   °r8   ?   ¨   ´'z\1z. \1z\1 .c                 S      |  d S N   grouplowermr5   r5   r6   <lambda>      z text_normalize.<locals>.<lambda>c                 S   r  r  r  r  r5   r5   r6   r    r  6rahapraha)z[ehm_]z[noise]z[unintelligible]z[background_speech]z	[speaker] -,z[NO_ENG]+c                 S   s    g | ]}|t v rt | n|qS r5   )	FIX_TYPOS)r<   wr5   r5   r6   
<listcomp>  s     z"text_normalize.<locals>.<listcomp>r  )r   r  recompilesubreplacerC   splitappendUNKNOWN_ABBREVIATIONSABBREVIATIONSINDIVIDUALLY_PRONOUNCEDr@   upperPHONETIC_ALPHABETisdigitstripCOLLAPSE_WORDS) r  r  r  r  r  r  r  r  r  BRACKET_PADDING_PATTERN1BRACKET_PADDING_PATTERN2COMMENT_PATTERNBACKGROUND_SPEECH_PATTERNNOISE_PATTERNSPEAKER_PATTERNDECIMAL_NUMBER_PATTERNNUMBER_DECIMAL_PATTERNPHONETIC_INTERRUPTED_PATTERN1PHONETIC_INTERRUPTED_PATTERN2INTERRUPTED_PATTERN1INTERRUPTED_PATTERN2ABBREVIATION_PATTERNSPLIT_NUMERIC_ALPHASPLIT_ALPHA_NUMERICNO_ENG_PATTERNCZECH_PATTERNUNINTELLIGIBLE_PATTERNWHITESPACE_PATTERNsimple_replaceresultr  pairr5   r5   r6   text_normalizeZ  s   






















$



r  PIATATPI)
air_groundgroundairc                 C   s:   ||j  |_| jd|d t|j f 7  _| | d S )Nz_%06d_%sd   )startdurationidSPEAKER_TO_ID_SUFFIXspeakerr  )supervisionssegmentend_timer5   r5   r6   finish_segment  s    r+  r8   <unk>r2   
output_dirc	                 C   s  t | } |  sJ d|  |durt |}|jddd t| ddd d}	t|	d	ks1J g }
g }td
}ddlm	} ||	ddD ]}|j
dv rnt|dd}t| d }W d   n1 shw   Y  nt| }| |jd tdd  }| std|  qFt|}|
| d}|dD ]}|D ]}|jdkrtd|j  qt|jd }|D ]}|jdkrtd|j  qt|jd }|rt||| d}|j }|dkrq|}d|v sd|v rd}nd |v rd!}n	d"|v rd#}nq|d$d%}d&D ]	}||d}qt||||||||d'}|dkr+qt d(|j
|d) f |j!|ddd*|||jd+ |"d,| d-d.	}q|rXt||| d}qqqFt#$|
}t%&|}t'||\}}t(|| |dur|)|d/  |)|d0  ||d1S )2a  
    Returns the manifests which consist of the Recordings and Supervisions

    :param corpus_dir: Pathlike, the path of the data dir.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param silence_sym: str, silence symbol
    :param breath_sym: str, breath symbol
    :param noise_sym: str, noise symbol
    :param foreign_sym: str, foreign symbol. when set to None, will output foreign words
    :param partial_sym: str, partial symbol. When set to None, will output partial words
    :param unintelligble_sym: str, unintellible symbol. When set to None, will output unintelligble words
    :param unknown_sym: str, unknown symbol
    :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'.
    zNo such directory: NTr   z*.trsc                 S   s   | j S )N)name)pr5   r5   r6   r    s    z"prepare_uwb_atcc.<locals>.<lambda>)keyia
  r  r   )tqdm	Preparing)desc)zACCU-80UXVVzACCU-7NqzYvzACCU-PhR5OjzACCU-JaeNLHz
TWR-XgqNSkcp1250)encodingz#</Turn></Section></Episode></Trans>audio_filenamee2_zNo such file: z
.//SectionTurnzUnexpected tag: endTimeSynctimer8   [air_|]
[ground_|]r  [air]r!  [ground]r   z][z] [)r<  z[|_air]r=  z
[|_ground]r>  r?  )r  r  r  r  r  r  r  zuwb-atcc_%s_%06dr"  Englishtyper  )rA  	orig_text)	r%  recording_idr#  r$  channellanguager  r'  customz"uwb_atcc_supervisions_all.jsonl.gzz uwb_atcc_recordings_all.jsonl.gz)
recordingsr(  )*r   is_dirr"   sortedgloblenr  r  	tqdm.autor1  stemr(   ET
fromstringr)   parsegetrootattribr#   r$   warningr
   	from_filer  findalltagfloatr+  tailr  r  r  r   r%  r  r   from_recordingsr   from_segmentsr   r	   to_file)r2   r-  r  r  r  r  r  r  r  	trs_filesrG  r(  r  r1  tfroot
audio_path	recordinglast_segmentsectionturnr*  syncr;  r  rB  r'  labelrecording_setsupervision_setr5   r5   r6   prepare_uwb_atcc  s   









P



ri  )r   F)Nr8   r8   r8   r,  r,  r,  r,  ).__doc__r&   r$   r  r,   r:   xml.etree.ElementTreeetreeElementTreerN  pathlibr   typingr   r   r   r   r   lhotser   r	   lhotse.audior
   r   lhotse.supervisionr   r   lhotse.utilsr   r   r   r   boolr7   rC   r  r  r  r  r  r  strr  r&  r+  ri  r5   r5   r5   r6   <module>   s   	
"
	
	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~                 	  
                                               !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /  0  1  2  3  4  5  6  7  8  9  :  ;  <  = 
 L
 	
