o
    æS™i'¸  ã                   @   sF  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZmZ ddlmZ dd	lmZmZmZmZmZ dd
lmZmZmZmZmZ dZe dgd ƒe dgd ƒe dgƒe dgd ƒe dgd ƒe dgd dg ƒe dgd dgd  ƒe dgd ƒe dgƒe dgd ƒe dgd ƒe dgd ƒe dgƒe dgd ƒe dgdgd  ƒdœZ!i dd“dd“d d!“d"d#“d$d%“d&d'“d(d)“d*d+“d,d+“d-d.“d/d0“d1d2“d3d4“d5d%“d6d7“d8d9“d:d;“Z"g d<¢Z#G d=d>„ d>ƒZ$e$ƒ Z%	?	@dpdAedBee& dCefdDdE„Z'		@	@dqdFedGee dHee& dIee& dCee(eeef f f
dJdK„Z)dLedMe*fdNdO„Z+dPefdQdR„Z,dSedTe*dMe*fdUdV„Z-dWe(fdXdY„Z.eG dZd[„ d[ƒƒZ/d\e(dCee/ fd]d^„Z0dCee/ fd_d`„Z1dae(fdbdc„Z2ddedee/dCe3fdfdg„Z4dhee( diedjee/ dCefdkdl„Z5dhee( dmedCeeef fdndo„Z6dS )ra‡  
This script downloads and prepares the data directory for the Santa Barbara
Corpus of Spoken American English.

The Santa Barbara Corpus of Spoken American English is based on a large body of
recordings of naturally occurring spoken interaction from all over the United
States. The Santa Barbara Corpus represents a wide variety of people of
different regional origins, ages, occupations, genders, and ethnic and social
backgrounds. The predominant form of language use represented is face-to-face
conversation, but the corpus also documents many other ways that that people use
language in their everyday lives: telephone conversations, card games, food
preparation, on-the-job talk, classroom lectures, sermons, story-telling, town
hall meetings, tour-guide spiels, and more.

The Santa Barbara Corpus was compiled by researchers in the Linguistics
Department of the University of California, Santa Barbara. The Director of the
Santa Barbara Corpus is John W. Du Bois, working with Associate Editors Wallace
L. Chafe and Sandra A. Thompson (all of UC Santa Barbara), and Charles Meyer
(UMass, Boston). For the publication of Parts 3 and 4, the authors are John W.
Du Bois and Robert Englebretson.

If you use the corpus or our data preparation scripts, please cite the following:
@misc{dubois_2005,
  author={Du Bois, John W. and Chafe, Wallace L. and Meyer, Charles and Thompson, Sandra A. and Englebretson, Robert and Martey, Nii},
  year={2000--2005},
  title={{S}anta {B}arbara corpus of spoken {A}merican {E}nglish, {P}arts 1--4},
  address={Philadelphia},
  organization={Linguistic Data Consortium},
}
@inproceedings{maciejewski24_interspeech,
  author={Matthew Maciejewski and Dominik Klement and Ruizhe Huang and Matthew Wiesner and Sanjeev Khudanpur},
  title={Evaluating the {Santa Barbara} Corpus: Challenges of the Breadth of Conversational Spoken Language},
  year=2024,
  booktitle={Proc. Interspeech 2024}
}
é    N)Údeepcopy)Ú	dataclass)Úinf)ÚPath)ÚDictÚListÚOptionalÚTupleÚUnion)Útqdm)Ú	RecordingÚRecordingSetÚSupervisionSegmentÚSupervisionSetÚfix_manifests)ÚPathlikeÚfastcopyÚis_module_availableÚresumable_downloadÚsafe_extractz3https://www.openslr.org/resources/155/SBCSAE.tar.gzÚSpanishé   ÚFrenché   ÚGreeké
   ÚGermanÚLatiné   é   é$   é<   ÚJapaneseé>   ÚItalian)ÚSBC004ÚSBC006ÚSBC010ÚSBC012ÚSBC015ÚSBC025ÚSBC027ÚSBC031ÚSBC033ÚSBC034ÚSBC036ÚSBC037ÚSBC047ÚSBC057ÚSBC058zmetro St.L. ILzSaint Louis MOzmiddle Wes MOÚMissourizS.E.Texas TXzSouth East TexaszSouth Alabama mostly ALzAndalusia AlabamazSouth FLzSouth Bay FloridazWalnut Cre CAzWalnut Creek CAzSan Leandr CAzSan Leandro CAzBoston/Santa Fe MA/NMzBoston/Santa Fe	MA/NMzBoston/New Mexico MA/NMzMillstad ILzMillstadt ILzCleveland/San Francisco OH/CAzCleveland/San Fransisco	OH/CAzJamesville WIzJanesville WIzFalls Church/Albuquerque VA/NMzFalls Church/Albuquerque	VA/NMzSouthern FloridazMassachusetts MAÚMassachusettszNew Zealand n/azNew Zealandz
French n/aÚFrance)ÚSBC020ÚSBC021r+   ÚSBC028c                   @   s   e Zd Zdd„ Zddd„ZdS )ÚDummy_Spk_Iteratorc                 C   s
   d| _ d S )NéÕ   )Úind)Úself© r>   úI/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/sbcsae.pyÚ__init__p   s   
zDummy_Spk_Iterator.__init__ÚSBCXXX_Xc                 C   sN   | j d | _ d | d¡dd … ¡}| d¡s| d¡rd}| j d›d|› S )Né   Ú_ÚXÚAUDÚUNKÚ04d)r<   ÚjoinÚsplitÚ
startswith)r=   ÚspkÚnamer>   r>   r?   Únexts   s
   zDummy_Spk_Iterator.nextN)rA   )Ú__name__Ú
__module__Ú__qualname__r@   rM   r>   r>   r>   r?   r:   o   s    r:   Ú.FÚ
target_dirÚforce_downloadÚreturnc                 C   sž   t | ƒ} | d }|jddd | d }| d }| ¡ r&t d|› d¡ |S tt||d t |¡}t	||d	 | 
¡  W d
  ƒ |S 1 sHw   Y  |S )a+  
    Download and untar the dataset.

    :param: target_dir: Pathlike, the path of the directory where the SBCSAE
        dataset will be downloaded.
    :param force_download: bool, if True, download the archive even if it already exists.
    :return: The path to the directory with the data.
    ÚSBCSAET©ÚparentsÚexist_okzSBCSAE.tar.gzz.sbcsae_completedzSkipping download because z exists.)ÚfilenamerS   )ÚpathN)r   ÚmkdirÚis_fileÚloggingÚinfor   ÚSBCSAE_TAR_URLÚtarfileÚopenr   Útouch)rR   rS   Ú
corpus_dirÚtar_pathÚcompleted_detectorÚtarr>   r>   r?   Údownload_sbcsae~   s    

þürg   rc   Ú
output_dirÚgeolocationÚomit_realignmentsc                 C   s‚  t | tƒr	t| ƒ} t |tƒrt|ƒ}| d }t dd„ | d¡D ƒ¡}t|ƒdkr1t d|› ¡ | d }t	|ƒ\}}i }	|rDt
| |ƒ}	g }
| d }tt| d	¡ƒd
ƒD ]}t|||ƒD ]}|
 |¡ q\qTt|
ƒdkrst d|› ¡ g }|
D ]c}|jdk r™||j }td|jd ƒ}t||t|d |jƒd}n|}|j|	v r´|	|j d d |	|j d d dœ|_t ||j jtƒrÌt||j jƒdk sÌ|jtv rÕ||j jd |_| |¡ qwt |¡}
t||
ƒ\}}
|dur
t |tƒrõt|ƒ}|jddd | |d ¡ |
 |d ¡ ||
dœ}|s?tt|j ƒ|
ƒ\}}t||ƒ\}}t||ƒ\}}| |d ¡ | |d ¡ ||dœ|¥}|S )as  
    Prepares manifest for SBCSAE dataset.

    :param: corpus_dir: Path to the root where SBCSAE data was downloaded. It
        should be called SBCSAE. There is no consistent formatting between
        releases of the data. Check script comments for details if using an
        existing corpus download rather than Lhotse's download script.
    :param: output_dir: Root directory where .json manifests are stored.
    :param: geolocation: Include geographic coordinates of speakers' hometowns
        in the manifests.
    :param: omit_realignments: Only output original corpus segmentation.
    :return: The manifests.
    ÚWAVc                 s   s    | ]}t  |¡V  qd S ©N)r   Ú	from_file)Ú.0Úpr>   r>   r?   Ú	<genexpr>¸   s   € 

ÿz!prepare_sbcsae.<locals>.<genexpr>z*.wavr   zNo .wav files found in ÚdocsÚTRNz*.trnz*Collecting and normalizing transcripts ...zNo supervisions found in g{®Gáz”?g{®Gáz„?)ÚstartÚdurationrB   )ÚlatÚlonr   NTrV   zsbcsae_recordings.jsonl.gzzsbcsae_supervisions.jsonl.gz)Ú
recordingsÚsupervisionsz(sbcsae_supervisions_asr_aligned.jsonl.gzz)sbcsae_supervisions_diar_aligned.jsonl.gz)Úasr_supervisionsÚdiar_supervisions)!Ú
isinstanceÚstrr   r   Úfrom_recordingsÚglobÚlenr]   ÚwarningÚgenerate_speaker_map_dictsÚgenerate_geolocationsr   ÚlistÚ_filename_to_supervisionsÚappendrt   Úrecording_idÚmaxrs   r   ÚminÚspeakerÚcustomÚchannel_idsÚ
bad_stereoÚchannelr   Úfrom_segmentsr   r[   Úto_fileÚapply_aligned_stmsÚids)rc   rh   ri   rj   Ú	audio_dirrw   Údoc_dirÚspk2gen_dictÚspk2glob_dictÚ
spk_coordsrx   Útrn_dirro   ÚsupervisionÚsupervisions_ÚsÚs_recoÚ	new_startÚs_Ú	manifestsry   rz   rC   r>   r>   r?   Úprepare_sbcsaeœ   s’   


ÿ
ÿÿ

ý
þÿ





ÿÿÿþýrŸ   Úcorpusr•   c                 C   s  t dƒstdƒ‚ddlm} ddlm} |  d¡}|dd}i }tt|ƒd	ƒD ]Æ}t	|ƒ¸}|D ]­}	|	 
¡  d
¡}
t|
ƒdk r@q0|
d dv }|
d dv }|rW|sW|
d d }n|sj|
d d |
d  }t ||¡}nq0d|v rÍz1| dd¡\}}| d¡}| d¡}g }t||ƒD ]\}}| |j|› d|› d dd ¡ q‰W n6 tyÌ   | d
d¡\}}g }| d¡D ]}| |j|› d|› d dd ¡ q¶Y nw |j|d dd g}|||
d < q0W d   ƒ n1 sèw   Y  q'dd„ | ¡ D ƒ}i }|D ]}||v r
|| ||| < qû|S )NÚgeopyz>geopy package not found. Please install... (pip install geopy)r   )Ú	geocoders)Ú	Nominatimzdocs/Part_*/speaker.tblÚmyapplication)Ú
user_agentz"Generating speaker geolocations...ú,é   é   )Ú ú?z, United Statesú ú/ú	rB   )Útimeoutz, c                 S   s    i | ]\}}|  d ¡d |“qS )rC   r   )rI   )rn   rš   Únr>   r>   r?   Ú
<dictcomp>B  s     z)generate_geolocations.<locals>.<dictcomp>)r   ÚImportErrorr¡   r¢   Úgeopy.geocodersr£   Úrglobr   rƒ   ra   ÚstriprI   r   Úannotation_correctionsÚgetÚzipr…   ÚgeocodeÚ
ValueErrorÚitems)r    r•   r¢   r£   ÚspeakersÚ
geolocatorr–   rK   ÚfÚlÚvalsÚempty_hometownÚempty_stateÚlocÚorig_locÚ	hometownsÚstatesÚcoordsÚhrš   ÚcountryÚspknum2spk_nameÚspk_coords_r>   r>   r?   r‚     sn   ÿ




ÿÿÿÿýßÿ€#
€r‚   r“   c                 C   sì  t ƒ }t ƒ }t ƒ }dD ]C}| | d }| ¡  d¡D ]3}d|v rM| dd¡}t dd|¡}| d	¡d
 d d… }||vr@g ||< ||| vrM||  |¡ qqdD ]c}| | d }| ¡  d¡D ]S}d|vrgq`| dd¡}| d¡d d… \}}	}
|	 dd¡ ¡  d¡d
  d¡d }	|
 ¡ }
|
s“d }
|dv r˜q`|| D ]}|
||d |	 < |d |	 ||d |	 < qœq`qQdD ]¬}g }| | d }| ¡  d¡D ]"}d|v rêt dd|¡}| d¡d
  ¡  d¡d }	| |	|g¡ qÈg }| | d }| ¡  d¡D ]%}d|vrqú| d¡d d… \}}	}
|	 ¡  d¡d }	| |	||
g¡ qút||ƒD ]=\}}|d |d ks?J |d › d|d › ƒ‚|d ||d d |d  < |d d |d  ||d d |d  < q%q·dD ]}d ||< |||< qf||fS )N)ÚPart_1ÚPart_2ÚPart_4zsegment.tblÚ
zspeaker:z 0z	0zsbc0?([0-9]{3})\s.*zSBC\1r­   éÿÿÿÿr¨   zspeaker.tblr¦   z
0163,Dan,mz
0166,Dan,Mr   z (extra-corpus)r©   r«   r¬   r   )Ú0069Ú0091Ú0092Ú0097rC   )ÚPart_3z != r   rB   )Ú
SBC006_ALLÚ
SBC008_ALLÚSBC012_MANYÚ
SBC020_AUDÚSBC021_MANYÚSBC023_MANYÚ
SBC025_AUDÚ
SBC026_AUDÚSBC027_MANYÚ
SBC027_AUDÚSBC028_BOTHÚ
SBC030_AUDÚ
SBC038_AUDÚSBC053_RADIOÚ
SBC054_AUDÚSBC054_MANYÚ
SBC055_AUD)	ÚdictÚ	read_textrI   ÚreplaceÚreÚsubr…   Úupperr·   )r“   r”   r•   Úspk_num_to_reco_idsÚpartrY   ÚlineÚreco_idÚspk_numrL   ÚgenÚrecoÚseg_listÚspk_listÚseg_infoÚspk_infoÚspk_keyr>   r>   r?   r   J  s|   €ø
"ÿþò€
,ÿýr   rY   r”   c                 C   s¬  | j  d¡d }| jdd}g }| dd¡}| dd¡}| d	d
¡}|dkr.| dd¡}nâ|dkr@| dd¡}| dd¡}nÐ|dkrX| dd¡}| dd¡}| dd¡}n¸|dkrp| dd¡}| dd¡}| dd ¡}n |d!kr‚| d"d#¡}| d$d%¡}nŽ|d&krŽ| d'd(¡}n‚|d)krš| d*d+¡}nv|d,kr²| d- d.gd/ ¡d.¡}| d0d1¡}n^|d2krÐ| d3d4¡}| d5d6¡}| d7d8¡}| d9d:¡}n@|d;krâ| d<d-¡}| d=d>¡}n.|d?kr%| d@dA¡}| dBdC¡}| dDdE¡}| dFdG¡}| dHdI¡}| dJdK¡}| dLdM¡}| dNdO¡}| dPdQ¡}| dRdS¡}në|dTkr2| dUdV¡}nÞ|dWkr]| dXdY¡}| dFdG¡}| dZd[¡}| d\d]¡}| d^d_¡}| d`da¡}n³|dbkrp| dcdd¡}| dedf¡}n |dgkr­| dhdi¡}| djdk¡}| dldm¡}| dndo¡}| dpdq¡}| drds¡}| dtdu¡}| dvdw¡}| dxdy¡}nc|dzkrð| d{d|¡}| d}d~¡}| dd€¡}| dd‚¡}| dƒd„¡}| d…d†¡}| d‡dˆ¡}| d‰dŠ¡}| d‹dŒ¡}| ddŽ¡}n |dkr| dd‘¡}| d’d“¡}| d`da¡}| d”d•¡}n|d–krR| d—d˜¡}| d™dš¡}| d›dœ¡}| d}d~¡}| ddž¡}| dŸd ¡}| d¡d¢¡}| d£d¤¡}| d¥d¦¡}| d§d¨¡}n¾|d©krw| dªd«¡}| d¬d­¡}| d®d¯¡}| d°d±¡}| d²d³¡}n™|d´kr| dµd¶¡}| dFdG¡}| d·d¸¡}n€|d¹kr¯| dºd±¡}| dFdG¡}| d»d¼¡}| d½d¾¡}na|d¿kr¥| dÀdÁ¡}| dÂdÃ¡}| dÄdÅ¡}| dÆdÇ¡}| dÈdÉ¡}| dÊdË¡}| dÌdÍ¡}| dÎdÏ¡}| dÐdÑ¡}| dÒdÓ¡}| dÔdÕ¡}| dÖd×¡}| dØdÙ¡}| dÚdÛ¡}| dÜdÝ¡}| dÞdß¡}| dàdá¡}| dâdã¡}| dädå¡}| dædç¡}| dèdé¡}| dêdë¡}| dìdí¡}| dîdï¡}| dðdñ¡}| dòdó¡}| dôdõ¡}| död÷¡}| dødù¡}| dúdû¡}| düdý¡}| dþdÿ¡}| d½d ¡}| dd¡}| dd¡}| dd¡}| dd¡}| d	d
¡}nk|dkrë| dd¡}| dd¡}| dd¡}| dd¡}| dd¡}| dd¡}| dd¡}| dcdd¡}n%|dkr| dd¡}| dd‘¡}| dd¡}| d d!¡}| d"d#¡}nö|d$kr2| d%d&¡}| d'd(¡}nÞ|d)krB| d*d+¡}nÎ|d,krR| d-d.¡}n¾|d/kré| d0d1¡}| d2d3¡}| d4d5¡}| d6d7¡}| d8d9¡}| d:d;¡}| d<d=¡}| d>d?¡}| d@dA¡}| dBdC¡}| dDdE¡}| dFdG¡}| dHdI¡}| dJdK¡}| dLdM¡}| dNdO¡}| dPdQ¡}| dRdS¡}| dTdU¡}| dVdW¡}| dXdY¡}| dZd[¡}| d\d]¡}| d^d_¡}| d`da¡}| dbdc¡}| ddde¡}| dfdg¡}| dhdi¡}| djdk¡}| dldm¡}| dndo¡}| dpdq¡}| drds¡}| dtdu¡}| dvdw¡}| dxdy¡}| dîdz¡}| d{d|¡}| d}d~¡}| dd€¡}| dd‚¡}| dƒd„¡}| d…d†¡}| d‡dˆ¡}| d‰dŠ¡}| d‹dŒ¡}| ddŽ¡}| dd¡}| d‘d’¡}n'|d“kr9| d0d”¡}| d•d–¡}| d—d˜¡}| d™dš¡}| d›dœ¡}| ddž¡}| dŸd ¡}| d¡d¢¡}| d£d¤¡}n×|d¥krQ| d¦d§¡}| d¨d©¡}n¿|dªkra| d«d¬¡}n¯|d­krq| d®d¯¡}nŸ|d°kr¯| d±d²¡}| d	d³¡}| d½d¾¡}| d´dµ¡}| d¶d·¡}| d¸d¹¡}| dd€¡}na|dºkrß| d»d¼¡}| d½d¾¡}| d¿dÀ¡}| dÁdÂ¡}| dÃdÄ¡}n1|dÅkr÷| dÆdÇ¡}| dÈdÉ¡}n|dÊkrú| dËdÌ¡}| dÍdÎ¡}| dÏdÐ¡}| dÐdÑ¡}t dÒdÓ|¡}t dÔdÕ|¡}| dÖd×¡}| dØdÙ¡}| dÚdÛ¡}| dÜdÝ¡}t dÞdß|¡}| dàdá¡}| dâdã¡}| dädå¡}| dædç¡}| dèdé¡}| dédê¡}| dëdì¡}t dídî|¡}| dïdð¡}| dñdò¡}| dódô¡}| dõdö¡}| d÷dø¡}| dùdú¡}| dûdü¡}| dýdþ¡}| dÿd ¡}| dd¡}| dd¡}| dd¡}n|dkr| dd	¡}| d
d¡}d}d}| d-¡D ]¸}|dkr%q| d¡s3| d¡r5q| d¡rB| d¡ d|v rJq| 	¡ }t dd|¡}t dd|¡}| 	¡  d¡}	t
|	ƒdkr‰|	d/d … \}
}dd„ |	d d/… D ƒ\}}n6t
|	ƒdk	rt
|	d  ¡  d¡ƒdkrÅ|	dd … \}
}dd„ |	d  d¡d d/… D ƒ\}}|	d }núdd„ |	d d/… D ƒ\}}|	d/  d¡d }t d|¡rú|}
d |	d/  d¡dd … ¡}nÅd}
|	d/ }n¾t
|	ƒd/k	rl|	d  ¡  d¡}t
|ƒdk	rqdd„ |d d/… D ƒ\}}t
|ƒd/k	r<|d/ }
|	d }nƒ|	d  d¡d }t d|¡	rd|}
d |	d  d¡dd … ¡}n[d}
|	d }nS| d¡}t d|d ¡	r½t d|d ¡	r½dd„ |d d/… D ƒ\}}t d|d/ ¡	r°|d/ }
d |dd … ¡}nd}
d |d/d … ¡}nq|d k	rÉd!}n	|d"k	rÒd#}d$|v 	rÚq|
 	¡  d%¡ ¡  ¡ }
|
d&v 	rîq|
d'k	r÷d(}
|

rt d)d|
¡}
|
}|› d*t|d+ ƒd,›d*t|d+ ƒd,›d*|› }t|ƒ\}}d-|v 
rOt| d-¡ƒD ]	}tt| ƒ}
q6d
|v 
rLd.|› }n|}nd
|v 
rad.| d/¡d › }|d* | }||v
r~|d0k
r~d ||< t |¡||< ||v 
rŒ|| }|| }nt |¡}d }t d1|¡
r±| t||||| ddg||||d2	¡ |
rÒ|d d3k
rÀd}|d d-k
rÒ| d/¡d }q|S (4  NrQ   r   Úlatin1)Úencodingõ   Â’ú'úr©   ú ÚcÚSBC002z(TSK z(TSK) r%   ÚKATEÚKATHYz	sen~oritau	   seÃ±oritaÚSBC005z
good_/god/Úgoodz(H)@>z(H) @>z[@@ <@Mm@>]z[@@ <@ Mm @>]r&   z/pub/Úpubz<WH@@@@ (H) @@WH>z<WH @@@@ (H) @@ WH>z[2(H)2]1z[2(H)2]ÚSBC007z3\000000000 000000000 MARY: 1182.90 1186.92	        z
1182.90 1186.92	MARY:   z(YAWN0z(YAWN)ÚSBC008z[<X Go]=dX>z[<X Go]=d X>r'   z366.87 366.87z366.16 366.87r(   rÎ   z807.02 807.92	FRANK:  	.. Mhm.r   ÚMONTOYAÚMONTOYOÚSBC013z	[8<@She8]z
[8<@ She8]z[2(H) cou_ couch@>2]z[2(H) cou_ couch @>2]z	[4<@No=4]z
[4<@ No=4]zVOX2]zVOX>2]ÚSBC014z\000000000 000000000 z<@he thoughtz<@ he thoughtr)   z%243.055	244.080	KEN:	(H)] the little,z%243.465	244.670	KEN:	(H)] the little,z urch things.zchurch things.z2(H]=2z2(H)=2z 0.000000e+00Úez0m=,zum=,Ú0eopleÚpeopleÚ0idÚdidz
X 0ne %thozX uh line %thozand 0t [was]zand it [was]z0t was likezit was likeÚSBC016z/sed ai/zsed aiÚSBC017za	and names the] na=me,zand names the] na=me,z	[2I mean2z
[2I mean2]zno2.zno.Ú	0rganismsÚ	organismsÚ0ttleÚlittleÚSBC018Ú0fÚifz7129.916	130.324	LINDSEY:	Yeah.
129.915	130.325		[Mhm.]
z&129.915	130.325	LINDSEY:	[Mhm.] Yeah.
ÚSBC019zcello_(/cheller/)Úchellerz(sigh)z(SIGH)z<F<VOX> Mo=mz<F<VOX Mo=mz@@[3@=3z@@[3@=3]z[#5Jasonz[5#Jasonz[20nh2]z[2Unh2]zDraw 0nzDraw onÚ0oesÚDoesz0=kayzO=kayr7   z(COUGh)z(COUGH)z(throat)z(THROAT)z	S-  0emonzS- demonz 0.000000E+00ÚEznow 0mznow umzuh  0szuh iszbut  0nz	but uh inz	i- % 0t'sz
i- uh it'sÚ0rettyÚprettyzAUD:	YzX:	YÚSBC022z(h)z(H)z0.000000e+00ze-z	0ne thingzuh one thingÚSBC023ÚJANICDÚJANICEzNORA?ÚNORAzSUE?ÚSUEz2(SNIFF2z	2(SNIFF)2z
[<Xbu=tX>]z[<X bu=t X>]z
<or did itz<Q or did itzx>5]zX>5]Ú0nlyzuh onlyz[50r5]z[5Or5]ÚSBC024z >ENV: z>ENV:	z 0.000000irstÚFirstz2[causez[2causez 0oesÚdoesz0id]zdid]r*   z<ot,z<% not,Ú0mselfÚhimselfÚSBC026zdoes_(/uz/)Ú0ngoingÚongoingzAUD:	<XzX_2:	<Xr+   z142.870	144.790 :z142.870	144.790z451.510	452.130 :z451.510	452.130z 0oingÚdoingzAUD:	.. [Wez	X:	.. [WezAUD:	... LiquidzX_1:	... LiquidzAUD:	AddzX_2:	AddzAUD:	     [zX_3:	     [zAUD1:	... OnezX_4:	... Onez
AUD2:	[Onez	X_5:	[OnezAUD:	...X [Xz	X_6:	X [XzAUD1:	Eightz
X_7:	EightzAUD2:	... [@zAUD:	... [@zAUD3:	    [FourzX_8:	    [FourzAUD:	... SevenzX_9:	... SevenzAUD1:	.. <L2zX_10:	.. <L2zAUD2:	        [zX_11:	       [zAUD:	... <L2zX_12:	... <L2zAUD1:	... [EzX_13:	... [EzAUD2:	    [<L2zX_14:	    [<L2zAUD1:	     zX_15:	     zAUD2:	... TherezX_16:	... TherezAUD1:	[PullzX_17:	[Pullz	AUD2:	Youz	X_18:	YouzAUD:	[<Xz	X_19:	[<XzAUD:	... SolidzX_20:	... SolidzAUD:	.. HydrogenzX_21:	.. HydrogenzAUD:	.. OxygenzX_22:	.. Oxygenz
AUD:	.. [<zX_23:	.. [<zAUD:	       zX_24:	       zAUD:	They'rezX_25:	They'rez	AUD:	 XXXz
X_26:	 XXXzAUD:	... NozX_27:	... NozX_28:	<Xz
AUD:	ThrowzX_29:	ThrowzAUD:	HotterzX_30:	HotterzAUD:	.. LiquidzX_31:	.. Liquidz	AUD:	 Didz
X_32:	 DidzAUD:	XzX_33:	Xr9   z482.610	484.010	JILL_S: z482.610	484.010	JILL_S:	z	<@Oh[2=@>z<@ Oh[2= @>z	 0.000000r«   zi 0fzi- ifz0f wezif wezth- 0t'sz
th- that'sz0t'szit'sÚSBC029z96.230	98.240	>ENV: z96.230	98.240	>ENV:	z(H )z<0h=,z<% Oh=,zknowX>]zknow X>]Ú0verheatingÚoverheatingÚSBC030ÚDANNYÚBRADLEYzAUD:	YeszX:	Yesr.   z	13548.02 z1354.802r/   z#1558.463	1558.906		[thought he was,z#1558.906	1558.923		[thought he was,ÚSBC038zAUD:	... What'szX_2:	... What'sz
AUD:	... Uz
X_3:	... UzAUD:	... How farzX_2:	... How farzAUD:	<X QuitezX_4:	<X Quitez	AUD:	Yeahz	X_5:	Yeahz
AUD:	Aboutz
X_6:	AboutzAUD:	... ThatzX_7:	... ThatzAUD:	.. <X OhzX_8:	.. <X OhzAUD:	... How longzX_3:	... How longz	AUD:	<X @z	X_3:	<X @z	AUD:	Eachz	X_2:	EachzAUD:	The waterzX_2:	The waterzAUD:	[RightzX_9:	[RightzAUD:	... It'szX_9:	... It'sz
AUD:	[Perpz
X_9:	[PerpzAUD:	[2perpzX_9:	[2perpz
AUD:	[3Thez
X_9:	[3ThezAUD:	[4RightzX_9:	[4RightzAUD:	Oh yeahzX_9:	Oh yeahz
AUD:	[6Nowz
X_9:	[6NowzAUD:	with thezX_9:	with thezAUD:	[That-zX_9:	[That-zAUD:	[SpinningzX_9:	[SpinningzAUD:	[2YeahzX_9:	[2YeahzAUD:	[3XzX_9:	[3Xz	AUD:	[4<Xz	X_9:	[4<XzAUD:	And that'szX_9:	And that'szAUD:	[SozX_9:	[SozAUD:	[2that'szX_9:	[2that'szAUD:	that's3zX_9:	that's3zAUD:	WezX_9:	WezAUD:	.. AllzX_9:	.. AllzAUD:	.. What'szX_10:	.. What'szAUD:	... ArezX_3:	... ArezAUD:	The restzX_11:	The restzAUD:	... Y'allzX_12:	... Y'allzAUD:	... IszX_13:	... Isz	X_13:	[<Xz
AUD:	[YeahzX_13:	[YeahzAUD:	... What arezX_13:	... What areÚAUD_2rE   zAUD:	[What arezX_13:	[What arezAUD:	... SayzX_14:	... SayzAUD:	[what'szX_14:	[what'szAUD:	.. HmmzX_14:	.. HmmzAUD:	[3WhenzX_14:	[3Whenz
AUD:	[It'szX_15:	[It'szAUD:	... HavezX_16:	... HavezAUD:	ThankszX_17:	ThankszAUD:	... WowzX_13:	... WowÚSBC040zX:	... What'szAUD:	... HezX_2:	... Hez
AUD:	[Whatz
X_3:	[WhatzAUD:	.. Isn'tzX_4:	.. Isn'tzAUD:	ClaibornezX_4:	ClaibornezAUD:	... HowzX_4:	... HowzAUD:	.. HowzX_4:	.. HowzAUD:	.. ThezX_5:	.. ThezAUD:	... YeszX_6:	... YesÚSBC043z< HI any nights HI>z<HI any nights HI>ÚANNETTEÚANETTEÚSBC048z<@in San[2taz<@ in San[2taÚSBC052z~Janine	 saidz~Janine saidÚSBC054z<VOX Ugh VOX >z<VOX Ugh VOX>zX:	XzAUD_2:	[Tha-]zX_3:	[Tha-]zAUD_3:	[Tha-]zX_4:	[Tha-]zAUD:	[@rhinozX_5:	[@rhinoÚSBC055zin spite ..	of havingzin spite .. of havingzAUD:	... BeatricezX:	... BeatricezAUD:	How waszX_2:	How waszAUD:	CanzX_3:	CanzAUD_2:zX_4:ÚSBC056z@@@2]	[3@@@@3]z@@@2] [3@@@@3]z(sniff)z(SNIFF)r2   zHane-makikomiz<L2 Hane-makikomi L2>Úsenseiz<L2 sensei L2>ÚipponÚIpponz<L2 Ippon L2>z
gi([^a-z])z<L2 gi L2>\1zMakikomi([^-])z<L2 Makikomi L2>\1z
Hane-goshiz<L2 Hane-goshi L2>zSode-makikomiz<L2 Sode-makikomi L2>Úshiaiz<L2 shiai L2>Úrandoriz<L2 randori L2>z
Sode([^-])z<L2 Sode L2>\1ÚUkemiz<L2 Ukemi L2>zHa-jimez<L2 Ha-jime L2>z
Ude-garamiz<L2 Ude-garami L2>zHane-uchi-mataz<L2 Hane-uchi-mata L2>zUchi-<X mother X>z	Uchi-mataz<L2 Uchi-mata L2>zHande-maki- <L2 z<L2 Hande-maki- z
Hane([^-])z<L2 Hane L2>\1z%Sode-maki[komi]z<L2 %Sode-maki[komi] L2>z
Tsuri-komiz<L2 Tsuri-komi L2>z	Uchi-komiz<L2 Uchi-komi L2>zO-uchiz<L2 O-uchi L2>ÚGoshiz<L2 Goshi L2>z
Uchi]-mataz<L2 Uchi]-mata L2>ÚKomiz<L2 Komi L2>zTani-otoshiz<L2 Tani-otoshi L2>zHane-maki][2komi=z<L2 Hane-maki][2komi= L2>zMakikomi-wazaz<L2 Makikomi-waza L2>ÚSeoiz<L2 Seoi L2>Úukez<L2 uke L2>ÚSBC059z[<F 3And youz<F [3And youzhour[6=6 F>]zhour[6=6] F>ÚEnglishz77.200	77.540 :	(H)z000000000 000000000 z	0.00 0.00r­   z&and in his pamphlet the Liber Arbetrioú +z\t+r¨   c                 S   ó   g | ]}t | ¡ ƒ‘qS r>   ©ÚfloatÚrstrip©rn   Útimer>   r>   r?   Ú
<listcomp>ï  ó    z-_filename_to_supervisions.<locals>.<listcomp>r   rB   c                 S   ó   g | ]}t |ƒ‘qS r>   ©rP  rR  r>   r>   r?   rT  ó  ó    rÏ   c                 S   rN  r>   rO  rR  r>   r>   r?   rT  ö  rU  z[A-Z]+:c                 S   rV  r>   rW  rR  r>   r>   r?   rT    rX  z[0-9]+\.[0-9]+c                 S   rN  r>   rO  rR  r>   r>   r?   rT    rU  z[2<L2 Zocalo.z[2<L2 Zocalo L2>2].z[You're <L2 outre mer L2].z[You're <L2 outre mer L2>].z $ ú:)z>ENVÚENVz>MACz>DOGz>HORSEz>CATz>BABYz#READÚWALTz^[^A-Z]rC   iè  Ú07r¾   zEnglish-ú-r8   ú[A-Za-z])	Úidr†   rs   rt   r   ÚtextÚlanguager‰   ÚgenderÚr)ÚstemrI   rç   rè   rH   ré   rê   rJ   Úlstripr´   r   rQ  Ú	fullmatchrë   ÚintÚ_parse_raw_transcriptÚrangeÚcountrM   Úlang_iteratorsÚdummy_spk_iteratorÚsearchr…   r   )rY   r”   r•   rï   Úlinesrx   Ú
spk_bufferÚlang_bufferrî   ÚfieldsÚ	spk_fieldÚ	raw_transrs   ÚendÚspk_field_candidateÚtimesishrI   Úutt_idr`  Úlang_tagrC   Únew_langr÷   r‰   rb  r>   r>   r?   r„   ¡  sF  þþ
þ


þ







ÿþ" *$
&ÿ<




÷ÿ€r„   Ú
transcriptc              
   C   sâ  |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   d	d
¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} |   dd¡} t dd| ¡} t d| ¡}|D ]}|   |t dd|¡¡} qWt d| ¡}|D ]}|   |t dd|¡¡} qmt dd| ¡} t dd| ¡} t d d| ¡} t d!d| ¡} t d"d#| ¡} t d$d#| ¡} t d%d| ¡} t d&d| ¡} |   d'd(¡} |   d)d(¡} |   d*d+¡} |   d,d(¡} |   d-d¡} |   d.d¡} |   d/d¡} |   d0d(¡} |   d1d(¡} t d2d| ¡} t d3d| ¡} |   d4d¡} |   d5d¡} |   d6d¡} |   d7d¡} t d8d#| ¡} t d9d(| ¡} t d:d;| ¡} |   d<d=¡} t d>d?| ¡} t d@d+| ¡} t dAdB| ¡} t dCdD| ¡} |   dEd+¡  dEd+¡} t dFdG| ¡} t dFdG| ¡} t dHdI| ¡} t dJd| ¡} t dKd| ¡} |   dLdM¡} |   dNdO¡} |   dPdQ¡} t dRd| ¡} t dSd| ¡} t|  dT¡ƒdUkr©t dV|  dT¡dW ¡s¿t|  dX¡ƒdUkrÂt dV|  dX¡dY ¡rÂdZ}nd}|   d+d[¡} |   dd\¡} d]| v rí|t d^d#t d_d#| ¡¡ }|  dTd`¡  dXda¡}| |fS )bNÚ0hÚohzs@sozs- sozla@terÚlaterzyou@.zyou @.z[N=]ÚNz[2C2]=ÚCz[MM=]ÚMMz[I=]ÚIz(YELL)z<yell>rC   r]  ú=r©   ú%z\[([2-9]?)([A-Z])+\1\]z\2z\([^a-z@ ]*\)z[^\[\]]z\[[^a-z@ ]+\]z[^\(\)]z<<[^a-z@ ]+>>z<<[^a-z@ ]+z[^a-z@ ]+>>z<[^a-z@ ]+>z<[^a-z2 ]*[^2 ]([ <])z\1z([ >])[^a-z2 ]*[^a-z 2]>z\[[2-9]?z[2-9]?\]z(Hx)r«   z(hx)z(@Hx)ú@z(COUGH COUGH)z(SNIFFú(ú)z< z >z[^A-Za-z-]-+z\.\.+ú+ú&ú#Ú*z!([A-Za-z])rM  zX+rD   zon@,zon @,z([a-z-])@([a-z])z\1\2z@+z(^| )@([^ ])z @ \2z([^ ])@( |$)z\1 @ z@ @z(^| )X([ ,.?']|$)z	\1<UNK>\2zX-($| )z<UNK>\1z^ z $z .rQ   z ,r¦   z ?rª   z^\. z^\.$z<L2rB   r^  r   zL2>rÏ   rþ   z<LAUGH>z<YELL>ÚL2z(<L2|L2>)(?!.*(<L2|L2>)).*$z.*?(<L2|L2>)r¾   rc  )rè   ré   rê   Úfindallr   rI   rm  )rz  Úparen_matchesÚparen_matchÚbrack_matchesÚbrack_matchrx  r>   r>   r?   rh  b  s¦   ÿÿÿÿ
ýrh  c                   @   sB   e Zd ZU eed< eed< eed< eed< eed< dZeed< dS )	Ú
StmSegmentr†   r‰   rs   rt  r`  Ú1r   N)rN   rO   rP   r|   Ú__annotations__rP  r   r>   r>   r>   r?   r‘  Þ  s   
 r‘  Údatac                 C   s†   |   d¡}g }|D ]7}|sq	| ¡   ¡ }|d d… \}}}dd„ |dd… D ƒ\}}	d |dd … ¡}
| t||||	|
|d¡ q	|S )NrÎ   r   c                 S   rV  r>   rW  rR  r>   r>   r?   rT  ò  rX  z"parse_stm_file.<locals>.<listcomp>r§   r«   )r†   r‰   rs   rt  r`  r   )rI   r´   rH   r…   r‘  )r”  rn  Ústm_segmentsrî   rq  rï   r   r‰   rs   rt  r`  r>   r>   r?   Úparse_stm_fileè  s(   
úÿr–  c                 C   s*   dd l }|j | ¡}| ¡  d¡}t|ƒS )Nr   zutf-8)Úurllib.requestÚrequestÚurlopenÚreadÚdecoder–  )ÚurlÚurllibÚresponser”  r>   r>   r?   Úretrieve_stm_file  s   rŸ  r`  c                 C   s   |   ¡ } |  ¡ } | S rl   )r´   Úlower)r`  r>   r>   r?   Únorm_txt  s   r¡  Úseg1Úseg2c                 C   sN   t | j|jƒ}t| j|jƒ}t d|| ƒ}| j| j |j|j  | }|| S )Nç        )r‡   rs   rˆ   rt  )r¢  r£  rs   rt  ÚintersectionÚunionr>   r>   r?   Úcompute_iou  s
   r§  Úrecording_idsrx   Úaligned_stm_segsc                    s(  t dƒstdƒ‚ddlm} t dƒstdƒ‚ddlm} t|ƒ}i }| D ]}|ƒ ||< q$t|dd	D ]}|||j |j	|j
…< q2t|d
d	D ]Ë‰ d}	tt‡ fdd„|ˆ j ˆ j	|	 ˆ j
|	 … ƒƒ}
|
ss|ˆ j ˆ j	|	 ˆ j
|	 … }
t}d }d }d}|
D ]I}|tˆ jƒt|jjƒdd}|d }||k r¡|}|}|}tˆ |jƒ}||krÆ|jj d¡d ˆ j d¡d krÆtˆ |jƒ}||krÆ|}|}|}q}ˆ j d¡d |jj d¡d kr|d |d   krèdkrn qF|dk sötˆ jƒdk r|jj	ˆ _	|jj
|jj	 ˆ _|jjˆ _|ˆ j  |¡ qF|S )NÚintervaltreezLintervaltree package not found. Please install... (pip install intervaltree)r   )ÚIntervalTreeÚjiwerzEjiwer package not found. Please install... (pip install jiwer==3.0.4))ÚcerzBuilding interval tree...)ÚdesczApplying STM...g       @c                    s   | j jˆ jkS rl   )r”  r‰   )Úx©rš   r>   r?   Ú<lambda>A  s    zapply_stm.<locals>.<lambda>r¤  T)Úreturn_dictr­  rC   rB   ÚsubstitutionsÚ	deletionsg      à?r   )r   r±   rª  r«  r¬  r­  r   r   r†   rs   rt  rƒ   Úfilterr   r¡  r`  r”  r§  r‰   rI   r   rt   Úremove)r¨  rx   r©  r«  r­  ÚssetÚper_rec_itsÚridÚstm_segÚcollarÚmatching_segmentsÚbest_cerÚbest_cer_resÚbest_matching_segÚbest_iouÚmatching_segÚcer_resÚcer_valÚcurrent_iour>   r°  r?   Ú	apply_stm  sv   ÿÿ
þÿÿÿ"€$"

€rÅ  Úprocessed_supervisionsc                 C   s0   t dƒ}t dƒ}t| ||ƒ}t| ||ƒ}||fS )Nzfhttps://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_asr.stmzghttps://raw.githubusercontent.com/domklement/SBCSAE_alignments/main/alignments/stm/aligned_for_diar.stm)rŸ  rÅ  )r¨  rÆ  Úaligned_for_asr_stmÚaligned_for_diar_stmÚasr_supÚdiar_supr>   r>   r?   r   v  s   ÿÿr   )rQ   F)NFF)7Ú__doc__r]   ré   r`   Úcopyr   Údataclassesr   Úmathr   Úpathlibr   Útypingr   r   r   r	   r
   r   Úlhotser   r   r   r   r   Úlhotse.utilsr   r   r   r   r   r_   Úiterrk  rµ   rŒ   r:   rl  Úboolrg   r|   rŸ   ræ   r‚   r   r„   rh  r‘  r–  rŸ  r¡  rP  r§  rÅ  r   r>   r>   r>   r?   Ú<module>   sî    $ñÿþýüûúùø	÷
öõôóòñðïþÿþ
ý üÿþýü
ûs;W   D|		
ÿþý
üZÿÿ
þ