o
    Si                     @   s  d Z ddlZddlZddlZddlZddlm  mZ	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) i dg ddg dddgddgddgdddgdg ddg ddg dd g d!d"g d#d$g d%d&g d'd(g d)d*g d+d,g d-d.g d/i d0g d1d2g d3d4g d5d6g d7d8g d9d:d:gd;d;gd<d<gd=d=gd>d>gd?d?gd@d@gdAdAgdBdBgdCdCgdDdDgdEdEgi dFdFgdGdGgdHdHgdIdIgdJdJgdKg dLdMg dNdOg dPdQg dRdSg dTdUg dVdWg dXdYg dZd[g d\d]g d^d_g d`dag dbg dcg ddg deg dfg dgg dhg dig djdkZ*dldm dnD dodm dpD dqdm drD dsdtdm duD dvdm dwD dxdm dyD dsdzdm d{D d|dm d}D d~dm dD dsdZ+g dZ,ddgZ-g dZ.			ddedee/ dee0 dee0 ddf
ddZ1					dde&dee& dee/ dee0 dee0 defddZ2G dd deZ3				dde&de0dee4 de/de/dee0ee# f fddZ5			ddeee6e6e0f  dee4 de/de/deeee6e6e0f   f
ddZ7dee& defddZ8	ddee& dee0 defddZ9dedee0ee3 f de$fddZ:dedee0ee3 f de$fddZ;								dde&dee& dee& dee0 dee0 de0dee4 de/dee/ dee0ee0eee$f f f fddZ<dS )ac  
The data preparation recipe for the AMI Meeting Corpus.

NOTE on data splits and references:

- The official AMI documentation (http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml) recommends
three different data partitions: scenario-only, full-corpus, and full-corpus-asr, based on the
task that the data is used for. We provide an argument `partition` which specifies which
partition is to be used.

- We use the latest version of the official annotations: ami_public_manual_1.6.2. This differs from
the Kaldi s5 and s5b recipes which use 1.6.1 (known to have alignment and annotation issues). We
get word-level annotations with time-marks and combine adjacent words into one segment if: (i) they
belong to the same speaker, and (ii) there is no pause between the words. (These supervisions can
later be modified to get larger super-segments based on the task)

NOTE on mic settings: AMI comes with 4 different microphone settings:

- ihm (individual headset microphone)
- sdm (single distant microphone)
- ihm-mix (mix-headset sum)
- mdm (multiple distant microphone)

These can be specified using the `mic` argument.
    N)defaultdict)Path)DictList
NamedTupleOptionalTupleUnion)tqdm)$validate_recordings_and_supervisions)AudioSource	RecordingRecordingSet)fix_manifests)normalize_text_ami)AlignmentItemSupervisionSegmentSupervisionSet)PathlikeSecondsadd_durationsresumable_downloadEN2001)EN2001aEN2001bEN2001dEN2001eEN2002)EN2002aEN2002bEN2002cEN2002dEN2003EN2003aEN2004EN2004aEN2005EN2005aEN2006EN2006aEN2006bEN2009)EN2009bEN2009cEN2009dES2002)ES2002aES2002bES2002cES2002dES2003)ES2003aES2003bES2003cES2003dES2004)ES2004aES2004bES2004cES2004dES2005)ES2005aES2005bES2005cES2005dES2006)ES2006aES2006bES2006cES2006dES2007)ES2007aES2007bES2007cES2007dES2008)ES2008aES2008bES2008cES2008dES2009)ES2009aES2009bES2009cES2009dES2010)ES2010aES2010bES2010cES2010dES2011)ES2011aES2011bES2011cES2011dES2012)ES2012aES2012bES2012cES2012dES2013)ES2013aES2013bES2013cES2013dES2014)ES2014aES2014bES2014cES2014dES2015)ES2015aES2015bES2015cES2015dES2016)ES2016aES2016bES2016cES2016dIB4001IB4002IB4003IB4004IB4005IB4010IB4011IN1001IN1002IN1005IN1007IN1008IN1009IN1012IN1013IN1014IN1016IS1000)IS1000aIS1000bIS1000cIS1000dIS1001)IS1001aIS1001bIS1001cIS1001dIS1002)IS1002bIS1002cIS1002dIS1003)IS1003aIS1003bIS1003cIS1003dIS1004)IS1004aIS1004bIS1004cIS1004dIS1005)IS1005aIS1005bIS1005cIS1006)IS1006aIS1006bIS1006cIS1006dIS1007)IS1007aIS1007bIS1007cIS1007dIS1008)IS1008aIS1008bIS1008cIS1008dIS1009)IS1009aIS1009bIS1009cIS1009dTS3003)TS3003aTS3003bTS3003cTS3003dTS3004)TS3004aTS3004bTS3004cTS3004d)TS3005aTS3005bTS3005cTS3005d)TS3006aTS3006bTS3006cTS3006d)TS3007aTS3007bTS3007cTS3007d)TS3008aTS3008bTS3008cTS3008d)TS3009aTS3009bTS3009cTS3009d)TS3010aTS3010bTS3010cTS3010d)TS3011aTS3011bTS3011cTS3011d)TS3012aTS3012bTS3012cTS3012d)TS3005TS3006TS3007TS3008TS3009TS3010TS3011TS3012c                 C   s&   g | ]}t | D ]}|d vr|qqS ))IS1002aIS1005dMEETINGS.0sessionmeeting r   F/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/ami.py
<listcomp>n   s    
r   )r/   r>   rC   rH   rM   rR   rW   ra   rf   rp   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   c                 C      g | ]}t | D ]}|qqS r   r   r   r   r   r   r   s   
    
)r4   r\   r   r   r   c                 C   r   r   r   r   r   r   r   r   v   r   )r9   rk   r   r   r   traindevtestc                 C   r   r   r   r   r   r   r   r   {   
    
))r/   r>   rC   rH   rM   rR   rW   ra   rf   rp   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   r$   r&   r(   r+   r   r   r   r   r   r   r   r   r   r   c                 C   r   r   r   r   r   r   r   r      s
    
)r4   r\   r   r   r   rz   r{   r|   r}   r   r   c                 C   r   r   r   r   r   r   r   r      r   )r9   rk   r   r   r   r   c                 C   r   r   r   r   r   r   r   r      r   )-r/   r>   rC   rH   rM   rR   rW   ra   rf   rp   ru   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r"   r$   r&   r(   r+   r   r   r   r   r   r   r   r   r   r   rk   r   r4   r   c                 C   r   r   r   r   r   r   r   r      r   )	r\   r   r   rz   r{   r|   r}   r   r   c                 C   r   r   r   r   r   r   r   r      r   )r9   r   r   r   )zscenario-onlyfull-corpuszfull-corpus-asr)ihmihm-mixsdmmdmmdm8-bfArray1Array2)0102030405060708Fhttp://groups.inf.ed.ac.uk/amir   
target_dirforce_downloadurlmicreturnc              	   C   s8  t tjt ddD ]}|dkrO|dv rdnd}t|D ].}| d| d}| d	| d
| }| d | d }	|	jddd |	| }
t||
|d qq|dkr}| d}| d	| d
| }| d | d }	|	jddd |	| }
t||
|d q|dkr| d}| d	| d
| }| d | d }	|	jddd |	| }
t||
|dd q|dkrt	D ]7}t
D ]2}| d| d| d}| d	| d
| }| d | d }	|	jddd |	| }
t||
|dd qqq|dkr| d}| d| d| }| d | d }	|	jddd |	| }
t||
|d qd S )NzDownloading AMI meetingsdescr   )r   r   r         z	.Headset-z.wavz/AMICorpusMirror/amicorpus/z/audio/wav_dbaudioTparentsexist_ok)filenamer  r  z.Mix-Headset.wavr  z.Array1-01.wav)r  r  
missing_okr  .-r  z	_MDM8.wavz&/AMICorpusMirror/amicorpus/beamformed//)r
   	itertoolschainfrom_iterabler   valuesrangemkdirr   
MDM_ARRAYSMDM_CHANNELS)r  r  r  r  itemheadset_nummwav_namewav_urlwav_dirwav_patharraychannelr   r   r   download_audio   s   



r4  r   annotationsc                 C   sh   t | } |s
| d n|}t| ||| td | r&td|  | S | d}t|||d | S )a2  
    Download AMI audio and annotations for provided microphone setting.

    Example usage:
    1. Download AMI data for IHM mic setting:
    >>> download_ami(mic='ihm')
    2. Download AMI data for IHM-mix mic setting, and use existing annotations:
    >>> download_ami(mic='ihm-mix', annotations='/path/to/existing/annotations.zip')

    :param target_dir: Pathlike, the path to store the data.
    :param annotations: Pathlike (default = None), path to save annotations zip file
    :param force_download: bool (default = False), if True, download even if file is present.
    :param url: str (default = 'http://groups.inf.ed.ac.uk/ami'), AMI download URL.
    :param mic: str {'ihm','ihm-mix','sdm','mdm','mdm8-bf'}, type of mic setting.
    :return: the path to downloaded and extracted directory with data.
    ami_public_manual_1.6.2.zipzDownloading AMI annotationsz/Skip downloading annotations as they exist in: z1/AMICorpusAnnotations/ami_public_manual_1.6.2.zip)r  )r   r4  logginginfoexistsr   )r  r5  r  r  r  annotations_urlr   r   r   download_ami   s   

r;  c                   @   sB   e Zd ZU eed< eed< eed< eed< eed< ee ed< dS )AmiSegmentAnnotationtextspeakergender
start_timeend_timewordsN)__name__
__module____qualname__str__annotations__r   r   r   r   r   r   r   r<    s   
 r<  upperannotations_dir	normalizemax_words_per_segmentmerge_consecutivekeep_punctuationc           ,         s  t | dr*dd l}|| }|j| jd W d    n1 s"w   Y  | j} i }i }t| d d 5}	t|	}
|
	 D ]#}|j
d }|D ]}||j
d f}|j
d ||< t|j
d	 ||< qIq@W d    n1 snw   Y  i }| d
  D ]p}|jd\}}}||f|vrtd| d| d q{|||f }|||f }|||f}g ||< t|1}	t|	}
|
	 D ]}|jdkrqt|j
d }t|j
d }|| ||f qW d    n1 sw   Y  q{i }| d  D ]}|jd\}}}||f|vrq|||f }|||f }|||f}||vrqg ||< t|\}	t|	}
|
	 D ]J}|jdks>d|j
vr@q0t|j
d }t|j
d }|ddrWdnd}|ddrbdnd}|| |||rt||j | n|jf q0W d    n	1 sw   Y  qtt}| D ]\}}|| }|D ]\ tt fdd|} t| ||}!|!D ]}"|"d d }#|"d d }$g }%|"D ]o}&t|#t|&d dd }'t|$t|&d dd }(t|(|' d!d"})t|&d# |d$}*t |*dkrq|)dkr,|r
t |&d# dkr,td%|d  d|d  d|d#  d&|# d'|$ d(|&d#  d) q|%t!|'|)|*d* q|r=dnd"d+d, |%D # }+|| t$|+|d |d d |#|$|%d- qqq|S ).Nz.zipr   )pathcorpusResourceszmeetings.xmlobservation	nxt_agentglobal_namer3  segmentsr   zNo speaker z found! Skipping annotation.segmenttranscriber_starttranscriber_endrB  w	starttimeendtimepuncF  truncz- c                    s   | d ko| d  kS )Nr      r   )rW  seg_end	seg_startr   r   <lambda>w  s    z'parse_ami_annotations.<locals>.<lambda>r^  r  ndigits>  sampling_rate   )rJ  Segment z	 at time r!  z has a word `z+` with zero or negative duration. Skipping.)startdurationsymbolc                 s   s    | ]}|j V  qd S N)rm  )r   rW  r   r   r   	<genexpr>  s    z(parse_ami_annotations.<locals>.<genexpr>)r=  r>  r?  r@  rA  rB  )%rF  endswithzipfileZipFile
extractallparentopenETparsegetrootattribintiterdirstemsplitr7  warningtagfloatappendgetr=  r   listitemsfiltersplit_segmentmaxroundminr   r   lenr   joinstripr<  ),rI  rJ  rK  rL  rM  rq  zglobal_spk_id
channel_idftreer   meet_idr>  local_idrS  filelocal_spkid_spkr3  keysegr@  rA  rB  wordmaybe_spacemaybe_hyphenr5  segs	spk_words	seg_wordssubsegmentssubsegrk  endword_alignmentsrW  w_startw_endw_durw_symbolr=  r   r_  r   parse_ami_annotations  s   











",
+r  rB  c                    sv    fddfdd}fddt || }dur0fdd	|D }d
d	 |D }t tdd |}|S )a   
    Given a list of words, return a list of segments (each segment is a list of words)
    where each segment has at most max_words_per_segment words. If merge_consecutive
    is True, then consecutive segments with less than max_words_per_segment words
    will be merged together.
    c                 3   sf    g }| D ] }|d |kr  r| | t|dkr|V  g }q| | qt|dkr1|V  d S d S )Nrc  r   )r  r  )sequencesepchunkval)rM  r   r   split_  s   

zsplit_segment.<locals>.split_c                    s   t | d}t|dk r|S  r nd}rN|d g}|dd  D ])}|d d d |d d krFt|d t| |krF|d | q"|| q"|}|S )Nr   ri  i r   r^  rc  r  r  extendr  )r  subsegsmax_segment_lengthmerged_subsegsr  )rK  rL  r  r   r   split_on_fullstop_  s   
z)split_segment.<locals>.split_on_fullstop_c                    sp   t | d}t|dk r|S |d g}|dd  D ]}t|d t|  kr0|d | q|| q|S )N,ri  r   r^  rc  r  )rT  r  r  r  )rK  r  r   r   split_on_comma_  s   
z&split_segment.<locals>.split_on_comma_Nc                    s*   g | ]}t | krt|n|gqS r   )r  r  )r   r  )rK  r  r   r   r     s    z!split_segment.<locals>.<listcomp>c                 S   s   g | ]	}|D ]}|qqS r   r   )r   sublistr+  r   r   r   r     s    c                 S   s   t | dkS )Nr   )r  )sr   r   r   rb    s    zsplit_segment.<locals>.<lambda>)r  r  )rB  rK  rL  rM  r  r  r   )rM  rK  rL  r  r  r   r    s   r  audio_pathsc              
   C   s   dd l }ddlm} |dd | }g }t| ddD ]W\}}|t|d }g }d}	tt|D ]*\}
}|t|}|j	dkrOt
d	| d
 d}	 n|td|
gt|d q2|	s`q|t|||j|j|j|j d qt|S )Nr   )groupbyc                 S   s
   | j d S )N)parts)pr   r   r   rb    s   
 z'prepare_audio_grouped.<locals>.<lambda>Processing audio filesr  Tr^  zSkipping recording z since it has a stereo channelFr  typechannelssourceidsourcesrh  num_samplesrl  )	soundfilecytoolzr  r
   r  	SoundFilerF  	enumeratesortedr  r7  r~  r  r   r   
samplerateframesr   from_recordings)r  sfr  channel_wavs
recordingssession_namechannel_pathsaudio_sfr  all_monoidx
audio_pathr  r   r   r   prepare_audio_grouped  sB   



	r  r  c              
   C   s   dd l }g }t| ddD ]6}|dkr|jd n|jd }|t|}|t|tdtt	|j
t|dg|j|j|j|j d	 qt|S )
Nr   r  r  r  r  r  r  r  )r  r
   r  r  rF  r  r   r   r  r'  r  r  r  r   r  )r  r  r  r  r  r  r  r   r   r   prepare_audio_single-  s*   

r  r  c                    s$   fdd D }g }t | ddD ]{}|jD ]u}|j\}||j|f}|d u r7td|j d|j d qt|D ]O\}}	t	|	j
|	j dd	}
|	j
|jkr`td
|j d| d| d q;|
dkr|t|j d| d| |jt|	jdd|
|d|	j|	j|	jd|	jid
 q;qqt|S )Nc                    s"   i | ]}|d  |d f | qS )r   ri  r   )r   r  r5  r   r   
<dictcomp>T  s    z+prepare_supervision_ihm.<locals>.<dictcomp>Preparing supervisionsr  "No annotation found for recording z (file )rf  rg  rj  r!  z8 exceeds recording duration. Not adding to supervisions.r   r  rd  Englishr  
r  recording_idrk  rl  r3  languager>  r?  r=  	alignment)r
   r  r  r  r  r7  r~  r  r  r   rA  r@  rl  r  r   r  r>  r?  r=  rB  r   from_segments)r  r5  annotation_by_id_and_channelrS  	recordingr  r3  
annotationseg_idxseg_inforl  r   r  r   prepare_supervision_ihmO  sV   



'r  c                 C   s  t t}| D ]\}}||d  | qg }t| ddD ]^}||j}|d u r4td|j  qt	dd |j
D rItd|j d qt|D ].\}}	|	j|	j }
|
dkr{|t|j d	| |j|	j|
|jd
|	j|	j|	jd|	jid
 qMqt|S )Nr   r  r  r  c                 s   s    | ]
}t |jd kV  qdS )r^  N)r  r  )r   r  r   r   r   ro    s    z,prepare_supervision_other.<locals>.<genexpr>z"More than 1 channels in recording z. Skipping this recording.r!  r  r  r  )r   r  r  r  r
   r  r  r7  r~  anyr  r  rA  r@  r  r   channel_idsr>  r?  r=  rB  r   r  )r  r5  annotation_by_idr  valuerS  r  r  r  r  rl  r   r   r   prepare_supervision_other  sD   
r  r   kaldidata_dir
output_dir	partitionnormalize_textc	              	      s6  t | } |  sJ d|  |tv sJ d| d|tv s'J d| d|dur6t |}|jddd td |sZ| d	  rH| d	 }n| d
  rS| d
 }ntd|  t |}t	|||||d}	td | }
|dv r|dkr{|

dn|

d}tt|}n(|dv r|dkr|

d}n|dkr|

d}n	|dkr|

d}tt||}td |dkrt||	nt||	}tt}t|  dD ]J| fdd}| fdd}t||\}}t|| |dur||d| d d   ||d| d! d   ||d"|< qt|S )#a  
    Returns the manifests which consist of the Recordings and Supervisions
    :param data_dir: Pathlike, the path of the data dir.
    :param annotations: Pathlike, the path of the annotations dir or zip file.
    :param output_dir: Pathlike, the path where to write the manifests.
    :param mic: str {'ihm','ihm-mix','sdm','mdm','mdm8-bf'}, type of mic to use.
    :param partition: str {'full-corpus','full-corpus-asr','scenario-only'}, AMI official data split
    :param normalize_text: str {'none', 'upper', 'kaldi'} normalization of text
    :param max_words_per_segment: int, maximum number of words per segment. If not None, we will split
        longer segments similar to Kaldi's data prep scripts, i.e., split on full-stop and comma.
    :param merge_consecutive: bool, if True, merge consecutive segments split on full-stop.
        We will only merge segments if the number of words in the merged segment is less than
        max_words_per_segment.
    :param keep_punctuation: bool, if True, keep punctuation marks.
    :return: a Dict whose key is ('train', 'dev', 'eval'), and the values are dicts of manifests under keys
        'recordings' and 'supervisions'.

    Example usage:
    1. Prepare IHM-Mix data for ASR:
    >>> manifests = prepare_ami('/path/to/ami-corpus', mic='ihm-mix', partition='full-corpus-asr')
    2. Prepare SDM data:
    >>> manifests = prepare_ami('/path/to/ami-corpus', mic='sdm', partition='full-corpus')
    zNo such directory: zMic z not supportedz
Partition NTr  zParsing AMI annotationszami_public_manual_1.6.2r6  z<No annotations directory specified and no zip file found in )rJ  rK  rL  rM  zPreparing recording manifests)r   r  r   z*Headset-?.wavz*Array?-0?.wav)r  r  r  r  z*Mix-Headset.wavr  z*Array1-01.wavr  z	*MDM8.wavzPreparing supervision manifestsr   c                       | j   v S rn  )r  xdataset_partspartr   r   rb        zprepare_ami.<locals>.<lambda>c                    r  rn  )r  r  r  r   r   rb    r  zami-_recordings_z	.jsonl.gz_supervisions_)r  supervisions)r   is_dirMICS
PARTITIONSr(  r7  r8  is_file
ValueErrorr  rglobr  r  r  r  r  r   dictr  r   r   to_file)r  rI  r  r  r  r  rK  rL  rM  r5  r0  r  r  supervision	manifests
audio_partsupervision_partr   r  r   prepare_ami  s   "



	





r  )Fr  r   )r   NFr  r   )rH  NFF)NFF)r  )NNr   r   r  NFF)=__doc__r#  r7  osurllib.requesturllibxml.etree.ElementTreeetreeElementTreerv  collectionsr   pathlibr   typingr   r   r   r   r   r	   	tqdm.autor
   lhotser   lhotse.audior   r   r   	lhotse.qar   lhotse.recipes.utilsr   lhotse.supervisionr   r   r   lhotse.utilsr   r   r   r   r   r  r   r)  r*  boolrF  r4  r;  r<  rz  r  r  r  r  r  r  r  r  r   r   r   r   <module>   s    	
 !"#$%&'()*+,-./01234@/
G
,
 

X
4
"
6
-	
