o
    2wi$                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZ g dZ		ddedede	e dedeee
eef f f
ddZdededeeee f fddZdefddZ dd Z!dd Z"dS ) aV  
This is a part of English HUB4 corpus.
It contains Broadcast News data, i.e. audio and transcripts of TV news.
We currently support the following LDC packages:

1997 English Broadcast News Train (HUB4)
  Speech       LDC98S71
  Transcripts  LDC98T28

This data is not available for free - your institution needs to have an LDC subscription.
    N)chain)Path)DictListOptionalUnion)sliding_window)$validate_recordings_and_supervisions)	RecordingRecordingSet)fix_manifests)SupervisionSegmentSupervisionSet)Pathlikecheck_and_rglobrecursion_limit)z</timez<overlapz	</overlapF	audio_dirtranscripts_dir
output_dirabsolute_pathsreturnc           
         s  t | d}t |d}t fdd|D }td dd t||D }W d   n1 s/w   Y  ttd	d |D }ttd
d |D }	t	||	\}}	t
||	 |dur~t|}|jddd ||d  ||d  |	|d  |||	dS )a(  
    Prepare manifests for 1997 English Broadcast News corpus.
    We create three manifests: one with recordings, one with segments supervisions,
    and one with section supervisions. The latter can be used e.g. for topic segmentation.

    :param audio_dir: Path to ``LDC98S71`` package.
    :param transcripts_dir: Path to ``LDC98T28`` package.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :return: A dict with manifests. The keys are: ``{'recordings', 'sections', 'segments'}``.
    z*.sphz*.sgmlc                 3   s&    | ]}t j| rd nddV  qd S )N   )relative_path_depth)r
   	from_file).0pr    Z/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/lhotse/recipes/broadcast_news.py	<genexpr>3   s
    
z)prepare_broadcast_news.<locals>.<genexpr>i  c                 S   s   g | ]	\}}t ||qS r   )make_supervisions)r   r   rr   r   r   
<listcomp>;   s    z*prepare_broadcast_news.<locals>.<listcomp>Nc                 s       | ]}|d  V  qdS )sectionsNr   r   supsr   r   r   r   ?       c                 s   r#   )segmentsNr   r%   r   r   r   r   B   r'   T)parentsexist_okz&broadcast-news_recordings_all.jsonl.gzz$broadcast-news_sections_all.jsonl.gzz$broadcast-news_segments_all.jsonl.gz)
recordingsr$   r(   )r   r   from_recordingsr   zipr   from_segmentsr   from_iterabler   r	   r   mkdirto_file)
r   r   r   r   audio_paths
sgml_pathsr+   supervisions_listsection_supervisionssegment_supervisionsr   r   r   prepare_broadcast_news    s@   



r7   	sgml_path	recordingc                 C   s  t | }|d}g }g }d}t|ddD ]\}}t|jd }	|t|j d|d|j|	t	t|jd |	 dd	d|jd
 |jd |jd dd |dD ]}
|
j
D ]y}dd t|dD }|slq[g }g }t|D ]\}}td|}|t|d || qt|t|
jd  ttd||D ]/\\}}}|t|j d|d|j|t	|| dd	d|jd
 | |
jd |
jd d	 q|d7 }q[qVq||dS )zICreate supervisions for sections and segments for a given HUB4 recording.episoder   section	starttime_section03dendtimer   )ndigitslanguagetypeprogram)r;   rC   )idrecording_idstartdurationchannelrA   customturnc                    s.   g | ] t  rt fd dtD s qS )c                 3   s    | ]}  |V  qd S N
startswith)r   blr   r   r      s    z/make_supervisions.<locals>.<listcomp>.<genexpr>)lenanyEXCLUDE_BEGINNINGS)r   r   rO   r   r"      s    z%make_supervisions.<locals>.<listcomp>
zsec="?(\d+\.?\d*)"?      _segment04d   speakerspkrtype)	rD   rE   rF   rG   rH   rA   textrZ   gender)r$   r(   )	try_parsefind	enumeratefind_allfloatattrsappendr   rD   roundchildrenstrsplitgroup_lines_in_time_markerresearchgroupr-   r   strip)r8   r9   docr:   r5   text_supervisionstext_idxsec_idxr;   	sec_startrJ   childlinestimestextstime_markerr\   matchrF   endr   r   r   r    Z   sh   



,r    c                 C   s   zddl m} W n   tdz||  dW S  tyq   ddl}ddlm} |jd|  ddd		d
d 
dd }| $}|jd| d|j d|  dddd || dW  d    Y S 1 siw   Y  Y dS w )z
    Return a BeautifulSoup object created from an SGML file.
    If it runs into Unicode decoding errors, it will try to determine the file's encoding
    and use iconv to automatically convert it to UTF-8.
    r   )BeautifulSoupzVBefore running BroadcastNews data preparation, you should "pip install beautifulsoup4"zhtml.parserN)NamedTemporaryFilez	file -bi T)shellr\   ;zcharset= z	iconv -f z -t utf-8 -o  )r|   checkr\   )bs4rz   ImportError	read_textUnicodeDecodeError
subprocesstempfiler{   check_outputrh   replacerm   runnameread)r8   rz   r   r{   encodingfr   r   r   r^      s6   *r^   c                 C   s,   ddl m} tdd || dd dD dS )	zXThis is a helper for the situation when a <time> marker contains multiple lines of text.r   )groupbyc                 S   s$   g | ]\}}d  dd |D qS )r   c                 s   s    | ]}|  V  qd S rK   )rm   )r   rP   r   r   r   r      r'   z8group_lines_in_time_marker.<locals>.<listcomp>.<genexpr>)join)r   	is_markerrt   r   r   r   r"      s    z.group_lines_in_time_marker.<locals>.<listcomp>c                 S   s
   |  dS )Nz<timerL   rO   r   r   r   <lambda>   s   
 z,group_lines_in_time_marker.<locals>.<lambda>)keyrV   )	itertoolsr   rl   )
sgml_linesr   r   r   r   ri      s   
ri   c                    s   t  fddtD  S )aO  group([0,3,4,10,2,3], 2) => [(0,3), (4,10), (2,3)]

    Group a list into consecutive n-tuples. Incomplete tuples are
    discarded e.g.

    Source code by Brian Quinlan from:
    https://code.activestate.com/recipes/303060-group-a-list-into-sequential-n-tuples/

    >>> group(range(10), 3)
    [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    c                    s   g | ]	} |d  qS rK   r   )r   ilstnr   r   r"      s    zgroup.<locals>.<listcomp>)r-   ranger   r   r   r   rl      s   rl   )NF)#__doc__rj   r   r   pathlibr   typingr   r   r   r   cytoolzr   lhotser	   lhotse.audior
   r   	lhotse.qar   lhotse.supervisionr   r   lhotse.utilsr   r   r   rS   boolrg   r7   r    r^   ri   rl   r   r   r   r   <module>   sF    
:
K"