o
    SiN                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddedefddZ			ddede	e dededeee
eef f f
ddZdS )u  
This recipe prepares data collected from radio streamed on the web. The data
have some metadata attached to them, including the geographic location of
broadcast, date and time of the recorded clip, as well as a unique station
identifier.

Obtaining the data
-----------------------------------------------------------
If you want to use this corpus please email: wiesner@jhu.edu

As the data are collected from radio stream, they cannot be broadly
disseminated or used for commercial purposes. In the email, include your
affiliated academic institution and the intended use for the data and we will
the data to you if it is indeed for non-commercial, academic purporses.

Description
------------------------------------------------------------
The data consist of ∼4000 hours of speech collected between
September 27, 2023 to October 1, 2023, in 9449 locations all over the world,
from 17171 stations. 

These data were used for Geolocation of speech in order to answer the question,
Where are you from? in the paper 

Where are you from? Geolocating Speech and Applications to Language
Identification, presented at NAACL 2024. Please read for a full descrption
and please cite as 
 
@inproceedings{foley2024you,
  title={Where are you from? Geolocating Speech and Applications to Language Identification},
  author={Foley, Patrick and Wiesner, Matthew and Odoom, Bismarck and Perera, Leibny Paola Garcia and Murray, Kenton and Koehn, Philipp},
  booktitle={Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)},
  pages={5114--5126},
  year={2024}
}
    N)partial)Path)DictOptionalUnion)tqdm)	RecordingRecordingSet)parallel_map)SupervisionSegmentSupervisionSet)Pathlike      ?sfmsdc                 C   s  | j d }|d }| dj}t| jjd}|d|  | d }tj||d}dg|_	g }d}	t
| }
t|
}W d    n1 sGw   Y  tdt| jdd	\}}t|d
d}t|dd}td| d }|d	}dd |dd D }|D ]?}t|d t|d }}|| }|d dv r||kr|t| d	td| d||t|dd|||||d dd q||fS )N   recosz.flac.zrecos.)recording_idr   zlat[^_]+_long[^_]+_lat longz#s_dur[0-9]+_(.*)_lat[^_]+_long[^_]+c                 S   s   g | ]	}t |d qS )hms)intstrip).0i r   H/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/recipes/radio.py
<listcomp>I   s    z1_make_reco_and_sups_from_file.<locals>.<listcomp>      )malefemaled   04   )dater   lonstation
est_gender)idr   startdurationchannelcustom)parentswith_suffixstemr   parentsuffixr   r   	from_filechannel_idsopenjsonloadresearchr   groupsplitfloatreplacegroupsappendr   round)r   r   
corpus_dir	audio_dirfname	chunk_idx	reco_filerecosupstotalfsegmentsr   r)   r*   
fname_valsr(   segr-   enddurr   r   r   _make_reco_and_sups_from_file3   sN   

"
rR   r'   rD   
output_dirmin_segment_durationnum_jobsreturnc              	   C   s  t | } | d}g g }}tt|d}|durt |nd}|jdddd t|d T}t|d 4}	tt	|||d	d
dD ]\}
}|
| |
D ]}|	
| qLqAt|jt|	jd}W d   n1 slw   Y  W d   |S W d   |S 1 sw   Y  |S )aI  
    Return the manifests which consist of recordings and supervisions
    :param corpus_dir: Path to the collected radio samples
    :param output_dir: Pathlike, the path where manifests are written
    :return: A Dict whose key is the dataset part and the value is a Dict with
        keys 'recordings' and 'supervisions'.
    zsegs/*/*.json)r   Ni  T)moder1   exist_okzradio_recordings.jsonl.gzzradio_supervisions.jsonl.gz)rU   z"Making recordings and supervisions)desc)
recordingssupervisions)r   rglobr   rR   mkdirr	   open_writerr   r   r
   writefrom_jsonl_lazypath)rD   rS   rT   rU   segment_filesr[   rZ   fun
rec_writer
sup_writerrJ   rI   sup	manifestsr   r   r   prepare_radioa   sJ   






rh   )r   )Nr   r'   )__doc__r9   r;   	functoolsr   pathlibr   typingr   r   r   r   lhotse.audior   r	   lhotse.parallelr
   lhotse.supervisionr   r   lhotse.utilsr   strr?   rR   r   rh   r   r   r   r   <module>   s4    $0