o
    pri0                  	   @   sB  d dl Z d dlmZ d dlZejeZ	 eg dZdd e	 D Z
eg dZdd eD Zdd e	 D Zeg d	g d
g dg dg dg dg ddZdd e	 D Zdd e	 D ZeZg ZeD ]Zee qled dZdZdZdZed Zed ZG dd dejZdd ZG dd dejZ dS )    N)OrderedDict)f)	Afrikaansaf)Amharicam)Arabicar)Armenianhy)Assameseas)Asturianast)Azerbaijaniaz)
Belarusianbe)Bengalibn)Bosnianbs)	Bulgarianbg)Burmesemy)Catalanca)Cebuanoceb)Mandarin Chinesecmn_hans)Cantonese Chineseyue_hant)Croatianhr)Czechcs)Danishda)Dutchnl)Englishen)Estonianet)Filipinofil)Finnishfi)Frenchfr)Fulaff)Galiciangl)Gandalg)Georgianka)Germande)Greekel)Gujaratigu)Hausaha)Hebrewhe)Hindihi)	Hungarianhu)	Icelandicis)Igboig)
Indonesianid)Irishga)Italianit)Japaneseja)Javanesejv)Kabuverdianukea)Kambakam)Kannadakn)Kazakhkk)Khmerkm)Koreanko)Kyrgyzky)Laolo)Latvianlv)Lingalaln)
Lithuanianlt)Luoluo)Luxembourgishlb)
Macedonianmk)Malayms)	Malayalamml)Maltesemt)Maorimi)Marathimr)	Mongolianmn)Nepaline)Northern-Sothonso)	Norwegiannb)Nyanjany)Occitanoc)Oriyaor)Oromoom)Pashtops)Persianfa)Polishpl)
Portuguesept)Punjabipa)Romanianro)Russianru)Serbiansr)Shonasn)Sindhisd)Slovaksk)	Sloveniansl)Somaliso)Sorani-Kurdishckb)Spanishes)Swahilisw)Swedishsv)Tajiktg)Tamilta)Telugute)Thaith)Turkishtr)	Ukrainianuk)Umbunduumb)Urduur)Uzbekuz)
Vietnamesevi)Welshcy)Wolofwo)Xhosaxh)Yorubayo)Zuluzuc                 C      i | ]\}}||qS  r   .0kvr   r   /home/ubuntu/.cache/huggingface/modules/datasets_modules/datasets/google--fleurs/80cb68d1b4d319aefbd8ea302274d3950d95f6242f0742c1452d1545c80a2d5f/fleurs.py
<dictcomp>       r   )faf_zaam_etar_egas_inast_esaz_azbe_bybn_inbs_baca_esceb_phcmn_hans_cnyue_hant_hkcs_czcy_gbda_dkde_deel_gren_uses_419et_eefa_irff_snfi_fifil_phfr_frga_iegl_esgu_inha_nghe_ilhi_inhr_hrhu_huhy_amid_idig_ngis_isit_itja_jpjv_idka_gekam_kekea_cvkk_kzkm_khkn_inko_krckb_iqky_kglb_lulg_ugln_cdlo_lalt_ltluo_kelv_lvmi_nzmk_mkml_inmn_mnmr_inms_mymt_mtmy_mmnb_none_npnl_nlnso_zany_mwoc_from_etor_inpa_inpl_plps_afpt_brro_roru_rubg_bgsd_insk_sksl_sisn_zwso_sosr_rssv_sesw_keta_inte_intg_tjth_thtr_truk_uaumb_aour_pkuz_uzvi_vnwo_snxh_zayo_ngzu_zac                 C   s.   i | ]}t d |d dd p| |qS )_N)_FLEURS_LANG_SHORT_TO_LONGjoinsplit)r   r   r   r   r   r      s   . c                 C   r   r   r   r   r   r   r   r       r   )r   r   r   r#   r'   r)   r+   r1   r3   r7   r=   r?   rI   rK   rQ   rS   rY   rq   ry   r   r   r   r   r   r   )r	   r   r   r%   r-   r;   ri   rm   rs   r   r   r   r   r   r   r   )r   r   rE   r_   re   r   r   r   r   r   r   r   )r   r   r5   r9   rC   rM   r[   rk   ro   r   r   r   r   r   r   r   r   r   r   r   )r   r   rA   rG   r]   rw   r}   r   r   r   r   r   r   r   )r   r   r/   rO   rW   ra   rg   ru   r{   r   r   )r   r!   rU   rc   )western_european_weeastern_european_ee%central_asia_middle_north_african_cmnsub_saharan_african_ssasouth_asian_sasouth_east_asian_seachinese_japanase_korean_cjkc                 C   s    i | ]\}}|D ]}||qqS r   r   )r   r   r   ar   r   r   r   +        c                 C   s   i | ]	\}}t | |qS r   )_FLEURS_LONG_TO_LANGr   r   r   r   r   ,       allzFLEURS is the speech version of the FLORES machine translation benchmark, covering 2000 n-way parallel sentences in n=102 languages. zdata/{langs}/zaudio/{split}.tar.gzz{split}.tsvc                       s    e Zd ZdZ fddZ  ZS )FleursConfigzBuilderConfig for xtreme-sc                    s>   t t| j| jtdd| jd || _|| _|| _|| _d S )Nz2.0.0rO  )nameversiondescription)	superrP  __init__rQ  datasetsVersionrS  citationhomepage)selfrQ  rS  rX  rY  	__class__r   r   rU  C   s   


zFleursConfig.__init__)__name__
__module____qualname____doc__rU  __classcell__r   r   r[  r   rP  @   s    rP  c                 C   s   t | tttdS )N)rQ  rS  rX  rY  )rP  _DESCRIPTION	_CITATION_HOMEPAGE_URL)rQ  r   r   r   _build_configQ   s   re  c                   @   s>   e Zd ZdZdd eD Zdd Zdd Zdd	 Zd
d Z	dS )Fleursi  c                 C   s   g | ]}t |qS r   )re  )r   rQ  r   r   r   
<listcomp>]   s    zFleurs.<listcomp>c                 C   s   t }ttdtdtdtjddtdtdtjg ddtj|dtdtjtt dd
}tj	| j
jd t |d	| j
j| j
jd t d
S )Nint32stringi>  )sampling_rate)malefemaleother)names)
rP   num_samplespathaudiotranscriptionraw_transcriptiongenderlang_idlanguagelang_group_id
)rq  rr  )rS  featuressupervised_keysrY  rX  )_ALL_CONFIGSrV  FeaturesValueAudio
ClassLabellist_FLEURS_GROUP_TO_LONGkeysDatasetInfoconfigrS  rb  rY  rX  rc  )rZ  langsry  r   r   r   _info_   s.   


zFleurs._infoc           	         sL  g d}j jdkrdd |D }dd |D }nfdd|D }fdd|D } |} js8 |ni } fdd| D } |}tjtjj	|
d	d gt|
d	 |
d	|
d	d
dtjtjj|
dd gt|
d |
d|
dd
dtjtjj|
dd gt|
d |
d|
dd
dgS )N)traindevtestrN  c                        i | ]   fd dt D qS )c                       g | ]	}t j| d qS )r  rB  )	_DATA_URLformatr   r  rB  r   r   rg     rM  7Fleurs._split_generators.<locals>.<dictcomp>.<listcomp>_FLEURS_LANGr   r   r  r   r      rK  z,Fleurs._split_generators.<locals>.<dictcomp>c                    r  )c                    r  r  )	_META_URLr  r  r  r   r   rg     rM  r  r  r  r   r  r   r      rK  c                    "   i | ]}|t j jj|d gqS r  )r  r  r  rQ  r   rB  rZ  r   r   r         " c                    r  r  )r  r  r  rQ  r  r  r   r   r      r  c                    s$   i | ]\}}| fd d|D qS )c                    s   g | ]}  |qS r   )iter_archive)r   rp  
dl_managerr   r   rg     r   r  r   )r   rB  pathsr  r   r   r      s   $ r  )local_extracted_archivesarchive_iters
text_paths)rQ  
gen_kwargsr  r  )r  rQ  downloadis_streamingextractitemsrV  SplitGeneratorSplitTRAINgetlen
VALIDATIONTEST)	rZ  r  splits	data_urls	meta_urlsarchive_pathsr  r  
meta_pathsr   )r  rZ  r   _split_generators{   s@   

zFleurs._split_generatorsc                 C   s   i }dddd}|D ]=}t |tr|d}| d\}}}}	}
}}t| }t|||	t||| t|t	| t
t |d||< q
|S )Nr         )MALEFEMALEOTHERutf-8	)rP   rs  rr  ro  rt  ru  rv  rw  )
isinstancebytesdecodestriprB  _FLEURS_LANG_TO_GROUPintr  index_FLEURS_LANG_TO_LONGr  r  r  )rZ  linesru  datagender_to_idline_id	file_namers  rr  r>  ro  rt  
lang_groupr   r   r   	_get_data   s6   

	zFleurs._get_datac              	   c   s"   t |t |  krt |ksJ  J d}| jjdkr t}n| jjg}t||||D ]b\}}}}	t|dd}
|
 }| ||	}W d    n1 sMw   Y  |D ]9\}}|dd }||	 vrfqT|| }|d urut
j||nd }||d< || d|d	< ||fV  |d
7 }qTq,d S )Nr   rN  r  )encoding/r?  rp  )rp  r  rq  r  )r  r  rQ  r  zipopen	readlinesr  rB  r  osrp  rA  read)rZ  r  r  r  keyr  archive	text_pathlocal_extracted_pathru  fr  r  
audio_path
audio_fileaudio_filenameresultextracted_audio_pathr   r   r   _generate_examples   s4   (


zFleurs._generate_examplesN)
r]  r^  r_  DEFAULT_WRITER_BATCH_SIZEr{  BUILDER_CONFIGSr  r  r  r  r   r   r   r   rf  Z   s    -!rf  )!r  collectionsr   rV  logging
get_loggerr]  logger_FLEURS_LANG_TO_IDr  r@  sortedr  rL  r  r  _FLEURS_LONG_TO_GROUPr  	_ALL_LANGr{  r  appendrb  rc  rd  
_BASE_PATHr  r  BuilderConfigrP  re  GeneratorBasedBuilderrf  r   r   r   r   <module>   sF   	
	