o
    Ni4                     @   sf  d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddlm	  m
Z ddlmZ dZdZG d	d
 d
Zdd eddddg ddeddddg ddeddddg ddeddddg d ded!d"d#d$g d%ded&d'd(d)g d*ded+d,d-d.g d/ded0d1d2d3g d4ded5d6d7d8g d9df	D ZG d:d; d;ejjZG d<d= d=ejjZd>d? ZdS )@zopus dataset.    )absolute_import)division)print_functionN)loggingz
@inproceedings{Tiedemann2012ParallelData,
  author = {Tiedemann, J},
  title = {Parallel Data, Tools and Interfaces in OPUS},
  booktitle = {LREC}
  year = {2012}}
aF  
OPUS is a collection of translated texts from the web.

Create your own config to choose which data / language pair to load.

```
config = tfds.translate.opus.OpusConfig(
    version=tfds.core.Version('0.1.0'),
    language_pair=("de", "en"),
    subsets=["GNOME", "EMEA"]
)
builder = tfds.builder("opus", config=config)
```
c                   @   s   e Zd ZdZdd ZdS )
SubDatasetz<Class to keep track of information on a sub-dataset of OPUS.c                 C   sd   || _ || _|| _|| _t|}g }t|D ]\}}	||d d D ]	}
||	|
f q"q|| _dS )a:  Sub-dataset of OPUS.

    Args:
      name: `string`, a unique dataset identifier.
      description: `string`, a description of the dataset.
      homepage: `string`, homepage of the dataset.
      url: `string`, download url for the dataset.
      languages: `<list>(string)`, a list of supported languages.
       N)namedescriptionhomepageurlsorted	enumerateappendlanguage_pairs)selfr   r	   r
   r   	languagessorted_languagesr   idxsourcetarget r   V/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/translate/opus.py__init__7   s   

zSubDataset.__init__N)__name__
__module____qualname____doc__r   r   r   r   r   r   4   s    r   c                 C   s   i | ]}|j |qS r   r   ).0dsr   r   r   
<dictcomp>N       r    EMEAzOA parallel corpus made out of PDF documents from the European Medicines Agency.zhttp://opus.nlpl.eu/EMEA.phpz1http://opus.nlpl.eu/download.php?f=EMEA/v3/moses/)bgcsdadeelenesetfifrhuitltlvmtnlplptroskslsv)r   r	   r
   r   r   
JRC-AcquiszA collection of legislative text of the European Union and currently comprises selected texts written between the 1950s and now.z"http://opus.nlpl.eu/JRC-Acquis.phpz.http://opus.nlpl.eu/download.php?f=JRC-Acquis/TanzilzBA collection of Quran translations compiled by the Tanzil project.zhttp://opus.nlpl.eu/Tanzil.phpz3http://opus.nlpl.eu/download.php?f=Tanzil/v1/moses/)*amarazr#   bnbsr$   r&   dvr(   r)   far,   hahiidr.   jakokumlmsr2   nor3   r4   r5   rusdsosqr8   swtatgthtrttuguruzzhGNOMEzMA parallel corpus of GNOME localization files. Source: https://l10n.gnome.orgzhttp://opus.nlpl.eu/GNOME.phpz2http://opus.nlpl.eu/download.php?f=GNOME/v1/moses/)afr;   anangr<   ar_TNaraasastr=   az_IRbalbebemr#   bg_BGr>   bn_INbobrbrxr?   cacatcrhr$   csbcyr%   da_DKr&   de_CHr@   dzr'   r(   en_AUen_CAen_GBen_NZen_USen_ZAeor)   es_ARes_CLes_COes_CRes_DOes_ECes_ESes_GTes_HNes_MXes_NIes_PAes_PEes_PRes_SVes_UYes_VEr*   eurA   fa_IRr+   fofoor,   furfygagdglgngrgugvrB   herC   hi_INhrr-   hyiarD   igioisr.   it_ITrE   jbokakgkkkmknrF   krksrG   kylalglilor/   r0   maimgmimkrH   mnmrrI   ms_MYr1   musmynbnb_NOndsnenhnr2   nnnn_NOrJ   no_nbnqonrnsoocorospar3   psr4   pt_BRpt_PTquzr5   rK   rwsir6   r7   rM   rN   srsr_MEstr8   rO   szlrP   terQ   tg_TJrR   tktltl_PHtmprS   tr_TRtsrT   rU   ukrV   ur_PKrW   vivi_VNwaxhyiyozh_CNzh_HKzh_TWzuKDE4z3A parallel corpus of KDE4 localization files (v.2).zhttp://opus.nlpl.eu/KDE4.phpz1http://opus.nlpl.eu/download.php?f=KDE4/v2/moses/)\rZ   r<   r_   r`   rc   r#   r>   rf   rh   rj   rl   r$   rm   rn   r%   r&   r'   r(   rt   rx   r)   r*   r   rA   r+   r,   r   r   r   r   rB   r   rC   hner   hsbr-   r   rD   r   r.   rE   r   r   r   r   rF   rG   lbr/   r0   r   r   rH   r   rI   r1   r   r   r   r2   r   r   r   r   r   r3   r   r4   r   r5   rK   r   ser   r6   r7   r   r8   rP   r   rQ   rR   rS   r   rW   r   r   r   r   r   r   PHPzPA parallel corpus originally extracted from http://se.php.net/download-docs.php.zhttp://opus.nlpl.eu/PHP.phpz0http://opus.nlpl.eu/download.php?f=PHP/v1/moses/)r$   r&   r(   r)   r+   r,   r   r-   r.   rE   rF   r2   r3   r   r5   rK   r6   r7   r8   rS   twrX   r   UbuntuzZA parallel corpus of Ubuntu localization files. Source: https://translations.launchpad.netzhttp://opus.nlpl.eu/Ubuntu.phpz7http://opus.nlpl.eu/download.php?f=Ubuntu/v14.10/moses/)acerZ   akr;   r[   r\   r<   ar_SYaryr_   r`   r=   barb   rc   rd   berr#   bhor>   rf   rg   rh   ri   r?   buabynrj   cecebchrckbcorl   r$   rm   cvrn   r%   r&   de_ATde_DEdsbr@   rq   r'   r(   rr   rs   rt   ru   rv   rx   r)   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r*   r   rA   fa_AFffr+   filr   r,   fr_CAfr_FRfrmfrpr   r   r   r   r   r   grcr   gucr   rB   hawr   rC   hilr   r   r   htr-   r   r   rD   r   r   r   r.   iurE   r   jvr   kabr   r   klr   r   rF   kokr   kshrG   kwr   r   r   r   r   lijlldlnr   r/   ltgr0   r   r   mhmhrr   miqr   rH   r   mor   rI   r1   r   r   nannapr   r   r   r   r2   nl_NLr   rJ   r   nyr   ojomr   r   r   pampapr3   pmspmyr   r4   r   r   qurmr5   romrK   r   sascscorL   r   shnshsr   r6   r7   smsmlsnrM   sonrN   r   r   r8   rO   syrr   rP   ta_LKr   tetrQ   rR   tir   r   tlhrS   trvr   rT   rU   r   rV   rW   vevecr   r   waewoxalr   r   r   rX   r   r   r   r   zza
OpenOfficez:A collection of documents from http://www.openoffice.org/.z%http://opus.nlpl.eu/OpenOffice-v2.phpz7http://opus.nlpl.eu/download.php?f=OpenOffice/v2/moses/)r&   r(   r)   r,   jpr8   OpenSubtitleszQA new collection of translated movie subtitles from http://www.opensubtitles.org/z+http://opus.nlpl.eu/OpenSubtitles-v2018.phpz=http://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/moses/)>rZ   r<   r#   r>   rh   r?   rj   r$   r%   r&   r'   r(   rx   r)   r*   r   rA   r+   r,   r   r   rC   r   r-   r   rD   r   r.   rE   r   r   rF   r/   r0   r   rH   rI   r2   rJ   r3   r4   pt_brr5   rK   r   r6   r7   rN   r   r8   rP   r   rR   r   rS   r   rV   r   ze_enze_zhzh_cnzh_twc                       s(   e Zd ZdZejj fddZ  ZS )
OpusConfigzBuilderConfig for Opus.c              	      sf   t |}|dd|d |d d|f }|d }tt| jd
d|it||d || _|| _d	S )a/  BuilderConfig for Opus.

    Args:
      language_pair: `(string, string)`, pair of languages used for translation.
        Should contain 2 letter coded strings (e.g. "de", "en")
      subsets: `<list>(string)`, list of the subdatasets to use.
      **kwargs: keyword arguments forwarded to super.
    r   z%s-%s for %sr   r   z, z
 documentsr	   r   Nr   )	r   getjoinsuperrB  r   dictlanguage_pairsubsets)r   rG  rH  kwargssorted_language_pairr   r	   	__class__r   r   r      s   


zOpusConfig.__init__)	r   r   r   r   tfdscoredisallow_positional_argsr   __classcell__r   r   rK  r   rB     s    rB  c                   @   sr   e Zd ZdZddgfddgfddgfdg d	fd
dgfgZ	 dd eD Zedd Zdd Zdd Z	dd Z
dS )Opusz6OPUS is a collection of translated texts from the web.medicalr"   lawr9   koranr:   IT)rY   r   r   r   r:  	subtitlesr<  c                 C   s(   g | ]\}}t tjd d||dqS )z0.1.0)r&   r(   )versionrG  rH  r   )rB  rM  rN  Version)r   r   rH  r   r   r   
<listcomp>   s    
zOpus.<listcomp>c                 C   sD   | j j\}}g }dd | j jD D ]}||f|jv r|| q|S )Nc                 S   s   g | ]}t | qS r   )DATASET_MAP)r   r   r   r   r   rY     r!   z Opus.subsets.<locals>.<listcomp>)builder_configrG  rH  r   r   )r   r   r   filtered_subsetsdatasetr   r   r   rH     s   
zOpus.subsetsc                 C   sB   | j j\}}tjj| td | j j tjj| j jd||fdt	dS )N
)r   zhttp://opus.nlpl.eu/)builderr	   featuressupervised_keysr
   citation)
r[  rG  rM  rN  DatasetInfo_DESCRIPTIONr	   r`  Translation	_CITATION)r   srcr   r   r   r   _info   s   z
Opus._infoc           
      C   s   | j j\}}d||f }g }| jD ]3}|tj|jd| }tj|d|j||f }tj|d|j||f }	|	|j||	d qt
jjt
jjd|idgS )Nz%s-%sz
%s.txt.zipz%s.%s.%s)r   source_filetarget_filerH  )r   
gen_kwargs)r[  rG  rH  download_and_extractr   pathrD  r   r   r   rM  rN  SplitGeneratorSplitTRAIN)
r   
dl_managerr   r   file_extrH  itemdl_dirri  rj  r   r   r   _split_generators   s.   

zOpus._split_generatorsc                 c   s    | j j\}}|D ]@}td|d  |d }|d }t|t|g}tt| D ]\}\}	}
||	||
i}t| rHd|d |f }||fV  q)q	d S )NzGenerating examples from: %sr   ri  rj  z%s/%d)	r[  rG  r   info	_gen_liner   zipallvalues)r   rH  r   r   rs  ri  rj  gensr   source_senttarget_sentresultkeyr   r   r   _generate_examples   s   
zOpus._generate_examplesN)r   r   r   r   _KK_SUBSETSBUILDER_CONFIGSpropertyrH  rh  ru  r  r   r   r   r   rQ     s"    
	

rQ  c                 c   sF    t jj| }|D ]}|V  qW d   dS 1 sw   Y  dS )z)Returns sentences from an OPUS data file.N)tfr   gfileGFile)filenamefliner   r   r   rw    s   "rw  )r   
__future__r   r   r   r   abslr   tensorflow.compat.v2compatv2r  tensorflow_datasets.public_api
public_apirM  rf  rd  r   rZ  rN  BuilderConfigrB  GeneratorBasedBuilderrQ  rw  r   r   r   r   <module>   s   DW