o
    Ni                     @   s  d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	Z	ddl
Z
ddlm  mZ ddlmZ ddlZddlm  mZ ddlmZ dZg dZG d	d
 d
eZg eddh ddddedddhdddedddhddd edD ddedddhdd gd!ded"ddhd#d$d edD dded%ddhd#d&d edD dded'dd(hd)d*ded+dh d,d-d.ded/ddhd0dded1dd2d3hd4d5ded6dd3d7hd8d9ded:dh d;d<d!ded=ddhd>d?ded@ddAhdBdCgd!dedDdd(hdEdFdedGddHdhdIdJdedKdh dLdMdNdedOdh dLdPdQdedRdh dSdTdUdedVdh dWdXdYdedZdh dWd[d\ded]dh d^d_d!ded`ddhdad!dedbdd(hdcdddededh dfdgdhdediddjhdkdldedmdh d;dnd!dedoddhdpddedqdh drdsdtdedudd3dvhdwdxdedyddhdzd{ded|dd7d}hd~d!dedddjdhdddeddd3hdddedddAhdddedddjhdddeddh ddd!dedddjhddgdddd eD  ZedddhdddedddAhdddeddd3hdddeddd7d}hdddeddd7d}hdddeddd(dhdddeddd2hdddeddh ddddedddhdddedddhdddeddh d,dddeddh ddddeddh d,dddeddh d,dddeddh d,dddeddh ddddeddh ddddeddh ddddeddh ddddedddhdddeddh ddddeddd3hdddeddh d£dddeddd3hdddeddh dǣdddgZddʄ ee D Z edddhdddZ!G ddτ dej"j#Z$G ddф dej"j%Z&ddӄ Z'ddՄ Z(ddׄ Z)dddلZ*ddۄ Z+dd݄ Z,dd߄ Z-dS )zWMT: Translate dataset.    )absolute_import)division)print_functionN)loggingaA  Translate dataset based on the data from statmt.org.

Versions exists for the different years using a combination of multiple data
sources. The base `wmt_translate` allows you to create your own config to choose
your own data/language pair by creating a custom `tfds.translate.wmt.WmtConfig`.

```
config = tfds.translate.wmt.WmtConfig(
    version="0.0.1",
    language_pair=("fr", "de"),
    subsets={
        tfds.Split.TRAIN: ["commoncrawl_frde"],
        tfds.Split.VALIDATION: ["euelections_dev2019"],
    },
)
builder = tfds.builder("wmt_translate", config=config)
```

)	casia2015
casict2011
casict2015	datum2015	datum2017neu2017c                   @   s:   e Zd ZdZdddZdd Zdd Zd	d
 Zdd ZdS )
SubDatasetz;Class to keep track of information on a sub-dataset of WMT.Nc                 C   sX   t |tjr	|fn|| _t |tjr|fn|| _|r|ng | _|| _|| _t|| _	dS )au  Sub-dataset of WMT.

    Args:
      name: `string`, a unique dataset identifier.
      target: `string`, the target language code.
      sources: `set<string>`, the set of source language codes.
      url: `string` or `(string, string)`, URL(s) or URL template(s) specifying
        where to download the raw data from. If two strings are provided, the
        first is used for the source language and the second for the target.
        Template strings can either contain '{src}' placeholders that will be
        filled in with the source language code, '{0}' and '{1}' placeholders
        that will be filled in with the source and target language codes in
        alphabetical order, or all 3.
      path: `string` or `(string, string)`, path(s) or path template(s)
        specifing the path to the raw data relative to the root of the
        downloaded archive. If two strings are provided, the dataset is assumed
        to be made up of parallel text files, the first being the source and the
        second the target. If one string is provided, both languages are assumed
        to be stored within the same file and the extension is used to determine
        how to parse it. Template strings should be formatted the same as in
        `url`.
      manual_dl_files: `<list>(string)` (optional), the list of files that must
        be manually downloaded to the data directory.
    N)

isinstancesixstring_types_paths_urls_manual_dl_filesnametargetsetsources)selfr   r   r   urlpathmanual_dl_files r   U/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/translate/wmt.py__init__B   s   zSubDataset.__init__c                    s<   j vrtdjfdd  fdd|D S )z6Injects languages into (potentially) template strings.zInvalid source for '{0}': {1}c                    sf   d| v rd| v r| j t jgdiS d| v r'd| v r'| j t jg S d| v r1| j dS | S )Nz{0}z{1}z{src}src)r   )formatsortedr   )sr   r   r   r   _format_stringf   s   z3SubDataset._inject_language.<locals>._format_stringc                    s   g | ]} |qS r   r   ).0r!   )r#   r   r   
<listcomp>o       z/SubDataset._inject_language.<locals>.<listcomp>)r   
ValueErrorr   r   )r   r   stringsr   )r#   r   r   r   _inject_languageb   s   
	zSubDataset._inject_languagec                 C      |  || jS N)r)   r   r"   r   r   r   get_urlq      zSubDataset.get_urlc                 C   r*   r+   )r)   r   r"   r   r   r   get_manual_dl_filest   r-   zSubDataset.get_manual_dl_filesc                 C   r*   r+   )r)   r   r"   r   r   r   get_pathw   r-   zSubDataset.get_pathr+   )	__name__
__module____qualname____doc__r   r)   r,   r.   r/   r   r   r   r   r   ?   s    
 r   commoncrawlen>   csdeesfrruz=http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz)zcommoncrawl.{src}-en.{src}zcommoncrawl.{src}-en.enr   r   r   r   r   commoncrawl_frder7   r9   )zMhttp://data.statmt.org/wmt19/translation-task/fr-de/bitexts/commoncrawl.fr.gzzMhttp://data.statmt.org/wmt19/translation-task/fr-de/bitexts/commoncrawl.de.gz) r=   czeng_10r6   z%http://ufal.mff.cuni.cz/czeng/czeng10c                 C      g | ]}d | qS zdata-plaintext-format.%d.tarr   r$   ir   r   r   r%      r&   r%   
   )
 data.plaintext-format/??train.gzrD   rD   rD   rD   rD   rD   rD   rD   rD   )r   r   r   r   r   r   czeng_16prez(http://ufal.mff.cuni.cz/czeng/czeng16prez+czeng16pre.deduped-ignoring-sections.txt.gzr=   czeng_16zhttp://ufal.mff.cuni.cz/czengc                 C   r?   r@   r   rA   r   r   r   r%      r&   czeng_17c                 C   r?   r@   r   rA   r   r   r   r%      r&   dcep_v1lvz?http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz)zdcep.en-lv/dcep.lvzdcep.en-lv/dcep.eneuroparl_v7>   r6   r7   r8   r9   z=http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz)z#training/europarl-v7.{src}-en.{src}z training/europarl-v7.{src}-en.eneuroparl_v7_frde)zMhttp://data.statmt.org/wmt19/translation-task/fr-de/bitexts/europarl-v7.fr.gzzMhttp://data.statmt.org/wmt19/translation-task/fr-de/bitexts/europarl-v7.de.gzeuroparl_v8_18etfizIhttp://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz)z#training/europarl-v8.{src}-en.{src}z training/europarl-v8.{src}-en.eneuroparl_v8_16rozIhttp://data.statmt.org/wmt16/translation-task/training-parallel-ep-v8.tgz)z2training-parallel-ep-v8/europarl-v8.{src}-en.{src}z/training-parallel-ep-v8/europarl-v8.{src}-en.eneuroparl_v9>   r6   r7   rN   ltzFhttp://www.statmt.org/europarl/v9/training/europarl-v9.{src}-en.tsv.gzgigafrenz2http://www.statmt.org/wmt10/training-giga-fren.tar)zgiga-fren.release2.fixed.fr.gzzgiga-fren.release2.fixed.en.gzhindencorp_01hiz/http://ufallab.ms.mff.cuni.cz/~bojar/hindencorpzhindencorp0.1.gzleta_v1z9http://data.statmt.org/wmt17/translation-task/leta.v1.tgz)zLETA-lv-en/leta.lvzLETA-lv-en/leta.enmultiunr8   z4http://www.statmt.org/wmt13/training-parallel-un.tgz)zun/undoc.2000.{src}-en.{src}zun/undoc.2000.{src}-en.ennewscommentary_v9>   r6   r7   r9   r:   z7http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz)z*training/news-commentary-v9.{src}-en.{src}z'training/news-commentary-v9.{src}-en.ennewscommentary_v10z8http://www.statmt.org/wmt15/training-parallel-nc-v10.tgz)z"news-commentary-v10.{src}-en.{src}znews-commentary-v10.{src}-en.ennewscommentary_v11>   r6   r7   r:   zJhttp://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz)z;training-parallel-nc-v11/news-commentary-v11.{src}-en.{src}z8training-parallel-nc-v11/news-commentary-v11.{src}-en.ennewscommentary_v12>   r6   r7   r:   zhzJhttp://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz)z+training/news-commentary-v12.{src}-en.{src}z(training/news-commentary-v12.{src}-en.ennewscommentary_v13zJhttp://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz)z;training-parallel-nc-v13/news-commentary-v13.{src}-en.{src}z8training-parallel-nc-v13/news-commentary-v13.{src}-en.ennewscommentary_v14>   r6   r7   kkr:   r\   zVhttp://data.statmt.org/news-commentary/v14/training/news-commentary-v14.{0}-{1}.tsv.gznewscommentary_v14_frdezThttp://data.statmt.org/news-commentary/v14/training/news-commentary-v14.de-fr.tsv.gzonlinebooks_v1z@http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz)zfarewell/farewell.lvzfarewell/farewell.enparacrawl_v1>   r6   r7   rM   rN   r:   zuhttps://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-{src}.zipporah0-dedup-clean.tgz)z7paracrawl-release1.en-{src}.zipporah0-dedup-clean.{src}z4paracrawl-release1.en-{src}.zipporah0-dedup-clean.enparacrawl_v1_rur:   zrhttps://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz)z1paracrawl-release1.en-ru.zipporah0-dedup-clean.ruz1paracrawl-release1.en-ru.zipporah0-dedup-clean.enparacrawl_v3z[https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-{src}.bicleaner07.tmx.gzparacrawl_v3_frde)zShttp://data.statmt.org/wmt19/translation-task/fr-de/bitexts/de-fr.bicleaner07.de.gzzShttp://data.statmt.org/wmt19/translation-task/fr-de/bitexts/de-fr.bicleaner07.fr.gz
rapid_2016>   r7   rM   rN   z;http://data.statmt.org/wmt18/translation-task/rapid2016.tgz)zrapid2016.{0}-{1}.{src}zrapid2016.{0}-{1}.enrapid_2016_ltfirR   zIhttps://tilde-model.s3-eu-west-1.amazonaws.com/rapid2016.en-{src}.tmx.zipzrapid2016.en-{src}.tmx
rapid_2019zBhttps://s3-eu-west-1.amazonaws.com/tilde-model/rapid2019.de-en.zip)zrapid2019.de-en.dezrapid2019.de-en.en	setimes_2trzAhttp://opus.nlpl.eu/download.php?f=SETIMES/v2/tmx/en-{src}.tmx.gzuncorpus_v1r\   zRhttps://storage.googleapis.com/tfds-data/downloads/uncorpus/UNv1.0.en-{src}.tar.gz)zen-{src}/UNv1.0.en-{src}.{src}zen-{src}/UNv1.0.en-{src}.enwikiheadlines_fiz+http://www.statmt.org/wmt15/wiki-titles.tgzzwiki/fi-en/titles.fi-enwikiheadlines_hiz+http://www.statmt.org/wmt14/wiki-titles.tgzzwiki/hi-en/wiki-titles.hi-enwikiheadlines_ruzwiki/ru-en/wiki.ru-enwikititles_v1>   r6   r7   rN   gur_   rR   r:   r\   zBhttp://data.statmt.org/wikititles/v1/wikititles-v1.{src}-en.tsv.gzyandexcorpusz*https://translate.yandex.ru/corpus?lang=enz1mcorpus.zip)zcorpus.en_ru.1m.ruzcorpus.en_ru.1m.enc              
   C   s0   g | ]}t |d dhd| d| d| fdqS )r5   r\   z6ftp://cwmt-wmt:cwmt-wmt@nlp.nju.edu.cn/parallel/%s.zipz%s/*_c[hn].txtz%s/*_en.txtr;   )r   )r$   ssr   r   r   r%     s    euelections_dev2019z5http://data.statmt.org/wmt19/translation-task/dev.tgz)z$dev/euelections_dev2019.fr-de.src.frz$dev/euelections_dev2019.fr-de.tgt.denewsdev2014)zdev/newsdev2014.hizdev/newsdev2014.ennewsdev2015)z"dev/newsdev2015-fien-src.{src}.sgmzdev/newsdev2015-fien-ref.en.sgmnewsdiscussdev2015)z,dev/newsdiscussdev2015-{src}en-src.{src}.sgmz)dev/newsdiscussdev2015-{src}en-ref.en.sgmnewsdev2016)z%dev/newsdev2016-{src}en-src.{src}.sgmz"dev/newsdev2016-{src}en-ref.en.sgmnewsdev2017)z%dev/newsdev2017-{src}en-src.{src}.sgmz"dev/newsdev2017-{src}en-ref.en.sgmnewsdev2018)z%dev/newsdev2018-{src}en-src.{src}.sgmz"dev/newsdev2018-{src}en-ref.en.sgmnewsdev2019>   rp   r_   rR   )z%dev/newsdev2019-{src}en-src.{src}.sgmz"dev/newsdev2019-{src}en-ref.en.sgmnewsdiscusstest2015)z-dev/newsdiscusstest2015-{src}en-src.{src}.sgmz*dev/newsdiscusstest2015-{src}en-ref.en.sgmnewssyscomb2009)zdev/newssyscomb2009.{src}zdev/newssyscomb2009.ennewstest2008>   r6   r7   r8   r9   hu)zdev/news-test2008.{src}zdev/news-test2008.ennewstest2009)zdev/newstest2009.{src}zdev/newstest2009.ennewstest2010)zdev/newstest2010.{src}zdev/newstest2010.ennewstest2011)zdev/newstest2011.{src}zdev/newstest2011.ennewstest2012)zdev/newstest2012.{src}zdev/newstest2012.ennewstest2013)zdev/newstest2013.{src}zdev/newstest2013.ennewstest2014>   r6   r7   r8   r9   rU   r:   )z&dev/newstest2014-{src}en-src.{src}.sgmz#dev/newstest2014-{src}en-ref.en.sgmnewstest2015>   r6   r7   rN   r:   )z&dev/newstest2015-{src}en-src.{src}.sgmz#dev/newstest2015-{src}en-ref.en.sgmnewstest2016>   r6   r7   rN   rP   r:   rj   )z&dev/newstest2016-{src}en-src.{src}.sgmz#dev/newstest2016-{src}en-ref.en.sgmnewstestB2016)z$dev/newstestB2016-enfi-ref.{src}.sgmz!dev/newstestB2016-enfi-src.en.sgmnewstest2017>   r6   r7   rN   rI   r:   rj   r\   )z&dev/newstest2017-{src}en-src.{src}.sgmz#dev/newstest2017-{src}en-ref.en.sgmnewstestB2017)z!dev/newstestB2017-fien-src.fi.sgmz!dev/newstestB2017-fien-ref.en.sgmnewstest2018>   r6   r7   rM   rN   r:   rj   r\   )z&dev/newstest2018-{src}en-src.{src}.sgmz#dev/newstest2018-{src}en-ref.en.sgmc                 C   s   i | ]}|j |qS r   )r   )r$   dsr   r   r   
<dictcomp>5  r&   r   czeng17_filterzIhttp://ufal.mff.cuni.cz/czeng/download.php?f=convert_czeng16_to_17.pl.zipzconvert_czeng16_to_17.plc                       s6   e Zd ZdZejj						d fdd	Z  ZS )	WmtConfigzBuilderConfig for WMT.NNNc           	         s   d|d |d f }|r|d|j  7 }d|v r |d|d 7 }tt| jd	||d| |p1d| _|| _|| _|| _|| _	dS )
a  BuilderConfig for WMT.

    Args:
      url: The reference URL for the dataset.
      citation: The paper citation for the dataset.
      description: The description of the dataset.
      language_pair: pair of languages that will be used for translation. Should
                 contain 2 letter coded strings. For example: ("en", "de").
      text_encoder_config: `tfds.features.text.TextEncoderConfig` (optional),
        configuration for the `tfds.features.text.TextEncoder` used for the
        `tfds.features.text.Translation` features.
      subsets: Dict[split, list[str]]. List of the subset to use for each of the
        split. Note that WMT subclasses overwrite this parameter.
      **kwargs: keyword arguments forwarded to super.
    z%s-%sr      .r   )r   descriptionzhttp://www.statmt.orgNr   )
r   popsuperr   r   r   citationlanguage_pairtext_encoder_configsubsets)	r   r   r   r   r   r   r   kwargsr   	__class__r   r   r   C  s   

zWmtConfig.__init__)NNNr   NN)	r0   r1   r2   r3   tfdscoredisallow_positional_argsr   __classcell__r   r   r   r   r   @  s    r   c                       s\   e Zd ZdZdZ fddZedd Zedd Zd	d
 Z	dd Z
dd Zdd Z  ZS )WmtTranslatezWMT translation dataset.z  Some of the wmt configs here, require a manual download.
  Please look into wmt.py to see the exact path (and file name) that has to
  be downloaded.
  c                    s6   t | tkrd|vrtdtt| j|i | d S )NconfigzThe raw `wmt_translate` can only be instantiated with the config kwargs. You may want to use one of the `wmtYY_translate` implementation instead to get the WMT dataset for a specific year.)typer   r'   r   r   )r   argsr   r   r   r   r   t  s
   zWmtTranslate.__init__c                 C   s   | j jS )z/Subsets that make up each split of the dataset.)builder_configr   )r   r   r   r   _subsets}  s   zWmtTranslate._subsetsc                 C   s~   | j j\}}i }| j D ])\}}g ||< |D ]}t| }|j|ks'||jvr.td| q|| 	| qqtd| |S )zESubsets that make up each split of the dataset for the language pair.z<Skipping sub-dataset that does not include language pair: %szUsing sub-datasets: %s)
r   r   r   itemsDATASET_MAPr   r   r   infoappend)r   sourcer   filtered_subsetssplitss_namesss_namer   r   r   r   r     s   zWmtTranslate.subsetsc                 C   sD   | j j\}}tjj| ttjj| j j| j jd||f| j j	| j j
dS )N)	languagesencoder_config)builderr   featuressupervised_keyshomepager   )r   r   r   r   DatasetInfo_DESCRIPTIONr   Translationr   r   r   )r   r   r   r   r   r   _info  s   zWmtTranslate._infoc                 c   s&    |  ||D ]	\}}|| V  qd S r+   )_generate_examples)r   split_subsetsextraction_maplanguage_exr   r   r   _vocab_text_gen  s   zWmtTranslate._vocab_text_genc                    s   | j j\} fdd}i }i }tj| j D ]%}|dkr(t|tj	< t
| }|r8||||< q|||< q |} |}	t|fi |	| j jD ]}
| jj|
 | | jtjj |
 qVfdd| j D S )Nc              
      sh   |  }g }|D ](}tj j|}tjj|s,t	d
| j|  jd||| q	|S )zCVerifies the manual files are downloaded for the given sub-dataset.z]For {0}, you must manually download the following file(s) from {1} and place them in {2}: {3}z, )r.   osr   join
manual_dirtfiogfileexistsAssertionErrorr   r   r,   r   )r   r   manual_pathsfnamemanual_path)
dl_managerr   r   r   _check_manual_files  s   
z;WmtTranslate._split_generators.<locals>._check_manual_filesrG   c                    s&   g | ]\}}t jj|| d dqS ))r   r   )r   
gen_kwargs)r   r   SplitGenerator)r$   r   r   )r   r   r   r%     s    z2WmtTranslate._split_generators.<locals>.<listcomp>)r   r   	itertoolschainfrom_iterabler   values_CZENG17_FILTERr,   r   r   r.   download_and_extractextractdictr   r   maybe_build_from_corpusr   r   SplitTRAINr   )r   r   r   r   r   urls_to_downloadr   r   downloaded_filesmanual_filesr   r   )r   r   r   r   _split_generators  s.   



zWmtTranslate._split_generatorsc                 #   s   | j j\ } fdd}|D ]}td| t| }|| }|||}|drP|dr6tjt	dd}	no|drM|t
|t
j d	 }
tjt|
d
}	nXt}	nU|dkrWt}	nNt|dkrh|dret}	n@t}	n=t|dkr|d	 }d|v ryt	}	n,|drtjt	| j jd}	nd|v rt}	n|drt}	ntd| tdt| |	| D ]\}}t| sqd||}||fV  qqdS )z,Returns the examples in the raw (text) form.c                    s6   |   }t|dkr|t| }dd t||D S )Nr   c                 S   s&   g | ]\}}|rt j||n|qS r   )r   r   r   )r$   ex_dirrel_pathr   r   r   r%     s    zMWmtTranslate._generate_examples.<locals>._get_local_paths.<locals>.<listcomp>)r/   lenzip)r   extract_dirs	rel_pathsr   r   r   _get_local_paths  s   
z9WmtTranslate._generate_examples.<locals>._get_local_pathszGenerating examples from: %sczeng16pre)r5   r6   )r   17r   )filter_pathrT      _frder   z.tsvr^   tmxwikiheadlineszUnsupported file format: %szInvalid number of files: %d{}/{}N)r   r   r   r   r   
startswithendswith	functoolspartial
_parse_tsvr   r   _parse_czeng_parse_hindencorpr   _parse_frde_bitext_parse_parallel_sentences
_parse_tmx_parse_wikiheadlinesr'   allr   r   )r   r   r   r   r   r   r   r   filessub_generatorr   r   sub_keyr   keyr   r   r   r     sd   







zWmtTranslate._generate_examples)r0   r1   r2   r3   MANUAL_DOWNLOAD_INSTRUCTIONSr   propertyr   r   r   r   r   r   r   r   r   r   r   r   k  s    	

5r   c                 c   s*   dd }dd }|  dr|n|}tjj| }tjj|}|r$|s,J d| |f t|t|ksBJ dt|t|| |f ttt|t|D ]E\}\}}	||\}
}||	\}}t|
t|ksuJ dt|
t|||	f tt|
|D ]\}\}}d		||}|||||ifV  q|qMd
S )zHReturns examples from parallel SGML or text files, which may be gzipped.c              	   S   s  |  d}|d dkrO|d }tjj| d0}tj|d}| d	 |fW  d   W  d   S 1 s;w   Y  W d   n1 sJw   Y  |d d	krg|d  d
d }|dv rddn|}n|d }tjj| }| 	 |fW  d   S 1 sw   Y  dS )zDReturns the sentences from a single text file, which may be gzipped.r   gzrbfileobjutf-8Ntxtr   )chcnr\   )
r   r   r   r   GFilegzipGzipFilereaddecode
splitlines)r   
split_pathlangfgr   r   r   _parse_text  s   
 L$z._parse_parallel_sentences.<locals>._parse_textc                 S   s   |  dd }g }td}tjj| +}|D ]}t||}|r5t|	 dks,J |
|	 d  qW d   ||fS 1 sCw   Y  ||fS )z*Returns sentences from a single SGML file.r   r  z<seg id=\"\d+\">(.*)</seg>r   r   N)r   recompiler   r   r   r  matchr   groupsr   )r   r  	sentencesseg_rer  line	seg_matchr   r   r   
_parse_sgm+  s   

z-_parse_parallel_sentences.<locals>._parse_sgmz.sgmz No matching files found: %s, %s.z4Number of files do not match: %d vs %d for %s vs %s.*Sizes do not match: %d vs %d for %s vs %s.r   N)
r   r   r   r   globr   	enumerater   r    r   )f1f2r  r  
parse_filef1_filesf2_filesf_idf1_if2_il1_sentencesl1l2_sentencesl2line_ids1s2r   r   r   r   r     s8   "
r   c                 c   s    t jj| }|  }W d    n1 sw   Y  t jj|}|  }W d    n1 s6w   Y  t|t|ksQJ dt|t|| |f tt||D ]\}\}}|||dfV  qXd S )Nr  )r9   r7   )	r   r   r   r  r  r  r   r!  r   )fr_pathde_pathr  fr_sentencesde_sentencesr.  r/  r0  r   r   r   r   V  s$   r   c                 #   s    dd  dd t jj| d=}tjrtd|}n|}tt	
|D ]\}\}}|jdkrE| fdd	|d
D fV  |  q&W d   dS 1 sQw   Y  dS )z!Generates examples from TMX file.c                 S   s,   |   D ]\}}|dr|  S qtd)Nz}langz'Language not found in `tuv` attributes.)r   r   r   )tuvkvr   r   r   _get_tuv_langg  s
   
z!_parse_tmx.<locals>._get_tuv_langc                 S   s0   |  d}t|dksJ dt| |d jS )Nsegr   zInvalid number of segments: %dr   )findallr   text)r5  segsr   r   r   _get_tuv_segm  s   

z _parse_tmx.<locals>._get_tuv_segr  r  tuc                    s   i | ]	} ||qS r   r   )r$   r5  r8  r=  r   r   r   z  s
    z_parse_tmx.<locals>.<dictcomp>r5  N)r   r   r   r  r   PY3codecs	getreaderr!  ElementTree	iterparsetagiterfindclear)r   r  utf_fr.  r   elemr   r?  r   r   e  s    

"r   c              	   c   s    |du rt d| }|dusJ d|  | \}}n|\}}tjj| 9}t|D ]+\}}|d}t	|dkrFt
d|| t	| q,|\}	}
|||	 ||
 ifV  q,W d   dS 1 scw   Y  dS )z!Generates examples from TSV file.Nz".*\.([a-z][a-z])-([a-z][a-z])\.tsvzInvalid TSV filename: %s	r   z2Skipping line %d in TSV (%s) with %d != 2 columns.)r  r  r  r   r   r   r  r!  r   r   r   warningstrip)r   r   
lang_matchr+  r-  r  jr  colsr/  r0  r   r   r   r     s,   


"r   c           	      c   s    t d| }|dusJ d|  | \}}tjj| &}t|D ]\}}|d\}}|||	 ||	 ifV  q#W d   dS 1 sGw   Y  dS )z3Generates examples from Wikiheadlines dataset file.z.*\.([a-z][a-z])-([a-z][a-z])$Nz"Invalid Wikiheadlines filename: %sz|||)
r  r  r  r   r   r   r  r!  r   rL  )	r   rM  r+  r-  r  r.  r  r/  r0  r   r   r   r     s   
"r   c                  o   s   | dd}|r=td}tjj|}ttd|	 
 d  }W d   n1 s0w   Y  tdt| | D ]}tjj|D ]w}tjj|de}tj|dO}tj|}	t|D ]<\}
}|d	}| srqd|d
\}}}}|rt||}|r|
 d |v rqdd|	|
}|| | dfV  qdW d   n1 sw   Y  W d   n1 sw   Y  qHq?dS )zEGenerates examples from CzEng v1.6, with optional filtering for v1.7.r   Nz^[^-]+-b(\d+)-\d\d[tde]zqw{([\s\d]*)}r   z<Loaded %d bad blocks to filter from CzEng v1.6 to make v1.7.r  r  r  rJ  r   )r6   r5   )getr  r  r   r   r   r  r   searchr  r  r   r   r   r   r   r  r  r   r   basenamer!  r  rL  r  r   )pathsr   r   re_blockr  
bad_blocksr   gz_pathr  filenamer.  r  id_unused_scorer6   r5   block_matchr   r   r   r   r     sL   
 
 r   c                 c   s    t jj| 4}t|D ]&\}}|d}t|dkr#td| q||d 	 |d 	 dfV  qW d    d S 1 s?w   Y  d S )NrJ     z$Skipping invalid HindEnCorp line: %s      )r5   rU   )
r   r   r   r  r!  r   r   r   rK  rL  )r   r  r.  r  
split_liner   r   r   r     s   


"r   r+   ).r3   
__future__r   r   r   rA  r   r  r   r   r  xml.etree.cElementTreeetreecElementTreerC  abslr   r   tensorflow.compat.v2compatv2r   tensorflow_datasets.public_api
public_apir   r   CWMT_SUBSET_NAMESobjectr   range_TRAIN_SUBSETS_DEV_SUBSETSr   r   r   BuilderConfigr   GeneratorBasedBuilderr   r   r   r   r   r   r   r   r   r   r   r   <module>   s~  =(39@GNU[biou|           %  +  2  9  ?  F  L  R  X  ^  d  j  p  w  }         ,	+ -?
 