o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  mZ	 ddl
mZ ddlmZ dZdZd	Zd
Ze dd ZG dd dejjZG dd dejjZdS )zHParaCrawl (Bitextor) parallel open-source machine translation benchmark.    )absolute_import)division)print_functionN)utilsz;Web-Scale Parallel Corpora for Official European Languages.z"https://paracrawl.eu/releases.htmlzu@misc {paracrawl,
    title  = "ParaCrawl",
    year   = "2018",
    url    = "http://paracrawl.eu/download.html."
}
zchttps://s3.amazonaws.com/web-language-models/paracrawl/release4/en-{target_lang}.bicleaner07.txt.gzc                  C   s   i ddddddddd	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)} t t|  S )*zCreate the sorted dictionary of language codes, and language names.

  Returns:
    The sorted dictionary as an instance of `collections.OrderedDict`.
  bg	BulgariancsCzechdaDanishdeGermanelGreekesSpanishetEstonianfiFinnishfrFrenchgaIrishhrCroatianhu	HungarianitItalianlt
LithuanianlvLatvianmtMaltesenlDutchPolish
PortugueseRomanianSlovak	SlovenianSwedish)plptroskslsv)collectionsOrderedDictsorteditems)langs r9   \/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/translate/para_crawl.py_target_languages-   sV   	
r;   c                       s*   e Zd ZdZejjd fdd	Z  ZS )ParaCrawlConfigzBuilderConfig for ParaCrawl.Nc                    s   |t  vrtd| |r|jnd}d||f }d||f }tt| jd||d| |p3tjj	 | _
|| _tj|d| _dS )	a  BuilderConfig for ParaCrawl.

    Args:
      text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
        for the `tfds.features.text.TextEncoder` used for the features feature.
      target_language: Target language that will be used to translate to from
        English which is always the source language. It has to contain 2-letter
        coded strings. For example: "se", "hu".
      **kwargs: Keyword arguments forwarded to super.
    zInvalid target language: %s 
plain_textzen%s_%sz8Translation dataset from English to %s, uses encoder %s.)namedescription)target_langNr9   )r;   
ValueErrorr>   superr<   __init__tfdsfeaturestextTextEncoderConfigtext_encoder_configtarget_language_BASE_DATA_URL_FORMAT_STRformatdata_url)selfrH   rI   kwargsencoder_namer>   r?   	__class__r9   r:   rC   S   s&   
zParaCrawlConfig.__init__)NN)	__name__
__module____qualname____doc__rD   coredisallow_positional_argsrC   __classcell__r9   r9   rP   r:   r<   P   s    r<   c                   @   s@   e Zd ZdZdd e D Zdd Zdd Zdd	 Zd
d Z	dS )	ParaCrawlz&ParaCrawl machine translation dataset.c                 C   s    g | ]}t |tjd dqS )z1.0.0)rI   version)r<   rD   rV   Version).0rI   r9   r9   r:   
<listcomp>{   s    
zParaCrawl.<listcomp>c                 C   s8   | j j}tjj| ttjjd|f| j jdd|ft	t
dS )Nen)	languagesencoder_config)builderr?   rE   supervised_keyshomepagecitation)builder_configrI   rD   rV   DatasetInfo_DESCRIPTIONrE   TranslationrH   _BENCHMARK_URL	_CITATION)rM   rI   r9   r9   r:   _info   s   zParaCrawl._infoc                 c   s*    | j di |D ]	\}}|| V  q	d S )Nr9   )_generate_examples)rM   fileslanguage_exr9   r9   r:   _vocab_text_gen   s   zParaCrawl._vocab_text_genc                 C   s(   | d| jji}tjjtjj|dgS )N	data_file)r>   
gen_kwargs)download_and_extractre   rL   rD   rV   SplitGeneratorSplitTRAIN)rM   
dl_managerrr   r9   r9   r:   _split_generators   s
   
zParaCrawl._split_generatorsc           
      c   s    | j j}tjj|?}t|D ]1\}}| d}t	|dkr,d
||}t||d  |d  }}	|d|||	ifV  qW d   dS 1 sNw   Y  dS )z:This function returns the examples in the raw (text) form.	   zPWrong data format in line {}. The line '{}' does not have exactly one delimiter.r      r^   N)re   rI   tfiogfileGFile	enumeratestripsplitlenrK   rA   )
rM   rr   rI   fidxline
line_partsmsgsourcetargetr9   r9   r:   rl      s   "zParaCrawl._generate_examplesN)
rR   rS   rT   rU   r;   BUILDER_CONFIGSrk   rq   ry   rl   r9   r9   r9   r:   rY   u   s    

rY   )rU   
__future__r   r   r   r4   tensorflow.compat.v2compatv2r}   tensorflow_datasets.corer   tensorflow_datasets.public_api
public_apirD   rg   ri   rj   rJ   memoizer;   rV   BuilderConfigr<   GeneratorBasedBuilderrY   r9   r9   r9   r:   <module>   s    
"%