o
    Ni%                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  mZ	 ddl
mZ dZdZdZed	d
dgZG dd dejjZG dd dejjZdS )zEFacebook Low Resource (FLoRes) machine translation benchmark dataset.    )absolute_import)division)print_functionNz^Evaluation datasets for low-resource machine translation: Nepali-English and Sinhala-English.
a  @misc{guzmn2019new,
    title={Two New Evaluation Datasets for Low-Resource Machine Translation: Nepali-English and Sinhala-English},
    author={Francisco Guzman and Peng-Jen Chen and Myle Ott and Juan Pino and Guillaume Lample and Philipp Koehn and Vishrav Chaudhary and Marc'Aurelio Ranzato},
    year={2019},
    eprint={1902.01382},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
z[https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgzTranslateDataurllanguage_to_filec                       s.   e Zd ZdZejj		d fdd	Z  ZS )FloresConfigzBuilderConfig for FLoRes.NNNc           
         s   |r|j nd}d|d |d |f }d|d |d |f }tt| jd||tjddd| |p8tjj	 | _
d	|v sDJ d
|f|\}}|d	krN|n|}	|	dv sZJ d|	f|| _dS )a  BuilderConfig for FLoRes.

    Args:
      text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
        for the `tfds.features.text.TextEncoder` used for the features feature.
      language_pair: pair of languages that will be used for translation. Should
        contain 2-letter coded strings. First will be used at source and second
        as target in supervised mode. For example: ("se", "en").
      **kwargs: keyword arguments forwarded to super.
    
plain_textz%s%s_%sr      z3Translation dataset from %s to %s, uses encoder %s.z1.1.0z6New split API (https://tensorflow.org/datasets/splits))namedescriptionversionenz/Config language pair must contain `en`, got: %s)nesiz#Invalid non-en language in pair: %sN )r   superr   __init__tfdscoreVersionfeaturestextTextEncoderConfigtext_encoder_configlanguage_pair)
selfr   r   kwargsencoder_namer   r   sourcetargetnon_en	__class__r   X/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/translate/flores.pyr   7   s:   


zFloresConfig.__init__)Nr	   )	__name__
__module____qualname____doc__r   r   disallow_positional_argsr   __classcell__r   r   r#   r%   r   4   s    r   c                   @   sD   e Zd ZdZeddeddgZdd Zdd Zd	d
 Zdd Z	dS )Floresz#FLoRes machine translation dataset.)r   r   )r   )r   r   c                 C   s<   | j j\}}tjj| ttjj| j j| j jd||fdt	dS )N)	languagesencoder_configz+https://github.com/facebookresearch/flores/)builderr   r   supervised_keyshomepagecitation)
builder_configr   r   r   DatasetInfo_DESCRIPTIONr   Translationr   	_CITATION)r   r    r!   r   r   r%   _infoo   s   zFlores._infoc                 c   s*    | j di |D ]	\}}|| V  q	d S )Nr   )_generate_examples)r   fileslanguage_exr   r   r%   _vocab_text_gen|   s   zFlores._vocab_text_genc           
      C   s   | t}| jj\}}|dkr|n|}d}i }dD ]}|j||||d|j||||dd||< q| jjD ]}	| jj| | |d |	 q5t	j
jt	jj|d dt	j
jt	jj|d dgS )	Nr   zJ{dl_dir}/wikipedia_en_ne_si_test_sets/wikipedia.{split}.{non_en}-en.{lang})devdevtest)dl_dirsplitr"   lang)source_filetarget_filer?   )r   
gen_kwargsr@   )download_and_extract	_DATA_URLr3   r   formatinfor   maybe_build_from_corpusr>   r   r   SplitGeneratorSplit
VALIDATIONTEST)
r   
dl_managerrA   r    r!   r"   	path_tmplr:   rB   r;   r   r   r%   _split_generators   s6   
zFlores._split_generatorsc                 c   s    t jj|}| d}W d   n1 sw   Y  t jj|}| d}W d   n1 s8w   Y  t|t|ksSJ dt|t|||f | jj\}}t	t
||D ]\}\}	}
||	||
i}t| rw||fV  q`dS )z:This function returns the examples in the raw (text) form.
Nz*Sizes do not match: %d vs %d for %s vs %s.)tfiogfileGFilereadrB   lenr3   r   	enumeratezipallvalues)r   rD   rE   fsource_sentencestarget_sentencesr    r!   idxl1l2resultr   r   r%   r9      s0   
zFlores._generate_examplesN)
r&   r'   r(   r)   r   BUILDER_CONFIGSr8   r>   rR   r9   r   r   r   r%   r,   c   s    	 r,   )r)   
__future__r   r   r   collectionstensorflow.compat.v2compatv2rT   tensorflow_datasets.public_api
public_apir   r5   r7   rH   
namedtupler   r   BuilderConfigr   GeneratorBasedBuilderr,   r   r   r   r%   <module>   s   /