o
    Ni]                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  mZ	 ddl
mZ dZdZdZd	ZG d
d dejjZG dd dejjZdS )zITED talk high/low-resource paired language data set from Qi, et al. 2018.    )absolute_import)division)print_functionNzData sets derived from TED talk transcripts for comparing similar language pairs
where one is high resource and the other is low resource.
a%  @inproceedings{Ye2018WordEmbeddings,
  author  = {Ye, Qi and Devendra, Sachan and Matthieu, Felix and Sarguna, Padmanabhan and Graham, Neubig},
  title   = {When and Why are pre-trained word embeddings useful for Neural Machine Translation},
  booktitle = {HLT-NAACL},
  year    = {2018},
  }
z5http://www.phontron.com/data/qi18naacl-dataset.tar.gz))azen)az_trr   )ber   )be_rur   )espt)frr   )glr   )gl_ptr   )her   )itr   )r   r   )rur   )r   r   )trr   c                       s*   e Zd ZdZejjd fdd	Z  ZS )TedHrlrConfigzFBuilderConfig for TED talk data comparing high/low resource languages.NNc                    sj   d|d  dd|d f }d|d |d f }tt| jd
||d| |tv s0J d| || _d	S )a9  BuilderConfig for TED talk data comparing high/low resource languages.

    The first language in `language_pair` should either be a 2-letter coded
    string or two such strings joined by an underscore (e.g., "az" or "az_tr").
    In cases where it contains two languages, the train data set will contain an
    (unlabelled) mix of the two languages and the validation and test sets
    will contain only the first language. This dataset will refer to the
    source language by the 5-letter string with the underscore. The second
    language in `language_pair` must be a 2-letter coded string.

    For example, to get pairings between Russian and English, specify
    `("ru", "en")` as `language_pair`. To get a mix of Belarusian and Russian in
    the training set and purely Belarusian in the validation and test sets,
    specify `("be_ru", "en")`.

    Args:
      language_pair: pair of languages that will be used for translation. The
        first will be used as source and second as target in supervised mode.
      **kwargs: keyword arguments forwarded to super.
    %s_to_%sr   _    z0Translation dataset from %s to %s in plain text.)namedescriptionz+Config language pair (%s, %s) not supportedN )replacesuperr   __init___VALID_LANGUAGE_PAIRSlanguage_pair)selfr    kwargsr   r   	__class__r   Z/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/translate/ted_hrlr.pyr   A   s   

zTedHrlrConfig.__init__)r   )	__name__
__module____qualname____doc__tfdscoredisallow_positional_argsr   __classcell__r   r   r#   r%   r   >   s    r   c                   @   s6   e Zd ZdZdd eD Zdd Zdd Zdd	 Zd
S )TedHrlrTranslatez@TED talk data set for comparing high and low resource languages.c                 C   s"   g | ]}t |tjd ddqS )z1.0.0z6New split API (https://tensorflow.org/datasets/splits))r    version)r   r*   r+   Version).0pairr   r   r%   
<listcomp>i   s    zTedHrlrTranslate.<listcomp>c                 C   s*   t jj| tt jj| jjdd| jjtdS )N)	languagesz1https://github.com/neulab/word-embeddings-for-nmt)builderr   featureshomepagesupervised_keyscitation)	r*   r+   DatasetInfo_DESCRIPTIONr6   Translationbuilder_configr    	_CITATION)r!   r   r   r%   _infor   s   zTedHrlrTranslate._infoc                 C   s   | t}| jj\}}tj|dd||f }tjj	tj
jtj|d|ddtj|d|ddtjj	tj
jtj|d|dd	 tj|d|ddtjj	tj
jtj|d
|dd	 tj|d
|ddgS )Ndatasetsr   z{}.trainr   -)source_filetarget_file)r   
gen_kwargsz{}.devr   z{}.test)download_and_extract	_DATA_URLr=   r    ospathjoinr*   r+   SplitGeneratorSplitTRAINformatr   
VALIDATIONsplitTEST)r!   
dl_managerdl_dirsourcetargetdata_dirr   r   r%   _split_generators}   s8   

		z"TedHrlrTranslate._split_generatorsc                 c   s    t jj|}| d}W d   n1 sw   Y  t jj|}| d}W d   n1 s8w   Y  t|t|ksSJ dt|t|||f | jj\}}t	t
||D ]\}\}	}
||	||
i}t| rw||fV  q`dS )z:This function returns the examples in the raw (text) form.
Nz*Sizes do not match: %d vs %d for %s vs %s.)tfiogfileGFilereadrO   lenr=   r    	enumeratezipallvalues)r!   rB   rC   fsource_sentencestarget_sentencesrS   rT   idxl1l2resultr   r   r%   _generate_examples   s2   

z#TedHrlrTranslate._generate_examplesN)	r&   r'   r(   r)   r   BUILDER_CONFIGSr?   rV   ri   r   r   r   r%   r.   f   s    	$r.   )r)   
__future__r   r   r   rG   tensorflow.compat.v2compatv2rX   tensorflow_datasets.public_api
public_apir*   r;   r>   rF   r   r+   BuilderConfigr   GeneratorBasedBuilderr.   r   r   r   r%   <module>   s   	(