o
    Ni'                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddlm	  m
Z ddlmZ dZdZd	Zd
ZejdZg dZG dd dejjZejdZG dd dejjZdS )z5Wiki40B: A clean Wikipedia dataset for 40+ languages.    )absolute_import)division)print_functionN)loggingz
@inproceedings{49029,
title = {Wiki-40B: Multilingual Language Model Dataset},
author = {Mandy Guo and Zihang Dai and Denny Vrandecic and Rami Al-Rfou},
year = {2020},
booktitle	= {LREC 2020}
}
a]  
Clean-up text for 40+ Wikipedia languages editions of pages
correspond to entities. The datasets have train/dev/test splits per language.
The dataset is cleaned up by page filtering to remove disambiguation pages,
redirect pages, deleted pages, and non-entity pages. Each example contains the
wikidata id of the entity, and the full Wikipedia article after page processing
that removes non-content sections and structured objects. The language models
trained on this corpus - including 41 monolingual models, and 2 multilingual
models - can be found at https://tfhub.dev/google/collections/wiki40b-lm/1.
a  
This work is licensed under the Creative Commons Attribution-ShareAlike
3.0 Unported License. To view a copy of this license, visit
http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to
Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
z&https://research.google/pubs/pub49029/zdownloads/wiki40b/tfrecord_prod))enarzzh-cnzzh-twnlfrdeitjakoplptruesthtrbgcacsdaeletfafihehihrhuidltlvmsnoroskslsrsvtlukvic                       s*   e Zd ZdZejjd fdd	Z  ZS )Wiki40bConfigzBuilderConfig for Wiki40B.Nc                    s,   t t| jd|d|d| || _dS )zBuilderConfig for Wiki40B.

    Args:
      language: string, the language code for the Wiki40B dataset to use.
      **kwargs: keyword arguments forwarded to super.
    zWiki40B dataset for {}.)namedescriptionN )superr-   __init__formatlanguage)selfr4   kwargs	__class__r0   T/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/wiki40b.pyr2   G   s   
zWiki40bConfig.__init__)N)	__name__
__module____qualname____doc__tfdscoredisallow_positional_argsr2   __classcell__r0   r0   r7   r9   r-   D   s    r-   z1.3.0c                   @   s6   e Zd ZdZdd eD Zdd Zdd Zdd	 Zd
S )Wiki40bzFWiki40B: A Clean Wikipedia Dataset for Mutlilingual Language Modeling.c                 C   s   g | ]}t t|d qS ))versionr4   )r-   _VERSION).0langr0   r0   r9   
<listcomp>\   s    zWiki40b.<listcomp>c              	   C   s@   t jj| tt jt j t j t j dd ttdt	idS )Nwikidata_idtext
version_idlicense)builderr/   featuressupervised_keyshomepagecitationredistribution_info)
r>   r?   DatasetInfo_DESCRIPTIONrN   FeaturesDictText_URL	_CITATION_LICENSE)r5   r0   r0   r9   _infoc   s   zWiki40b._infoc                 C   s   ~| j j}tjjtjjdtj	t
dd|idtjjtjjdtj	t
dd|idtjjtjjdtj	t
dd|idgS )zReturns SplitGenerators.	filepathstrainz{}_examples-*)r.   
gen_kwargsdevtest)_builder_configr4   r>   r?   SplitGeneratorSplitTRAINospathjoin_DATA_DIRECTORYr3   
VALIDATIONTEST)r5   
dl_managerrF   r0   r0   r9   _split_generatorsu   s0   zWiki40b._split_generatorsc                 C   sH   t jjj}td| dd }||jj||j	t
jjdB ||B S )zBuild PCollection of examples.zgenerating examples from = %sc                 s   sj    | j jd jjd d}| j jd jjd d}| j jd jjd d}|| |||dfV  dS )z"Extracts content from a TFExample.rI   r   zutf-8rJ   rK   rH   N)rN   feature
bytes_listvaluedecode)examplerI   rJ   rK   r0   r0   r9   _extract_content   s4   z4Wiki40b._build_pcollection.<locals>._extract_content)coder)r>   r?   lazy_importsapache_beamr   infoioReadFromTFRecordcoders
ProtoCodertfr\   ExampleFlatMap)r5   pipeliner[   beamrq   r0   r0   r9   _build_pcollection   s   
zWiki40b._build_pcollectionN)	r:   r;   r<   r=   WIKIPEDIA_LANGUAGESBUILDER_CONFIGSrZ   rk   r   r0   r0   r0   r9   rB   Y   s    rB   )r=   
__future__r   r   r   rd   abslr   tensorflow.compat.v2compatv2rz   tensorflow_datasets.public_api
public_apir>   rX   rT   rY   rW   r?   gcs_pathrg   r   BuilderConfigr-   VersionrD   BeamBasedBuilderrB   r0   r0   r0   r9   <module>   s"   	