o
    Ni).                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlm	  m
Z	 ddlmZ ddlZddlm  mZ ddlmZ ejrJddlZnddlZdZdZd	Zg d
ZdZdZG dd dejjZ ej!ddZ"G dd dejj#Z$dd Z%dS )z?Wikipedia dataset containing cleaned articles of all languages.    )absolute_import)division)print_functionN)loggingz@ONLINE {wikidump,
    author = "Wikimedia Foundation",
    title  = "Wikimedia Downloads",
    url    = "https://dumps.wikimedia.org"
}
a5  Wikipedia dataset containing cleaned articles of all languages.
The datasets are built from the Wikipedia dump
(https://dumps.wikimedia.org/) with one split per language. Each example
contains the content of one full Wikipedia article with cleaning to strip
markdown and unwanted sections (references, etc.).
a  This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.(,  aaabaceadyafakalsamanangararcarzasastatjavayazazbbabarzbat-smgbclbezbe-x-oldbgbhbibjnbmbnbobpybrbsbugbxrcazcbk-zamcdocecebchchochrchyckbcocrcrhcscsbcucvcydadedindiqdsbdtydvdzeeelemleneoeseteuextfafffizfiu-vrofjfofrfrpfrrfurfygagaggangdglglkgngomgorgotgugvhahakhawhehihifhohrhsbhthuhyiaidieigiiikiloinhioisitiujajamjbojvkakaakabkbdkbpkgkikjkkklkmknkokoikrckskshkukvkwkylaladlblbelezlfnlglilijlmolnlolrcltltglvmaizmap-bmsmdfmgmhmhrmiminmkmlmnmrmrjmsmtmusmwlmymyvmznnanahnapndsznds-nlnenewngnlnnnonovnrmnsonvnyocoloomorospapagpampappcdpdcpflpipihplpmspnbpntpsptqurmrmyrnrozroa-rupzroa-tararuruerwsasahsatscscnscosdsesgshsisimpleskslsmsnsosqsrsrnssststqsusvswszltatcytetettgthtitktltntotpitrtstttumtwtytyvudmugukuruzvevecvepvivlsvowawarwowuuxalxhxmfyiyozazeazhzzh-classicalz
zh-min-nanzzh-yuezuz3https://dumps.wikimedia.your.org/{lang}wiki/{date}/zdumpstatus.jsonc                       s*   e Zd ZdZejjd fdd	Z  ZS )WikipediaConfigzBuilderConfig for Wikipedia.Nc                    s<   t t| jdd||d||d| || _|| _dS )aE  BuilderConfig for Wikipedia.

    Args:
      language: string, the language code for the Wikipedia dump to use.
      date: string, date of the Wikipedia dump in YYYYMMDD format. A list of
        available dates can be found at https://dumps.wikimedia.org/enwiki/.
      **kwargs: keyword arguments forwarded to super.
    z{0}.{1}z0Wikipedia dataset for {0}, parsed from {1} dump.)namedescriptionN )superr'  __init__formatdatelanguage)selfr/  r.  kwargs	__class__r*  V/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/wikipedia.pyr,  c   s   


zWikipediaConfig.__init__)NN)	__name__
__module____qualname____doc__tfdscoredisallow_positional_argsr,  __classcell__r*  r*  r2  r4  r'  `   s    r'  z1.0.0z6New split API (https://tensorflow.org/datasets/splits)c                   @   sD   e Zd ZdZdd eD dd eD  Zdd Zdd Zd	d
 ZdS )	WikipediazWikipedia dataset.c                 C      g | ]	}t t|d dqS )20200301versionr/  r.  r'  _VERSION.0langr*  r*  r4  
<listcomp>~   s    zWikipedia.<listcomp>c                 C   r>  )20190301r@  rB  rD  r*  r*  r4  rG     s    c              	   C   s8   t jj| tt jt j t j dd dtdtidS )Ntitletextzhttps://dumps.wikimedia.orglicense)builderr)  featuressupervised_keyshomepagecitationredistribution_info)	r9  r:  DatasetInfo_DESCRIPTIONrN  FeaturesDictText	_CITATION_LICENSEr0  r*  r*  r4  _info   s   zWikipedia._infoc                    s   fdd} j j}||t }|d|i}g }d}tjj|d }t	|}	W d    n1 s4w   Y  |	d d }
|
d dksQJ d	|||
d f |
d
 
 D ]\}}d|vr`qW||d 7 }||||  qW|d|i}tjjtjj|d |ddgS )Nc                    s   t j| dd jjdS )N-_)rF  r.  )_BASE_URL_TMPLr-  replace_builder_configr.  )rF  rY  r*  r4  	_base_url   s   z.Wikipedia._split_generators.<locals>._base_urlinfor   jobsarticlesmultistreamdumpstatusdonez8Specified dump (%s) multistream status is not 'done': %sfilesz.xmlsizexml)	filepathsr/  )r(  
gen_kwargs)r_  r/  
_INFO_FILEdownload_and_extracttfrv   gfileGFilejsonloaditemsappenddownloadr9  r:  SplitGeneratorSplitTRAIN)r0  
dl_managerr`  rF  info_urldownloaded_filesxml_urlstotal_bytesf	dump_infomultistream_dump_infofnamera  r*  rY  r4  _split_generators   s4   zWikipedia._split_generatorsc                    sR   t jjj  fdd} fdd}| |B  |B  j B  |B S )z5Build PCollection of examples in the raw (text) form.c                 3   sd   t d|  tjj| d}tj|d}tj	r!t
d|}n|}tj|dd}t|}t|\}}|D ]i\}}|jdsAq6|jdd	 }|d
|j}|d|j}	|d|j}
|	dkrl|  q6|d|j}|  |du s| dr jjd  q6 jjd  |
||fV  q6W d   dS 1 sw   Y  dS )z:Extracts article content from a single WikiMedia XML file.zgenerating examples from = %srb)filenamezutf-8)end)eventspageNz
./{0}titlez./{0}nsz./{0}id0z./{0}revision/{0}textz	#redirectzfiltered-redirectszextracted-examples)r   ra  rm  rv   rn  ro  bz2BZ2FilesixPY3codecs	getreaderetree	iterparseiternexttagendswithfindr-  rK  clearlower
startswithmetricsMetricscounterinc)filepathr}  utf_fcontextunused_eventrootelem	namespacerJ  nsid_raw_contentbeamr/  r*  r4  _extract_content   sB   "z6Wikipedia._build_pcollection.<locals>._extract_contentc              
   3   s    | \}}}zt |}W n( tjjjjjy4 } z jj	d
  td| W Y d}~dS d}~ww |sC jj	d
  dS  jj	d
  |||dfV  dS )z$Cleans raw wikicode to extract text.zparser-errorzmwparserfromhell ParseError: %sNzempty-clean-exampleszcleaned-examplesrI  )_parse_and_clean_wikicoder9  r:  lazy_importsmwparserfromhellparserParserErrorr  r  r  r  r   error)inputsr  rJ  r  rK  er  r*  r4  _clean_content   s(   

z4Wikipedia._build_pcollection.<locals>._clean_content)r9  r:  r  apache_beamCreateFlatMap
transforms	Reshuffle)r0  pipelineri  r/  r  r  r*  r  r4  _build_pcollection   s   
)zWikipedia._build_pcollectionN)	r5  r6  r7  r8  WIKIPEDIA_LANGUAGESBUILDER_CONFIGSrZ  r  r  r*  r*  r*  r4  r=  z   s    #r=  c           	         s   t jjj| }tjdtjtjB d  fdd}dd }dd }d	d
 }g }|j	ddddD ]8}|j
|ddD ]}||| q8|j|ddD ]}||| qG|j|ddD ]}||| qV||   q/d|S )z>Strips formatting and unwanted sections from raw page content.z^(?:File|Image|Media):)flagsc                    s   t  t| jS N)boolmatchr  	text_typerJ  objre_rm_wikilinkr*  r4  rm_wikilink  s   z._parse_and_clean_wikicode.<locals>.rm_wikilinkc                 S   s   t | jdv S )N>   reftable)r  r  r  r  r*  r*  r4  rm_tag  s   z)_parse_and_clean_wikicode.<locals>.rm_tagc                 S   s   | j  dv S )N>   notelist-lgnotelist-lrnotelist-uanotelist-urreflistnotelist)r(  r  r  r*  r*  r4  rm_template  s   z._parse_and_clean_wikicode.<locals>.rm_templatec                 S   s&   z| |  W d S  ty   Y d S w r  )remove
ValueError)r  sectionr*  r*  r4  try_remove_obj  s
   z1_parse_and_clean_wikicode.<locals>.try_remove_objT)flatinclude_leadinclude_headings)matches	recursivez

)r9  r:  r  r  parserecompile
IGNORECASEUNICODEget_sectionsifilter_wikilinksifilter_templatesifilter_tagsrs  
strip_codestripjoin)	r  wikicoder  r  r  r  section_textr  r  r*  r  r4  r    s(   

r  )&r8  
__future__r   r   r   r  rp  r  xml.etree.cElementTreer  cElementTreeabslr   r  tensorflow.compat.v2compatv2rm  tensorflow_datasets.public_api
public_apir9  r  r  bz2filerW  rT  rX  r  r]  rk  r:  BuilderConfigr'  VersionrC  BeamBasedBuilderr=  r  r*  r*  r*  r4  <module>   s:   
	 