o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddlm	  m
Z ddlmZ dZdZd	Zd
ZejeddZejeddZG dd dejjZdd Zdd ZG dd dejjZdS )z%The Language Model 1 Billion dataset.    )absolute_import)division)print_functionN)logginga  @article{DBLP:journals/corr/ChelbaMSGBK13,
  author    = {Ciprian Chelba and
               Tomas Mikolov and
               Mike Schuster and
               Qi Ge and
               Thorsten Brants and
               Phillipp Koehn},
  title     = {One Billion Word Benchmark for Measuring Progress in Statistical Language
               Modeling},
  journal   = {CoRR},
  volume    = {abs/1312.3005},
  year      = {2013},
  url       = {http://arxiv.org/abs/1312.3005},
  archivePrefix = {arXiv},
  eprint    = {1312.3005},
  timestamp = {Mon, 13 Aug 2018 16:46:16 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/ChelbaMSGBK13},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}
zA benchmark corpus to be used for measuring progress in statistical language modeling. This has almost one billion words in the training data.
z^http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gzz41-billion-word-language-modeling-benchmark-r13outputz'training-monolingual.tokenized.shuffledz	news.en-*z&heldout-monolingual.tokenized.shuffledznews.en.heldout-*c                       s*   e Zd ZdZejjd fdd	Z  ZS )
Lm1bConfigzBuilderConfig for Lm1b.Nc                    s:   t t| jddtjddi| |ptjj | _	dS )a  BuilderConfig for Lm1b.

    Args:
      text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
        for the `tfds.features.text.TextEncoder` used for the Lm1b `"text"`
        feature.
      **kwargs: keyword arguments forwarded to super.
    versionz1.0.0z6New split API (https://tensorflow.org/datasets/splits)N )
superr   __init__tfdscoreVersionfeaturestextTextEncoderConfigtext_encoder_config)selfr   kwargs	__class__r   Q/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/lm1b.pyr
   F   s   
zLm1bConfig.__init__N)	__name__
__module____qualname____doc__r   r   disallow_positional_argsr
   __classcell__r   r   r   r   r   C   s    r   c                 C      t jjtj| tS r   )tfiogfileglobospathjoin_TRAIN_FILE_FORMATtmp_dirr   r   r   _train_data_filenamesY      r)   c                 C   r   r   )r   r    r!   r"   r#   r$   r%   _HELDOUT_FILE_FORMATr'   r   r   r   _test_data_filenames]   r*   r,   c                
   @   s   e Zd ZdZedddeddejjjejj	 dded	d
ejjjejjj
dddeddejjjejjj
dddgZdd Zdd Zdd Zdd ZdS )Lm1bz01 Billion Word Language Model Benchmark dataset.
plain_textz
Plain text)namedescriptionbyteszGUses byte-level text encoding with `tfds.features.text.ByteTextEncoder`)encoder)r/   r0   r   
subwords8kz?Uses `tfds.features.text.SubwordTextEncoder` with 8k vocab sizei    )encoder_cls
vocab_sizesubwords32kz@Uses `tfds.features.text.SubwordTextEncoder` with 32k vocab sizei   c              	   C   s2   t jj| tt jdt jj| jjdiddt	dS )Nr   )encoder_config)r   r   z#http://www.statmt.org/lm-benchmark/)builderr0   r   supervised_keyshomepagecitation)
r   r   DatasetInfo_DESCRIPTIONr   FeaturesDictTextbuilder_configr   	_CITATION)r   r   r   r   _info   s   z
Lm1b._infoc                 c   s$    |  |D ]	\}}|d V  qd S )Nr   )_generate_examples)r   training_files_exr   r   r   _vocab_text_gen   s   zLm1b._vocab_text_genc                 C   sb   | t}t|}t|}| jjd | | tj	j
tjjd|idtj	j
tjjd|idgS )Nr   files)r/   
gen_kwargs)download_and_extract_DOWNLOAD_URLr)   r,   infor   maybe_build_from_corpusrG   r   r   SplitGeneratorSplitTRAINTEST)r   
dl_manager	lm1b_pathtrain_files
test_filesr   r   r   _split_generators   s   
zLm1b._split_generatorsc              	   c   s~    |D ]9}t d| tjj|"}t|D ]\}}dtj	||f d|
 ifV  qW d    n1 s7w   Y  qd S )Nzgenerating examples from = %sz%s_%dr   )r   rL   r   r    r!   GFile	enumerater#   r$   basenamestrip)r   rH   filepathfidxliner   r   r   rC      s   
zLm1b._generate_examplesN)r   r   r   r   r   r   r   r   r   ByteTextEncoderSubwordTextEncoderBUILDER_CONFIGSrB   rG   rV   rC   r   r   r   r   r-   a   sD    
r-   )r   
__future__r   r   r   r#   abslr   tensorflow.compat.v2compatv2r   tensorflow_datasets.public_api
public_apir   rA   r=   rK   _TOP_LEVEL_DIRr$   r%   r&   r+   r   BuilderConfigr   r)   r,   GeneratorBasedBuilderr-   r   r   r   r   <module>   s0   