o
    NiD                     @   sr   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  mZ	 ddl
mZ dZdZdZG d	d
 d
ejjZdS )z PG-19 language modeling dataset.    )absolute_import)division)print_functionNa7  
@article{raecompressive2019,
author = {Rae, Jack W and Potapenko, Anna and Jayakumar, Siddhant M and
          Hillier, Chloe and Lillicrap, Timothy P},
title = {Compressive Transformers for Long-Range Sequence Modelling},
journal = {arXiv preprint},
url = {https://arxiv.org/abs/1911.05507},
year = {2019},
}
aj  
This dataset contains the PG-19 language modeling benchmark. It includes a set
of books extracted from the Project Gutenberg books project
(https://www.gutenberg.org), that were published before 1919. It also contains
metadata of book titles and publication dates.
PG-19 is over double the size of the Billion Word benchmark and contains
documents that are 20X longer, on average, than the WikiText long-range
language modelling benchmark.

Books are partitioned into a train, validation, and test set. Books metadata is
stored in metadata.csv which contains
(book_id, short_book_title, publication_date, book_link).
zgs://deepmind-gutenbergc                   @   s4   e Zd ZdZejdZdd Zdd Z	dd Z
d	S )
Pg19z<This dataset contains the PG-19 language modeling benchmark.z0.1.1c                 C   s:   t jj| tt jt j tjtj	tj	tj	dd dt
dS )N	book_textbook_id
book_titlepublication_date	book_linkz https://github.com/deepmind/pg19)builderdescriptionfeaturessupervised_keyshomepagecitation)tfdscoreDatasetInfo_DESCRIPTIONr   FeaturesDictTexttfint32string	_CITATION)self r   Q/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/pg19.py_info<   s   z
Pg19._infoc              	   C   s   ~t  }tjtd}tjj|	 
 }|D ]}|d}|dd |t|d < qtjjtjj|tjtdddtjjtjj|tjtd	ddtjjtjj|tjtd
ddgS )zReturns SplitGenerators.zmetadata.csv,   Nr   train)metadatafilepath)name
gen_kwargs
validationtest)dictospathjoin	_DATA_DIRr   iogfileGFileread
splitlinessplitintr   r   SplitGeneratorSplitTRAIN
VALIDATIONTEST)r   
dl_managermetadata_dictmetadata_pathr#   row	row_splitr   r   r   _split_generatorsM   s4   
zPg19._split_generatorsc           	   	   c   s    t jj|D ]C}t|d}|| }tj||}t jj	|d}|
  }||||d |d |d dfV  W d   n1 sFw   Y  qdS )zYields examples.z.txtrr   r!      r   N)r   r.   r/   listdirr4   rstripr*   r+   r,   r0   r1   strip)	r   r$   r#   filer   	book_datar+   ftextr   r   r   _generate_exampleso   s"   zPg19._generate_examplesN)__name__
__module____qualname____doc__r   r   VersionVERSIONr   r?   rI   r   r   r   r   r   7   s    "r   )rM   
__future__r   r   r   r*   tensorflow.compat.v2compatv2r   tensorflow_datasets.public_api
public_apir   r   r   r-   r   GeneratorBasedBuilderr   r   r   r   r   <module>   s   