o
    Ni\                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlm  m	Z
 ddlmZ dZdZdZd	Zd
Zg dZG dd dejjZdS )z'Reddit dataset using tldr as summaries.    )absolute_import)division)print_functionNa  
@inproceedings{volske-etal-2017-tl,
    title = "{TL};{DR}: Mining {R}eddit to Learn Automatic Summarization",
    author = {V{"o}lske, Michael  and
      Potthast, Martin  and
      Syed, Shahbaz  and
      Stein, Benno},
    booktitle = "Proceedings of the Workshop on New Frontiers in Summarization",
    month = sep,
    year = "2017",
    address = "Copenhagen, Denmark",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W17-4508",
    doi = "10.18653/v1/W17-4508",
    pages = "59--63",
    abstract = "Recent advances in automatic text summarization have used deep neural networks to generate high-quality abstractive summaries, but the performance of these models strongly depends on large amounts of suitable training data. We propose a new method for mining social media for author-provided summaries, taking advantage of the common practice of appending a {``}TL;DR{''} to long posts. A case study using a large Reddit crawl yields the Webis-TLDR-17 dataset, complementing existing corpora primarily from the news genre. Our technique is likely applicable to other social media sites and general web crawls.",
}
aZ  
This corpus contains preprocessed posts from the Reddit dataset.
The dataset consists of 3,848,330 posts with an average length of 270 words for content,
and 28 words for the summary.

Features includes strings: author, body, normalizedBody, content, summary, subreddit, subreddit_id.
Content is used as document and summary is used as summary.
zKhttps://zenodo.org/record/1043504/files/corpus-webis-tldr-17.zip?download=1contentsummary)authorbodynormalizedBody	subredditsubreddit_ididc                   @   s6   e Zd ZdZejdZdd Zdd Z	d
dd	Z
dS )RedditzReddit Dataset.z1.0.0c              	   C   s6   t jj| tt jdd tttg D ttfdt	dS )Nc                 S   s   i | ]}|t jqS  )tfstring.0kr   r   \/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/reddit.py
<dictcomp>K   s    z Reddit._info.<locals>.<dictcomp>z0https://github.com/webis-de/webis-tldr-17-corpus)builderdescriptionfeaturessupervised_keyshomepagecitation)
tfdscoreDatasetInfo_DESCRIPTIONr   FeaturesDict_ADDITIONAL_FEATURES	_DOCUMENT_SUMMARY	_CITATION)selfr   r   r   _infoG   s   
zReddit._infoc                 C   s.   | t}tjjtjjdtj	|didgS )zReturns SplitGenerators.pathzcorpus-webis-tldr-17.json)name
gen_kwargs)
download_and_extract_URLr   r   SplitGeneratorSplitTRAINosr'   join)r%   
dl_managerdl_pathr   r   r   _split_generatorsS   s   
zReddit._split_generatorsNc                 #   s    t jj|d/}t|D ]!\}}t| t v r/t v r/| fddt	ttg D fV  qW d   dS 1 s;w   Y  dS )zYields examples.rbc                    s   i | ]	}|  |d qS ) )getr   dr   r   r   q   s    z-Reddit._generate_examples.<locals>.<dictcomp>N)
r   iogfileGFile	enumeratejsonloadsr#   r"   r!   )r%   r'   filiner   r7   r   _generate_examples_   s   


"zReddit._generate_examples)N)__name__
__module____qualname____doc__r   r   VersionVERSIONr&   r3   rB   r   r   r   r   r   B   s    r   )rF   
__future__r   r   r   r=   r/   tensorflow.compat.v2compatv2r   tensorflow_datasets.public_api
public_apir   r$   r   r+   r"   r#   r!   r   GeneratorBasedBuilderr   r   r   r   r   <module>   s   	