o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlm  m	Z
 ddlmZ dZdZdZd	Zg d
Zg dZG dd dejjZdS )NEWSROOM Dataset.    )absolute_import)division)print_functionNa  
@article{Grusky_2018,
   title={Newsroom: A Dataset of 1.3 Million Summaries with Diverse Extractive Strategies},
   url={http://dx.doi.org/10.18653/v1/n18-1065},
   DOI={10.18653/v1/n18-1065},
   journal={Proceedings of the 2018 Conference of the North American Chapter of
          the Association for Computational Linguistics: Human Language
          Technologies, Volume 1 (Long Papers)},
   publisher={Association for Computational Linguistics},
   author={Grusky, Max and Naaman, Mor and Artzi, Yoav},
   year={2018}
}

a  
NEWSROOM is a large dataset for training and evaluating summarization systems.
It contains 1.3 million articles and summaries written by authors and
editors in the newsrooms of 38 major publications.

Dataset features includes:
  - text: Input news text.
  - summary: Summary for the news.
And additional features:
  - title: news title.
  - url: url of the news.
  - date: date of the article.
  - density: extractive density.
  - coverage: extractive coverage.
  - compression: compression ratio.
  - density_bin: low, medium, high.
  - coverage_bin: extractive, abstractive.
  - compression_bin: low, medium, high.

This dataset can be downloaded upon requests. Unzip all the contents
"train.jsonl, dev.josnl, test.jsonl" to the tfds folder.

textsummary)titleurldatedensity_bincoverage_bincompression_bin)densitycoveragecompressionc                   @   s:   e Zd ZdZejdZdZdd Z	dd Z
dd	d
ZdS )Newsroomr   z1.0.0z  You should download the dataset from https://summari.es/download/
  The webpage requires registration.
  After downloading, please put dev.jsonl, test.jsonl and train.jsonl
  files in the manual_dir.
  c                 C   sN   dd t tgt D }|dd tD  tjj| ttj	
|t tfdtdS )Nc                 S   s   i | ]}|t j qS  )tfdsfeaturesText.0kr   r   ^/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/newsroom.py
<dictcomp>\   s    
z"Newsroom._info.<locals>.<dictcomp>c                 S   s    i | ]}|t jjg tjd qS ))shapedtype)r   r   Tensortffloat32r   r   r   r   r   `   s    zhttps://summari.es)builderdescriptionr   supervised_keyshomepagecitation)	_DOCUMENT_SUMMARY_ADDITIONAL_TEXT_FEATURESupdate_ADDITIONAL_FLOAT_FEATURESr   coreDatasetInfo_DESCRIPTIONr   FeaturesDict	_CITATION)selfr   r   r   r   _info[   s   


zNewsroom._infoc              	   C   sj   t jjt jjdtj|jdidt jjt jj	dtj|jdidt jjt jj
dtj|jdidgS )zReturns SplitGenerators.
input_fileztrain.jsonl)name
gen_kwargsz	dev.jsonlz
test.jsonl)r   r*   SplitGeneratorSplitTRAINospathjoin
manual_dir
VALIDATIONTEST)r/   
dl_managerr   r   r   _split_generatorsm   s    zNewsroom._split_generatorsNc                 #   sv    t jj|)}t|D ]\}}t| | fddttgt	 t
 D fV  qW d   dS 1 s4w   Y  dS )zYields examples.c                    s   i | ]}| | qS r   r   r   dr   r   r      s    
z/Newsroom._generate_examples.<locals>.<dictcomp>N)r   iogfileGFile	enumeratejsonloadsr%   r&   r'   r)   )r/   r1   filiner   r?   r   _generate_examples   s   

"zNewsroom._generate_examples)N)__name__
__module____qualname____doc__r   r*   VersionVERSIONMANUAL_DOWNLOAD_INSTRUCTIONSr0   r>   rJ   r   r   r   r   r   P   s    r   )rN   
__future__r   r   r   rE   r7   tensorflow.compat.v2compatv2r   tensorflow_datasets.public_api
public_apir   r.   r,   r%   r&   r'   r)   r*   GeneratorBasedBuilderr   r   r   r   r   <module>   s   