o
    Ni3                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlmZ ddl	m
  mZ ddlmZ dZdZd	Zd
ZdZeg dZG dd dejjZdS )zXSum dataset.    )absolute_import)division)print_functionN)logginga  
@article{Narayan2018DontGM,
  title={Don't Give Me the Details, Just the Summary! Topic-Aware Convolutional Neural Networks for Extreme Summarization},
  author={Shashi Narayan and Shay B. Cohen and Mirella Lapata},
  journal={ArXiv},
  year={2018},
  volume={abs/1808.08745}
}
a  
Extreme Summarization (XSum) Dataset.

There are two features:
  - document: Input news article.
  - summary: One sentence summary of the article.

This data need to manaully downloaded and extracted as described in
https://github.com/EdinburghNLP/XSum/blob/master/XSum-Dataset/README.md.
The folder 'xsum-extracts-from-downloads' need to be compressed as
'xsum-extracts-from-downloads.tar.gz' and put in manually downloaded folder.
zphttps://raw.githubusercontent.com/EdinburghNLP/XSum/master/XSum-Dataset/XSum-TRAINING-DEV-TEST-SPLIT-90-5-5.jsondocumentsummary)zShare this with
zEmail
z	Facebook
z
Messenger
zTwitter
z
Pinterest
z	WhatsApp
z	Linkedin
z	LinkedIn
zCopy this link
z7These are external links and will open in a new window
c                   @   sJ   e Zd ZdZejdZejddgZdZ	dd Z
dd	 ZdddZd
S )Xsumz%Extreme Summarization (XSum) Dataset.z1.1.0z1.0.0zDataset without cleaning.a"    Detailed download instructions (which require running a custom script) are
  here:
  https://github.com/EdinburghNLP/XSum/blob/master/XSum-Dataset/README.md#running-the-download-and-extraction-scripts
  Afterwards, please put xsum-extracts-from-downloads.tar.gz file in the manual_dir.
  c              
   C   s8   t jj| tt jtt j tt j ittfdt	dS )Nz=https://github.com/EdinburghNLP/XSum/tree/master/XSum-Dataset)builderdescriptionfeaturessupervised_keyshomepagecitation)
tfdscoreDatasetInfo_DESCRIPTIONr   FeaturesDict	_DOCUMENTText_SUMMARY	_CITATION)self r   Z/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/xsum.py_infoO   s   

z
Xsum._infoc              	   C   s   | t}tjj|d}t|}W d   n1 sw   Y  d}tj	
|tj	
|j|d |}tjjtjj|d |ddtjjtjj|d |ddtjjtjj|d	 |ddgS )
zReturns SplitGenerators.rNzxsum-extracts-from-downloadsz.tar.gztrain)	split_idspath)name
gen_kwargs
validationtest)download_URLtfiogfileGFilejsonloadosr   joinextract
manual_dirr   r   SplitGeneratorSplitTRAIN
VALIDATIONTEST)r   
dl_managerdl_path	json_filer   folder_nameextract_pathr   r   r   _split_generators]   s<   
zXsum._split_generatorsNc           
   	   c   s    d}t |}|D ]V}tj||d }tjj|rUtjj|*}ddd |	 D }|
d}	|t|	d  t|	d  ifV  W d	   n1 sOw   Y  q	|d
7 }td| q	|rktd|| d	S d	S )zYields examples.r   z.data c                 S   s    g | ]}|t vr| r|qS r   )_REMOVE_LINESstrip).0liner   r   r   
<listcomp>   s
    z+Xsum._generate_examples.<locals>.<listcomp>z[XSUM]      N   zid %s missing.z"%d out of %d examples are missing.)lenr,   r   r-   r&   r'   r(   existsr)   	readlinessplitr   r=   r   r   infowarning)
r   r   r   missing	total_numifilenameftextsegsr   r   r   _generate_examples   s&   

$zXsum._generate_examples)NN)__name__
__module____qualname____doc__r   r   VersionVERSIONSUPPORTED_VERSIONSMANUAL_DOWNLOAD_INSTRUCTIONSr   r:   rQ   r   r   r   r   r   A   s    "r   )rU   
__future__r   r   r   r*   r,   abslr   tensorflow.compat.v2compatv2r&   tensorflow_datasets.public_api
public_apir   r   r   r%   r   r   setr<   r   GeneratorBasedBuilderr   r   r   r   r   <module>   s    
