o
    Ni                     @   sz   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  mZ	 ddl
mZ dZdZdZd	Zd
ZG dd dejjZdS )Gigaword summarization dataset.    )absolute_import)division)print_functionNa  
@article{graff2003english,
  title={English gigaword},
  author={Graff, David and Kong, Junbo and Chen, Ke and Maeda, Kazuaki},
  journal={Linguistic Data Consortium, Philadelphia},
  volume={4},
  number={1},
  pages={34},
  year={2003}
}

@article{Rush_2015,
   title={A Neural Attention Model for Abstractive Sentence Summarization},
   url={http://dx.doi.org/10.18653/v1/D15-1044},
   DOI={10.18653/v1/d15-1044},
   journal={Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing},
   publisher={Association for Computational Linguistics},
   author={Rush, Alexander M. and Chopra, Sumit and Weston, Jason},
   year={2015}
}
aM  
Headline-generation on a corpus of article pairs from Gigaword consisting of
around 4 million articles. Use the 'org_data' provided by
https://github.com/microsoft/unilm/ which is identical to
https://github.com/harvardnlp/sent-summary but with better format.

There are two features:
  - document: article.
  - summary: headline.

zPhttps://drive.google.com/uc?export=download&id=1USoQ8lJgN8kAWnUnRrupMGrPMLlDVqlVdocumentsummaryc                   @   s6   e Zd ZdZejdZdd Zdd Z	d
dd	Z
dS )Gigawordr   z1.2.0c              
   C   s8   t jj| tt jtt j tt j ittfdt	dS )Nz*https://github.com/harvardnlp/sent-summary)builderdescriptionfeaturessupervised_keyshomepagecitation)
tfdscoreDatasetInfo_DESCRIPTIONr   FeaturesDict	_DOCUMENTText_SUMMARY	_CITATION)self r   ^/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/gigaword.py_infoL   s   

zGigaword._infoc                 C   s   | t}tj|dd}tjjtjj	|d |d dddtjjtjj
|d |d	 dddtjjtjj|d
 |d dddgS )zReturns SplitGenerators.org_dataz	%s.%s.txt)trainsrc)r   tgtT)src_pathtgt_pathreplace_unk)name
gen_kwargs)devr   )r%   r   )testr   )r&   r   F)download_and_extract_URLospathjoinr   r   SplitGeneratorSplitTRAIN
VALIDATIONTEST)r   
dl_managerdl_pathpatternr   r   r   _split_generatorsY   s0   
zGigaword._split_generatorsNc           	      c   s    t jj|[}t jj|;}tt||D ]+\}\}}|r6|t| ddt	| ddifV  q|t| t	| ifV  qW d   n1 sNw   Y  W d   dS W d   dS 1 sfw   Y  dS )zYields examples.z<unk>UNKN)
tfiogfileGFile	enumeratezipr   stripreplacer   )	r   r    r!   r"   f_df_sidoc_textsum_textr   r   r   _generate_examplesx   s    
PzGigaword._generate_examples)NNN)__name__
__module____qualname____doc__r   r   VersionVERSIONr   r4   rC   r   r   r   r   r   D   s    r   )rG   
__future__r   r   r   r)   tensorflow.compat.v2compatv2r6   tensorflow_datasets.public_api
public_apir   r   r   r(   r   r   r   GeneratorBasedBuilderr   r   r   r   r   <module>   s   