o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlm  m	Z
 ddlmZ dZdZdZd	Zd
ddZG dd dejjZG dd dejjZdS )zScientific Papers Dataset.    )absolute_import)division)print_functionNap  
@article{Cohan_2018,
   title={A Discourse-Aware Attention Model for Abstractive Summarization of
            Long Documents},
   url={http://dx.doi.org/10.18653/v1/n18-2097},
   DOI={10.18653/v1/n18-2097},
   journal={Proceedings of the 2018 Conference of the North American Chapter of
          the Association for Computational Linguistics: Human Language
          Technologies, Volume 2 (Short Papers)},
   publisher={Association for Computational Linguistics},
   author={Cohan, Arman and Dernoncourt, Franck and Kim, Doo Soon and Bui, Trung and Kim, Seokhwan and Chang, Walter and Goharian, Nazli},
   year={2018}
}
a  
Scientific papers datasets contains two sets of long and structured documents.
The datasets are obtained from ArXiv and PubMed OpenAccess repositories.

Both "arxiv" and "pubmed" have two features:

  - article: the body of the document, pagragraphs seperated by "/n".
  - abstract: the abstract of the document, pagragraphs seperated by "/n".
  - section_names: titles of sections, seperated by "/n".

articleabstractzPhttps://drive.google.com/uc?id=1b3rmCSIoh6VhD4HKWjI4HOW-cSwcwbeC&export=downloadzPhttps://drive.google.com/uc?id=1lvsqvsFi3W-pE1SqNZI0s8NR9rC1tsja&export=download)arxivpubmedc                       s*   e Zd ZdZejjd fdd	Z  ZS )ScientificPapersConfigz$BuilderConfig for Scientific Papers.Nc                    s8   t t| jdtjdtjdgd| || _dS )zBuilderConfig for Wikihow.

    Args:
      filename: filename of different configs for the dataset.
      **kwargs: keyword arguments forwarded to super.
    z1.1.1z1.1.0)versionsupported_versionsN )superr	   __init__tfdscoreVersionfilename)selfr   kwargs	__class__r   g/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/scientific_papers.pyr   F   s   	

zScientificPapersConfig.__init__N)	__name__
__module____qualname____doc__r   r   disallow_positional_argsr   __classcell__r   r   r   r   r	   C   s    r	   c                   @   sB   e Zd ZdZedddedddgZdd Zd	d
 ZdddZdS )ScientificPaperszScientific Papers.r   z Documents from ArXiv repository.)namedescriptionr   z!Documents from PubMed repository.c                 C   sB   t jj| tt jtt j tt j dt j ittfdt	dS )Nsection_namesz0https://github.com/armancohan/long-summarization)builderr!   featuressupervised_keyshomepagecitation)
r   r   DatasetInfo_DESCRIPTIONr$   FeaturesDict	_DOCUMENTText_SUMMARY	_CITATION)r   r   r   r   _info`   s   


zScientificPapers._infoc              	   C   s   | t}tj|| jj | jjd }tjj	tj
jdtj|didtjj	tj
jdtj|didtjj	tj
jdtj|didgS )zReturns SplitGenerators.z-datasetpathz	train.txt)r    
gen_kwargszval.txtztest.txt)download_and_extract_URLSosr0   joinbuilder_configr    r   r   SplitGeneratorSplitTRAIN
VALIDATIONTEST)r   
dl_managerdl_pathsr0   r   r   r   _split_generatorsn   s"   

z"ScientificPapers._split_generatorsNc                 c   s    t jj|;}|D ]/}t|}d|d }|dddd}|d td|d t	|dd|d ifV  qW d	   d	S 1 sFw   Y  d	S )
zYields examples.
abstract_textz<S> z</S>
article_idarticle_textr"   N)
tfiogfileGFilejsonloadsr5   replacer+   r-   )r   r0   flinedsummaryr   r   r   _generate_examples   s   

"z#ScientificPapers._generate_examplesr   )	r   r   r   r   r	   BUILDER_CONFIGSr/   r>   rO   r   r   r   r   r   V   s    r   )r   
__future__r   r   r   rH   r4   tensorflow.compat.v2compatv2rD   tensorflow_datasets.public_api
public_apir   r.   r)   r+   r-   r3   r   BuilderConfigr	   GeneratorBasedBuilderr   r   r   r   r   <module>   s"   