o
    Niu                  	   @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlmZm	Z	m
Z
mZmZmZ ddlm  mZ ddlmZ dZdZd	Zd
ZdZdZdZdZeddddddddg	ZG dd dejjZ de	eef de!fddZ"dS )z/Summarizing abstract from covid19 publications.    )absolute_import)division)print_functionN)AnyDictIteratorListTextTupleaD  
@ONLINE {CORD-19-research-challenge,
    author = "An AI challenge with AI2, CZI, MSR, Georgetown, NIH & The White House",
    title  = "COVID-19 Open Research Dataset Challenge (CORD-19)",
    month  = "april",
    year   = "2020",
    url    = "https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge"
}
zHhttps://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challengea4  
CORD-19 is a resource of over 45,000 scholarly articles, including over 33,000
with full text, about COVID-19, SARS-CoV-2, and related coronaviruses.

To help organizing information in scientific literatures of COVID-19 through
abstractive summarization. This dataset parse those articles to pairs of
document and summaries of full_text-abstract or introduction-abstract.

Features includes strings of: abstract, full_text, sha (hash of pdf),
source_x (source of publication), title, doi (digital object identifier),
license, authors, publish_time, journal, url.
abstract	body_textsectiontextshasource_xtitledoilicenseauthorspublish_timejournalurlc                
   @   s   e Zd ZdZdZejdZdejj	fddZ
dejjdeejj fdd	Z	
ddeeeef  deeeeeef f  fddZd
S )
Covid19sumzCovid19sum Dataset.z
    This dataset need to be manually downloaded through kaggle api:
    `kaggle datasets download allen-institute-for-ai/CORD-19-research-challenge`
    Place the downloaded zip file in the manual folder.
    z1.0.0returnc                 C   s\   dd t tg D }tjtjttjt	tji|t
< tjj| ttj|t
tfttdS )Nc                 S   s   i | ]}|t jqS  )tfstring.0kr   r   `/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/covid19sum.py
<dictcomp>N   s    z$Covid19sum._info.<locals>.<dictcomp>)builderdescriptionfeaturessupervised_keyshomepagecitation)_ADDITIONAL_FEATURES	_ABSTRACTtfdsr$   SequenceFeaturesDict_SECTIONr   r   _TEXT
_BODY_TEXTcoreDatasetInfo_DESCRIPTION	_HOMEPAGE	_CITATION)selfr$   r   r   r    _infoM   s   
zCovid19sum._info
dl_managerc           	         s   | tj|jd}tjjj}|	tj|d
d}g }| D ]2\}  d } d rUt rU|rU fddttg D }tj||| t d |d	< || q#tjjtjjd
|idgS )zReturns SplitGenerators.zCORD-19-research-challenge.zipzmetadata.csv full_text_filehas_full_textc                       i | ]}| | qS r   r   r   rowr   r    r!   i       z0Covid19sum._split_generators.<locals>.<dictcomp>z.jsonpath
data_paths)name
gen_kwargs)extractosr?   join
manual_dirr*   r0   lazy_importspandasread_csvfillnaiterrows_has_abstractr(   r)   _SHAappendSplitGeneratorSplitTRAIN)	r5   r7   extracted_pathpddfr@   _file_dirdr   r<   r    _split_generators]   s(   


zCovid19sum._split_generatorsNr@   c              	   c   s    |D ]C}| d}tjj|rFtjj|d%}t|}|t	g }|r7dd |D |t	< |t
 |fV  W d   n1 sAw   Y  qdS )zYields examples.r?   rbc                    s"   g | ]  fd dt tfD qS )c                    r;   r   r   r   sr   r    r!      r>   z<Covid19sum._generate_examples.<locals>.<listcomp>.<dictcomp>)r-   r.   )r   r   rZ   r    
<listcomp>   s    z1Covid19sum._generate_examples.<locals>.<listcomp>N)popr   iogfileexistsGFilejsonloadgetr/   rM   )r5   r@   rW   r?   f	data_dictr   r   r   r    _generate_examplest   s    


zCovid19sum._generate_examples)N)__name__
__module____qualname____doc__MANUAL_DOWNLOAD_INSTRUCTIONSr*   r0   VersionVERSIONr1   r6   downloadDownloadManagerr   rO   rX   r   r	   r   r   r
   rg   r   r   r   r    r   B   s     

r   exampler   c                 C   s   | t  }|o| dkS )Nunknown)r)   lower)rq   r   r   r   r    rL      s   rL   )#rk   
__future__r   r   r   rb   rD   typingr   r   r   r   r	   r
   tensorflow.compat.v2compatv2r   tensorflow_datasets.public_api
public_apir*   r4   r3   r2   r)   r/   r-   r.   rM   r(   r0   GeneratorBasedBuilderr   boolrL   r   r   r   r    <module>   s,    
D