o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlm	  m
Z ddlmZ dZdZdZd	Zd
dddZG dd dejjZG dd dejjZdd ZdS )zWikiHow Datasets.    )absolute_import)division)print_functionNz
@misc{koupaee2018wikihow,
    title={WikiHow: A Large Scale Text Summarization Dataset},
    author={Mahnaz Koupaee and William Yang Wang},
    year={2018},
    eprint={1810.09305},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
a+  
WikiHow is a new large-scale dataset using the online WikiHow
(http://www.wikihow.com/) knowledge base.

There are two features:
  - text: wikihow answers texts.
  - headline: bold lines as summary.

There are two separate versions:
  - all: consisting of the concatenation of all paragraphs as the articles and
         the bold lines as the reference summaries.
  - sep: consisting of each paragraph and its summary.

Download "wikihowAll.csv" and "wikihowSep.csv" from
https://github.com/mahnazkoupaee/WikiHow-Dataset and place them in manual folder
https://www.tensorflow.org/datasets/api_docs/python/tfds/download/DownloadConfig.
Train/validation/test splits are provided by the authors.
Preprocessing is applied to remove short articles
(abstract length < 0.75 article length) and clean up extra commas.
textheadlinezThttps://raw.githubusercontent.com/mahnazkoupaee/WikiHow-Dataset/master/all_train.txtzRhttps://raw.githubusercontent.com/mahnazkoupaee/WikiHow-Dataset/master/all_val.txtzShttps://raw.githubusercontent.com/mahnazkoupaee/WikiHow-Dataset/master/all_test.txt)train
validationtestc                       s*   e Zd ZdZejjd fdd	Z  ZS )WikihowConfigzBuilderConfig for Wikihow.Nc                    s,   t t| jddtjdi| || _dS )zBuilderConfig for Wikihow.

    Args:
      filename: filename of different configs for the dataset.
      **kwargs: keyword arguments forwarded to super.
    versionz1.2.0N )superr
   __init__tfdscoreVersionfilename)selfr   kwargs	__class__r   ]/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/wikihow.pyr   N   s   


zWikihowConfig.__init__)N)	__name__
__module____qualname____doc__r   r   disallow_positional_argsr   __classcell__r   r   r   r   r
   K   s    r
   c                   @   sJ   e Zd ZdZdZeddddeddd	dgZd
d Zdd ZdddZ	dS )Wikihowz2WikiHow: A Large Scale Text Summarization Dataset.z  Links to files can be found on https://github.com/mahnazkoupaee/WikiHow-Dataset
  Please download both wikihowAll.csv and wikihowSep.csv.
  allzwikihowAll.csvzeUse the concatenation of all paragraphs as the articles and the bold lines as the reference summaries)namer   descriptionsepzwikihowSep.csvz#use each paragraph and its summary.c                 C   sR   t tdg}| jjdkr|ddg tjj| ttj	
dd |D t tfdtdS )	Ntitler"   overviewsectionLabelc                 S   s   i | ]}|t j qS r   )r   featuresText.0kr   r   r   
<dictcomp>y   s    z!Wikihow._info.<locals>.<dictcomp>z0https://github.com/mahnazkoupaee/WikiHow-Dataset)builderr!   r&   supervised_keyshomepagecitation)	_DOCUMENT_SUMMARYbuilder_configr    extendr   r   DatasetInfo_DESCRIPTIONr&   FeaturesDict	_CITATION)r   feature_namesr   r   r   _infoq   s   
zWikihow._infoc              	   C   s   | t}dd |D }| D ])\}}tjj|}|D ]}|| |  qW d   n1 s4w   Y  qt	j
jt	jjtj|j| jj|d ddt	j
jt	jjtj|j| jj|d ddt	j
jt	jjtj|j| jj|d ddgS )	zReturns SplitGenerators.c                 S   s   i | ]}|t  qS r   )setr(   r   r   r   r+      s    z-Wikihow._split_generators.<locals>.<dictcomp>Nr   )path	title_set)r    
gen_kwargsr   r	   )download_URLSitemstfiogfileGFileaddstripr   r   SplitGeneratorSplitTRAINosr;   join
manual_dirr2   r   
VALIDATIONTEST)r   
dl_managerdl_pathtitlesr*   r;   fliner   r   r   _split_generators   sD   





zWikihow._split_generatorsNc                 #   s>   t jj|}t|}t|}| jjdkr"|g dkr"t	d| jjdkr2|g dkr2t	ddd t
|D }t
|D ]M\} t t|kr |t   } |t   }	t||	\}}	|r|	r |d	   d
d|v r fdd| D }
|	|
t< ||
t< ||
fV  q?W d   dS 1 sw   Y  dS )zYields examples.r   )r   r#   r   z Mismatched header in WikiAll.txtr"   )r$   r   r   r%   r#   z Mismatched header in WikiSep.txtc                 S   s   i | ]\}}||qS r   r   )r)   ikeyr   r   r   r+      s    z.Wikihow._generate_examples.<locals>.<dictcomp>r#     c                    s*   i | ]\}}|t tfvr| |  qS r   )r1   r0   rF   )r)   r*   vrS   r   r   r+      s
    N)rA   rB   rC   rD   csvreadernextr2   r    
ValueError	enumeratelenr1   rF   r0   _filter_and_cleanreplacer@   )r   r;   r<   rR   r\   headerskey2idrU   summarydocumentdr   rZ   r   _generate_examples   s2   


"zWikihow._generate_examples)NN)
r   r   r   r   MANUAL_DOWNLOAD_INSTRUCTIONSr
   BUILDER_CONFIGSr9   rT   rh   r   r   r   r   r   ]   s"    *r   c                 C   s:   t | dt | k r| dd} tdd|}| |fS dS )zARemove short article and clean up commas in abstract and article.g      ?z.,.z[.]+[\n]+[,]z.
)rX   rX   )r`   rb   resub)abstractarticler   r   r   ra      s
   ra   )r   
__future__r   r   r   r[   rJ   rl   tensorflow.compat.v2compatv2rA   tensorflow_datasets.public_api
public_apir   r7   r5   r0   r1   r?   r   BuilderConfigr
   GeneratorBasedBuilderr   ra   r   r   r   r   <module>   s(   
n