o
    Ni(                  
   @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlmZ ddl	m
  mZ ddlmZ dZdZd	d
ddddZdZdZejddejddgZejddZG dd dejjZdd Zdd Zdd ZdZd Zd!d"d#d$d%d&d'eed(g
Z d)d* Z!d+d, Z"G d-d. d.ejj#Z$dS )/z<CNN/DailyMail Summarization dataset, non-anonymized version.    )absolute_import)division)print_functionN)logginga	  CNN/DailyMail non-anonymized summarization dataset.

There are two features:
  - article: text of news article, used as the document to be summarized
  - highlights: joined text of highlights with <s> and </s> around each
    highlight, which is the target summary
a  @article{DBLP:journals/corr/SeeLM17,
  author    = {Abigail See and
               Peter J. Liu and
               Christopher D. Manning},
  title     = {Get To The Point: Summarization with Pointer-Generator Networks},
  journal   = {CoRR},
  volume    = {abs/1704.04368},
  year      = {2017},
  url       = {http://arxiv.org/abs/1704.04368},
  archivePrefix = {arXiv},
  eprint    = {1704.04368},
  timestamp = {Mon, 13 Aug 2018 16:46:08 +0200},
  biburl    = {https://dblp.org/rec/bib/journals/corr/SeeLM17},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{hermann2015teaching,
  title={Teaching machines to read and comprehend},
  author={Hermann, Karl Moritz and Kocisky, Tomas and Grefenstette, Edward and Espeholt, Lasse and Kay, Will and Suleyman, Mustafa and Blunsom, Phil},
  booktitle={Advances in neural information processing systems},
  pages={1693--1701},
  year={2015}
}
zKhttps://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQzKhttps://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWszThttps://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txtzUhttps://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txtzShttps://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt)cnn_stories
dm_stories	test_urls
train_urlsval_urls
highlightsarticlez1.0.0z6New split API (https://tensorflow.org/datasets/splits)2.0.0z'Separate target sentences with newline.z3.0.0zUsing cased version.c                       s*   e Zd ZdZejjd fdd	Z  ZS )CnnDailymailConfigzBuilderConfig for CnnDailymail.Nc                    s2   t t| jdttd| |ptjj | _	dS )a  BuilderConfig for CnnDailymail.

    Args:
      text_encoder_config: `tfds.features.text.TextEncoderConfig`, configuration
        for the `tfds.features.text.TextEncoder` used for the CnnDailymail
        (text) features
      **kwargs: keyword arguments forwarded to super.
    )versionsupported_versionsN )
superr   __init___DEFAULT_VERSION_SUPPORTED_VERSIONStfdsfeaturestextTextEncoderConfigtext_encoder_config)selfr   kwargs	__class__r   c/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/cnn_dailymail.pyr   b   s   
zCnnDailymailConfig.__init__N)	__name__
__module____qualname____doc__r   coredisallow_positional_argsr   __classcell__r   r   r   r   r   _   s    r   c                    s"   t | }dd   fdd|D S )zGet hashes of urls in file.c                 S   sH   t  }z| d} W n ty   td|  Y nw ||  | S )Nzutf-8zCannot hash url: %s)hashlibsha1encodeUnicodeDecodeErrorr   errorupdate	hexdigest)uhr   r   r   url_hashx   s   
z!_get_url_hashes.<locals>.url_hashc                    s   i | ]} |d qS )Tr   ).0r/   r1   r   r   
<dictcomp>   s    z#_get_url_hashes.<locals>.<dictcomp>)_read_text_file)pathurlsr   r3   r   _get_url_hashest   s   	r8   c                 C   s   |dkrt j| d dd}n|dkrt j| d dd}ntd| tjj|}g }|D ]}t j	|}|d|
d	 |v rL|t j|| q/|S )
z!Find files corresponding to urls.cnnr   storiesdmr   	dailymailzUnsupported publisher: %sr   z.story)osr6   joinr   fataltfiogfilelistdirbasenamefindappend)dl_paths	publisherurl_dicttop_dirfiles	ret_filesprD   r   r   r   _find_files   s   rN   c                 C   s   t | ts	J | |tjjkrt| d }n |tjjkr#t| d }n|tjjkr0t| d }nt	d| t
| d|}t
| d|}|| S )z%Get filenames for a particular split.r	   r
   r   zUnsupported split: %sr9   r;   )
isinstancedictr   SplitTRAINr8   
VALIDATIONTESTr   r?   rN   )rG   splitr7   r9   r;   r   r   r   _subset_filenames   s   rV   u   ’u   ”.!?z...'`")c                 C   sR   g }t jj| d}|D ]	}||  qW d    |S 1 s"w   Y  |S )Nr)r@   rA   rB   GFilerF   strip)	text_filelinesfliner   r   r   r5      s   
r5   c           	         s   t | }dd   fdd|D }g }g }d}|D ]}|sq|dr&d}q|r.|| q|| qd|}|d	krFd
|}||fS d|}||fS )z=Get abstract (highlights) and article from a story file path.c                 S   s,   d| v r| S | s
| S | d t v r| S | d S )z1Adds a period to a line that is missing a period.
@highlightz .)
END_TOKENS)rd   r   r   r   fix_missing_period   s   z(_get_art_abs.<locals>.fix_missing_periodc                    s   g | ]} |qS r   r   )r2   rd   rh   r   r   
<listcomp>   s    z _get_art_abs.<locals>.<listcomp>Fre   T r   
)r5   
startswithrF   r>   )	
story_filetfds_versionrb   article_linesr   next_is_highlightrd   r   abstractr   ri   r   _get_art_abs   s(   




rs   c                	   @   s~   e Zd ZdZedddeddejjjejj	 dded	d
ejjjejjj
dddgZdd Zdd Zdd Zdd ZdS )CnnDailymailz3CNN/DailyMail non-anonymized summarization dataset.
plain_textz
Plain text)namedescriptionbyteszGUses byte-level text encoding with `tfds.features.text.ByteTextEncoder`)encoder)rv   rw   r   subwords32kz@Uses `tfds.features.text.SubwordTextEncoder` with 32k vocab sizei   )encoder_cls
vocab_sizec                 C   sH   t jj| tt jtt jj| jj	dt
t jj| jj	ditt
fdtdS )N)encoder_configz'https://github.com/abisee/cnn-dailymail)builderrw   r   supervised_keyshomepagecitation)r   r%   DatasetInfo_DESCRIPTIONr   FeaturesDict_ARTICLETextbuilder_configr   _HIGHLIGHTS	_CITATION)r   r   r   r   _info  s"   zCnnDailymail._infoc                 c   s2    |  |D ]\}}d|t |t gV  qd S )Nrk   )_generate_examplesr>   r   r   )r   paths_exr   r   r   _vocab_text_gen  s   zCnnDailymail._vocab_text_genc                 C   s   | t}t|tjj}| jjt 	| 
| | jjt j}| jjt | tjjtjjd|idtjjtjjdt|tjjidtjjtjjdt|tjjidgS )NrK   )rv   
gen_kwargs)download_and_extract_DL_URLSrV   r   rQ   rR   infor   r   maybe_build_from_corpusr   ry   r   maybe_set_encoderr%   SplitGeneratorrS   rT   )r   
dl_managerrG   train_filesry   r   r   r   _split_generators  s*   
zCnnDailymail._split_generatorsc                 c   sH    |D ]}t || j\}}|r|sqtj|}|t|t|ifV  qd S r    )rs   r   r=   r6   rD   r   r   )r   rK   rM   r   r   fnamer   r   r   r   2  s   zCnnDailymail._generate_examplesN)r!   r"   r#   r$   r   r   r   r   r   ByteTextEncoderSubwordTextEncoderBUILDER_CONFIGSr   r   r   r   r   r   r   r   rt      s4    
rt   )%r$   
__future__r   r   r   r(   r=   abslr   tensorflow.compat.v2compatv2r@   tensorflow_datasets.public_api
public_apir   r   r   r   r   r   r%   Versionr   r   BuilderConfigr   r8   rN   rV   DM_SINGLE_CLOSE_QUOTEDM_DOUBLE_CLOSE_QUOTErg   r5   rs   GeneratorBasedBuilderrt   r   r   r   r   <module>   sJ   
2