o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlm  mZ	 ddl
mZ dZdZdZd	Zd
ZG dd dejjZdd ZdS ),Annotated Enron Subject Line Corpus Dataset.    )absolute_import)division)print_functionNa  
@misc{zhang2019email,
    title={This Email Could Save Your Life: Introducing the Task of Email Subject Line Generation},
    author={Rui Zhang and Joel Tetreault},
    year={2019},
    eprint={1906.03497},
    archivePrefix={arXiv},
    primaryClass={cs.CL}
}
z
A collection of email messages of employees in the Enron Corporation.

There are two features:
  - email_body: email body text.
  - subject_line: email subject text.
z7https://github.com/ryanzhumich/AESLC/archive/master.zip
email_bodysubject_linec                   @   s6   e Zd ZdZejdZdd Zdd Z	d
dd	Z
dS )Aeslcr   z1.0.0c              
   C   s8   t jj| tt jtt j tt j ittfdt	dS )Nz$https://github.com/ryanzhumich/AESLC)builderdescriptionfeaturessupervised_keyshomepagecitation)
tfdscoreDatasetInfo_DESCRIPTIONr   FeaturesDict	_DOCUMENTText_SUMMARY	_CITATION)self r   [/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/summarization/aeslc.py_info:   s   

zAeslc._infoc              
   C   s   | tjjttjjjd}tj	|dd}tj
jtjjdtj	|ddidtj
jtjjdtj	|ddidtj
jtjjdtj	|d	didgS )
zReturns SplitGenerators.)urlextract_methodzAESLC-masterenron_subject_linepatterntrainz	*.subject)name
gen_kwargsdevtest)download_and_extractr   downloadResource_URLExtractMethodZIPospathjoinr   SplitGeneratorSplitTRAIN
VALIDATIONTEST)r   
dl_managerdl_path
input_pathr   r   r   _split_generatorsG   s*   
zAeslc._split_generatorsNc                 c   sJ    t jj|D ]}t|\}}tj|d}|t	|t
|ifV  qdS )zYields examples.z.subjectN)tfiogfileglob_parse_email_filer+   r,   basenamerstripr   r   )r   r   filenamer   r   keyr   r   r   _generate_examplesb   s   zAeslc._generate_examples)N)__name__
__module____qualname____doc__r   r   VersionVERSIONr   r6   r@   r   r   r   r   r   5   s    r   c                 C   s   t jj| ;}d}|D ]}|dkr n||7 }qt|}d}|D ]}|dkr) n||7 }q!W d   ||fS W d   ||fS 1 sEw   Y  ||fS )z1Parse email file text for email body and subject. 
N)r7   r8   r9   GFilenext)r>   fr   linesubjectr   r   r   r;   j   s(   



r;   )rD   
__future__r   r   r   r+   tensorflow.compat.v2compatv2r7   tensorflow_datasets.public_api
public_apir   r   r   r(   r   r   r   GeneratorBasedBuilderr   r;   r   r   r   r   <module>   s   5