o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlZddlZddl	m
  mZ ddlmZ dZdZdZd	Zd
ZdZdZdZdZdd Zdd ZG dd dejjZdS )zreddit_disentanglement dataset.    )absolute_import)division)print_functionNa  
@article{zhu2019did,
  title={Who did They Respond to? Conversation Structure Modeling using Masked Hierarchical Transformer},
  author={Zhu, Henghui and Nan, Feng and Wang, Zhiguo and Nallapati, Ramesh and Xiang, Bing},
  journal={arXiv preprint arXiv:1911.10666},
  year={2019}
}
a  
This dataset contains ~3M messages from reddit.
Every message is labeled with metadata. The task is to predict the id of its
parent message in the corresponding thread.
Each record contains a list of messages from one thread.
Duplicated and broken records are removed from the dataset.


Features are:
  - id - message id
  - text - message text
  - author - message author
  - created_utc - message UTC timestamp
  - link_id - id of the post that the comment relates to
Target:
  - parent_id - id of the parent message in the current thread
threadidtextcreated_utcauthorlink_id	parent_idc                 c   sX    t jj| }t|}|D ]	}|d r|V  qW d    d S 1 s%w   Y  d S Nr   )tfiogfileGFilecsv
DictReader)pathfreaderrow r   c/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/reddit_disentanglement.py	_read_csvB   s   
"r   c                    s   t dd | D }tdd | D   fdd| D } fdd| D }t|dd d	}t|d
d D ]:\}ttfddD rR|	d  q7dd D }t
|dkrjtd|d d |	|d  q7t|dd d	S )zRemove duplicated records.c                 s   s    | ]}|d  V  qdS )r   Nr   .0r   r   r   r   	<genexpr>L   s    z_deduplicate.<locals>.<genexpr>c                 s   s     | ]\}}|d kr|V  qdS )   Nr   )r   r   countr   r   r   r   M   s    c                    s   g | ]
}|d   v r|qS r   r   r   nonuniq_idsr   r   
<listcomp>N       z _deduplicate.<locals>.<listcomp>c                    s   g | ]
}|d   vr|qS r   r   r   r    r   r   r"   P   r#   c                 S      | d S r   r   r   r   r   r   <lambda>R       z_deduplicate.<locals>.<lambda>)keyc                 S   r$   r   r   r%   r   r   r   r&   S   r'   c                 3   s    | ]	} d  |kV  qdS )r   Nr   )r   x)same_id_datar   r   r   U   s    r   c                 S   s   g | ]
}|d  dkr|qS )r	   z	[deleted]r   r   r   r   r   r"   X   s    r   z5Found several message with id {} in the original datar   c                 S   s   | d | d fS )Nr
   r   r   r%   r   r   r   r&   `   s    )collectionsCountersetitemssorted	itertoolsgroupbylistallappendlen
ValueErrorformat)datacntnonuniq_dataunique_data_non_deleted_same_id_datar   )r!   r*   r   _deduplicateJ   s$   r>   c                   @   s8   e Zd ZdZejdZdZdd Z	dd Z
dd	 Zd
S )RedditDisentanglementzReddit Disentanglement dataset.z2.0.0z  Download https://github.com/henghuiz/MaskedHierarchicalTransformer, decompress
  raw_data.zip and run generate_dataset.py with your reddit api credentials.
  Then put train.csv, val.csv and test.csv from the output directory into the
  manual folder.
  c                 C   sn   t jj| tt jtt jt jtt j	 t
t j	 tt j	 tt j	 tt j	 tt j	 iidtdS )Nz9https://github.com/henghuiz/MaskedHierarchicalTransformer)builderdescriptionfeatureshomepagecitation)tfdscoreDatasetInfo_DESCRIPTIONrB   FeaturesDict_THREAD_KEYSequence_MESSAGE_IDText_MESSAGE_TEXT_MESSAGE_TIMESTAMP_MESSAGE_AUTHOR_MESSAGE_LINK_ID_MESSAGE_PARENT_ID	_CITATION)selfr   r   r   _infon   s$   






zRedditDisentanglement._infoc              	   C   sj   t jjt jjdtj|jdidt jjt jj	dtj|jdidt jjt jj
dtj|jdidgS )zReturns SplitGenerators.r   z	train.csv)name
gen_kwargszval.csvztest.csv)rE   rF   SplitGeneratorSplitTRAINosr   join
manual_dir
VALIDATIONTEST)rT   
dl_managerr   r   r   _split_generators   s&   z'RedditDisentanglement._split_generatorsc                 c   sb    t t|}t|}t|dd D ]\}}t |}|D ]	}|d|d< q|t|ifV  qdS )zYields examples.c                 S   r$   )Nr
   r   r%   r   r   r   r&      r'   z:RedditDisentanglement._generate_examples.<locals>.<lambda>bodyr   N)r2   r   r>   r0   r1   poprJ   )rT   r   r8   r
   one_topic_datar   r   r   r   _generate_examples   s   z(RedditDisentanglement._generate_examplesN)__name__
__module____qualname____doc__rE   rF   VersionVERSIONMANUAL_DOWNLOAD_INSTRUCTIONSrU   ra   re   r   r   r   r   r?   c   s    r?   )ri   
__future__r   r   r   r+   r   r0   r[   tensorflow.compat.v2compatv2r   tensorflow_datasets.public_api
public_apirE   rS   rH   rJ   rL   rN   rO   rP   rQ   rR   r   r>   rF   GeneratorBasedBuilderr?   r   r   r   r   <module>   s,   	