o
    Ni                     @   s   d Z ddlmZ ddlmZ ddlmZ ddlZddlZddlmZ ddl	m
  mZ ddlmZ dZdZd	Zejd
dZdZdZdZdZdZdd Zdd Zdd Zdd Zdd Zdd Z G dd dej!j"Z#dS )zirc_disentanglement dataset.    )absolute_import)division)print_functionN)Lista  
@InProceedings{acl19disentangle,
  author    = {Jonathan K. Kummerfeld and Sai R. Gouravajhala and Joseph Peper and Vignesh Athreya and Chulaka Gunasekara and Jatin Ganhotra and Siva Sankalp Patel and Lazaros Polymenakos and Walter S. Lasecki},
  title     = {A Large-Scale Corpus for Conversation Disentanglement},
  booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
  location  = {Florence, Italy},
  month     = {July},
  year      = {2019},
  doi       = {10.18653/v1/P19-1374},
  pages     = {3846--3856},
  url       = {https://aclweb.org/anthology/papers/P/P19/P19-1374/},
  arxiv     = {https://arxiv.org/abs/1810.11118},
  software  = {https://jkk.name/irc-disentanglement},
  data      = {https://jkk.name/irc-disentanglement},
}
a  
IRC Disentanglement dataset contains over 77,563 messages from Ubuntu IRC
channel.

Features include message id, message text and timestamp.
Target is list of messages that current message replies to.
Each record contains a list of messages from one day of IRC chat.
zChttps://github.com/jkkummerfeld/irc-disentanglement/zipball/fd379e9z(jkkummerfeld-irc-disentanglement-fd379e9datadayidtext	timestampparentsc                 C   sh   t t}tjj| D ]%}tj	| |}|dt
d }d|v r'||| d< d|v r1||| d< q|S )au  Prepares paths to files with raw chat messages and replies annotations.

  Args:
    data_dir: directory containing files with data. directory can be

  Returns:
    day_to_paths: dict formatted date -> dict with paths
      day_to_paths[day_str]["text"] - path to file with raw chat messages
      day_to_paths[day_str]["annot"] - path to file with replies annotations.
  Nz
YYYY-MM-DDrawr	   
annotationannot)collectionsdefaultdictdicttfiogfilelistdirospathjoinlen)data_dirday_to_pathsfilenamefilepathday_str r   `/home/ubuntu/.local/lib/python3.10/site-packages/tensorflow_datasets/text/irc_disentanglement.py_get_day_to_pathsB   s   
r!   c                 C   sB   t jj| d}dd |D W  d    S 1 sw   Y  d S )Nrc                 S   s   g | ]}|  qS r   )strip).0liner   r   r    
<listcomp>\   s    z$_read_texts_file.<locals>.<listcomp>)r   r   r   GFiler   fr   r   r    _read_texts_fileZ   s   $r*   c                 C   sJ   t jj| d}dd ttj|D W  d   S 1 sw   Y  dS )z#Reads file with replies annotation.r"   c                 S   s"   g | ]\}}}t |t |fqS r   )int)r$   firstsecond_r   r   r    r&   b   s    z$_read_annot_file.<locals>.<listcomp>N)r   r   r   r'   mapstrsplitr(   r   r   r    _read_annot_file_   s
   
$r2   c                 C   s   d}g g }}| D ]<}| dr;t|dd }t|dd }||k r(|d7 }|}|d||| |d	d
 }n|d || q	||fS )aj  Parsing timestamps from IRC chat messages.

  Similar logic is implemented here.
  https://github.com/jkkummerfeld/irc-disentanglement/blob/master/src/disentangle.py#L174

  Args:
    raw_texts: list of raw chat messages.
    day_str: formatted date string.

  Returns:
    texts: list of texts without timestamps.
    timestamps: list of formatted timestamps
  r   [               z{}_{:02}_{:02}   N )
startswithr+   appendformat)	raw_textsr   
prev_hours
timestampstextsraw_texthoursminsr   r   r    _parse_out_timestampsf   s   


rE   c                 C   s   d | |S )Nz{}_{:05})r=   )r   line_numr   r   r    _get_msg_id   s   rG   c              	   #   s    t | }t|}dd tt|D }|D ]\}}|| | qt| \}}	| D ]\}
} fdd|D }tt |
t	||
 t
|	|
 t|iV  q-dS )zPrepares examples for 1 day.c                 S   s   i | ]}|g qS r   r   )r$   idxr   r   r    
<dictcomp>   s    z%_prepare_examples.<locals>.<dictcomp>c                    s   g | ]}t  |qS r   )rG   )r$   parentr   r   r    r&      s    z%_prepare_examples.<locals>.<listcomp>N)r*   r2   ranger   r<   rE   items_MESSAGE_IDrG   _MESSAGE_TEXT_MESSAGE_TIMESTAMP_MESSAGE_PARENTS_IDS)texts_file_pathannot_file_pathr   r>   annotationsidx_to_parentsparent_msg_idxmsg_idxrA   r@   line_idxr   parents_idsr   rK   r    _prepare_examples   s   
rZ   c                   @   sT   e Zd ZdZejdZdejjfddZ	dej
jdeejj fddZd	d
 ZdS )IrcDisentanglementzIRC Disentanglement dataset.z2.0.0returnc                 C   sb   t jj| tt jtt jt jtt j	 t
t j	 tt j	 tt jt j	 iidtdS )Nz$https://jkk.name/irc-disentanglement)builderdescriptionfeatureshomepagecitation)tfdscoreDatasetInfo_DESCRIPTIONr_   FeaturesDict_IRC_DAY_KEYSequencerN   TextrO   rP   rQ   	_CITATION)selfr   r   r    _info   s$   


	zIrcDisentanglement._info
dl_managerc              
   C   s   | tjjttjjjd}tj	|t
}tjjtjjdttj	|didtjjtjjdttj	|didtjjtjjdttj	|didgS )zReturns SplitGenerators.)urlextract_methodr   train)name
gen_kwargsdevtest)download_and_extractrb   downloadResource_DOWNLOAD_URLExtractMethodZIPr   r   r   _DOWNLOAD_ARCHIVE_SUBDIRrc   SplitGeneratorSplitTRAINr!   
VALIDATIONTEST)rk   rm   base_dirr   r   r   r    _split_generators   s4   z$IrcDisentanglement._split_generatorsc                 c   s:    |  D ]\}}|ttt|d |d |ifV  qdS )zYields examples.r	   r   N)rM   rg   listrZ   )rk   r   r   pathsr   r   r    _generate_examples   s   z%IrcDisentanglement._generate_examplesN)__name__
__module____qualname____doc__rb   rc   VersionVERSIONrd   rl   rv   DownloadManagerr   r|   r   r   r   r   r   r    r[      s    


r[   )$r   
__future__r   r   r   r   r   typingr   tensorflow.compat.v2compatv2r   tensorflow_datasets.public_api
public_apirb   rj   re   rx   r   r   r{   rg   rN   rO   rP   rQ   r!   r*   r2   rE   rG   rZ   rc   GeneratorBasedBuilderr[   r   r   r   r    <module>   s8   	$