o
    	Ti                     @   s   d dl mZmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 eG dd dZdd	 Zed
Zedkrie
eZe d  ZddddZededZejeejg ddZejrkeej ejejdd dS dS dS )    )	dataclassfield)Optional)load_dataset)	ModelCard)HfArgumentParserc                   @   s^   e Zd ZU dZedddidZeed< edddidZe	ed	< ed
ddidZ
ee ed< d
S )ScriptArgumentsa  
    Arguments for the script.

    Args:
        push_to_hub (`bool`, *optional*, defaults to `False`):
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/tldr"`):
            Hugging Face repository ID to push the dataset to.
        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
            Number of workers to use for dataset processing.
    Fhelpz4Whether to push the dataset to the Hugging Face Hub.)defaultmetadatapush_to_hubztrl-lib/tldrz2Hugging Face repository ID to push the dataset to.repo_idNz0Number of workers to use for dataset processing.dataset_num_proc)__name__
__module____qualname____doc__r   r   bool__annotations__r   strr   r   int r   r   J/home/ubuntu/.local/lib/python3.10/site-packages/examples/datasets/tldr.pyr      s   
 r   c                 C   s6   d}|j | d | d | d d}d| d  }||dS )	Nz>SUBREDDIT: r/{subreddit}

TITLE: {title}

POST: {post}

TL;DR:	subreddittitlepost)r   r   r    summary)prompt
completion)format)exampletldr_format_strr   r   r   r   r   to_prompt_completion3   s   
r#   a3  
---
tags: [trl]
---

# TL;DR Dataset

## Summary

The TL;DR dataset is a processed version of Reddit posts, specifically curated to train models using the [TRL library](https://github.com/huggingface/trl) for summarization tasks. It leverages the common practice on Reddit where users append "TL;DR" (Too Long; Didn't Read) summaries to lengthy posts, providing a rich source of paired text data for training summarization models.

## Data Structure

- **Format**: [Standard](https://huggingface.co/docs/trl/main/dataset_formats#standard)
- **Type**: [Prompt-completion](https://huggingface.co/docs/trl/main/dataset_formats#prompt-completion)

Columns:
- `"prompt"`: The unabridged Reddit post.
- `"completion"`: The concise "TL;DR" summary appended by the author.

This structure enables models to learn the relationship between detailed content and its abbreviated form, enhancing their summarization capabilities.

## Generation script

The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/tldr.py).
__main__zghttps://openaipublic.blob.core.windows.net/summarize-from-feedback/datasets/tldr_3_filtered/train.jsonlzghttps://openaipublic.blob.core.windows.net/summarize-from-feedback/datasets/tldr_3_filtered/valid.jsonlzfhttps://openaipublic.blob.core.windows.net/summarize-from-feedback/datasets/tldr_3_filtered/test.jsonl)train
validationtestjson)
data_files)idr   r   r   r   )num_procremove_columnsdataset)	repo_typeN)dataclassesr   r   typingr   datasetsr   huggingface_hubr   transformersr   r   r#   
model_cardr   parserparse_args_into_dataclassesscript_argsr)   r-   mapr   r   r   r   r   r   r   <module>   s6   