o
    	Tij                     @   s   d dl mZmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 eG dd dZdd	 Zed
Zedkrbe
eZe d  ZeddZejeejg ddZejrdeej ejejdd dS dS dS )    )	dataclassfield)Optional)load_dataset)	ModelCard)HfArgumentParserc                   @   s^   e Zd ZU dZedddidZeed< edddidZe	ed	< ed
ddidZ
ee ed< d
S )ScriptArgumentsa  
    Arguments for the script.

    Args:
        push_to_hub (`bool`, *optional*, defaults to `False`):
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/tldr-preference"`):
            Hugging Face repository ID to push the dataset to.
        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
            Number of workers to use for dataset processing.
    Fhelpz4Whether to push the dataset to the Hugging Face Hub.)defaultmetadatapush_to_hubztrl-lib/tldr-preferencez2Hugging Face repository ID to push the dataset to.repo_idNz0Number of workers to use for dataset processing.dataset_num_proc)__name__
__module____qualname____doc__r   r   bool__annotations__r   strr   r   int r   r   U/home/ubuntu/.local/lib/python3.10/site-packages/examples/datasets/tldr_preference.pyr      s   
 r   c           	      C   s   | d }| d dv r|d  dd}d|d  d| d	}n3| d d
d tddD dg v rH|d  dd}d|d  d|d  d| d	}n	td| d  | d }d| }| d | d }| d | d }|||dS )Ninfobatch)batch0_cnndmcnndm0cnndm2articlez


zTITLE: titlez

TL;DR:c                 S   s   g | ]}d | qS )r   r   ).0ir   r   r   
<listcomp>8   s    z!to_preference.<locals>.<listcomp>      edit_b2_eval_testpostzSUBREDDIT: r/	subredditz	

TITLE: z

POST: zUnknown batch: choice   	summariestext)promptchosenrejected)replacerange
ValueError)	exampler   r   r-   r'   
chosen_idxrejected_idxr.   r/   r   r   r   to_preference3   s   ""r6   a  
---
tags: [trl]
---

# TL;DR Dataset for Preference Learning

## Summary

The TL;DR dataset is a processed version of Reddit posts, specifically curated to train models using the [TRL library](https://github.com/huggingface/trl) for preference learning and Reinforcement Learning from Human Feedback (RLHF) tasks. It leverages the common practice on Reddit where users append "TL;DR" (Too Long; Didn't Read) summaries to lengthy posts, providing a rich source of paired text data for training models to understand and generate concise summaries.

## Data Structure

- **Format**: [Standard](https://huggingface.co/docs/trl/main/dataset_formats#standard)
- **Type**: [Preference](https://huggingface.co/docs/trl/main/dataset_formats#preference)

Columns:
- `"prompt"`: The unabridged Reddit post.
- `"chosen"`: The concise "TL;DR" summary appended by the author.
- `"rejected"`: An alternative summary or response that was not selected.

This structure enables models to learn the relationship between detailed content and its abbreviated form, enhancing their summarization capabilities.

## Generation script

The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/tldr_preference.py).
__main__zopenai/summarize_from_feedbackcomparisons)r   r+   r)   workerr   splitextra)num_procremove_columnsdataset)	repo_typeN)dataclassesr   r   typingr   datasetsr   huggingface_hubr   transformersr   r   r6   
model_cardr   parserparse_args_into_dataclassesscript_argsr>   mapr   r   r   r   r   r   r   <module>   s.   
