o
    ̳i                       @   s   d dl mZmZmZmZmZ d dlZd dlm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ G dd	 d	eZG d
d deZdS )    )AnyCallableDictMappingOptionalN)load_dataset)Dataset)CROSS_ENTROPY_IGNORE_IDX)validate_messages)	Transformc                   @   sf   e Zd ZdZdddedededee deee	f d	dfd
dZ
dd Zded	eee	f fddZdS )
SFTDatasetaA  
    Primary class for creating any dataset for supervised fine-tuning either from
    Hugging Face Hub, local files, or remote files. This class supports instruct,
    chat, tool, or multimodal data for fine-tuning. At a high level, this class
    will load the data from source and apply the following pre-processing steps
    when a sample is retrieved:

    1. Dataset-specific transform. This is typically unique to each dataset and extracts
       the necessary columns into torchtune's :class:`~torchtune.data.Message` format,
       a standardized API for all model tokenizers.
    2. Model-specific transform or tokenization with optional prompt template


    All datasets are formatted into a list of :class:`~torchtune.data.Message`
    because for fine-tuning, datasets can be considered as "conversations" with the model,
    or AI assistant. Thus, we can standardize all text content as messages in a conversation assigned to
    a role:

    - ``"system"`` messages contain the system prompt
    - ``"user"`` messages contain the input prompt into the model
    - ``"assistant"`` messages are the response of the model and what you actually want
      to train for and compute loss directly against
    - ``"ipython"`` messages are the return from a tool call

    Chat datasets are multiple rounds of user-assistant messages. Instruct datasets
    are typically a single round involving a specific instruction and the model's response.
    Tool datasets are a type of chat dataset that includes ipython messages. Multimodal
    datasets are a type of chat dataset that incorporates media into the user messages.

    The :class:`~torchtune.data.Message` forms the core data unit that all tokenizer
    APIs expect. The key component of this class that ensures any dataset is transformed
    into this format is the ``message_transform``. This is a callable class that takes
    in a sample dictionary - typically a single row from the source dataset - that
    processes the sample in any configurable way to output a list of messages::

        [
            Message(
                role=<system|user|assistant|ipython>,
                content=<message>,
            ),
            ...
        ]

    For any custom dataset, use the ``message_transform`` to contain all pre-processing to
    return the list of messages.

    Any model-specific pre-processing that needs to happen can be configured with the ``model_transform``
    parameter. This is another callable class that contains any custom logic tied to the
    model you are fine-tuning and will carry over to inference. For example, text + image
    multimodal datasets requires processing the images in a way specific to the vision
    encoder being used by the model and is agnostic to the specific dataset.

    Tokenization is handled by the ``model_transform``. All
    :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` can be treated as
    a ``model_transform`` since it uses the model-specific tokenizer to transform the
    list of messages outputted from the ``message_transform`` into tokens used by the
    model for training. Text-only datasets will simply pass the
    :class:`~torchtune.modules.transforms.tokenizers.ModelTokenizer` into ``model_transform``.
    Tokenizers handle prompt templating, if configured.

    Args:
        source (str): path to dataset repository on Hugging Face. For local datasets,
            define source as the data file type (e.g. "json", "csv", "text") and pass
            in the filepath in ``data_files``. See `Hugging Face's
            <https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset.path>`_
            ``load_dataset`` for more details.
        message_transform (Transform): callable that keys into the desired fields in the sample
            and converts text content to a list of :class:`~torchtune.data.Message`. It is expected that the final list
            of messages are stored in the ``"messages"`` key. See :ref:`message_transform_usage_label` for details.
        model_transform (Transform): callable that applies model-specific pre-processing to the sample after the list of
            messages is created from ``message_transform``. This includes tokenization and any modality-specific
            transforms. It is expected to return at minimum ``"tokens"`` and ``"mask"`` keys.
        filter_fn (Optional[Callable]): callable used to filter the dataset prior to any pre-processing. See
            the Hugging Face `docs <https://huggingface.co/docs/datasets/v2.20.0/process#select-and-filter>`_ for more
            details.
        **load_dataset_kwargs (Dict[str, Any]): additional keyword arguments to pass to ``load_dataset``. See Hugging
            Face's `API ref <https://huggingface.co/docs/datasets/en/package_reference/loading_methods#datasets.load_dataset>`_
            for more details.
    N)	filter_fnsourcemessage_transformmodel_transformr   load_dataset_kwargsreturnc                K   sJ   || _ || _t|fi || _|d ur| j|| _t| j | jd| _d S )N)r   r   )_message_transform_model_transformr   _datafilterSFTTransform_prepare_sample)selfr   r   r   r   r    r   K/home/ubuntu/.local/lib/python3.10/site-packages/torchtune/datasets/_sft.py__init__d   s   	zSFTDataset.__init__c                 C   s
   t | jS N)lenr   )r   r   r   r   __len__y   s   
zSFTDataset.__len__indexc                 C   s   | j | }| |S r   )r   r   )r   r    sampler   r   r   __getitem__|   s   

zSFTDataset.__getitem__)__name__
__module____qualname____doc__strr   r   r   r   r   r   r   intr"   r   r   r   r   r      s$    V

r   c                   @   sN   e Zd Z		d
dee dee fddZdeeef de	eef fdd	Z
dS )r   Nr   r   c                 C   s(   |d u r|d u rt d|| _|| _d S )NzFAt least one of message_transform or model_transform must be provided.)
ValueErrorr   r   )r   r   r   r   r   r   r      s   
zSFTTransform.__init__r!   r   c                 C   s   | j d ur|  |}d|v rt|d  n|}| jd urY| |}d|v r)d|v s:d| }d| d}t|tt|d t	|d |d< t
|d t
|d ksWJ |S |}|S )Nmessagestokensmaskz, z-model_transform returned the following keys: z*. Must return 'tokens' and 'mask' as keys.labels)r   r
   r   joinkeysr)   listnpwherer	   r   )r   r!   transformed_sampletokenized_dictkeys_strerror_messager   r   r   __call__   s4   



zSFTTransform.__call__)NN)r#   r$   r%   r   r   r   r   r'   r   r   r7   r   r   r   r   r      s    
&r   )typingr   r   r   r   r   numpyr1   datasetsr   torch.utils.datar   torchtune.data._commonr	   torchtune.data._messagesr
   torchtune.modules.transformsr   r   r   r   r   r   r   <module>   s   n