o
    	Ti                  
   @   s<  d dl Z d dlmZmZmZ d dlZd dlmZmZ d dlm	Z	 d dl
mZ e	eje	dkrUd dlmZ eeddd	eddd	d
eddd	eddd	ddZneddd	eddd	d
geddd	eddd	ddZ	ddeded dee fddZdefddZ	ddededee dee fddZdS )    N)CallableLiteralOptional)DatasetValue)version)AutoTokenizerz4.0.0)Liststring)dtypeid)contentrole)
completionprompt)chatmlinstruction	tokenizermessages_field)messagesconversationstoolsc                    s    fdd}|S )z
    return a callable function that takes in a "messages" dataset and returns a formatted dataset, based on the
    tokenizer apply chat template to the dataset along with the schema of the list of functions in the tools list.
    c                    sb   t |   d tr'g }tt|   D ]}|j|   | dd q|S j|   ddS )Nr   F)tokenizer   
isinstancelistrangelenappendapply_chat_template)examplesoutput_textsir   r   r    Q/home/ubuntu/.local/lib/python3.10/site-packages/trl/extras/dataset_formatting.pyformat_dataset.   s   z9conversations_formatting_function.<locals>.format_datasetr$   )r   r   r   r&   r$   r#   r%   !conversations_formatting_function&   s   r'   c                    s    fdd}|S )z
    return a callable function that takes in an "instructions" dataset and returns a formatted dataset, based on the
    tokenizer apply chat template to the dataset
    c                    s   t | d tr2g }tt| d D ]}d| d | dd| d | dg}| j|dd q|S d| d dd| d dg} j|ddS )Nr   user)r   r   	assistantr   F)r   r   )r    r!   r"   converted_sampler   r$   r%   r&   B   s   z8instructions_formatting_function.<locals>.format_datasetr$   )r   r&   r$   r+   r%    instructions_formatting_function<   s   r,   datasetreturnc                 C   s   t | trId| jv r| jd td krtd t|d|S d| jv r9| jd td kr7td t|d|S dS | jtd krItd t|S dS )a  
    Finds the correct formatting function based on the dataset structure. Currently supported datasets are:
    - `ChatML` with [{"role": str, "content": str}]
    - `instruction` with [{"prompt": str, "completion": str}]

    Args:
        dataset (Dataset): User dataset
        tokenizer (AutoTokenizer): Tokenizer used for formatting

    Returns:
        Callable: Formatting function if the dataset format is supported else None
    r   r   z%Formatting dataset with chatml formatr   r   z*Formatting dataset with instruction formatN)r   r   featuresFORMAT_MAPPINGlogginginfor'   r,   )r-   r   r   r$   r$   r%    get_formatting_func_from_datasetV   s   





r3   )N)r1   typingr   r   r   datasetsr   r   	packagingr   transformersr   parse__version__r	   r0   r   r'   r,   r3   r$   r$   r$   r%   <module>   sB   
