o
    	Ti                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZ d dlmZmZmZmZmZmZ d dlmZ deiZeG d	d
 d
eZdd Zdde jfddZedkrue Ze \Z Z!Z"ee e!e" dS dS )    N)	dataclassfield)Optional)load_dataset)AutoModelForCausalLM"AutoModelForSequenceClassificationAutoTokenizer)
GRPOConfigGRPOTrainerModelConfigScriptArguments	TrlParserget_peft_config)think_format_rewardr   c                   @   sN   e Zd ZU dZedddidZee ed< edddidZ	ee
e  ed< dS )	GRPOScriptArgumentsal  
    Script arguments for the GRPO training script.

    Args:
        reward_model_name_or_path (`str` or `None`, *optional*, defaults to `None`):
            Reward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a
            directory containing model weights saved using [`~transformers.PreTrainedModel.save_pretrained`].
        reward_funcs (`list[str]` or `None`, *optional*, defaults to `None`):
            Reward functions to use. It can be either one of `"think_format_reward"`; or a dotted import path " (e.g.,
            `'my_lib.rewards.custom_reward'`).
    NhelpzReward model id of a pretrained model hosted inside a model repo on huggingface.co or local path to a directory containing model weights saved using `PreTrainedModel.save_pretrained`.)defaultmetadatareward_model_name_or_pathzReward functions to use. It can be either one of  'think_format_reward'; or a dotted import path. (e.g., 'my_lib.rewards.custom_reward').reward_funcs)__name__
__module____qualname____doc__r   r   r   str__annotations__r   list r   r   D/home/ubuntu/.local/lib/python3.10/site-packages/trl/scripts/grpo.pyr   )   s   
 r   c              	   C   sP  t j|j|jd}tj|j|jd}g }| jr&tj| j|jdd}|| | jrn| jD ]A}|t	v r:|t	|  q,d|v r_|
dd\}}tjdt  t|}	t|	|}
||
 q,td| dtt	  dt| j| jd	}t||||| j |jd
kr|| j nd |t|d}|  ||j  |j!r|j!| jd d S d S )N)trust_remote_code   )r   
num_labels.r   z Could not load reward function 'z'. Expected one of z or a valid import path.)nameno)modelr   argstrain_dataseteval_datasetprocessing_classpeft_config)dataset_name)"r   from_pretrainedmodel_name_or_pathr   r   r   r   appendr   reward_funcs_registryrsplitsyspathinsertosgetcwd	importlibimport_modulegetattr
ValueErrorr   keysr   r+   dataset_configr
   dataset_train_spliteval_strategydataset_test_splitr   train
save_model
output_dirpush_to_hub)script_argstraining_args
model_argsr%   	tokenizerr   reward_model	func_namemodule_pathmodulereward_funcdatasettrainerr   r   r   mainG   sT   





rN   
subparsersc                 C   s2   t ttf}| d ur| jdd|d}|S t|}|S )NgrpozRun the GRPO training script)r   dataclass_types)r   r	   r   
add_parserr   )rO   rQ   parserr   r   r   make_parser   s   
rT   __main__)N)#argparser6   r4   r1   dataclassesr   r   typingr   datasetsr   transformersr   r   r   trlr	   r
   r   r   r   r   trl.rewardsr   r/   r   rN   _SubParsersActionrT   r   rS   parse_args_and_configrC   rD   rE   r   r   r   r   <module>   s*    8	