o
    5tiA                     @   sp   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
mZmZmZmZ G dd deZdS )    N)partial)
SubCommand)MergeDictAction	SplitArgs_int_or_none_list_arg_typerequest_caching_arg_to_dicttry_parse_jsonc                       sJ   e Zd ZdZdejf fddZdddZed	ej	ddfd
dZ
  ZS )Runz.Command for running language model evaluation.
subparsersc                    sN   t  j|i | |jddddtdtjd| _|   | jj	| j
d d S )Nrunz-Run the evaluation harness on specified tasksz9Evaluate language models on various benchmarks and tasks.z`lm-eval run --model <model> --tasks <task> <task> --model_args <arg=value> <arg=value> [options]a=  
                examples:
                  # Basic evaluation with HuggingFace model
                  $ lm-eval run --model hf --model_args pretrained=gpt2 dtype=float32 --tasks hellaswag

                  # Evaluate on multiple tasks with few-shot examples
                  $ lm-eval run --model vllm --model_args pretrained=EleutherAI/gpt-j-6B --tasks arc_easy arc_challenge --num_fewshot 5

                  # Evaluation with custom generation parameters
                  $ lm-eval run --model hf --model_args pretrained=gpt2 --tasks lambada --gen_kwargs temperature=0.8 top_p=0.95 'stop=["\n\n"]'

                  # Use configuration file
                  $ lm-eval run --config my_config.yaml --tasks mmlu

                For more information, see: https://github.com/EleutherAI/lm-evaluation-harness
            )helpdescriptionusageepilogformatter_class)func)super__init__
add_parsertextwrapdedentargparseRawDescriptionHelpFormatter_parser	_add_argsset_defaults_execute)selfr
   argskwargs	__class__ D/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/_cli/run.pyr      s   zRun.__init__returnNc              
   C   sb  | j | _ | j d}|jddd tddd | j d}|jdd	d d
dttd d |jddtd ddd |jddd d
tddd |jdtddt	j
ddd |jddtd d d!d |jd"d#td dd$d | j d%}|jd&d'td d(d)d |jd*d+tt	j
d,td-d |jd.td d(d/d |jd0td d1d2d |jd3d d
tdtd4d | j d5}|jd6d7d tdd8d |jd9d:d;t	j
d<d= |jd>d?d td@dAd | j dB}|jdCtd g dDdEdF |jdGd;t	j
dHd= | j dI}|jdJtd dKdLd |jdMdNdO ddt	j
dPdQd | j dR}|jdStd ddTd | j dU}|jdVdWtjd dXdYd |jdZd[d;t	j
d\d= |jd]d;t	j
d^d= |jd_d d
td`dad |jdbd d
td`dcd |jddd d
td`ded | j df}	|	jdgdhd;t	j
did= dj}
|	jdkttdldm|
d dntdo|
 dp d |	jdqd;t	j
drd= |	jdsd;t	j
dtd= |	jdutjd dtdvd d S )wNconfigurationz--configz-Cz<path>z&Set initial arguments from YAML config)defaulttypemetavarr   zmodel and tasksz--tasksz-t+z<task>z
                Space (or comma-separated) list of task names or groupings.
                Use 'lm-eval list tasks' to see all available tasks.
            )r&   nargsr(   actionr   z--modelz-Mz<model>zModel name (default: hf))r'   r&   r(   r   z--model_argsz-az<arg>z?Model arguments as 'key=val,key2=val2' or `key=val` `key2=val2`)r&   r*   r+   r(   r   z--apply_chat_template?Tz
<template>z7Apply chat template to prompts (optional template name))r'   r*   constr&   r(   r   z--limitz-Lz<limit>z3Limit examples per task (integer count or fraction)z--use_cachez-cz8Path to cache model responses (skips repeated inference)zevaluation settingsz--num_fewshotz-fz<n>z&Number of examples in few-shot contextz--batch_sizez-bz<size>zIBatch size: 'auto', 'auto:N' (auto-tune N times), or integer (default: 1)z--max_batch_sizez/Maximum batch size when using --batch_size autoz--devicez<device>z+Device to use (e.g. cuda, cuda:0, cpu, mps)z--gen_kwargszGeneration arguments as `temperature=0,stop=["stop"]` or `key=val` `key2=val2`.Values should be parsable with ast.literal_eval.z#data and output (see also: --limit)z--output_pathz-oz1Output dir or json file for results (and samples)z--log_samplesz-s
store_truez:Save all model outputs and documents for post-hoc analysis)r+   r&   r   z	--samplesz-Ez<json>z`JSON mapping task names to sample indices, e.g. '{"task1": [0,1,2]}'. Incompatible with --limit.z/caching and performance (see also: --use_cache)z--cache_requests)truerefreshdeletez0Cache preprocessed prompts (true|refresh|delete))r'   r&   choicesr   z--check_integrityzRun task test suite validationz5instruct formatting (see also: --apply_chat_template)z--system_instructionz<text>zAdd custom system instruction.z--fewshot_as_multiturnc                 S   s   |   dv S )N)r/   1yes)lower)xr"   r"   r#   <lambda>   s    zRun._add_args.<locals>.<lambda>z<bool>zhUse fewshot as multi-turn conversation. Auto-enabled with --apply_chat_template. Use 'false' to disable.ztask managementz--include_pathz'Additional directory for external taskszlogging and trackingz--verbosityz-vz<level>z<(Deprecated) Log level. Use LMEVAL_LOG_LEVEL env var insteadz--write_outz-wz%Print prompts for first few documentsz--show_configz0Display full task configuration after evaluationz--wandb_argsz<args>z1Weights & Biases init arguments key=val key2=val2z--wandb_config_argsz3Weights & Biases config arguments key=val key2=val2z--hf_hub_log_argsz4Hugging Face Hub logging arguments key=val key2=val2zadvanced optionsz--predict_onlyz-xz.Save predictions only, skip metric computationz0,1234,1234,1234z--seed      z<seed>zG
                Random seeds for python,numpy,torch,fewshot (default: z).
                Use single integer for all, or comma-separated list of 4 values.
                Use 'None' to skip setting a seed. Example: --seed 42 or --seed 0,None,8,52
            z--trust_remote_codez1Allow executing remote code from Hugging Face Hubz--confirm_run_unsafe_codez4Confirm understanding of unsafe code execution risksz
--metadataz`key=val` `key2=val` args parsable by ast.literal_eval (merged with model_args),
                required for some tasks such as RULER)r   add_argument_groupadd_argumentstrr   r   r   stripr   r   SUPPRESSfloatintr   r   upperr   r   jsonloads)r   config_groupmodel_group
eval_group
data_groupcache_grouptemplate_group
task_grouplogging_groupadvanced_groupdefault_seed_stringr"   r"   r#   r   1   s
  
		


	

zRun._add_argsr   c                 C   s
  dt jd< ddlm} tt}|| }ddlm	} ddl
m}m} ddlm}m} |jr5||j|j}	|jr>|j|jd< t jd	d
rNt jd	|jd< |dCi |j}
||j}d|jv ri|jsi|d |jd
urw|d|j  |d|j  |dCi d|jd|jd|jd|jd|jd|j d|j!d|j"d|j#ddd|j#ddd|j#ddd|j$d|j%d|j&d|j'd |jd!|
d"|j(d#|j)d$|j*d%|j+d&|d'|j,d(|j-d)|j.r|j.d nd
d*|j.r|j.d+ nd
d,|j.r|j.d- nd
d.|j.r|j.d/ nd
d0|j/d1|j}|d
ur|jr4|0d}t1j2|d-|dd2}|j3rEt4| d35t6t7|d4 d5 }|jrz|	8| |	9  |jrh|	:| W n t;y } z|d6|  W Y d
}~nd
}~ww |
j<||jr|nd
d7 |jr|d8 = D ]\}}|
j>||| d9 q|
j?s|
j@r|
A  |j0d:d
 t4|j d;|j d<|j+ d=|j$ d>|j d?|j |rd;| d@ndA  t4|| dB|v rt4||dB |jr|	jBC  d
S d
S d
S )Dz8Runs the evaluation harness with the provided arguments.falseTOKENIZERS_PARALLELISMr   )EvaluatorConfig)simple_evaluate)EvaluationTrackerWandbLogger)handle_non_serializable
make_tableoutput_pathHF_TOKENNtokenpush_samples_to_hubz<Pushing samples to the Hub requires --log_samples to be set.zIncluding path: zSelected Tasks: model
model_argstasksnum_fewshot
batch_sizemax_batch_sizedevice	use_cachecache_requestsFrewrite_requests_cachedelete_requests_cachelimitsamplescheck_integrity	write_outlog_samplesevaluation_trackersystem_instructionapply_chat_templatefewshot_as_multiturn
gen_kwargstask_manager	verbositypredict_onlyrandom_seednumpy_random_seed   torch_random_seed   fewshot_random_seedr8   confirm_run_unsafe_codemetadata)indentr&   ensure_ascii,configbatch_sizeszLogging to W&B failed: )resultsrf   configs)	task_namerf   trust_remote_codez (z), gen_kwargs: (z
), limit: z, num_fewshot: z, batch_size: ) groupsr"   )Dosenvironlm_eval.config.evaluate_configrP   logging	getLogger__name__from_clilm_evalrQ   lm_eval.loggersrR   rS   lm_eval.utilsrT   rU   
wandb_argswandb_config_argsrV   hf_hub_log_argsgetprocess_tasksry   ri   warninginclude_pathinfor\   rZ   r[   r]   r^   r_   r`   ra   rb   re   rf   rg   rh   rk   rl   rm   rn   rp   rq   seedrx   poprB   dumpsshow_configprintjoinmapr<   	post_initlog_eval_resultlog_eval_samples	Exceptionsave_results_aggregateditemssave_results_samplespush_results_to_hubrY   recreate_metadata_cardr   finish)r   rP   eval_loggercfgrQ   rR   rS   rT   rU   wandb_loggerrj   ro   r   rf   dumpedr~   er   _r"   r"   r#   r   O  s  



	
 !"
&




2zRun._execute)r$   N)r   
__module____qualname____doc__r   _SubParsersActionr   r   staticmethod	Namespacer   __classcell__r"   r"   r    r#   r	      s    
   r	   )r   rB   r   r   r   	functoolsr   lm_eval._cli.subcommandr   lm_eval._cli.utilsr   r   r   r   r   r	   r"   r"   r"   r#   <module>   s    	