o
    5tiG                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZm Z  d dl!m"Z"m#Z#m$Z$ d d	l%m&Z&m'Z' d d
l(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 erd dlm1Z1 d dlm2Z2 d dl3m4Z4 e5e6Z7e,ddddddddddddddddddddddddeeeeddfdJd=d>Z8e,											?	dKdLdDdEZ9dMdHdIZ:dS )N    )annotationsN)defaultdict)TYPE_CHECKING)delete_cache)DEFAULT_OTHER_SEEDDEFAULT_RANDOM_SEED)consolidate_group_resultsconsolidate_resultsget_sample_sizeget_subtask_listget_task_listprepare_print_tasksprint_writeoutrun_task_tests)add_env_infoadd_tokenizer_infoget_git_commit_hash)TaskManagerget_task_dict)handle_non_serializablehash_dict_imageshash_stringpositional_deprecatedset_torch_seedsetup_loggingsimple_parse_args_string	wrap_text)LM)Task)EvaluationTrackerF順 Tmodelstr | LM
model_args)str | dict[str, str | int | float] | Nonetaskslist[str | dict | Task] | Nonenum_fewshot
int | None
batch_sizeint | str | Nonemax_batch_sizedevice
str | None	use_cachecache_requestsboolrewrite_requests_cachedelete_requests_cachelimitint | float | Nonesamplesdict | Nonebootstrap_itersintcheck_integrity	write_outlog_samplesevaluation_trackerEvaluationTracker | Nonesystem_instructionapply_chat_template
bool | strfewshot_as_multiturn
gen_kwargsstr | dict | Nonetask_managerTaskManager | Nonepredict_onlyrandom_seednumpy_random_seedtorch_random_seedfewshot_random_seedconfirm_run_unsafe_codemetadatac           %        s  |dur	t |d t }|dur|durtdd ttr+tfdd D s=ttrTt fdd D rT|sTt	t
dttrMd	n d
 |
r^td t  g } |durq| d|  t| |dur| d|  tj| |dur| d|  t| dur| d  | rtd|  |du rg }t|dkrtdrttrtt	d d sdt| tr"du rt	d dttrtd|  d  tjj| |||d}!n9tt
d|  dt  tjj| |||d}!nt| tjjjs5tdt|  dtd | }!|durctd|d t|!j  d   tjj!|!|d t|!j  d }!|du rttrrtn	ttrzni |pi B }t"|d }t#||}"fd!d"|"}"|rt$|d# |dur|j%j&t| tr| nd$pd||r|!'|nd|d% t(|!|"||||	||rd&n||||||d'}#|durt |d |!j dkrUt| tr| }$nt)| d(rt)| j*d)r| j*j+}$nt| j,}$|$d*|#d(< t)|!d+r!|#d( -|!.  |#d( -|t)|!d,r3t/|!j0 ng |||||||d- t1 |#d.< ||#d/< t2|# t3|#|! |#S dS )0a  Instantiate and evaluate a model on a list of tasks.

    Args:
        model (str | LM): Name of model or LM object. See
            lm_eval.models.__init__.py for available aliases.
        model_args (str | dict | None): String or dict arguments for each model
            class, see LM.create_from_arg_string and LM.create_from_arg_object.
            Ignored if `model` argument is a LM object.
        tasks (list[str | dict | Task]): List of task names or Task objects.
            Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined
            and type(task).__name__ otherwise.
        num_fewshot (int): Number of examples in few-shot context.
        batch_size (int | str | None): Batch size for model.
        max_batch_size (int | None): Maximal batch size to try with automatic
            batch size detection.
        device (str | None): PyTorch device (e.g. "cpu" or "cuda:0") for running
            models.
        use_cache (str | None): A path to a sqlite db file for caching model
            responses. `None` if not caching.
        cache_requests (bool): Speed up evaluation by caching the building of
            dataset requests. `None` if not caching.
        rewrite_requests_cache (bool): Rewrites all the request cache if set to
            `True`. `None` if not desired.
        delete_requests_cache (bool): Deletes all the request cache if set to
            `True`. `None` if not desired.
        limit (int | float | None): Limit the number of examples per task (only
            use this for testing). If <1, limit is a percentage of the total
            number of examples.
        samples (dict | None): Dictionary indicating which examples should be
            tested in each task, e.g.,
            {"mmlu_astronomy": [0, 3, 6], "mmlu_anatomy": [1, 4, 7, 10]}.
        bootstrap_iters (int): Number of iterations for bootstrap statistics, used
            when calculating stderrs. Set to 0 for no stderr calculations to be
            performed.
        check_integrity (bool): Whether to run the relevant part of the test suite
            for the tasks.
        write_out (bool): If True, write out an example document and model input
            for checking task integrity.
        log_samples (bool): If True, write out all model outputs and documents for
            per-sample measurement and post-hoc analysis.
        evaluation_tracker (EvaluationTracker | None): Tracker for logging
            experiment configuration and results.
        system_instruction (str | None): System instruction to be applied to the
            prompt.
        apply_chat_template (bool | str): Specifies whether to apply a chat
            template to the prompt. If set to True, the default chat template is
            applied. If set to a string, applies the specified chat template by
            name. Defaults to False (no chat template applied).
        fewshot_as_multiturn (bool): Whether to provide the fewshot examples as a
            multiturn conversation or a single user turn.
        gen_kwargs (dict | str | None): Arguments for model generation. Ignored
            for all tasks with loglikelihood output_type.
        task_manager (TaskManager | None): Task manager instance to use.
        verbosity (str | None): Verbosity level for logging.
        predict_only (bool): If True, only model outputs will be generated and
            returned. Metrics will not be evaluated.
        random_seed (int): Random seed for python's random module. If set to None,
            the seed will not be set.
        numpy_random_seed (int): Random seed for numpy. If set to None, the seed
            will not be set.
        torch_random_seed (int): Random seed for torch. If set to None, the seed
            will not be set.
        fewshot_random_seed (int): Random seed for fewshot sampler random generator.
            If set to None, the seed of generator will be set to None.
        confirm_run_unsafe_code (bool): Whether to confirm running tasks marked
            as unsafe.
        metadata (dict | None): Additional metadata to be added to the task
            manager. Will get passed to the download function of the task.

    Returns:
        dict | None: Dictionary of results, or None if not on rank 0.
    N)	verbosity@Either 'limit' or 'samples' must be None, but both are not None.)instchatc                 3  s    | ]	}|   v V  qd S N)lower.0kw)r#    E/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/evaluator.py	<genexpr>   s    z"simple_evaluate.<locals>.<genexpr>c                 3  s&    | ] t  fd dD V  qdS )c                 3  s     | ]}|t   v V  qd S rQ   )strrR   rS   vrV   rW   rX      s    z,simple_evaluate.<locals>.<genexpr>.<genexpr>N)any)rT   )_NEEDS_CHAT_TEMPLATErZ   rW   rX      s
    
zpretrained=
pretrainedz appears to be an
                instruct or chat variant but chat template is not applied.
                Recommend setting `apply_chat_template` (optionally `fewshot_as_multiturn`).zDeleting requests cache...zSetting random seed to zSetting numpy seed to zSetting torch manual seed to zSetting fewshot manual seed to z | r   zDNo tasks specified, or no tasks found. Please verify the task names.zgeneration_kwargs: z specified through cli, these settings will update set parameters in yaml tasks. Ensure 'do_sample=True' for non-greedy decoding!z)model_args not specified. Using defaults. zInitializing z model, with arguments: )r)   r+   r,   z=The value of `model` passed to simple_evaluate() was of type z, but is required to be a subclass of lm_eval.api.model.LM . This may be because you are passing an initialized Hugging Face PreTrainedModel without having wrapped it in `lm_eval.models.huggingface.HFLM(pretrained=my_model)` first.zUsing pre-initialized modelzUsing cache at _rankz.db)rL   c              	     s,  i }|   D ]\}}t|tri || |i}q|ddkr;d ur-|jddd t|jj d|jj	  rLtd| d |j
d	d
 d ury|d }dkrctd| d n&td| d| d  |jdd n|d }d u r|jddd |jd |||< q|S )Noutput_typegenerate_untilgeneration_kwargsT)keyvalueupdatez: Using gen_kwargs: zProcessing z5 in output-only mode. Metrics will not be calculated!bypass)metric_namer'   r   z"num_fewshot has been set to 0 for z5 in its config. Manual configuration will be ignored.z#Overwriting default num_fewshot of z from z to )rd   re   )seed)items
isinstancedict
get_config
set_configeval_loggerinfoconfigtaskrc   override_metricwarningset_fewshot_seed)	task_dictadjusted_task_dict	task_nametask_objdefault_num_fewshot)_adjust_configrJ   rB   r'   rF   rV   rW   r{   +  sJ   




z'simple_evaluate.<locals>._adjust_config)	task_listCUSTOM)model_sourcer#   r>   chat_templaterA   T)lmrv   r3   r5   r/   r1   r7   r:   r;   r>   r?   rA   rM   rK   rq   _name_or_path)r!   r#   get_model_infobatch_sizes)r)   r   r,   r.   r3   r7   rB   rG   
numpy_seed
torch_seedfewshot_seedgit_hashdate)4r   time
ValueErrorrk   rY   r\   rl   valuesro   rt   r   getrp   r   appendrandomri   npr   joinlenr   lm_evalapiregistry	get_modelcreate_from_arg_objcreate_from_arg_stringr!   r   	TypeErrortyperank	CachingLMr   r   r   general_config_trackerlog_experiment_argsr   evaluatehasattrrq   r   __name__rf   r   listr   r   r   r   )%r!   r#   r%   r'   r)   r+   r,   r.   r/   r1   r2   r3   r5   r7   r9   r:   r;   r<   r>   r?   rA   rB   rD   rM   rF   rG   rH   rI   rJ   rK   rL   
start_dateseed_messager   rv   results
model_namerV   )r]   r{   rJ   rB   r#   r'   rF   rW   simple_evaluate4   sb  j






	

"


3









r   INFOr   r   rM   rY   c           A        s  |dur|durt d|durtdt|   |
r#td tt}tt}t|}|s>t	dd |D s>t dg }|D ])}|j
}t|dd	rZt| dd	sZ||j qBt|d
d	rk|skt d|j dqBt|dkrt| dd	st d| d|}g }|D ]}|j
}t||}|| |j||dur||jdn|| j| j|||	t|
||
rt| ddnd|
rt| ddndd td|j dt|j  |rt| |jD ]}|j}|| | q| jdkr(ddl}|jt|j| jd}| j|   ! " }|j#dkrdn|j#}t$||| j  }||  |7  < q|% D ]d\}}td| d g }|D ]}|&|g|j'  q>| jdkrk|| dkrkt(|| D ]}|&|g|j'  q^t| ||} t)| |ddD ]\}!}|j*|! qy| jdkr| j+  q-| j}"| j}#t)||ddD ]\}}|j
}|,  tt}$|jD ]}|$|j- | q|$. D ]}%|%j/dd d  q|jd j0D ] |dur||jdnd}&|j1|"||#|&d!}'|'D ]\}(})|&r|&|( n|(}*|$|( }|2|) fd"d#|D }+|rY|3|)},|*|)|,d$d# |D d%d# |D  fd&d#|D  t|+ t4t5j6|d j7d't8d	d(t4|d j9d t4t:|,d)}-|-;|+ |j<|- |+% D ]\}.}/|j=|. f |/ q]qqӐq|#dkrddl}|D ]Y}|r|"dkrdg|# nd}0|j>j?|j<|0dd* |"dkrtt@jAB|0|_<|j=D ]+}+|"dkrdg|# nd}1|j>j?|j=|+ |1dd* |"dkrtt@jAB|1|j=|+< qq~|"dkr|D ]	}|jC|d+ qtD|\}2}}3}4}5}6t|2rtE|2|4|^}2}4}7}tF||2\}8}9tG|}:i };|:% D ]N\}<}=t|=dkr`|=D ];}|6| % D ]1\}>}?|>|;vr5|?|;|>< |>|;v rX|;|> durX|;|> |?krXtd,|> d-|< d. d|;|>< q(q |;|6|<< qd/tH|8% it|9|7@ rxd0tH|9% ini tHtI|:% tHtJ|3% tHtJ|4% tHtJ|5% tHtJ|6% d1d2 t)||ddD d3}@|rtKjLd4d5d6krtM| drtN|n|}tH||@d7< |@S dS )8a  Instantiate and evaluate a model on a list of tasks.

    Args:
        lm (LM): Language Model.
        task_dict (dict[str, Task]): Dictionary of tasks. Tasks will be taken to
            have name type(task).config.task.
        limit (int | None): Limit the number of examples per task (only use this
            for testing).
        samples (dict | None): Dictionary indicating which examples should be
            tested in each task, e.g.,
            {"mmlu_astronomy": [0, 3, 6], "mmlu_anatomy": [1, 4, 7, 10]}.
        cache_requests (bool): Speed up evaluation by caching the building of
            dataset requests.
        rewrite_requests_cache (bool): Rewrites all the request cache if set to
            `True`.
        bootstrap_iters (int | None): Number of iterations for bootstrap
            statistics, used when calculating stderr. Set to 0 for skipping all
            stderr calculations.
        write_out (bool): If True, write out an example document and model input
            for checking task integrity.
        log_samples (bool): If True, write out all model outputs and documents
            for per-sample measurement and post-hoc analysis.
        system_instruction (str | None): System instruction to be applied to the
            prompt.
        apply_chat_template (bool | str): Specifies whether to apply a chat
            template to the prompt. If set to True, the default chat template is
            applied. If set to a string, applies the specified chat template by
            name. Defaults to False (no chat template applied).
        fewshot_as_multiturn (bool): Whether to provide the fewshot examples as a
            multiturn conversation or a single user turn.
        verbosity (str): Verbosity level for logging. (no-op, deprecated)
        confirm_run_unsafe_code (bool): Whether to confirm running tasks marked
            as unsafe.

    Returns:
        dict | None: Dictionary of results, or None if not on rank 0.
    NrN   zEvaluating examples for tasks z~Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details.c                 s  s"    | ]}d t |jdi vV  qdS )rg   _metric_fn_listN)getattrrr   )rT   task_outputrV   rV   rW   rX     s
    
zevaluate.<locals>.<genexpr>z7log_samples must be True for 'bypass' metric-only tasks
MULTIMODALFUNSAFE_CODEzAttempted to run task: zN which is marked as unsafe. Set confirm_run_unsafe_code=True to run this task.r   zAttempted to run tasks: z which require multimodal input, but the selected model type does not currently implement this. Multimodal support is currently restricted to the ['hf-multimodal', 'vllm-vlm'] model type.r?   tokenizer_namer_   )r3   r5   r   
world_sizer/   r1   r>   r?   rA   r   r   zTask: z#; number of requests on this rank:    )r,   multiple_choiceloglikelihoodzRunning z	 requestsT)strictc                 S  s   | j S rQ   )idx)xrV   rV   rW   <lambda>m  s    zevaluate.<locals>.<lambda>)rd   )r   r3   r   r5   c                      g | ]}|j   qS rV   filtered_respsrT   req
filter_keyrV   rW   
<listcomp>  s    zevaluate.<locals>.<listcomp>c                 S     g | ]}|j qS rV   )argsr   rV   rV   rW   r         c                 S  r   rV   )respsr   rV   rV   rW   r     r   c                   r   rV   r   r   r   rV   rW   r     s    
   )indentdefaultensure_ascii)doc_iddoctarget	argumentsr   r   filtermetricsdoc_hashprompt_hashtarget_hash)objobject_gather_listdst)r7   z#Higher_is_better values for metric z
 in group z( are not consistent. Defaulting to None.r   groupsc                 S  sD   i | ]\}}|j t|jjt|r|nt|jjt|jjd qS ))original	effective)rx   r   rr   	eval_docsmin)rT   r   r3   rV   rV   rW   
<dictcomp>  s    

zevaluate.<locals>.<dictcomp>)group_subtasksconfigsversionszn-shothigher_is_betterz	n-samplesLMEVAL_HASHMM10r5   )Or   ro   rp   r   keysrt   r   r8   r   allrr   r   r   rx   r   r
   build_all_requestsr   r   r   r0   debug	instancesr   request_typetorchtensor
_instancesr,   acceleratorgathercpudetachnumpytolistOUTPUT_TYPEmaxrj   extendrepeatsrangezipr   wait_for_everyoneapply_filtersr   r   sortr   doc_iteratorprocess_resultsdoc_to_targetr   jsondumpsr   r   r   rY   rf   logged_samplessample_metricsdistributedgather_object	itertoolschainfrom_iterablecalculate_aggregate_metricr	   r   r   r   rl   reversedsortedosenvironr   r   )Ar   rv   r3   r5   r/   r1   r7   r:   r;   r>   r?   rA   rM   rK   requestspadding_requests
eval_tasksincompatible_tasksr   rr   	limit_arglimitsinstancereqtyper   instances_rnkgathered_itemnumpadreqscloned_reqsr   _r   r   RANK
WORLD_SIZEinstances_by_doc_idr   indicesr   r   r   doc_id_truer   r   examplemetricre   full_samplesmetric_listr   r   r   r'   r   show_group_tableresults_agg	group_aggsubtask_list_higher_is_bettergroupr|   mhresults_dictrV   r   rW   r     s  7












/










r   returnrl   c                 C  s   | dv | dk| dkd}|S )N>   truerefreshr*  delete)r/   r1   r2   rV   )r/   request_caching_argsrV   rV   rW   request_caching_arg_to_dict  s
   r-  )<r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r(   r,   r-   r.   r-   r/   r0   r1   r0   r2   r0   r3   r4   r5   r6   r7   r8   r9   r0   r:   r0   r;   r0   r<   r=   r>   r-   r?   r@   rA   r0   rB   rC   rD   rE   rF   r0   rG   r8   rH   r8   rI   r8   rJ   r8   rK   r0   rL   r6   )NNFFr    FTNFFr   F)r   r   r3   r(   r5   r6   r/   r0   r1   r0   r7   r(   r:   r0   r;   r0   r>   r-   r?   r@   rA   r0   rM   rY   rK   r0   )r/   rY   r(  rl   );
__future__r   r   r   loggingr  r   r   collectionsr   typingr   r   r   lm_eval.api.metricsr   lm_eval.api.modellm_eval.api.registrylm_eval.api.tasklm_eval.caching.cacher   lm_eval.defaultsr   r   lm_eval.evaluator_utilsr   r	   r
   r   r   r   r   r   lm_eval.loggers.utilsr   r   r   lm_eval.tasksr   r   lm_eval.utilsr   r   r   r   r   r   r   r   r   r   lm_eval.loggersr   	getLoggerr   ro   r   r   r-  rV   rV   rV   rW   <module>   s    (
(
  y  j