o
    5tih                    @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZm	Z	m
Z
mZ d dlmZ d dlmZ d dlmZmZmZmZ d dlZd dlZd dlmZ d dlmZ d d	lmZ d d
lmZmZ d dl m!Z!m"Z"m#Z# d dl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* d dl+m,Z,m-Z-m.Z.m/Z/m0Z0 d dl1m2Z2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 erd dl4m:Z: g dZ;e<e=Z>e5Z5G dd dej?Z@G dd de@ZAG dd de@ZBG dd de@ZCdS )    )annotationsN)CallableIterableIteratorMapping)deepcopy)partial)TYPE_CHECKINGAnyLiteralcast)tqdm)utils)samplers)Instance
OutputType)bits_per_bytemeanweighted_perplexity)AGGREGATION_REGISTRYDEFAULT_METRIC_REGISTRYget_aggregation
get_metricget_metric_aggregationis_higher_better)Messageends_with_whitespacemaybe_delimitmultiturn_to_singleturnrequires_delimiter)load_from_cachesave_to_cache)
TaskConfig)build_filter_ensemble)
get_prompt)FewshotConfig)loglikelihoodmultiple_choiceloglikelihood_rollinggenerate_untilc                   @  s  e Zd ZU dZdZded< dZded< dZded< dZded	< 				ddddZ				ddddZ
edddZejdd Zejdd Zejdd Zdd d!Zdd"d#Zdd$d%Zdd&d'Zdd*d+Zedd-d.Zd/d0 Zd1d2 Zejd3d4 Zejd5d6 Zd7d8 Zd9d: Zd;d< Zddd=d>d?d?dd?d?dd@dAddSdTZejdUdV Z ejdWdX Z!ejdYdZ Z"ejd[d\ Z#dd_d`Z$e%dadb Z&e%dcdd Z'e(j)ddedfZ*ddhdiZ+ddjdkZ,dddndoZ-ddqdrZ.dddtduZ/eddwdxZ0d=dd>ddydd{d|Z1e2ddddZ3dS )Taskao  A task represents an entire benchmark including its dataset, problems,
    answers, and evaluation methods. See BoolQ for a simple example implementation

    A `doc` can be any python object which represents one instance of evaluation.
    This is usually a dictionary e.g.
        {"question": ..., "answer": ...} or
        {"question": ..., question, answer)
    Nzint | str | NoneVERSION
str | NoneDATASET_PATHDATASET_NAMEzOutputType | NoneOUTPUT_TYPEdata_dir	cache_dirdownload_modedatasets.DownloadMode | NoneconfigMapping | NonereturnNonec                 C  sV   |  ||| d| _d| _d| _|rti |nt | _tdddggg| _d| _dS )ag  
        :param data_dir: str
            Stores the path to a local folder containing the `Task`'s data files.
            Use this to specify the path to manually downloaded data (usually when
            the dataset is not publicly accessible).
        :param cache_dir: str
            The directory to read/write the `Task` dataset. This follows the
            HuggingFace `datasets` API with the default cache directory located at:
                `~/.cache/huggingface/datasets`
            NOTE: You can change the cache location globally for a given process
            to another directory:
                `export HF_DATASETS_CACHE="/path/to/another/directory"`
        :param download_mode: datasets.DownloadMode
            How to treat pre-existing `Task` downloads and data.
            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
                Reuse download and reuse dataset.
            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
                Reuse download with fresh dataset.
            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
                Fresh download and fresh dataset.
        Nnone
take_first)	download_training_docs_fewshot_docs
_instancesr"   _configr#   _filtersfewshot_rnd)selfr0   r1   r2   r4    rB   D/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/api/task.py__init__T   s   zTask.__init__c                 C  s   t j| j| j|||d| _dS )a#  Downloads and returns the task dataset.
        Override this method to download the dataset from a custom API.

        :param data_dir: str
            Stores the path to a local folder containing the `Task`'s data files.
            Use this to specify the path to manually downloaded data (usually when
            the dataset is not publicly accessible).
        :param cache_dir: str
            The directory to read/write the `Task` dataset. This follows the
            HuggingFace `datasets` API with the default cache directory located at:
                `~/.cache/huggingface/datasets`
            NOTE: You can change the cache location globally for a given process
            by setting the shell environment variable, `HF_DATASETS_CACHE`,
            to another directory:
                `export HF_DATASETS_CACHE="/path/to/another/directory"`
        :param download_mode: datasets.DownloadMode
            How to treat pre-existing `Task` downloads and data.
            - `datasets.DownloadMode.REUSE_DATASET_IF_EXISTS`
                Reuse download and reuse dataset.
            - `datasets.DownloadMode.REUSE_CACHE_IF_EXISTS`
                Reuse download with fresh dataset.
            - `datasets.DownloadMode.FORCE_REDOWNLOAD`
                Fresh download and fresh dataset.
        )pathnamer0   r1   r2   N)datasetsload_datasetr-   r.   dataset)rA   r0   r1   r2   rB   rB   rC   r:   |   s   zTask.downloadr"   c                 C     | j S )z2Returns the TaskConfig associated with this class.)r>   rA   rB   rB   rC   r4      s   zTask.configc                 C     dS )z#Whether the task has a training setNrB   rK   rB   rB   rC   has_training_docs      zTask.has_training_docsc                 C  rL   )z%Whether the task has a validation setNrB   rK   rB   rB   rC   has_validation_docs   rN   zTask.has_validation_docsc                 C  rL   )zWhether the task has a test setNrB   rK   rB   rB   rC   has_test_docs   rN   zTask.has_test_docsr   c                 C     g S j
        :return: Iterable[obj]
            A iterable of any object, that doc_to_text can handle
        rB   rK   rB   rB   rC   training_docs      zTask.training_docsc                 C  rQ   rR   rB   rK   rB   rB   rC   validation_docs   rU   zTask.validation_docsc                 C  rQ   rR   rB   rK   rB   rB   rC   	test_docs   rU   zTask.test_docsc                 C  sP   |   r|  S |  r|  S | jdddkr$td| jj d | 	 S )rS   num_fewshotr   [Task: zs] has_training_docs and has_validation_docs are False, using test_docs as fewshot_docs but this is not recommended.)
rM   rT   rO   rV   r4   geteval_loggerwarningtaskrW   rK   rB   rB   rC   fewshot_docs   s   zTask.fewshot_docsdocdictc                 C     |S a8  
        Override this to process (detokenize, strip, replace, etc.) individual
        documents. This can be used in a map over documents of a data split.
        E.g. `map(self._process_doc, self.dataset["validation"])`

        :return: dict
            The processed version of the specified `doc`.
        rB   rA   r_   rB   rB   rC   _process_doc      	zTask._process_doclist[Instance]c                 C  rJ   )zAfter calling `task.build_all_requests()`, tasks
        maintain a list of the dataset instances which will be evaluated.
        )r=   rK   rB   rB   rC   	instances   s   zTask.instancesc                 C  s&   | j d u rt|  | _ || j |S N)r;   listrT   samplerA   krndrB   rB   rC   fewshot_examples   s   
zTask.fewshot_examplesc                 C  s   t d)NzSOverride doc_to_decontamination_query with document specific decontamination query.NotImplementedErrorrc   rB   rB   rC   doc_to_decontamination_query   s   z!Task.doc_to_decontamination_queryc                 C     d S rh   rB   rc   rB   rB   rC   doc_to_text      zTask.doc_to_textc                 C  rr   rh   rB   rc   rB   rB   rC   doc_to_target   rt   zTask.doc_to_targetc                 C     t rh   ro   rc   rB   rB   rC   doc_to_image     zTask.doc_to_imagec                 C  rv   rh   ro   rc   rB   rB   rC   doc_to_audio  rx   zTask.doc_to_audioc                 C  rL   N rB   rc   rB   rB   rC   doc_to_prefix  rx   zTask.doc_to_prefixr      Fr{   )limitsamplesrank
world_sizecache_requestsrewrite_requests_cachesystem_instructionapply_chat_templatefewshot_as_multiturnchat_templatetokenizer_namer~   
int | Noner   list[int] | Noner   intr   r   boolr   r   r   r   r   Callable | Noner   strc                C  s  |}d| j j d| jj d| d| }||rdnd7 }||	r!dnd7 }||dur1d	t| nd7 }|d
| 7 }t||d}|rY|rY|sY|d| }dd |D }|| _dS t	d| jj d| d g }|ru|ro|ru|durud}t
| j||||d}t|}t||dD ]<\}}| j|| jjdu rdn| jj|||	|
| |d}| j||| jd || jjf||
d}t|t
s|g}|| q|d| }dd |D }|| _t| jdkrtd|r|r|rt||d dS dS dS )zEBuild a set of Instances for a task, and store them in task.instancesz	requests--z	shot-rankz-world_sizez-chat_templater{   z-fewshot_as_multiturnNz-system_prompt_hashz
-tokenizer)	file_namecachec                 S     g | ]	}|D ]}|qqS rB   rB   .0instance_groupinstancerB   rB   rC   
<listcomp>.      z+Task.build_all_requests.<locals>.<listcomp>zBuilding contexts for z	 on rank z...)r   r~   r   r   )totalr   )rX   r   r   r   r   
gen_prefixr]   )r_   ctxmetadatar   r   c                 S  r   rB   rB   r   rB   rB   rC   r   n  r   z,task.build_requests() did not find any docs!)r   obj)r>   r]   r4   rX   r   hash_stringr    r=   r[   infori   doc_iteratorlenr   fewshot_contextr|   construct_requestsrepeats
isinstanceappend
ValueErrorr!   )rA   r~   r   r   r   r   r   r   r   r   r   r   og_limit	cache_keycached_instancesflattened_instancesrg   doc_id_docsnum_docsdoc_idr_   fewshot_ctxinstsliced_instancesrB   rB   rC   build_all_requests  s   $
zTask.build_all_requestsc                 K  rL   )a[  Uses RequestFactory to construct Requests and returns an iterable of
        Requests which will be sent to the LM.

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param ctx: str
            The context string, generated by fewshot_context. This includes the natural
            language description, as well as the few shot examples, and the question
            part of the document for `doc`.
        :param doc_idx: int
            The index of a document within `self.test_docs()` or `self.validation_docs()`,
            whichever is the main split used.
        :param repeats: int
        TODO: update this docstring
            The number of times each instance in a dataset is inferred on. Defaults to 1,
            can be increased for techniques like majority voting.
        NrB   rA   r_   r   kwargsrB   rB   rC   r   |  s   zTask.construct_requestsc                 C  rL   )a  Take a single document and the LM results and evaluates, returning a
        dict where keys are the names of submetrics and values are the values of
        the metric for that one document

        :param doc:
            The document as returned from training_docs, validation_docs, or test_docs.
        :param results:
            The results of the requests created in construct_requests.
        NrB   )rA   r_   resultsrB   rB   rC   process_results  s   zTask.process_resultsc                 C  rL   )z
        :returns: {str: [metric_score] -> float}
            A dictionary where keys are the names of submetrics and values are
            functions that aggregate a list of metric scores
        NrB   rK   rB   rB   rC   aggregation     zTask.aggregationc                 C  rL   )z
        :returns: {str: bool}
            A dictionary where keys are the names of submetrics and values are
            whether a higher value of the submetric is better
        NrB   rK   rB   rB   rC   higher_is_better  r   zTask.higher_is_betterkeyr
   c                 C     t | j|d S rh   getattrr>   rA   r   rB   rB   rC   
get_config     zTask.get_configc                 C     t |dS )z?Used for byte-level perplexity metrics in rolling loglikelihoodutf-8r   encodeclsr_   rB   rB   rC   count_bytes  s   zTask.count_bytesc                 C     t td|S )zcDownstream loglikelihood_rolling perplexity tasks with custom word boundaries should override this!\s+r   resplitr   rB   rB   rC   count_words     zTask.count_wordsc           	        s   |du rj durj }ntd|r|nd}|dkrd}nD r*j||d}n*jdu r>t r8 n _|	j|d } fdd|D d| }d	
fd
d|D d	 } }|| | S )a  Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.

        :param doc: str
            The document as returned from training_docs, validation_docs, or test_docs.
        :param num_fewshot: int
            The number of fewshot examples to provide in the returned context string.
        :param rnd: random.Random
            The pseudo-random number generator used to randomly sample examples.
            WARNING: This is currently a required arg although it's optionalized with a default `None`.
        :param description: str
            The task's description that will be prepended to the fewshot examples.
        :returns: str
            The fewshot context.
        Nz>A `random.Random` generator argument must be provided to `rnd`r{   r   )rl   rm   r}   c                   s   g | ]}| kr|qS rB   rB   )r   x)r_   rB   rC   r         z(Task.fewshot_context.<locals>.<listcomp>

c                   s    g | ]}  | | qS rB   )rs   ru   )r   r_   rK   rB   rC   r     s    )r@   r   rM   rn   r<   ri   rO   rV   rW   rj   joinrs   )	rA   r_   rX   rm   descriptionr   labeled_examples	fewshotexexamplerB   r_   rA   rC   r     s:   





zTask.fewshot_contextlist[Instance] | Nonec                 C  6   t | dr| jD ]}|| j qdS td | jS z;Iterates over FilterEnsembles and applies them to instancesr?   z,No filter defined, passing through instancesNhasattrr?   applyr=   r[   r\   rA   frB   rB   rC   apply_filters     


zTask.apply_filtersc                 C  s
   | j  S )z#Returns the config as a dictionary.)r4   to_dictrK   rB   rB   rC   dump_config  s   
zTask.dump_configvalueupdatec                 C  sh   |du rt d|r+t| j|i }t|ts$td| dt|j d|| dS t	| j|| dS )z0Set or update the configuration for a given key.NzKey must be provided.zExpected a dict for key 'z', got z	 instead.)
r   r   r>   r   r`   	TypeErrortype__name__r   setattr)rA   r   r   r   current_valuerB   rB   rC   
set_config  s   
zTask.set_configmetric_namec                   s   i i i i f\| _ | _| _| _t | j  < t | j < t | j < i | j < t| ts; fdd| _	 fdd| _
d ig| jd< d| jd< dS )z
        Override the default metrics used for evaluation with custom metrics.

        Parameters:
        - metric_name (str): The name of the custom metric to override. Should be registered in api.metrics.
        c                       t  iS rh   )r   )r   yr   rB   rC   <lambda>(  s    z&Task.override_metric.<locals>.<lambda>c                     r   rh   )r   rB   r   rB   rC   r   )  s   metricmetric_listr   N)_metric_fn_list_aggregation_list_metric_fn_kwargs_higher_is_betterr   r   r   r   ConfigurableTaskr   r   r>   )rA   r   rB   r   rC   override_metric  s   


zTask.override_metricseedc                 C  s*   t || _t| dr| j| d S d S )Nsampler)randomRandomr@   r   r   set_rnd)rA   r   rB   rB   rC   set_fewshot_seed/  s   
zTask.set_fewshot_seeddatasets.Dataset | list[dict]c                 C  s:   |   r|  S |  r|  S td| j d| j d)NzTask dataset (path=z, name=z) must have valid or test docs!)rP   rW   rO   rV   r   r-   r.   rK   rB   rB   rC   	eval_docs4  s   zTask.eval_docs)r   r~   r   r   Iterator[tuple[int, Any]]c                  s   rDt | j t fddD sJ d  dt| jj dt  d tjt	fddt	| jD t
|d t
|d}|S |rJt
|nd }tjt	| jt
||t
|d}|S )	Nc                 3  s    | ]}| k V  qd S rh   rB   )r   e)nrB   rC   	<genexpr>I      z$Task.doc_iterator.<locals>.<genexpr>zqElements of --samples should be in the interval [0,k-1] where k is the number of total examples. In this case, k=.z: Evaluating on z	 examplesc                 3  s     | ]\}}| v r|V  qd S rh   rB   )r   ir   )r   rB   rC   r  P  s    )r   r~   r   )r   r   allr[   r   r4   r]   r   create_iterator	enumerater   )rA   r   r~   r   r   r   rB   )r  r   rC   r   ?  s.   

zTask.doc_iteratordict[str, Any]fieldc                 C  s$   |r|| v r
| | S t || S d S rh   )r   apply_template)r_   r  rB   rB   rC   resolve_field_  s   zTask.resolve_fieldNNNN)
r0   r,   r1   r,   r2   r3   r4   r5   r6   r7   )NNN)r0   r,   r1   r,   r6   r7   )r6   r"   )r6   r   r_   r`   r6   r`   )r6   rf   )r~   r   r   r   r   r   r   r   r   r   r   r   r   r,   r   r   r   r   r   r   r   r   r6   r7   r   r   r6   r
   )NNr6   r   r6   r`   )F)r   r   r   r
   r   r   r6   r7   )r   r   r6   r7   rh   )r   r   r6   r7   )r6   r   )
r   r   r~   r   r   r   r   r   r6   r  )r_   r  r  r,   )4r   
__module____qualname____doc__r+   __annotations__r-   r.   r/   rD   r:   propertyr4   abcabstractmethodrM   rO   rP   rT   rV   rW   r^   rd   rg   rn   rq   rs   ru   rw   ry   r|   r   r   r   r   r   r   classmethodr   r   r   positional_deprecatedr   r   r   r   r   r   r   r   staticmethodr  rB   rB   rB   rC   r*   ?   s   
 	*&









q







:
	
 r*   c                      sx  e Zd ZdZdZdZ				dgdhddZdidjddZdkddZdkddZ	dkddZ
dlddZdlddZdlddZ fddZej					dmdnd*d+Zddddd,d-d.dod5d6Z		dpdqd:d;Zdrd=d>Zd?d@ ZdsdAdBZdtdCdDZdidEdFZdidudIdJZdidvdLdMZdidwdOdPZdidwdQdRZdSdT ZdxdWdXZdYdZ Zdyd[d\Z dyd]d^Z!dzdadbZ"e#d{dcddZ$dedf Z%  Z&S )|r   YamlNr4   dict | Noner6   r7   c              
     s<  | j | _| jd u rtd*i || _n|d ur| jj| | jd u r&tdt| jjt	r:d| jjv r:| jjd | _
| jjd ur[| jjtvrVtd| jj ddt d| jj| _td| jj| _| jjd urld| _| jjrsd| _| jjd	ur|d| _| jjd ur| jj| _| jjd ur| jj| _i | _i | _i | _i | _| jjd u rt | jj }|D ]}t!|| j|< i | j|< t"|| j|< t#|| j|< qn| jjD ]ĉd
vrtdd
 }fddD }dv od du }| jj$d urd | j|< i | j|< n#t%|r|j&}	|j'}|	| j|< || j|< nt!||| j|< || j|< dv rHd }
t|
t(r;t)|
| j|< n1t%|
rGd | j|< n$dd t*+ D }t"|}t,-d| jj. d| d||   || j|< dv ryd | j|< qt,-d| jj. d| dt#|  t#|| j|< q| /| jj0 d | _1d | _2| jj3d urg | _4| jj3D ]0}|d }|d }g }|D ]  fdd D }|5 d |g qt6||}| j45| qn| jdkrt,7d t6ddd ggg| _4| jj8d urt,9d| jj8  t:| jj8| j| j| _;nd | _;| <  }d urS| jjr'| jj=nd}t>|}t|t(r9t?@|}ntA|t?jBrC|}n	tCd tD| ||d d!| _=| jE| _Ft>| jFjGH | _Gd"| _Id"| _J| jFd" }| K|}| L|}| jjMd ur| M|}t|t>st,Nd# ntO|}t|tPrt,7d$ || _Ind }t|t>rt,7d% tO|| _Jnt|tPr|d ur|| }nt(|}|d ur|n|g}| jjMd ur|D ]A}tQ|d" R }| jjST | jjSk}|r|rt,7d&| jjS d'| d( q|s|st,7d&| jjS d'| d) qd S d S )+NzNMust pass a config to ConfigurableTask, either in cls.CONFIG or `config` kwargversionzGot invalid output_type 'z', must be in ','r%   TFr   zK'metric' key not provided for an entry in 'metric_list', must be specified!c                   s   i | ]}|d vr| | qS ))r   r   r   hf_evaluaterB   r   r   )metric_configrB   rC   
<dictcomp>  s    z-ConfigurableTask.__init__.<locals>.<dictcomp>r#  r   c                 S  s   i | ]\}}||qS rB   rB   )r   rl   vrB   rB   rC   r&        rY   z	] metric z? is defined, but aggregation is not. using default aggregation=r   zI is defined, but higher_is_better is not. using default higher_is_better=rF   filterc                   s   i | ]}|d kr| | qS functionrB   r$  r*  rB   rC   r&    s    r+  r)   zRNo custom filters defined. Using default 'take_first' filter for handling repeats.r8   r9   zloading prompt defaultzMfewshot_config.sampler should be a string or subclass of ContextSampler, not )rm   r   zdoc_to_choice must return listz6doc_to_text returned an int. Assuming multiple inputs.z9doc_to_target returned a list. Assuming multiple targets.zBoth target_delimiter "z" and target choice: "z" have whitespacezf" do not have whitespace, ignore if the language you are evaluating on does not require/use whitespacerB   )UCONFIGr>   r4   r"   __dict__r   r   r   r   r`   r+   output_typeALL_OUTPUT_TYPESr   r/   r   fewshot_configfewshot_cfgrw   
MULTIMODALry   unsafe_codeUNSAFE_CODEdataset_pathr-   dataset_namer.   r   r   r   r   r   r   r   r   r   r   callable__call__r   r   r   r   itemsr[   r\   r]   r:   dataset_kwargsr;   r<   filter_listr?   r   r#   debug
use_promptr   r$   promptr^   r   ri   r   get_sampler
issubclassContextSamplerr   r   r   	task_docsfeatureskeysmultiple_inputmultiple_targetrs   ru   doc_to_choiceerrorr   r   r   isspacetarget_delimiterrstrip)rA   r0   r1   r2   r4   _metric_listr   r   hf_evaluate_metric	metric_fnagg_nameINV_AGG_REGISTRY
metric_aggfilter_configfilter_namefilter_functions
componentsfilter_pipeline_fs_docsconfig_samplerr^   sampler_clstest_doc	test_texttest_targettest_choice
num_choicecheck_choiceschoicechoice_has_whitespacedelimiter_has_whitespacerB   )r+  r%  rC   rD   j  sj  


























zConfigurableTask.__init__r;  dict[str, Any] | Nonec                 K  s   ddl m} |r|tj|dkr|dd  t| jjtr@t	
| jj dd  | jjdi | jjp4i | jjp:i | _d S tjd| j| jd|d urO|ni | _d S )	Nr   )parsez4.0.0trust_remote_codez`: Custom kwargs can be passed to `--metadata` in console (as json string) or to the TaskManager.zX
For example --metadata='{"max_seq_lengths":[4096, 8192]}'. For details see task Readme.)rE   rF   rB   )packaging.versionre  rG   __version__popr   r4   custom_datasetr   r[   r\   r]   r   r;  rI   rH   r-   r.   )rA   r;  r   vparserB   rB   rC   r:   R  s(   


zConfigurableTask.downloadr   c                 C     | j jd uS rh   )r4   training_splitrK   rB   rB   rC   rM   f     z"ConfigurableTask.has_training_docsc                 C  rl  rh   )r4   validation_splitrK   rB   rB   rC   rO   i  rn  z$ConfigurableTask.has_validation_docsc                 C  rl  rh   )r4   
test_splitrK   rB   rB   rC   rP   l  rn  zConfigurableTask.has_test_docsdatasets.Datasetc                 C  <   |   r| jjd ur| j| j| jj S | j| jj S d S rh   )rM   r4   process_docsrI   rm  rK   rB   rB   rC   rT   o     zConfigurableTask.training_docsc                 C  rr  rh   )rO   r4   rs  rI   ro  rK   rB   rB   rC   rV   w  rt  z ConfigurableTask.validation_docsc                 C  rr  rh   )rP   r4   rs  rI   rp  rK   rB   rB   rC   rW     s
   zConfigurableTask.test_docsc                   s   | j j }d ur| j j }d ur|| j| S | j| S | jjd urB| j j }d urB| td r4  |S  }t|r=| S t	dd | jj
 }d urY|dkrYtd| jj d t  S )NrB   z`fewshot_config['samples']` was incorrectly defined in the configuration. It should either be `list[dict]`, or callable returning this list.r   rY   zF] num_fewshot > 0 but fewshot_split is None. using preconfigured rule.)r2  r   rs  rI   r4   r1  r   ri   r8  	ExceptionrX   r[   r\   r]   superr^   )rA   r   rs  r   fsamples_shots	__class__rB   rC   r^     s&   

zConfigurableTask.fewshot_docsFr_   r`   rX   r   r   r,   r   r   r   Callable[..., str] | Noner   str | list[str]c              
   C  s  g }|rt || dnd}| || jjpd}	t||	| jj}
|
r)|td|
 |dkr| jj	|| j
j| jjkr;|nddD ]G}| || j
j| j
jrT| || j
jnd| || j
j}}}| || j
j}| jrutd|| }d}|| j||||| j
j| j
jd7 }q?| || jjr| |nd| |}}}| jrt|tsJ d	| j||||r||d
S d|d
S || j|||| jjdd7 }|r|r|rdd |D nt|}||}|S ddd |D }|S )aj  Build the full prompt context including system prompt, few-shot examples, and eval doc.

        Constructs a complete prompt by:
        1. Adding system instruction + task description (if provided)
        2. Adding `num_fewshot` labeled examples from the fewshot split
        3. Adding the evaluation document (without its answer)

        Each component is built using `build_qa_turn()` and can be rendered as plain
        text or formatted via a chat template.

        Args:
            doc (dict): The evaluation document to build context for.
            num_fewshot (int): Number of few-shot examples to include.
            system_instruction (str | None): System instruction to prepend to the prompt.
            apply_chat_template (bool): If True, format output using the chat template.
            fewshot_as_multiturn (bool): If True, keep few-shot examples as separate
                user/assistant turns. If False, collapse into a single user message.
            chat_template (Callable | None): Renders a list of message dicts to a string.
            gen_prefix (str | None): Prefix to start the assistant's response (e.g., "Answer:").

        Returns:
            str | list[str]: The formatted prompt string, or a list of strings for
                multiple-input tasks (e.g., Winogrande where each choice becomes a
                separate context).
        )add_generation_promptNr{   systemr   )r  eval_docr   qcar   	tgt_delim	few_delimz,multiple inputs require choices to be a list)r   r   )r  r  r   r  r  c                 S     g | ]}|  qS rB   r   r   mrB   rB   rC   r         z4ConfigurableTask.fewshot_context.<locals>.<listcomp>c                 s      | ]}|  V  qd S rh   to_textr  rB   rB   rC   r    r  z3ConfigurableTask.fewshot_context.<locals>.<genexpr>)r   r  r4   r   r   fewshot_delimiterr   r   r   rj   r2  r   rp  rs   rH  ru   r   rF  r   build_qa_turnrK  r   ri   multiple_input_contextr   r   )rA   r_   rX   r   r   r   r   r   messagesr   system_promptfs_docr  r  r  _gen_prefixresrB   rB   rC   r     s   $





z ConfigurableTask.fewshot_context r   r  r  r  list[str] | Noner  str | int | list[str] | Nonelist[Message]c          
      C  s   t |tsJ d| |duo|dk}td||r|s|n
|r&t||r&|ndg}|r`|r7t |tr7|| n
t |tr@|d n|}	t |	tsNJ d| t||	dd}	|td	|	| |S |rj|td	| |S )
a&  Build a single Q&A turn as a list of Messages.

        Constructs a user message containing the question/context, and optionally
        an assistant message containing the answer. Used for building both few-shot
        examples and the final evaluation prompt. The returned Messages can be
        rendered as plain text (via to_text()) or converted to chat format
        (via to_dict()) depending on whether a chat template is applied.

        Args:
            q (str): The question or context text (required).
            c (list[str] | None): List of answer choices for multiple-choice tasks.
                When provided with an integer `a`, indexes into this list to get the answer.
            a (str | int | list[str] | None): The answer - can be a string, an index
                into `c`, or a list of strings (for multiple targets).
            gen_prefix (str | None): A prefix to prepend to generated text (e.g., "Answer:").
            tgt_delim (str): Delimiter between question and answer (default: " ").
            few_delim (str): Delimiter after assistant response for few-shot separation
                (default: "\n\n").

        Returns:
            list[Message]: [user_msg] or [user_msg, assistant_msg] depending on
                whether an answer or gen_prefix is provided.
        zContext is not a string! : Nr{   userr   zAnswer is not a string! : r  )	delimiter	assistant)r   r   r   r   r   ri   r   r   )
rA   r  r  r  r   r  r  
has_answermsgsanswer_textrB   rB   rC   r    s@   !

zConfigurableTask.build_qa_turnprev_contextlist[Message] | None	list[str]c           
        sz   g }rng  fdd|D }|D ]%}|r+|r"dd |D nt |}	||	}	n
ddd |D }	||	 q|S )a  Build separate prompt contexts for each input choice in multiple-input tasks.

        For tasks like Winogrande where each answer choice produces a different
        input context (e.g., filling a blank with different options), this method
        creates a separate full prompt for each choice. All prompts share the same
        fewshot prefix but differ in the final evaluation turn.

        Args:
            prev_context (list[Message] | None): Messages from system prompt and fewshot
                examples (shared across all choices).
            gen_prefix (str | None): Prefix to start the assistant's response (e.g., "Answer:").
            q (list[str]): List of input texts, one per choice.
            chat_template (Callable | None): Renders a list of message dicts to a string.
            fewshot_as_multiturn (bool): If True, keep messages as separate turns.

        Returns:
            list[str]: Formatted prompt strings, one per input choice.
        c                   s    g | ]}j | d d qS )r{   )r  r   r  )r  )r   r   r   r  rA   rB   rC   r   m  s    z;ConfigurableTask.multiple_input_context.<locals>.<listcomp>c                 S  r  rB   r  r  rB   rB   rC   r   y  r  r{   c                 s  r  rh   r  r  rB   rB   rC   r    r  z:ConfigurableTask.multiple_input_context.<locals>.<genexpr>)r   r   r   )
rA   r  r   r  r   r   res_contextsr  r  rB   r  rC   r  P  s   	
z'ConfigurableTask.multiple_input_contextr   c                 C  r   r   r   r   rB   rB   rC   r     r   zConfigurableTask.apply_filtersc                 C  s   | j jS rh   )r4   should_decontaminaterK   rB   rB   rC   r    s   z%ConfigurableTask.should_decontaminatec                 C  sb   | j jr/| j jd u r| |S | j j}|| jv r|| S t|r$||S tt	| j j|S d S rh   )
r4   r  rq   rs   rD  r8  astliteral_evalr   r  )rA   r_   rq   rB   rB   rC   rq     s   

z-ConfigurableTask.doc_to_decontamination_queryc                 C  ra   rb   rB   rc   rB   rB   rC   rd     re   zConfigurableTask._process_docc                 C  s   | j d ur	| j }n|d ur|}n| jj}t|tr|S t|tr@|| jv r)|| S t||}|	 r>| j
jd ur>t|S |S t|rH||S t|dre||}t|dkr\|d S td | jjS tt| t)Nr      r   #Applied prompt returns empty string)r?  r4   rs   r   r   r   rD  r   r  isdigitr>   rH  r  r  r8  r   r   r   r[   r\   r  printr   r   )rA   r_   rs   text_stringapplied_promptrB   rB   rC   rs     s0   







zConfigurableTask.doc_to_textr   int | str | listc              	   C  s(  | j d ur	| j }n|d ur|}n| jj}t|tr|S t|trf|| jv r)|| S t||}|	 r>| j
jd ur>t|S t|dkrd|d dkrd|d dkrdzt|W S  ttfyc   | Y S w |S t|trm|S t|ru||S t|dr||}t|dkr|d S td | jjS t)	Nr  r   []r   r}   r  )r?  r4   ru   r   r   r   rD  r   r  r  r>   rH  r  r  r   SyntaxErrorr   ri   r8  r   r   r[   r\   r  r   )rA   r_   ru   target_stringr  rB   rB   rC   ru     sB   








zConfigurableTask.doc_to_targetr
   c                 C  s   | j d ur	| j }n|d ur|}n| jjd u rtd n| jj}t|tr7|| jv r.|| S t	t
||S t|tr>|S t|trIt| S t|rQ||S t|dr[||S t)Nz.doc_to_choice was called but not set in configget_answer_choices_list)r?  r4   rH  r[   rI  r   r   rD  r  r  r   r  ri   r`   valuesr8  r   r  r   )rA   r_   rH  rB   rB   rC   rH    s(   






zConfigurableTask.doc_to_choiceint | str | list | Nonec                      |d ur|}nj jd urj j}nd S t|tr* fdd|D }dd |D S t|trA|jv r8 | S tt	| S t
|rI| S d S )Nc                      g | ]}  |qS rB   )rw   r   featurer   rB   rC   r          z1ConfigurableTask.doc_to_image.<locals>.<listcomp>c                 S     g | ]}|d ur|qS rh   rB   r  rB   rB   rC   r   #  r   )r4   rw   r   ri   r   rD  r  r  r   r  r8  )rA   r_   rw   image_featurerB   r   rC   rw     "   



zConfigurableTask.doc_to_imagec                   r  )Nc                   r  rB   )ry   r  r   rB   rC   r   7  r  z1ConfigurableTask.doc_to_audio.<locals>.<listcomp>c                 S  r  rh   rB   r  rB   rB   rC   r   :  r   )r4   ry   r   ri   r   rD  r  r  r   r  r8  )rA   r_   ry   audio_featurerB   r   rC   ry   .  r  zConfigurableTask.doc_to_audioc                 C  s2   | j j }d ur|| jv r|| S t||S d S rh   )r4   r   rD  r   r  )rA   r_   r   rB   rB   rC   r|   E  s
   
zConfigurableTask.doc_to_prefixr   list[Instance] | Instancec           
        s   dd} dd }d }| jdkr| f}nl| jdkr'| f}n`| jdkrz| }| jj|rG| jjrEt| jjsE| jjnd| jrZ|   fdd	D }n
fd
d	|D }d| j	
 v ryfdd	|D }|| n| jdkrt| jjf}i | jjri d| i| jjri d| itrt|trfdd	|D }n|f }| jdkr҇fdd	t|D }	|	S td| j|ddS )Nr   Fr   r&   r(   r'   r{   c                   s   g | ]
}|   fqS rB   rB   )r   context)contrK  rB   rC   r   f      z7ConfigurableTask.construct_requests.<locals>.<listcomp>c                   s   g | ]
}  | fqS rB   rB   )r   r  )r   rK  rB   rC   r   i  r  acc_mutual_infoc                   s   g | ]
}d   | fqS )r{   rB   )r   ra  )rK  rB   rC   r   t  s    r)   visualaudioc                   s   g | ]}| f qS rB   rB   )r   arg)multimodal_argrB   rC   r     r(  c              	     s(   g | ]\}}t dd  ||dqS )r&   request_typer_   	argumentsidxrB   r   )r   r  r  )r_   r   rB   rC   r     s    r   r  rB   )ri  r/   ru   rH  r4   rK  r   r   rF  r   rE  extendr   generation_kwargsrw   ry   r   r   ri   r
  r   )
rA   r_   r   r   r   r   aux_argumentsr  choicesrequest_listrB   )r  r   r_   r   r  rK  rC   r   M  sz   












z#ConfigurableTask.construct_requestsc              
     s~  t | jjr| j||S i }t| j }| jdkr<|d }|\}i d|v r,d|ini d|v r9dtiS i S | jdkr{|\}| | 	|}| 
| 	|}i d|v r_d||fini d|v rkd||fini d|v rxd||fiS i S | jd	krt|d
di\}	| | tdd  D }
tdd  D }dt  t|	krd| j v r|	t d  }t|t krt|	d t  }	t|	}t|	|
 }t|	| }| jr| |}n| 	|}d}t|tr fdd|D }d|v rd}n+t|tr|t k r|nd}nt|tr,| v r* |nd}|dkr3d}|r?td| d | jrl||v rJdnd}||v rSdnd}||v r\dnd}ttfdd|D }n(||krsdnd}||kr|dnd}||krdnd}|dkrt| nd}t|	}i d|v rd|ini d|v rd||fini d|v rd||fini d|v rd|ini d|v rd|ini d|v rd|ini d|v rd||fini d |v rd ||	fini }d|v rd!d t|	|dd"D }t||krdnd}||d< |S | jd#kr5| 	|}|d | jjd ur>| |  | }n&| jrGt|}nt|turdd$| j v sdttsdt|}| j D ]}| jrg }t|tsz|g}|dkrfd%dtt|D | j| d*|d&| j| | }|dkrdnd}no|D ]:}z| j| d*|ggd&| j| }W n t y   | j| |g}Y nw t|t!r|| }|"| qt|rdnd}n(z| j| d*|ggd&| j| }W n t y   | j| |g}Y nw t|t!r-|# D ]	\}}|||< q!qi|||< qi|S td'| j d(d))+Nr&   r   
perplexityaccr(   word_perplexitybyte_perplexityr   r'   strictTc                 S     g | ]}t t|qS rB   floatr   r   r  rB   rB   rC   r     r   z4ConfigurableTask.process_results.<locals>.<listcomp>c                 S  s   g | ]}t t|d qS )r   )r  r   r   r  rB   rB   rC   r     s    r  r  Fc                   s    g | ]}|t  k r|nd qS ))r   r  )r  rB   rC   r     s     r  zBLabel index was not in within range of available choices,Sample:

r         ?        c                 3  s$    | ]}|d kr | ndV  qdS )r  r   NrB   r  )	is_greedyrB   rC   r    s   " z3ConfigurableTask.process_results.<locals>.<genexpr>f1mccacc_norm	acc_bytesexact_matchbrier_score
likelihoodc                 S  s   g | ]\}}|| qS rB   rB   )r   ll_cll_urB   rB   rC   r     s    )r  r)   bypassc                   s   g | ]} qS rB   rB   )r   _)resultrB   rC   r   B  s    )
referencespredictionszPassed invalid output_type 'z' ! Please use one of zO'loglikelihood', 'loglikelihood_rolling', 'generate_until' or 'multiple_choice'rB   )$r8  r4   r   ri   r   rE  r/   r   r   ru   r   ziprH  nparrayr   r   argmaxrF  rs   r   r   indexr[   r\   rG  anyr   softmaxr   ranger   r   r`   r   r:  )rA   r_   r   result_dict
use_metricllr&   _words_bytesllscompletion_lenbyte_lengthlls_unconditionalpred	pred_norm	pred_bytegoldgold_index_errorr  r  r  r  	prob_normlls_mutual_infor  r   scoresresult_scoregold_optionrl   r'  rB   )r  r  r  rC   r     sR  









	
O










z ConfigurableTask.process_resultsc                 C  rJ   rh   )r   rK   rB   rB   rC   r   t     zConfigurableTask.aggregationc                 C  rJ   rh   )r   rK   rB   rB   rC   r   w  r
  z!ConfigurableTask.higher_is_betterr   r   c                 C  r   rh   r   r   rB   rB   rC   r   z  r   zConfigurableTask.get_configc                 C  s   t | jdd S )Nr]   )r   r4   rK   rB   rB   rC   	task_name}     zConfigurableTask.task_namec              	   C  s:   dt | jdd  d| j dt | jdd  dt| j d	S )NzConfigurableTask(task_name=r]   z,output_type=z,num_fewshot=rX   z,num_samples=))r   r4   r/   r   r   rK   rB   rB   rC   __repr__  s   zConfigurableTask.__repr__r  )r4   r  r6   r7   rh   )r;  rd  r6   r7   r6   r   )r6   rq  )NFFNN)r_   r`   rX   r   r   r,   r   r   r   r   r   r{  r   r,   r6   r|  )
r  r,   r  r  r  r  r   r,   r6   r  NF)r  r  r   r,   r  r  r   r{  r   r   r6   r  r  )r_   r`   r  )r_   r   r6   r  )r_   r
   r6   r  )r_   r
   r6   r  )r_   r`   r   r|  r6   r  r  r  )r6   r
   )'r   r  r  r+   r/   r-  rD   r:   rM   rO   rP   rT   rV   rW   r^   r   r  r   r  r  r   r  rq   rd   rs   ru   rH  rw   ry   r|   r   r   r   r   r   r  r  r  __classcell__rB   rB   ry  rC   r   e  sl     i





rF
3	


$.
] 
K

r   c                   @  sB   e Zd ZdZdddZdd
dZdddZdddZdddZdS )MultipleChoiceTaskr&   r_   r`   r6   r   c                 C  s   d|d |d   S )Nr  r  r  rB   rc   rB   rB   rC   ru     s   z MultipleChoiceTask.doc_to_targetr   rf   c                   s    fddt d D S )Nc              	     s2   g | ]\}}t dd  d| f|dqS )r&   r  r  rB   r  )r   r  ra  r   r_   r   rB   rC   r     s    z9MultipleChoiceTask.construct_requests.<locals>.<listcomp>r  )r
  r   rB   r  rC   r     s   
z%MultipleChoiceTask.construct_requestsr   Iterable[tuple[float, bool]]c                 C  sh   dd |D }|d }t ||krdnd}t dd |d D }t || |kr-dnd}||dS )	Nc                 S  s   g | ]}|d  qS )r   rB   )r   r  rB   rB   rC   r     s    z6MultipleChoiceTask.process_results.<locals>.<listcomp>r  r  r  c                 S  r  rB   r  r  rB   rB   rC   r     r   r  r  r  )r  r  r  )rA   r_   r   r  r  r  r  rB   rB   rC   r     s   z"MultipleChoiceTask.process_resultsc                 C  s
   dddS )NTr  rB   rK   rB   rB   rC   r        z#MultipleChoiceTask.higher_is_betterc                 C  s
   t t dS )Nr  )r   rK   rB   rB   rC   r     r  zMultipleChoiceTask.aggregationN)r_   r`   r6   r   )r_   r`   r   r   r6   rf   )r_   r`   r   r  r6   r`   r  )	r   r  r  r/   ru   r   r   r   r   rB   rB   rB   rC   r    s    



r  c                   @  s   e Zd ZdZd)ddZd*d	d
Zd+ddZd,ddZdd Zd-ddZ	dd Z
d.ddZd/d d!Zd,d"d#Zed0d$d%Zed0d&d'Zd(S )1PerplexityTaskr(   r6   r   c                 C  rL   r  rB   rK   rB   rB   rC   rM     rx   z PerplexityTask.has_training_docsrl   r   ri   c                 C  s   |dkrt dg S )Nr   >The number of fewshot examples must be 0 for perplexity tasks.r   rk   rB   rB   rC   rn     s
   zPerplexityTask.fewshot_examplesr_   r`   rX   Literal['']c                 C  s   |dkrt ddS )Nr   r  r{   r  )rA   r_   rX   rB   rB   rC   r     s
   zPerplexityTask.fewshot_contextc                 C  s   ddddS )NFr  r  r   rB   rK   rB   rB   rC   r        zPerplexityTask.higher_is_betterc                 C  ra   rh   rB   rc   rB   rB   rC   rq     rx   z+PerplexityTask.doc_to_decontamination_queryr   c                 C  rL   rz   rB   rc   rB   rB   rC   rs     rx   zPerplexityTask.doc_to_textc                 C  ra   rh   rB   rc   rB   rB   rC   ru     rx   zPerplexityTask.doc_to_targetr   r,   c                 K  s.   t |rttd| j|| |fdd|S )Nr   r  rB   )r   r   r   r/   ru   r   rB   rB   rC   r     s   
z!PerplexityTask.construct_requestsr   tuple[float]c                 C  s>   |\}|  | |}| | |}||f||f||fdS Nr  )r   ru   r   )rA   r_   r   r&   wordsbytes_rB   rB   rC   r     s   zPerplexityTask.process_resultsc                 C  s   t t tdS r  )r   r   rK   rB   rB   rC   r     r  zPerplexityTask.aggregationc                 C  r   )Nr   r   r   rB   rB   rC   r     r  zPerplexityTask.count_bytesc                 C  r   )zBDownstream tasks with custom word boundaries should override this!r   r   r   rB   rB   rC   r     r   zPerplexityTask.count_wordsNr  )rl   r   r6   ri   )r_   r`   rX   r   r6   r  r  )r6   r   )r_   r`   r   r,   )r_   r`   r   r  r6   r`   )r6   r   )r   r  r  r/   rM   rn   r   r   rq   rs   ru   r   r   r   r  r   r   rB   rB   rB   rC   r    s     








r  )D
__future__r   r  r  loggingr   r   collections.abcr   r   r   r   copyr   	functoolsr   typingr	   r
   r   r   rG   numpyr  r   lm_evalr   lm_eval.apir   lm_eval.api.instancer   r   lm_eval.api.metricsr   r   r   lm_eval.api.registryr   r   r   r   r   r   lm_eval.api.utilsr   r   r   r   r   lm_eval.caching.cacher    r!   lm_eval.config.taskr"   lm_eval.filtersr#   lm_eval.promptsr$   r%   r0  	getLoggerr   r[   ABCr*   r   r  r  rB   rB   rB   rC   <module>   sX     
    *        -/