o
    5tixZ                  
   @   s`  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZ d dlmZ d dlmZ eeZG dd dZded	ee fd
dZd"ddZd#ddZdedB d	edB fddZ	 	 d$deded	eeef fddZdee d	eeeeeeef fddZ			d%d	eeeedf fddZedej d	ej fddZ!edee" fd d!Z#dS )&    N)ConfigurableGroup)aggregate_subtask_metricsmeanpooled_sample_stderrstderr_for_metric)Task)positional_deprecatedc                   @   sR   e Zd ZdZ									dddZedefddZddd
dZdd Z	dS )
TaskOutputa  
    Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task.

        Attributes:
            task (object): The task object.
            task_name (str): The name of the task.
            task_config (dict): The configuration of the task.
            version (str): The version of the task.
            group_name (str): The name of the task group.
            n_shot (int): The number of shots for the task.
            task_alias (str): The alias of the task.
            group_alias (str): The alias of the task group.
            is_group (bool): Indicates if the task is a group.
            logged_samples (list): The list of logged samples.
            sample_len (int): The length of the samples.
            sample_metrics (defaultdict): The dictionary of samples' metrics.
            agg_metrics (defaultdict): The dictionary of aggregate metrics.

        Methods:
            from_taskdict(cls, task_name: str, task):
                Creates a TaskOutput instance from a task dictionary.

            calculate_aggregate_metric(bootstrap_iters=100000) -> None:
                Calculates the aggregate metrics for the task.
    Nc
           
      C   s^   || _ || _|| _|| _|| _|| _|| _|| _|	| _g | _	d | _
tt| _tt| _d S N)tasktask_config	task_name
group_nameversionn_shot
task_aliasgroup_aliasis_grouplogged_samples
sample_lencollectionsdefaultdictlistsample_metricsagg_metrics)
selfr   r   r   r   r   r   r   r   r    r   K/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/evaluator_utils.py__init__0   s   zTaskOutput.__init__r   c           
   
   C   s   t |tr
|\}}nd }|sd}| ||||dS |j}t| }|d }dkr4|di dd}|d}|d}	| ||||||||	dS )	NT)r   r   r   r   num_fewshotr   metadataaliasr   )r   r   r   r   r   r   r   r   )
isinstancetupleVERSIONdictdump_configget)
clsr   r   r   r   r   r   r   r   r   r   r   r   from_taskdictJ   s0   



zTaskOutput.from_taskdict順 returnc              	   C   s   | j  D ]`\\}}}z	| j | }W n ty   t}Y nw | d| }||| j|< t|| _t	|t
r^t||dv rCt|dn|d}|rSt|dkrS||nd| j| d| < qtd| d	d S )
N,)bleuchrfterd   )metricbootstrap_iters   N/A_stderr,zReceived bootstrap_iters 'zD' but expected an integer. Set to 0 to turn off stderr calculations.)r   itemsr   aggregationKeyErrorr   r   lenr   r"   intr   min
ValueError)r   r2   r1   
filter_keyr6   agg_fn
metric_key	stderr_fnr   r   r   calculate_aggregate_metrich   s,   



z%TaskOutput.calculate_aggregate_metricc                 C   s6   d| j  d| j d| j d| j d| j d| j dS )NzTaskOutput(task_name=z, group_name=z
, version=z	, n_shot=z, task_alias=z, group_alias=))r   r   r   r   r   r   )r   r   r   r   __repr__   s   
zTaskOutput.__repr__)	NNNNNNNNN)r*   r+   N)
__name__
__module____qualname____doc__r   classmethodstrr)   rA   rC   r   r   r   r   r	      s     
r	   	task_dictr+   c                 C   sN   g }|   D ]\}}t|trt|}|| qt||}|| q|S r
   )r6   r"   r%   get_task_listextendr	   r)   append)rK   outputsr   task_obj_outputstask_outputr   r   r   rL      s   
rL   c                    s
  i }|   D ]c\}}t|tr|j}n|}t|tr@t|| d d}|r9|| fg  fdd| D  i ||}qt|trI|j}nt|t	rQ|j
}|d u r^|| fg  q|| fg | q dkri }|  D ]\}	}
|	\} |
||< qt|}|S )Nr3   )	task_rootdepthc                    s    g | ]\}}|d   kr|qS )r3   r   ).0_task_depthrT   r   r   
<listcomp>   s
    z$get_subtask_list.<locals>.<listcomp>r   )r6   r"   r   r   r%   get_subtask_list
setdefaultrM   keysr   r   rN   )rK   rS   rT   subtask_list	group_objrP   r   _subtask_listgroup_or_task_name	group_key	task_listr   rX   r   rZ      s@   






rZ   c                 C   sb   | j D ]+}|jdk r.td|  d|j d|jd  d| |j d	 tdt|  qd S )	Nr3   zTask: z; document z.; context prompt (starting on next line):    
r   z`
(end of prompt on previous line)
target string or answer choice index (starting on next line):
z!
(end of target on previous line)z	Request: )	instancesdoc_ideval_loggerinfoargsdoc_to_targetdocrJ   )r   instr   r   r   print_writeout   s   


rk   limitc                 C   s4   |d ur|dk rt tt| j| nt |}|S )Ng      ?)r:   mathceilr9   	eval_docs)r   rl   r   r   r   get_sample_size   s   &rp   resultsc                 C   s  dd }t t}t t}|| } |  D ]\}}|dkr$d| d nd}	t|tr5|j}
d}||}nt|trF|}
t|trD|j	}
d}||
 
 ||
< |r]|jd	urY|j}n|j}nd
||
 v rj||
 d
 }n|
}|	| ||
 d
< d||
 v r||
 d |rd||
 vr|dkrd| d nd}||
 
 ||
< || ||
 d
< d||
 v r||
 d t|tr|d7 }|d7 }t||||\}}i ||}i ||}|d8 }|d8 }q||fS )a  
    @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its
    value is a list of task names.
    @param results: Dictionary containing the results of each task. Each key is a
    group name and its value is a dictionary of task results.
    @param task_depth: The indentation level for printing the task
    hierarchy. Default is 0.
    @param group_depth: The indentation level for printing the group
    hierarchy. Default is 0.
    @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains
    aggregated results for each task, and groups_agg contains aggregated results for each group.

    Prepares the task hierarchy and aggregates the results for each task and group recursively for printing.
    c                 S   s   t t|  dd dS )z
        Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name.
        Required so that we end up sorting within each sub-header correctly.
        c                 S   s    t | d tr| d jS | d S Nr   )r"   r   r   )itemr   r   r   <lambda>   s   z>prepare_print_tasks.<locals>._sort_task_dict.<locals>.<lambda>)key)r%   sortedr6   )rK   r   r   r   _sort_task_dict   s   z,prepare_print_tasks.<locals>._sort_task_dictr    z-  TFNr!   samplesr3   )r   r   r%   r6   r"   r   r   rJ   r   r   copyr   grouppopprepare_print_tasks)rK   rq   
task_depthgroup_depthrw   task_agg	group_aggtask_or_group_nametask_or_group_obj
tab_stringnamefrom_configurable_groupr!   group_tab_string	_task_agg
_group_aggr   r   r   r~      s`   







r~   
eval_tasksc                 C   sb  t t}t t}t t}t t}t t}t t}| D ]}d|j }v r3|d ||j d< n|j||j d< |j }	rO|	|vrO|j }
rO|	||
 d< |j	||j< |j||j< |j
||j< |j||j< |j ||j< |j D ]3\\}}}| d| }|j| ||j |< |j||j d< |j| d|  ||j | d| < qtq ||||||fS )a(  
    @param eval_tasks: list(TaskOutput).
    @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot.

    Consolidates the results of multiple evaluation tasks into a single structure.

    The method iterates over each evaluation instance and extracts relevant information to create the consolidated
    results structure. The consolidated results structure has the following properties:

    - results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains
    metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task
    aliases specified in the task configuration.
    - samples: A defaultdict with task names as keys and lists of log samples as values.
    - configs: A defaultdict with task names as keys and task configurations as values.
    - versions: A defaultdict with task names as keys and task versions as values.
    - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values.
    - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better
    for each metric as values.

    The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple.
    r   r!   r,   rz   r5   )r   r   r%   r   r:   r   r   r   r   r   r   r   r   higher_is_betterr   r6   r   r   )r   rq   rz   r   configsversionsr   rR   r   r   r   r1   r=   r6   r?   r   r   r   consolidate_results8  s8   






	r   Fc              
      s  |du ri }|du ri }|  D ]O\}}t|tr!|j}|j}nd}t|tr5|r4||g |j qt	|||||\}}}	|rQ||g 
||g  |du s[|d du rbd| d< qd|v rj|d }
|t|d B }|	| }tfdd|D }|D ] d d fdd	|D }fd
d	|D } fdd	|D }|
D ]V}|d D ]O} d|d |gkrq|d dkrt}nt|d r|d }ntd|d  d| d||||d |  < d|v rd| < qt||| < qqt|| d< |dd}|dur|dd||< q|D ]=}dd	 |  D }dd	 |D }|r^t|t|kr^|D ]}| |d |dd}| |d qDq"q|||fS )a  
    (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info.

    @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below:

    - results: A defaultdict with task names (and, after this function is called, group names of
    groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys.
    - versions: A defaultdict with task names (and, after this function is called, group names of
    groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None).
    - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table.
    - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric.

    The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple.
    In the top-level invocation of this function, task_aggregation_list is ignored.
    Naggregate_metric_listrx   c                    s2   h | ]} |   D ]}d |vr
|dvr
|q
qS )_stderr)r   r!   rz   )r\   )rU   r   ru   )rq   r   r   	<setcomp>  s    
z,consolidate_group_results.<locals>.<setcomp>r5   r,   c                    s$   g | ]} | v r|   qS r   r   rU   r   r1   rq   r   r   rY     
    
z-consolidate_group_results.<locals>.<listcomp>c                    s$   g | ]} | v r |  qS r   r   r   )rq   stderrr   r   rY     r   c                    s$   g | ]} | v r| d  qS )rz   r   r   r   r   r   rY     r   filter_listr1   r7   r   zgCurrently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got 'z' for group ''weight_by_sizer4   rz   r    r   c                 S   s"   g | ]}d |v r| ds|qS )r,   score_stderr
startswithrU   ru   r   r   r   rY     s
    c                 S   s   g | ]	}| d r|qS )score,r   r   r   r   r   rY     s
    
r   zscore_stderr,)r6   r"   r   configr   r   r[   rN   r   consolidate_group_resultsrM   r'   boolr   joinsplitr   callabler<   r   sumr\   r9   r}   replace)rq   r   rK   rS   show_group_tabletask_aggregation_listgroup_or_taskgroup_or_task_infogroup_config_task_aggregation_listagg_metric_listrb   metric_listmetricsstderrssizesmetric_configfilter_nameaggregate_fngroup_metadatar   task_metricsscore_metricsscore_metric
stderr_keyr   )r1   rq   r   r   r   v  s   

		




r   
start_pathc                 C   sV   |   }d}t|D ]}|d d  r|  S |j  }q
td| dd|   )z
    Search upward in the directory tree to a maximum of three layers
    to find and return the package root (containing the 'tests' folder)
       testsztest_version_stable.pyz#Unable to find package root within z upwardszof )resolverangeexistsparentFileNotFoundError)r   cur_path
max_layers_r   r   r   find_test_root  s   r   rb   c                 C   sr   ddl }tttd}d| }| dd| d| g}tjt	| |
|}|r7td|  d	| dS )
zE
    Find the package root and run the tests for the given tasks
    r   N)r   z or z/tests/test_version_stable.pyz
--rootdir=z-kz'Not all tests for the specified tasks (z ) ran successfully! Error code: )pytestr   pathlibPath__file__r   syspathrN   rJ   mainr<   )rb   r   package_roottask_stringrg   pytest_return_valr   r   r   run_task_tests%  s   

r   rr   rD   )r   r   )NFN)$r   loggingrm   r   r   lm_eval.api.groupr   lm_eval.api.metricsr   r   r   r   lm_eval.api.taskr   lm_eval.utilsr   	getLoggerrE   re   r	   r%   r   rL   rZ   rk   r:   rp   r#   r~   r   r   r   r   r   rJ   r   r   r   r   r   <module>   sP    
x

/

\
B
 