o
    5tie                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZm	Z	 d dl
m
Z
 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZmZmZmZmZmZmZmZmZmZ erdd d
l m!Z! e"e#Z$e	ddG dd dZ%G dd dZ&dS )    N)defaultdict)asdict	dataclass)datetime)Path)TYPE_CHECKINGAny)load_dataset)MetadataConfigs)get_file_datetimeget_file_task_nameget_results_filenamesget_sample_results_filenameshandle_non_serializablehash_string	info_oncerandom_name_idsanitize_listsanitize_model_namesanitize_task_nameHfApiF)initc                   @   s  e Zd ZU dZdZedB ed< dZedB ed< dZedB ed< dZ	edB ed< dZ
edB ed< dZedB ed< dZedB ed	< dZedB ed
< dZedB ed< dddZedeeeef B dB dedB fddZdedeeeef B dedB d	edB deddfddZdddZdS )GeneralConfigTrackera  Tracker for the evaluation parameters.

    Attributes:
        model_source (str | None): Source of the model (e.g. hf, vllm, etc.)
        model_name (str | None): Name of the model.
        model_name_sanitized (str | None): Sanitized model name for directory creation.
        system_instruction (str | None): System instruction/prompt provided to the model.
        system_instruction_sha (str | None): SHA hash of the system instruction for
            tracking and reproducibility.
        fewshot_as_multiturn (bool | None): Whether few-shot examples are formatted
            as multi-turn conversations.
        chat_template (str | None): Chat template used for formatting prompts.
        chat_template_sha (str | None): SHA hash of the chat template for tracking
            and reproducibility.
        start_time (float): Start time of the experiment. Logged at class init.
        end_time (float): End time of the experiment. Logged when calling
            `GeneralConfigTracker.log_end_time`.
        total_evaluation_time_seconds (str | None): Inferred total evaluation time
            in seconds (from the start and end times).
    Nmodel_source
model_namemodel_name_sanitizedsystem_instructionsystem_instruction_shafewshot_as_multiturnchat_templatechat_template_shatotal_evaluation_time_secondsreturnc                 C   s   t  | _dS )zStarts the evaluation timer.N)timeperf_counter
start_timeself r)   V/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/loggers/evaluation_tracker.py__init__F   s   zGeneralConfigTracker.__init__
model_argsc                 C   s   dt dt dt fdd}g d}t| tr'|D ]}|| v r$t | |   S qdS t| t rB|D ]}| d| v rA|| | d  S q.dS )	z1Extracts the model name from the model arguments.r,   keyr#   c                 S   s   |  |d }| dd S )z=Extracts the model name from the model arguments using a key.   ,r   )split)r,   r-   args_after_keyr)   r)   r*   extract_model_nameN   s   z@GeneralConfigTracker._get_model_name.<locals>.extract_model_name)peftdelta
pretrainedmodelpathengine=N)str
isinstancedict)r,   r2   prefixesr-   prefixr)   r)   r*   _get_model_nameJ   s   

z$GeneralConfigTracker._get_model_namec                 C   s^   || _ t|p
t | _t| j| _|| _|rt|nd| _	|| _
|r't|nd| _|| _dS )z!Logs model parameters and job ID.N)r   r   r?   r   r   r   r   r   r   r   r    r!   r   )r(   r   r,   r   r    r   r)   r)   r*   log_experiment_args_   s   	
z(GeneralConfigTracker.log_experiment_argsc                 C   s    t  | _t| j| j | _dS )zMLogs the end time of the evaluation and calculates the total evaluation time.N)r$   r%   end_timer:   r&   r"   r'   r)   r)   r*   log_end_timeu   s   
z!GeneralConfigTracker.log_end_timer#   N)__name__
__module____qualname____doc__r   r:   __annotations__r   r   r   r   r   boolr    r!   r"   r+   staticmethodr<   r   r?   r@   rB   r)   r)   r)   r*   r   %   s8   
 
(
r   c                   @   s   e Zd ZdZ												d dedB dededed	ed
edededededededdfddZed!dedB ddfddZ	d!de	de	dB ddfddZ
dede	ddfddZd"ddZdS )#EvaluationTrackerzKeeps track and saves relevant information of the evaluation process.

    Compiles the data from trackers and writes it to files, which can be published
    to the Hugging Face hub if requested.
    N Foutput_pathhub_results_orghub_repo_namedetails_repo_nameresults_repo_namepush_results_to_hubpush_samples_to_hubpublic_repotokenleaderboard_urlpoint_of_contactgatedr#   c                 C   s  t  | _|| _|| _|| _|| _|
| _|| _| |	| _	|| _
| j	s*|s&|r*td| j	rE|dkrE|s5|rE| j	 d }td| d |dkrZ|dkrO|nd}|dkrW|n|}n	|}|}td | d| | _| d| d	| _| d| | _| d| d	| _d
S )a  Creates all the necessary loggers for evaluation tracking.

        Args:
            output_path (str | None): Path to save the results. If not provided,
                the results won't be saved.
            hub_results_org (str): The Hugging Face organization to push the results
                to. If not provided, the results will be pushed to the owner of the
                Hugging Face token.
            hub_repo_name (str): The name of the Hugging Face repository to push
                the results to. If not provided, the results will be pushed to
                `lm-eval-results`. Deprecated in favor of details_repo_name and
                results_repo_name.
            details_repo_name (str): The name of the Hugging Face repository to push
                the details to. If not provided, defaults to `lm-eval-results`.
            results_repo_name (str): The name of the Hugging Face repository to push
                the results to. If not provided, defaults to details_repo_name.
            push_results_to_hub (bool): Whether to push the results to the Hugging
                Face hub.
            push_samples_to_hub (bool): Whether to push the samples to the Hugging
                Face hub.
            public_repo (bool): Whether to push the results to a public or private
                repository.
            token (str): Token to use when pushing to the Hugging Face hub. This
                token should have write access to `hub_results_org`.
            leaderboard_url (str): URL to the leaderboard on the Hugging Face hub
                on the dataset card.
            point_of_contact (str): Contact information on the Hugging Face hub
                dataset card.
            gated (bool): Whether to gate the repository.
        zHugging Face token is not defined, but 'push_results_to_hub' or 'push_samples_to_hub' is set to True. Please provide a valid Hugging Face token by setting the HF_TOKEN environment variable.rL   namez>hub_results_org was not specified. Results will be pushed to 'z'.zlm-eval-resultszhub_repo_name was specified. Both details and results will be pushed to the same repository. Using hub_repo_name is no longer recommended, details_repo_name and results_repo_name should be used instead./z-privateN)r   general_config_trackerrM   rR   rS   rT   rV   rW   _apiapi
gated_repo
ValueErrorwhoamieval_loggerwarningdetails_repodetails_repo_privateresults_reporesults_repo_private)r(   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   r)   r)   r*   r+      sL   -
zEvaluationTracker.__init__zHfApi | Nonec                 C   s   | sdS ddl m} || dS )z8Initializes the Hugging Face API if a token is provided.Nr   r   )rU   )huggingface_hubr   )rU   r   r)   r)   r*   r\      s   
zEvaluationTracker._apiresultssamplesc              
   C   s  | j   | jrztd i }|r+| D ]\}}dd |D }td|||< q|d|i |t	| j  t
j|dtdd}t| jrJ| jnt }t  d	d
| _|jdkru|jjddd ||j d| j d}	n|t| j j }|jddd |d| j d }	|	j|dd | jr| jr| jr| jn| j }
| jj!|
d| j dd | jj"|
t|	t#j$t| j j%|	j&dd| j j% d td|
  W dS W dS W dS  t'y } zt(d tt)| W Y d}~dS d}~ww td dS )a  Saves the aggregated results and samples to the output path.

        Pushes them to the Hugging Face hub if requested.

        Args:
            results (dict): The aggregated results to save.
            samples (dict | None): The samples results to save.
        zSaving results aggregatedc                 S   s$   g | ]}|d  |d  |d  qS )doc_hashprompt_hashtarget_hashr)   ).0sr)   r)   r*   
<listcomp>   s    z=EvaluationTracker.save_results_aggregated.<locals>.<listcomp>rL   task_hashes   F)indentdefaultensure_ascii:-.jsonTparentsexist_ok_results_utf-8encodingdatasetrepo_id	repo_typeprivaterz   zAdding aggregated results for )r   path_or_fileobjpath_in_repor   commit_messagezVSuccessfully pushed aggregated results to the Hugging Face Hub. You can find them at: z!Could not save results aggregatedNz<Output path not provided, skipping saving results aggregated)*r[   rB   rM   ra   infoitemsr   joinupdater   jsondumpsr   r   cwdr   now	isoformatreplacedate_idsuffixparentmkdir	with_namestemr:   r   
write_textr]   rR   rT   re   rf   create_repoupload_fileosr7   r   rY   	Exceptionrb   repr)r(   rh   ri   rp   	task_nametask_samplessample_hashesdumpedr7   file_results_aggregatedr   er)   r)   r*   save_results_aggregated   s   





z)EvaluationTracker.save_results_aggregatedr   c              
   C   s  | j rdz>td|  t| j r| j nt }|jdkr#|j}n|t| jj	 }|j
ddd |d| d| j d }ttd| d	 t|d
dda}|D ]V}i }t|d D ] \}}	i |d| < t|	D ]\}
}||d|  d|
 < qjq[t|d |d< t|d |d< ||d< t|d |d< tj|tddd }|| qQW d   n1 sw   Y  | jr>| jrAddlm}m}m} | jr| jn| j}| jj|d| j dd z| jr| }| jd| d|ddid}|| W n t y } zt!d  t"t#| W Y d}~nd}~ww | jj$|t|| jj	dd!| d"| jj% d# t"d$| d%|  W dS W dS W dS  t yc } zt!d& t"t#| W Y d}~dS d}~ww t"d' dS )(zSaves the samples results to the output path.

        Pushes them to the Hugging Face hub if requested.

        Args:
            task_name (str): The task name to save the samples for.
            samples (dict): The samples results to save.
        zSaving per-sample results for: rw   Trx   samples_r{   z.jsonlzSaving per-task samples to z/*.jsonlar}   r~   	arguments	gen_args_arg_respsfiltered_respstargetF)rs   rt   
Nr   )build_hf_headersget_sessionhf_raise_for_statusr   r   z$https://huggingface.co/api/datasets/z	/settingsrX   auto)urlheadersr   zCould not gate the repositoryzAdding samples results for z to )r   folder_pathr   r   r   z-Successfully pushed sample results for task: z0 to the Hugging Face Hub. You can find them at: zCould not save sample resultsz8Output path not provided, skipping saving sample results)&rM   ra   debugr   r   r   r   r:   r[   r   r   r   r   open	enumerater   r   r   r   writer]   rS   huggingface_hub.utilsr   r   r   rT   rc   rd   r   r^   putr   rb   r   r   upload_folderr   )r(   r   ri   r7   file_results_samplesfsampler   iargjtmpsample_dumpr   r   r   r   r   rr   r)   r)   r*   save_results_samples@  s   



*
z&EvaluationTracker.save_results_samplesc           "         s  ddl m}m}m} td | jr| jn| j}| j	j
|dd}t|}t|}tdd |D ]8}t|}|j}	|j}
t|	}t|	}t|}|
 d| }|
 d	}t| |  |< t|  |< q0t }|D ]Z}t|}|j}|j}
t|}td
d|}tdt|j }|
 d	}td
d| }||kr||dg i}|d |t|gd |||< || d dt|gd qn|D ]e}t|}|j}	|j}
t|	}t|	}t|}td
d|}tdt|	j }|
 d| }td
d| }||kr0||dg i}|d |t|gd |||< || d dt|gd qt  tfddd} fdd|D d }|||dd}td|dd}|d d }d|i}|| tj|dd}d}| j j!dkr|d| j j" d| j j" d 7 }n	|| j j" d!7 }|d"t#|d#  d$t#| d%7 }| j j!dkr|d&| d'| d(7 }|d)  d*|$d+d, d-| d.7 }||d/| j j" d0| j j" | j%| j&d1} |'|  |j(| | j)d2}!|!j*|dd3 d4S )5zvCreates a metadata card for the evaluation results dataset.

        Pushes the card to the Hugging Face hub.
        r   )DatasetCardDatasetCardData
hf_hub_urlzRecreating metadata cardr   )r   r   c                   S   s
   t j S N)r   minr   r)   r)   r)   r*   <lambda>  s   
 z:EvaluationTracker.recreate_metadata_card.<locals>.<lambda>__	__resultsz[^\w.]r{   z**
data_files)r0   r7   latestc                    s    |  S r   r)   )k)latest_task_results_datetimer)   r*   r     s    )r-   c                    s    g | ]}  d d|v r|qS )ru   rv   )r   )rm   r   )latest_datetimer)   r*   ro     s    z<EvaluationTracker.recreate_metadata_card.<locals>.<listcomp>)r   filenamer   r   train)r   r0   rh   all   )rr   zADataset automatically created during the evaluation run of model hf[z](https://huggingface.co/z)
r   zThe dataset is composed of r.   zk configuration(s), each one corresponding to one of the evaluated task.

The dataset has been created from aW   run(s). Each run can be found as a specific split in each configuration, the split being named using the timestamp of the run.The "train" split is always pointing to the latest results.

An additional configuration "results" store all the aggregated results of the run.

To load the details from a run, you can for instance do the following:
zC```python
from datasets import load_dataset
data = load_dataset(
	"z
",
	name="z",
	split="latest"
)
```

z:## Latest results

These are the [latest results from run z](z	/resolve/z/blob/z) (note that there might be results for other tasks in the repos if successive evals didn't cover the same tasks. You find each in the results and the "latest" split for each eval):

```python
z
```zhttps://huggingface.co/zEvaluation run of )dataset_summaryrepo_urlpretty_namerV   rW   )r   )r   N)+rg   r   r   r   ra   r   rT   rc   rd   r]   list_repo_filesr   r   r   r   rY   r   r   r   r   maxr
   resubgetappendr:   valuesr	   r   r   r   r[   r   r   lenr   rV   rW   to_dataset_card_datafrom_templater   push_to_hub)"r(   r   r   r   r   files_in_reporesults_filessample_files	file_pathr   r   r   results_datetimetask_name_sanitizedsamples_keyresults_keycard_metadataresults_filename	eval_dateeval_date_sanitizedconfig_name sanitized_last_eval_date_resultscurrent_resultscurrent_details_for_tasklatest_model_namelast_results_filelast_results_file_pathlatest_results_fileresults_dictnew_dictionaryresults_stringr   	card_datacardr)   )r   r   r*   recreate_metadata_card  s  









 

z(EvaluationTracker.recreate_metadata_card)NrL   rL   rL   rL   FFFrL   rL   rL   Fr   rC   )rD   rE   rF   rG   r:   rI   r+   rJ   r\   r<   r   r   r   r)   r)   r)   r*   rK   {   sv    	

[
Z
hrK   )'r   loggingr   r   r$   collectionsr   dataclassesr   r   r   pathlibr   typingr   r   datasetsr	   datasets.utils.metadatar
   lm_eval.utilsr   r   r   r   r   r   r   r   r   r   r   rg   r   	getLoggerrD   ra   r   rK   r)   r)   r)   r*   <module>   s&    4
U