o
    پi/                     @   s   d Z ddlZddlZddlZddlZddlmZmZmZm	Z	 ddl
mZ ddlmZ ddlmZmZmZmZmZmZ h dZdZd	Zd
edefddZdede	e fddZG dd deZdS )a
  
LongBench v2: Towards Deeper Understanding and Reasoning on Realistic Long-Context Multitasks
Yushi Bai, Shangqing Tu, Jiajie Zhang, Hao Peng, Xiaozhi Wang, Xin Lv, Shulin Cao, Jiazheng Xu, Lei Hou, Yuxiao Dong, Jie Tang, Juanzi Li
https://arxiv.org/abs/2412.15204
    N)AnyDictListOptional)AutoTokenizer)simple_eval_common)ANSWER_PATTERN_MULTICHOICE
HTML_JINJAEval
EvalResultSamplerBaseSingleEvalResult>   multi_document_qasingle_document_qalong_structured_datalong_dialogue_historycode_repo_understandinglong_in_context_learningzTHUDM/LongBench-v2trainrowreturnc           	      C   s   |  dd}|  dd}d| v rE| d }t|dkr|d nd}t|dkr*|d nd}t|dkr6|d nd}t|dkrB|d nd}n(|  d	|  d
d}|  d|  dd}|  d|  dd}|  d|  dd}d|  d|  d|  d|  d|  d|  d}|S )z;Format a LongBench-v2 question using the official template.context questionchoicesr            Achoice_ABchoice_BCchoice_CDchoice_DzF
Please read the following text and answer the question below.
<text>
z7
</text>

What is the correct answer to this question: z
Choices:
(A) z
(B) z
(C) z
(D) zP

Format your response as follows: "The correct answer is (insert answer here)".)getlenstrip)	r   r   r   r   r   r!   r#   r%   prompt r*   X/home/ubuntu/.local/lib/python3.10/site-packages/sglang/test/simple_eval_longbench_v2.pyformat_longbench_v2_question)   s4   	
r,   responsec                 C   s   |  dd} td| tj}|r|d S td| tj}|r(|d S tt| }|r7|d S td| tj}|rH|d S dS )zFExtract answer from model response using official LongBench-v2 method.*r   z!The correct answer is \(([A-D])\)r   zThe correct answer is ([A-D])zanswer\s+is\s*\(?([A-D])\)?N)replaceresearch
IGNORECASEgroupupperr   )r-   matchr*   r*   r+   extract_longbench_v2_answerN   s   r6   c                   @   s  e Zd ZdZdeddddddfdededee deded	eee  d
ee dee fddZ	dedee
eef  fddZdedee
eef  fddZdedee
eef  fddZde
eef de
eef fddZdededee dee def
ddZd edefd!d"ZdS )#LongBenchV2Evalz
    Evaluation utility for LongBench-v2 dataset.

    LongBench-v2 is designed to assess the ability of LLMs to handle long-context problems
    requiring deep understanding and reasoning across real-world multitasks.
    Nr   modeldata_sourcenum_examplesnum_threads	n_repeats
categoriesmax_context_lengthmin_context_lengthc	           
         s   t j|dd| _|| _|| _| |}	 r fdd|	D }	|r3|dks(J d|	dt|t|	 }	|	| }	|	s=td|	| _	|| _
|| _td	t| j	 d
  rZtd   |s^|rktd| d| d dS dS )a<  
        Initialize LongBench-v2 evaluation.

        Args:
            data_source: HuggingFace dataset name, local file path (CSV/JSON)
            num_examples: Number of examples to evaluate (None for all)
            num_threads: Number of threads for parallel processing
            n_repeats: Number of times to repeat evaluation for error bars
            categories: List of task categories to include (None for all)
            max_context_length: Maximum context length in characters
            min_context_length: Minimum context length in characters
        T)trust_remote_codec                    s   g | ]}| d  v r|qS )category)r&   ).0exr=   r*   r+   
<listcomp>       z,LongBenchV2Eval.__init__.<locals>.<listcomp>r   z3n_repeats only supported when not sampling examplesNzANo examples available for LongBench-v2 evaluation after filteringzLoaded z examples from LongBench-v2zFiltered to categories: zContext length filter: -z characters)r   from_pretrained	tokenizerr?   r>   _load_datasetminr'   
ValueErrorexamplesr<   r;   print)
selfr8   r9   r:   r;   r<   r=   r>   r?   rM   r*   rD   r+   __init__q   s2   
zLongBenchV2Eval.__init__r   c                    s<   |st }tj|r |}n |} fdd|D S )z1Load dataset from HuggingFace hub or local files.c                    s   g | ]}  |qS r*   )_normalize_example)rB   examplerO   r*   r+   rE      s    z1LongBenchV2Eval._load_dataset.<locals>.<listcomp>)DEFAULT_DATASETospathexists_load_local_file_load_hf_dataset)rO   r9   raw_examplesr*   rS   r+   rJ      s   
zLongBenchV2Eval._load_datasetrV   c                 C   sv  t j|d  }|dv r7t|ddd}|dkr"dd |D }nt|}W d	   n1 s1w   Y  nn|d
kr\t|ddd}t|}t	|}W d	   n1 sVw   Y  nIzt|ddd}t|}W d	   n1 stw   Y  W n* tj
y   t|ddd}t|}t	|}W d	   n1 sw   Y  Y nw t|tr|dg }t|t	std|S )z/Load examples from a local CSV/JSON/JSONL file.r   >   .json.jsonlrzutf-8)encodingr\   c                 S   s   g | ]}|  rt|qS r*   )r(   jsonloads)rB   liner*   r*   r+   rE      rF   z4LongBenchV2Eval._load_local_file.<locals>.<listcomp>Nz.csvdataz)Expected list of examples from local file)rU   rV   splitextloweropenr_   loadcsv
DictReaderlistJSONDecodeError
isinstancedictr&   rL   )rO   rV   suffixfhrb   readerr*   r*   r+   rX      s@   






z LongBenchV2Eval._load_local_file
identifierc              
   C   s|   |j ddd}|d }t|dkr|d nt}zddlm} W n ty0 } ztd|d}~ww |||d	}d
d |D S )z&Load the dataset from HuggingFace Hub.:r   )maxsplitr   r   )load_datasetzaPlease install the 'datasets' package to load LongBench-v2 from HuggingFace: pip install datasetsN)splitc                 S   s   g | ]}t |qS r*   )rl   )rB   r   r*   r*   r+   rE      s    z4LongBenchV2Eval._load_hf_dataset.<locals>.<listcomp>)rt   r'   DEFAULT_DATASET_SPLITdatasetsrs   ImportError)rO   rp   partsdataset_namert   rs   excdatasetr*   r*   r+   rY      s   z LongBenchV2Eval._load_hf_datasetrR   c                 C   s   t |}dD ]}d| }||vr||v r|| ||< qd|vr*d|v r*|d |d< |d}t|tr>|  |d< |S t|trXd|  krMdk rXn |S g d| |d< |S )z.Ensure each example exposes the expected keys.r   r    r"   r$   choice_rA   domainanswerr      )rl   r&   rk   strr(   r4   int)rO   rR   
normalizedletter
choice_keyr   r*   r*   r+   rQ      s    


 z"LongBenchV2Eval._normalize_exampleformatted_questionrI   
min_length
max_lengthc                 C   s>   | |}t|}|dur||k rdS |dur||krdS dS )z9Filter examples by context length measured in characters.NFT)encoder'   )rO   r   rI   r   r   	input_idscontext_lengthr*   r*   r+   _check_context_length   s   
z%LongBenchV2Eval._check_context_lengthsamplerc                    s0   dt f fdd}t|jj}t|S )zRun the evaluation.r   c                    s\  t | }js
jr|jjjsd S  j|ddg} |}|d u r)d}t|}| dd}t|t	r?|
  }nt|trVd|  krNdk rVn ng d| }||kr\dnd	}tjtj|t|d
d|||d}|t|d
dg }dt|i}	| d| dd}
|
tv r||	|
< | d}t|t	r|r||	d|  < t||||	dS )Nuser)contentroler   r   r   r   r|   g      ?g        	assistant)prompt_messagesnext_messagescorecorrect_answerextracted_answercharsrA   r~   unknown
difficultydifficulty_)htmlr   convometrics)r,   r?   r>   r   rI   _pack_messager6   r&   rk   r   r(   r4   r   common	jinja_envfrom_stringr	   renderrl   r'   TASK_CATEGORIESrd   r   )r   r   r   response_textr   r   r   r   r   r   rA   r   r   rO   r*   r+   fn  sT   
"
	
z$LongBenchV2Eval.__call__.<locals>.fn)rl   r   map_with_progressrM   r;   aggregate_results)rO   r   r   resultsr*   r   r+   __call__  s   D
zLongBenchV2Eval.__call__)__name__
__module____qualname____doc__rT   r   r   r   r   rP   r   r   rJ   rX   rY   rQ   r   boolr   r   r   r   r*   r*   r*   r+   r7   i   sV    	
	
: "
r7   )r   rg   r_   rU   r0   typingr   r   r   r   transformersr   sglang.testr   r   sglang.test.simple_eval_commonr   r	   r
   r   r   r   r   rT   ru   rl   r   r,   r6   r7   r*   r*   r*   r+   <module>   s    
	%