o
    پi                     @   s6  d dl Z d dlZd dlZd dlmZ d dlZdZdd Zdd Z	dd	 Z
d
d Zdd Zdd ZdededefddZdejfddZdejfddZdejfddZdeded efd!d"Zed#kre jed$Zejd%ed&d' ejd(ed)d* ejd+ed)d* ejd,ed-d' ejd.d/d0 e Zee dS dS )1    N)PathzCompare and find differences to benchmark outputs.

Supported inputs:
* The samples jsonl from `lm_eval --log_samples --output_path FOLDER_NAME`
* The output from `gsm8k/bench_sglang.py --raw-result-file FILE_NAME` (or mmlu)
c                    st  | j dkr
t|  ntt|  t fdddD sJ t } jddddt	d	
 }|d
 d
}|t	d
dk }|t	d
dk}td| j  t| jtjt| | | ddd | jstjdddddd2 td t| td t| d|fd|ffD ]\}}td| d t| qW d    d S 1 sw   Y  d S d S )Nsimple_evalsc                 3   s    | ]}| j v V  qd S N)columns).0cdf_input Z/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/debug_utils/text_comparator.py	<genexpr>   s
    
zmain.<locals>.<genexpr>categorytrial_index	prompt_idpromptoutputcorrectr   r   Tmaintain_orderr   correctness_deltar   zDump output to )df_metadf_good_to_baddf_bad_to_good   )indenti'  	UTF8_FULL)fmt_str_lengthstbl_colstbl_rowstbl_width_charstbl_formattingz#====== Correctness per trial ======zH====== Correctness Delta (-1.0 means all-right becomes all-wrong) ======z	Good->Badz	Bad->Goodz====== Concrete Examples: z ======)	data_type#_compute_df_input_mode_simple_evals_transform_df_input_compute_df_rawall_compute_df_metagroup_byaggplcolmeanlensortfilterprintoutput_pathr   
write_textjsondumpsdictto_dictsdisable_print_detailsConfig)argsr   df_correctness_per_trialdf_correctness_deltar   r   namedfr
   r   r   main   sb   



"r?   c                 C      t dd t| dD S )Nc                 S   s   g | ]	}t d i |qS )r
   )'_compute_df_input_one_mode_simple_evalsr   infor
   r
   r   
<listcomp>L   s    z7_compute_df_input_mode_simple_evals.<locals>.<listcomp>r:   r+   concat_get_file_infosrE   r
   r
   r   r$   J   s
   r$   c           	   	   C   s   t t|  }g }|d d D ]1}|d d }|d }|dv s(J d|t||t|t ||d d |d	kd
}|| qt	|S )Nmetadatasingle_eval_resultsexample_level_metadataactual_queried_prompt_messagesscore>                 ?zscore=response_textrO   r   )
r4   loadsr   	read_textr6   _compute_id_from_objectr5   appendr+   	DataFrame)	pathr   r   datarowssingle_eval_resultr   rM   rowr
   r
   r   rA   S   s$   

rA   c                 C   s8   t | tjr
|  } tj| ddd}t|d	 S )NTF)	sort_keysensure_asciizutf-8)

isinstancer+   Seriesto_listr4   r5   hashlibsha256encode	hexdigest)objjson_strr
   r
   r   rS   k   s   rS   c                 C   r@   )Nc                 S   s&   g | ]}t |d  |d |d dqS )rV   r   r   rV   r   r   )_read_df_rawrB   r
   r
   r   rD   t   s    z#_compute_df_raw.<locals>.<listcomp>rE   rF   rE   r
   r
   r   r&   r   s
   r&   c                 C   s   dd d| j fd| jffD S )Nc                 S   s0   g | ]\}}t |D ]\}}t|||d q
qS )rf   )	enumerater6   )r   r   pathsr   rV   r
   r
   r   rD      s    z#_get_file_infos.<locals>.<listcomp>baselinetarget)baseline_pathtarget_pathrE   r
   r
   r   rH      s
   rH   rV   r   r   c                 C   s   t | jt ||dS )N)r   r   )r+   read_ndjsonwith_columnslitrf   r
   r
   r   rg      s   

rg   r>   c              	   C   s   d| j v rctd | d jdd }t|dkr1|d }td|d	|  | td|k} | jtd
tdtdtdj	
dj	
dtdjdjdtdtd} | S d| j v rntd | S td| j  d)Ndoc_idzTransform mode: lm_evalr0   Tr      r   zChoose filter_name=z among r   r   	arguments
gen_args_0arg_0respsexact_match)r   r   r   r   r   zTransform mode: SGLang benchzUnknown data: z?. You may need to set `--data-type` if using e.g. simple_evals.)r   r1   uniquer_   r.   r0   r+   r,   selectstructfieldlistgetcastbool	Exception)r>   filter_namesfilter_namer
   r
   r   r%      s,   
	
r%   r	   c                 C   sX   |  ddd} tdd | jdddD }|jtdtd	 d
}| dd}|S )Nr   r   r   c                 S   s   g | ]}t |qS r
   )_handle_one_prompt)r   df_one_promptr
   r
   r   rD      s    z$_compute_df_meta.<locals>.<listcomp>Tr   correctness_targetcorrectness_baseline)r   r   output_same_prefix_len)r/   r+   rU   partition_byro   r,   )r	   r   r
   r
   r   r(      s   r(   r   c              	      s   t tdd | d D dksJ | tddk}| tddk}|d  }|d   t fd	d|D }t| d
 |d  |d  || d | dS )Nc                 s   s    | ]}t |V  qd S r   )rS   )r   rd   r
   r
   r   r      s    z%_handle_one_prompt.<locals>.<genexpr>r   rr   r   rj   rk   r   c                 3   s$    | ]} D ]}t ||V  qqd S r   )_compute_str_prefix_len)r   output_baselineoutput_targetoutputs_targetr
   r   r      s    )r   r   r   )r   r   )r   r   r   r   r   outputs_baseliner   )	r.   setr0   r+   r,   r_   maxr6   r-   )r   df_baseline	df_targetr   r   r
   r   r   r      s"   "

r   abreturnc                 C   s<   t t| t|}t|D ]}| | || kr|  S q|S r   )minr.   range)r   r   min_lenir
   r
   r   r      s   r   __main__)descriptionz--data-typeauto)typedefaultz--baseline-path+)r   nargsz--target-pathz--output-pathz /tmp/text_comparator_output.jsonz--disable-print-details
store_true)action)argparser`   r4   pathlibr   polarsr+   _DESCRIPTIONr?   r$   rA   rS   r&   rH   strintrg   rU   r%   r(   r   r   __name__ArgumentParserparseradd_argument
parse_argsr:   r
   r
   r
   r   <module>   s:    :	