o
    6ti                     @   sp   d dl Z d dlmZmZ d dlZdd Zdedeee ef fddZdedeee ef fd	d
Z	dd Z
dS )    N)ListTuplec                  K   s.   |  d}|stdtjd|dd}d|iS )z
    Load the graphwalks dataset with specific data file.

    Args:
        kwargs: Must contain 'data_file' key specifying which parquet file to load

    Returns:
        Dictionary with 'train' split containing the dataset
    	data_filez-data_file must be specified in dataset_kwargszopenai/graphwalkstrain)
data_filessplit)get
ValueErrordatasetsload_dataset)kwargsr   dataset r   R/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/graphwalks/utils.pyr      s   

r   responsereturnc                 C   sr   |  ddd }d|vrg dfS td|}|r5|d}| s'g dfS dd	 |d
D }|dfS g dfS )a  
    Extract the answer list from a model response.

    Args:
        response: The model's generated response

    Returns:
        Tuple of (list of nodes, is_error)
        - list of nodes: extracted node IDs
        - is_error: True if parsing failed, False otherwise
    
zFinal Answer:TFinal Answer:\s*\[(.*)\]   Fc                 S   "   g | ]}|  r|   d qS z'"strip.0itemr   r   r   
<listcomp>7       z'extract_answer_list.<locals>.<listcomp>,)rstripr   researchgroupr   )r   linematchbracket_contentresult_listr   r   r   extract_answer_list   s   
r(   c                 C   st   |  dd}t|D ])}td|}|r5|d}| s%g df  S dd |dD }|df  S qg dfS )	a  
    Extract the answer list from a model response (flexible version).
    Searches backwards through all lines to find "Final Answer:" pattern.
    More lenient than extract_answer_list which only checks the last line.

    Args:
        response: The model's generated response

    Returns:
        Tuple of (list of nodes, is_error)
        - list of nodes: extracted node IDs
        - is_error: True if parsing failed, False otherwise
    r   r   r   Fc                 S   r   r   r   r   r   r   r   r   Y   r   z0extract_answer_list_flexible.<locals>.<listcomp>r   T)r    r   reversedr!   r"   r#   r   )r   linesr$   r%   r&   r'   r   r   r   extract_answer_list_flexibleA   s   
r+   c                 C   s  |d }| d }t |\}}t|}t|}t||@ }t|}	t|}
|
dkr,||
 nd}|	dkr6||	 nd}|| dkrHd||  ||  nd}t|\}}t|}t||@ }t|}|
dkrf||
 nd}|dkrp|| nd}|| dkrd||  ||  nd}||dS )aB  
    Process results and compute set-based F1 scores.
    Returns both strict F1 (last line only) and flexible F1 (search all lines).

    Args:
        doc: Document containing ground truth answer_nodes
        results: List containing model generation

    Returns:
        Dictionary with f1 and flexible_f1 scores
    r   answer_nodesg           )f1flexible_f1)r(   setlenr+   )docresultsr   
gold_nodespredicted_nodes_strict_sampled_set_strict	truth_setn_overlap_strictn_sampled_strictn_goldenrecall_strictprecision_strict	f1_strictpredicted_nodes_flexiblesampled_set_flexiblen_overlap_flexiblen_sampled_flexiblerecall_flexibleprecision_flexiblef1_flexibler   r   r   process_resultsd   sB   	rF   )r!   typingr   r   r
   r   strboolr(   r+   rF   r   r   r   r   <module>   s    &#