o
    6ti                     @   sf   d dl Z d dlmZmZmZ e jdkre jne jZdd Z	dd Z
dd	 Zd
d Zdd Zdd ZdS )    N)f1_scoreprecision_scorerecall_scorez2.0.0c                 C   s   t dd | D }t dd | D }|dkr"t dd | D d }nt dd | D | }|dkr=t dd | D d }nt d	d | D | }d}|| dkrZd
| | ||  }|S )a_  
    Custom aggregation to compute corpus level metrics for the lexical substitution task
    predictions is a list of tuples (prec, has_answ, has_annotation)
    prec is the precision before dividing by |A|
    has_answ is 0 if the model did not produce any answer
    has_annotation is 0 if the gold answer is empty: no synonims from annotators
    c                 S      g | ]}|d  qS )    .0pr   r   U/home/ubuntu/.local/lib/python3.10/site-packages/lm_eval/tasks/evalita_llm/metrics.py
<listcomp>       z_aggreg_ls.<locals>.<listcomp>c                 S   r   )   r   r   r   r   r   r      r   r   c                 S   r   r   r   r   r   r   r   r      r   r   c                 S   r   r   r   r   r   r   r   r      r   c                 S   r   r   r   r   r   r   r   r      r   c                 S   r   r   r   r   r   r   r   r      r   r   )sum)predictionsATprecrecf1r   r   r   
_aggreg_ls
   s   	r   c                 C   s   t |  \} }t|| dd}|S )a  
    This aggregation considers the sentiment analysis task as a multiple choice one with four classes
    the f1 score is computed as the average of the f1 scores for each class weighted by the number of samples
    See sklearn.metrics.f1_score for more details

    weightedaverage)zipr   )r   
referencesr   r   r   r   _aggreg_sa_v2%   s   r   c                 C   s  t |  \} }	 dd }|| \}}||\}}t||dd}t||ddd }t|dkr2|d }	nd}	t||ddgdd\}
}t||ddgdd\}}t||dd}t||ddd }t|dkrlt||ddd }nd}||	 d }|| d }|| d }|S )	a  
    Custom aggregation function for the sentiment analysis task
    The original tasks compute the F1 score for each class and then average them
    Since the prompt cast the task to a multple choice one we need to aggregate the results in a different way
    c                 S   s   g }g }| D ]?}|dkr| d | d q|dkr&| d | d q|dkr5| d | d q|dkrD| d | d q	 q||fS )zf
        Return two separate list of labels for opos and oneg
        x is a list of integers
        r   r   r      )append)xoposonegir   r   r   _map_to_original_labels?   s"   



z+_aggreg_sa.<locals>._map_to_original_labelsNr   r   r   )labelsr   r   )r   r   lenr   r   )r   r   r$   	pred_opos	pred_onegref_oposref_onegopos_f1
opos_f1_c0
opos_f1_c1oneg_prec_c0oneg_prec_c1oneg_rec_c0oneg_rec_c1oneg_f1
oneg_f1_c0
oneg_f1_c1f1_score_oposf1_score_onegf1_finalr   r   r   
_aggreg_sa1   s2   


r8   c           	      C   s   t |  \}}g }|D ]}|| q
g }|D ]}|| qt||d d}t|dkr;t|d d t|d  }|S |d }|S )Nr   r   r   )r   extendr   r&   r   )	r   predrefall_predr
   all_refrr   f1_sumr   r   r   _aggreg_ner|   s   rA   c                 C   sN   t |  \}}g }|D ]}|| q
g }|D ]}|| qt||dd}|S )Nmacror   )r   r:   r   )r   r;   r<   r=   r
   r>   r?   r   r   r   r   _aggreg_rel   s   rC   c                 C   s.   t t|  }|d }|d }t||dd}|S )Nr   r   rB   r   )listr   r   )itemsunzipped_listgoldspredsfscorer   r   r   
_aggreg_dd   s
   rJ   )torchsklearn.metricsr   r   r   __version__inference_modeno_gradinference_decoratorr   r   r8   rA   rC   rJ   r   r   r   r   <module>   s    K