o
    ߥis                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlmZm	Z	 ddl
Z
ddlZddlmZ dZeg dZdd ZG d	d
 d
Zd@ddZdAddZdAddZG dd deZdd Zdd ZdBddZdCddZdd Zdd  Zd!d" Zd#d$ Z d%d& Z!d'd( Z"d)d* Z#d+d, Z$d-d. Z%d/d0 Z&d1d2 Z'd3d4 Z(d5d6 Z)e*d7kre+ Z,e,j-d8d9d: e,j-d;d<d: e,j-d=d>d: e,. Z/e/j01 2d?de/_0e)e/ dS dS )Dz?
This module computes evaluation metrics for DuReader dataset.
    N)Counterdefaultdict)Rouge YesNoDependsc                    s   t | t  k r|   }  fddtdt | d D }tdt  d D ]@}tdt | d D ]4}| |d   |d  krP||d  |d  d || |< q1t||d  | || |d  || |< q1q&|t |  t   S )a  
    Calculates longest common subsequence for a pair of tokenized strings
    :param string : list of str : tokens from a string split using whitespace
    :param sub : list of str : shorter string, also split using whitespace
    :returns: length (list of int): length of the longest common subsequence between the two strings

    Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
    c                    s(   g | ]}d d t dt d D qS )c                 S   s   g | ]}d qS r    ).0ir   r   _/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/palm_v2/dureader_eval.py
<listcomp>0       z%my_lcs.<locals>.<listcomp>.<listcomp>r      )rangelen)r   jsubr   r   r   0   s
    

zmy_lcs.<locals>.<listcomp>r   r   )r   r   max)stringr   lengthsr   r   r   r   r   my_lcs$   s   	


",r   c                   @   s&   e Zd Zd	ddZdd Zdd ZdS )
Bleu   c                 C   s   || _ i | _i | _d S N)_n_hypo_for_imageref_for_image)selfnr   r   r   __init__A   s   
zBleu.__init__c           
      C   s   t | t | ksJ t | }t| jd}|D ]2}|| }|| }t|t u s.J t|dks6J t|t u s>J t|dksFJ ||d |f7 }q|jddd\}}	||	fS )Nr"   r   r   closest)optionverbose)listkeys
BleuScorerr   typer   compute_score)
r!   gtsresimgIdsbleu_scoreridhyporefscorescoresr   r   r   r,   G   s   zBleu.compute_scorec                 C   s   dS )Nr   r   r!   r   r   r   method[   s   zBleu.methodNr   )__name__
__module____qualname__r#   r,   r7   r   r   r   r   r   ?   s    
r   r   Fc                 C   sn   |   }tt}td|d D ]!}tt|| d D ]}t||||  }||  d7  < qqt||fS )zTakes a string as input and returns an object that can be given to
    either cook_refs or cook_test. This is optional: cook_refs and cook_test
    can take string arguments as well.r   )splitr   intr   r   tuple)sr"   outwordscountskr   ngramr   r   r   precook_   s   rE   c           
      C   s   g }i }| D ]"}t ||\}}|| | D ]\}}	t||d|	||< qq|dkr5t|}||fS |dkrCtt|t| }||fS )zTakes a list of reference sentences for a single segment
    and returns an object that encapsulates everything that BLEU
    needs to know about them.r   shortestaverage)	rE   appenditemsr   getminfloatsumr   )
refseffr"   reflen	maxcountsr3   rlrB   rD   countr   r   r   	cook_refsl   s   
rT   c           
   	      s   |\}}t | |d\ }i }|dkr"t fdd|D d |d< n||d<  |d<  fdd	td|d D |d
< dg| |d< | D ]\}}	|d t|d   t||d|	7  < qE|S )znTakes a test sentence and returns an object that
    encapsulates everything that BLEU needs to know about it.Tr%   c                 3        | ]}t |  |fV  qd S r   absr   r3   testlenr   r   	<genexpr>       zcook_test.<locals>.<genexpr>r   rP   rZ   c                    s   g | ]}t d  | d qS )r   r   )r   r   rC   rY   r   r   r      s    zcook_test.<locals>.<listcomp>guessr   correct)rE   rK   r   rI   r   rJ   )
testxxx_todo_changemerO   r"   rP   refmaxcountsrB   resultrD   rS   r   rY   r   	cook_test   s     
rd   c                   @   s   e Zd ZdZdZdd Zd)ddZd	d
 Zd*ddZd*ddZ	d*ddZ
d*ddZd*ddZdd Zdd Zdd Zdd Zdd Zd+d d!Zd,d"d#Zd-d%d&Zd-d'd(ZdS ).r*   zBleu scorer.
    )r"   crefsctest_score_ratio_testlen_reflenspecial_reflenc                 C   s2   t | jd}t| j|_t| j|_d|_|S )z copy the refs.r$   N)r*   r"   copyrf   re   rg   )r!   newr   r   r   rl      s
   zBleuScorer.copyNr   c                 C   s(   || _ g | _g | _| || || _dS )z singular instance N)r"   re   rf   cook_appendrk   )r!   r`   rN   r"   rk   r   r   r   r#      s
   
zBleuScorer.__init__c                 C   sT   |dur%| j t| |durt|| j d }| j| n| jd d| _dS )zCcalled by constructor and __iadd__ to avoid creating new instances.N)re   rH   rT   rd   rf   rg   )r!   r`   rN   cooked_testr   r   r   rn      s   
zBleuScorer.cook_appendc                 C      | j |d | jS Nr&   )r,   rh   r!   r&   r   r   r   ratio      zBleuScorer.ratioc                 C   s   | j |d| j|dfS )zreturn (bleu, len_ratio) pairrs   )fscoreru   rt   r   r   r   score_ratio   s   zBleuScorer.score_ratioc                 C   s   d|  | S )Nz%.4f (%.2f))rx   rt   r   r   r   score_ratio_str   s   zBleuScorer.score_ratio_strc                 C   rq   rr   )r,   rj   rt   r   r   r   rP      rv   zBleuScorer.reflenc                 C   rq   rr   )r,   ri   rt   r   r   r   rZ      rv   zBleuScorer.testlenc                 C   sd   t |tu r	|g}t|t| jksJ |g | _t|| jD ]\}}| jt|| qd | _| S r   )	r+   strr   re   rf   ziprH   rd   rg   )r!   new_testtrsr   r   r   retest   s   zBleuScorer.retestc                 C   s   |  | S )z= replace test(s) with new test(s), and returns the new score.)r   r,   )r!   r|   r   r   r   rescore   s   zBleuScorer.rescorec                 C   s:   t | jt | jksJ dt | jt | jf t | jS )Nzrefs/test mismatch! %d<>%d)r   re   rf   r6   r   r   r   size   s   

zBleuScorer.sizec                 C   s\   t |tu r| |d |d  | S | |sJ d| j|j | j|j d| _| S )z.add an instance (e.g., from another sentence).r   r   zincompatible BLEUs.N)r+   r>   rn   
compatiblerf   extendre   rg   r!   otherr   r   r   __iadd__   s   zBleuScorer.__iadd__c                 C   s   t |to
| j|jkS r   )
isinstancer*   r"   r   r   r   r   r         zBleuScorer.compatiblerG   c                 C   s   |  | jd d |S Nr   )_single_reflenre   rt   r   r   r   single_reflen  r   zBleuScorer.single_reflenc                    sf   |dkr
t |}|S |dkrtt|t| }|S |dkr-t  fdd|D d }|S J d| )	NrF   rG   r%   c                 3   rU   r   rV   rX   rY   r   r   r[     r\   z,BleuScorer._single_reflen.<locals>.<genexpr>r   Fzunsupported reflen option %s)rK   rL   rM   r   )r!   reflensr&   rZ   rP   r   rY   r   r     s   zBleuScorer._single_reflenr   c                 C   s   d | _ | ||S r   )rg   r,   )r!   r&   r'   r   r   r   recompute_score  s   zBleuScorer.recompute_scorec              
   C   s  | j }d}d}dd t|D }| jd ur| jS |d u r't| jdkr%dnd}d| _d| _dddg| dg| d	}| jD ]}|d
 }	|  j|	7  _| jd u rY| 	|d ||	}
n| j}
|  j|
7  _dD ]}t|D ]}|| |  || | 7  < qkqed}t|D ]%}|t
|d | | t
|d | |  9 }|| |d|d    q|	| |
|  }|dk rt|D ]}|| d  tdd|  9  < q|dkrt||
 q=| j|d< | j|d
< g }d}t|D ]!}|t
|d | | |d | |  9 }||d|d    q| j| | j|  }|dk r1t|D ]}||  tdd|  9  < q|dkr?t| td| || _| j|fS )Ng&.>gV瞯<c                 S   s   g | ]}g qS r   r   )r   _r   r   r   r     r   z,BleuScorer.compute_score.<locals>.<listcomp>r   rG   r%   r   )rZ   rP   r^   r_   rZ   rP   )r^   r_         ?r_   r^   ro   zratio:)r"   r   rg   r   re   ri   rj   rf   rk   r   rL   rH   mathexpprint)r!   r&   r'   r"   smalltiny	bleu_list
totalcompscompsrZ   rP   keyrC   bleuru   bleusr   r   r   r,     sr   


$



"


zBleuScorer.compute_score)NNr   Nr   )rG   NNr   )r9   r:   r;   __doc__	__slots__rl   r#   rn   ru   rx   ry   rP   rZ   r   r   r   r   r   r   r   r   r,   r   r   r   r   r*      s(    
	







r*   c                 C   s<   | s| S g }| D ]}dd t |D }|d| q|S )z
    Normalize strings to space joined chars.

    Args:
        s: a list of strings.

    Returns:
        A list of normalized strings.
    c                 S   s    g | ]}t | d kr|qS r
   )r   strip)r   cr   r   r   r   o       znormalize.<locals>.<listcomp> )r(   rH   join)r?   
normalizedsstokensr   r   r   	normalizea  s   
r   c                 C   s   d| v sJ dd| v sJ d | d d| v s"J d | d t| d ts2J d | d d| v s?J d	 | d t| d trNt| d d
ksWJ d | d dS )zX
    Check data.

    Raises:
        Raises AssertionError when data is not legal.
    question_idzMissing 'question_id' field.question_typez.Missing 'question_type' field. question_id: {}yesno_answersz.Missing 'yesno_answers' field. question_id: {}z'yesno_answers' field must be a list, if the 'question_type' is not
            'YES_NO', then this field should be an empty list.
            question_id: {}entity_answersz/Missing 'entity_answers' field. question_id: {}r   z'entity_answers' field
            must be a list, and has at least one element, which can be a empty list.
            question_id: {}N)formatr   r(   r   )objtaskr   r   r   
data_checkt  s*   




r   c              
   C   s   ddd}i }g d}|r|dg7 }|  drt| dnd}|du r&| gn| }|D ]F}||d|dD ]<}	z	t|	 }
W n tyK   td	w t|
| |
d
 }||vs`J d	|i ||< |D ]
}|
| || |< qfq5q,|S )a  
    Read predict answers or reference answers from file.

    Args:
        file_name: the name of the file containing predict result or reference
                   result.

    Returns:
        A dictionary mapping question_id to the result information. The result
        information itself is also a dictionary with has four keys:
        - question_type: type of the query.
        - yesno_answers: A list of yesno answers corresponding to 'answers'.
        - answers: A list of predicted answers.
        - entity_answers: A list, each element is also a list containing the entities
                    tagged out from the corresponding answer string.
    Nc                 S   s   |d ur
| | |S t | |S r   )open)	file_namemodezip_objr   r   r   _open  s   
zread_file.<locals>._open)answersr   r   r   sourcez.zipr)r   z'Every line of data should be legal jsonr   zDuplicate question_id: {}r   )
endswithzipfileZipFilenamelistjsonloadsr   
ValueErrorr   r   )r   r   is_refr   resultsr)   zf	file_listfnliner   qidrC   r   r   r   	read_file  s8   


r   c           	      C   s   t |  t | ksJ dt | t |   i }t||| \}}t|D ]\}}||d|d  < q-t tt	dd | 
 tt	dd |
 }tdd |D t| }||d	< |S )
z(
    Compute bleu and rouge scores.
    zmissing keys: {}zBleu-%dr   c                 S      | d S r   r   xr   r   r   <lambda>      z$compute_bleu_rouge.<locals>.<lambda>c                 S   r   r   r   r   r   r   r   r     r   c                 S   s   g | ]}|d  d qS )zrouge-lfr   )r   dr   r   r   r     s    z&compute_bleu_rouge.<locals>.<listcomp>Rouge-L)setr)   r   r   r,   	enumerater   
get_scoresr(   mapvaluesrM   r   )		pred_dictref_dict
bleu_orderr5   bleu_scoresr   r   
bleu_scorerouge_scorer   r   r   compute_bleu_rouge  s$   r   c                 C   sf   t | t |@ }t| }|dkrdS d| t|  }d| t| }d| | ||  }|||fS )zp
    Compute local precision recall and f1-score,
    given only one prediction list and one reference list
    r   r   r   r   r      )r   rM   r   r   )	pred_listref_listcommonnum_samepr   f1r   r   r   	local_prf  s   
r   c                 C   sJ  t | }d\}}}|D ]i}| |g g}t|dks#J d||d }|| }d}	d}
|D ]}t||d }||	krB|}
|}	q1|
du rZt|dkrXt|dd d	d }
ng }
t |
}t |}|t||@ 7 }|t|7 }|t|7 }q|dkrt|| nd}|dkrt|| nd}|dkrd| | ||  nd}|||d
S )z0
    Compute precision recall and f1-score.
    r   r   z6the number of entity list for question_id {} is not 1.r   Nr   c                 S   s   t | S r   )r   r   r   r   r   r     r   zcompute_prf.<locals>.<lambda>)r   )	PrecisionRecallF1)r   r)   rJ   r   r   r   sortedrL   )r   r   ref_question_idscorrect_predstotal_correcttotal_predsr   pred_entity_listall_ref_entity_listsbest_local_f1best_ref_entity_listref_entity_listlocal_f1gold_entitiespred_entitiesr   r   r   r   r   r   compute_prf  sD   
 r   c                 C   s,   dd |   D }dd |  D }||fS )z6
    Prepares data for calculation of prf scores.
    c                 S      i | ]	\}}||d  qS r   r   r   rC   vr   r   r   
<dictcomp>
      zprepare_prf.<locals>.<dictcomp>c                 S   r   r   r   r   r   r   r   r     r   )rI   )r   r   predsrN   r   r   r   prepare_prf  s   r   c                 C   s,   i }|   D ]\}}||r|||< q|S )zM
    Filter a subset of the result_dict, where keys ends with 'key_tag'.
    )rI   r   )result_dictkey_tagfilteredrC   r   r   r   r   filter_dict  s   
r   c                    st  i }i }i }|dkr|}| }n|  D ]\}}|d |kr+|||< || v r+| | ||< q|dks8|dks8|dkrGt|||\t}|S |dkrt|||\g d}	fdd	|	D }
fd
d	|	D }t}t|	|
|D ]\ }}t||} fdd	|  D }|| qt|S |dkrt||\t|||\}}t}|t|| |S td|)z
    Computes metrics.
    bothr   mainalldescriptionyesnor   c                       g | ]}t  |qS r   r   r]   )r   r   r   r   6      zget_metrics.<locals>.<listcomp>c                    r  r   r  r]   )r   r   r   r   7  r  c                    s    g | ]\}} d  | |fqS )|r   )r   r   r   )rC   r   r   r   =  r   entityIllegal task name: {})	rI   prepare_bleur   r{   updater   r   r   r   )pred_result
ref_resultr   r   metricsref_result_filteredpred_result_filteredr   infor)   r   rN   predr3   mk_metricpred_dict_bleuref_dict_bleur   )rC   r   r   r   get_metrics  sZ   



r  c                 C   sZ  g g }}|  }|D ]V}|dkrt|| |\}}n;|dkr't|| |\}}n.|dkr4t|| |\}}n!|dkrAt|| |\}}n|dkrNt|| |\}}ntd||ra|ra||7 }||7 }qt|}	t|}
|
	 D ]$\}}t
|
| |
|< t
|	|tg|	|< |r|tgkr|
|= |	|= qn|		 D ]\}}t|dksJ d|q|	|
fS )	zA
    Prepares data for calculation of bleu and rouge scores.
    r  r  r  r	  r  r
  r   z8There should be only one predict answer. question_id: {})r)   get_main_resultget_yesno_resultget_all_resultget_entity_resultget_desc_resultr   r   dictrI   r   rJ   EMPTYr   )r  r  r   r   r   qidsr   r  r3   r   r   ansrC   r   r   r   r   r  M  s@   
r  c                 C   sP   ||  d }|st g}|| i dg dd }|st g}| |fg| |fgfS )a-  
    Prepare answers for task 'main'.

    Args:
        qid: question_id.
        pred_result: A dict include all question_id's result information read
                     from args.pred_file.
        ref_result: A dict incluce all question_id's result information read
                    from args.ref_file.
    Returns:
        Two lists, the first one contains predict result, the second
        one contains reference result of the same question_id. Each list has
        elements of tuple (question_id, answers), 'answers' is a list of strings.
    r   Nr   )r  rJ   )r   r  r  ref_anspred_ansr   r   r   r  r  s   r  c                 C       ||  d dkr
dS t | ||S )a/  
    Prepare answers for task 'entity'.

    Args:
        qid: question_id.
        pred_result: A dict include all question_id's result information read
                     from args.pred_file.
        ref_result: A dict incluce all question_id's result information read
                    from args.ref_file.
    Returns:
        Two lists, the first one contains predict result, the second
        one contains reference result of the same question_id. Each list has
        elements of tuple (question_id, answers), 'answers' is a list of strings.
    r   ENTITYr   r  r   r  r  r   r   r   r       r  c                 C   r$  )a4  
    Prepare answers for task 'description'.

    Args:
        qid: question_id.
        pred_result: A dict include all question_id's result information read
                     from args.pred_file.
        ref_result: A dict incluce all question_id's result information read
                    from args.ref_file.
    Returns:
        Two lists, the first one contains predict result, the second
        one contains reference result of the same question_id. Each list has
        elements of tuple (question_id, answers), 'answers' is a list of strings.
    r   DESCRIPTIONr   r&  r'  r   r   r   r    r(  r  c                    sT   dd dd  d fdd	}||  d d	krd
S || |dd}|| |}||fS )a.  
    Prepare answers for task 'yesno'.

    Args:
        qid: question_id.
        pred_result: A dict include all question_id's result information read
                     from args.pred_file.
        ref_result: A dict incluce all question_id's result information read
                    from args.ref_file.
    Returns:
        Two lists, the first one contains predict result, the second
        one contains reference result of the same question_id. Each list has
        elements of tuple (question_id, answers), 'answers' is a list of strings.
    c                 S   s   g }g }t  }| D ]\}}||vr|||f || q	|||f q	|rDt|}|D ]\}}||  |7  < q.dd | D }|S )Nc                 S   s   g | ]\}}||fqS r   r   r   r   r   r   r     s    z3get_yesno_result.<locals>._uniq.<locals>.<listcomp>)r   rH   addr  rI   )lir   uniq_lileftr)   rC   r   dict_lir   r   r   _uniq  s   zget_yesno_result.<locals>._uniqc                 S   s@   | d d  }t dd | D }t| D ]
}||tgf q|S )Nc                 S   s   g | ]}|d  qS r
   r   )r   r   r   r   r   r     s    z<get_yesno_result.<locals>._expand_result.<locals>.<listcomp>)r   YESNO_LABELSrH   r  )r,  expandedr)   rC   r   r   r   _expand_result  s
   z(get_yesno_result.<locals>._expand_resultFc                    sj    |vr fddg D S |  d }|  d }dd t ||D |} fdd|D }|S )Nc                    $   g | ]\}}t  d  | |fqS r   rz   r   r   r   r   r        $ z<get_yesno_result.<locals>._get_yesno_ans.<locals>.<listcomp>r   r   c                 S   s   g | ]	\}}||gfqS r   r   r   r   r   r   r     r   c                    r3  r4  r5  r   r6  r   r   r     r7  )r{   )r   r   r   r   r   lbl_ansretr2  r/  r6  r   _get_yesno_ans  s   z(get_yesno_result.<locals>._get_yesno_ansr   YES_NOr   Tr   NFr   )r   r  r  r;  r"  r#  r   r:  r   r    s   

r  c                 C   s(   ||  d dkrt | ||S t| ||S )a,  
    Prepare answers for task 'all'.

    Args:
        qid: question_id.
        pred_result: A dict include all question_id's result information read
                     from args.pred_file.
        ref_result: A dict incluce all question_id's result information read
                    from args.ref_file.
    Returns:
        Two lists, the first one contains predict result, the second
        one contains reference result of the same question_id. Each list has
        elements of tuple (question_id, answers), 'answers' is a list of strings.
    r   r<  )r  r  r'  r   r   r   r    s   r  c                 C   s  i }g d}|durt |dg dS g }|dkr|dkrdg}|dkrNd	d
g}g d}|| D ]}|D ]}	|t| |	 |dd d|	d}
||
 q3q/no|dkrd	d
g}g d}|d }	|D ]7}|t| |	 |dd ddd}
||
 |D ]}|t| |	 |d | dd d|d}
||
 qzq`n$d	d
g}|D ]}|D ]}	|t| |	 |dd d|	d}
||
 qq||d< d|d< d|d< |S )a  
    Format metrics. 'err' field returns any error occured during evaluation.

    Args:
        metrics: A dict object contains metrics for different tasks.
        task: Task name.
        err_msg: Exception raised during evaluation.
    Returns:
        Formatted result.
    r   searchzhidaoNr   )errorMsg	errorCodedatar  r  r   r	  zBleu-4r   )r   r   r   r   d   r   )namevaluer+   r  r   Allr  rD  rC  successrB  )rz   roundrJ   rH   )r  r   err_msgrc   sourcesrD  metric_namesmetric_names_prfrF  srcr   detailsr   r   r   r   format_metrics  sh   
rQ  c           	   
   C   s   d}i }z4t | j| j}t | j| jdd}g d}| jtddgvr(|dd }|D ]}t||| j|||< q*W n' tyL } z|}W Y d}~nd}~w ty_ } z|}W Y d}~nd}~ww tt	j
t|| j|dd	d
 dS )z
    Do evaluation.
    NTr=  r?  r  r  r   F)ensure_asciiutf8)r   	pred_filer   ref_filer   r  r   AssertionErrorr   r   dumpsrQ  encode)	argserrr  r  r  rL  r   veaer   r   r   r  E  s6   

r  __main__rT  zpredict file)helprU  zreference filer   z-task name: Main|Yes_No|All|Entity|Descriptionr   )r   F)Nr   r>  r8   )3r   argparserl   r   resysr   collectionsr   r   r   numpynprouger   r  r   r0  r   r   rE   rT   rd   objectr*   r   r   r   r   r   r   r   r   r  r  r  r  r  r  r  rQ  r  r9   ArgumentParserparseradd_argument
parse_argsrY  r   lowerreplacer   r   r   r   <module>   s`   
 

 A

/%	3%;C