o
    }oid:                     @   s   d Z ddlZddlZddlmZ edg dZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdd Zdd Zd d! Z d"d# Z!d$d% Z"d&d' Z#d(d) Z$d*d+ Z%dS ),a  Evaluation metrics for Schema-guided dialogue.

This library provides functions for calculating the evaluation metrics for a
single dialogue. The following metrics are defined:

(1) Active intent accuracy: The fraction of user turns for which the active
  intent has been correctly predicted.
(2) Slot tagging F1: The macro-averaged F1 score for tagging slot values for
  non-categorical slots. This metric is optional to report in the final paper
  if participants decide not to use slot tagging.
(3) Requested slots F1: The macro-averaged F1 score for requested slots over the
  turns. For a turn, if there are no requested slots in both the ground truth
  and the prediction, that turn is skipped. The reported number is the average
  F1 score for all un-skipped user turns. This metric is optional to report in
  the final paper.
(4) Average goal accuracy: For each turn, participants must predict a single
  value for each slot present in the dialogue state. The slots which have a
  non-empty assignment in the ground truth dialogue state are only considered.
  This is the average accuracy of predicting the value of a slot correctly. A
  fuzzy matching based score is used for non-categorical slots.
(5) Joint goal accuracy: This is the average accuracy of predicting all slot
  assignments for a turn correctly. A fuzzy matching based score is used for
  non-categorical slots. This is the primary evaluation metric used for ranking
  submissions. More details to follow with the evaluation script.

This file contains code artifacts adapted from the original implementation:
https://github.com/google-research/google-research/blob/master/schema_guided_dst/metrics.py
    N)fuzzF1Scoresf1	precisionrecallactive_intent_accuracyslot_tagging_f1slot_tagging_precisionslot_tagging_recallrequested_slots_f1requested_slots_precisionrequested_slots_recallaverage_goal_accuracyaverage_cat_accuracyaverage_noncat_accuracyjoint_goal_accuracyjoint_cat_accuracyjoint_noncat_accuracyaverage_cat_status_accuracyaverage_cat_value_accuracyaverage_noncat_status_accuracyaverage_noncat_value_accuracyjoint_cat_status_accuracyjoint_cat_value_accuracyjoint_noncat_status_accuracyjoint_noncat_value_accuracyNAc           
      C   s   t | }t |}t| }t| }t||@  }|r&t|| nd}|r0t|| nd}|| dkrCd| | ||  }	nd}	t|	||dS )a  Compute F1 score from reference (grouth truth) list and hypothesis list.
    Args:
      list_ref: List of true elements.
      list_hyp: List of postive (retrieved) elements.
    Returns:
      A F1Scores object containing F1, precision, and recall scores.
          ?        g       @r   )collectionsCountersumvaluesfloatr   )
list_reflist_hyprefhyptruepositivetrue_positiver   r   r    r,   \/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/nlp/metrics/sgd_metrics.py
compute_f1W   s   
	
r.   c                 C   s   t | |d S )zReturns fuzzy string similarity score in range [0.0, 1.0].
    Args:
      str_ref: reference string
      str_hyp: hypothesis string
    Returns:
      fuzzy string similarity
    g      Y@)r   token_sort_ratio)str_refstr_hypr,   r,   r-   fuzzy_string_matcho   s   
r2   c                 C   s8   d}| D ]}|rt ||}nt||k}t||}q|S )a/  Calculate non-categorical slots correctness.
    Args:
      str_ref_list: a list of reference strings.
      str_hyp: the hypothesis string.
      use_fuzzy_match: whether to use fuzzy string matching.
    Returns:
      score: The highest fuzzy string match score of the references and hypotheis.
    r   )r2   r$   max)str_ref_listr1   use_fuzzy_matchscorer0   match_scorer,   r,   r-   noncat_slot_value_match|   s   	r8   c                 C   sh  g }g }g }g }g }|d D ]z}	|	d }
| |	d  |
| v re| d |
|v rU| |
 }||
 d }|	d r?t|d |k}nt|||}| | | d | | q| d | d | d q| d	 |
|v ry| d | d n
| d | d | d qt|t|d ksJ t|t|d ksJ t|t|d ksJ |||||fS )
aQ  Compare and get correctness of goal state's slot_values.

    Args:
      slot_values_ref: goal state slot_values from reference (ground truth).
      slot_values_hyp: goal state slot_values from hypothesis (prediction).
      service: a service data structure in the schema. We use it to obtain the
        list of slots in the service and infer whether a slot is categorical.
      use_fuzzy_match: whether to use fuzzy string matching for non-categorical
        slot values

    Returns:
      list_cor: list of corectness scores, each corresponding to one slot in the
          service. The score is a float either 0.0 or 1.0 for categorical slot,
          and in range [0.0, 1.0] for non-categorical slot.
      slot_active: list indicating whether the element in list_cor corresponds to
          an active ground-truth slot.
      slot_cat: list indicating whether the element in list_cor corresponds to a
          categorical slot.
      list_cor_status: list of correct slot statuses 
      list_cor_value: list of correctness score only for active slots. Monactive slots are assigned -1.
    slotsnameis_categoricalTr   r   r   g      F)appendr$   r8   len)slot_values_refslot_values_hypservicer5   list_corlist_cor_statuslist_cor_valueslot_activeslot_catslot	slot_namevalue_ref_list	value_hypcorr,   r,   r-   compare_slot_values   sB   








rK   c                 C   s   t | d d |d d kS )a  Get active intent accuracy of a frame.

    Args:
      frame_ref: single semantic frame from reference (ground truth) file.
      frame_hyp: single semantic frame from hypothesis (prediction) file.

    Returns:
      1.0 if the intent prediction is correct, otherwise 0.0.
    stateactive_intent)r$   	frame_ref	frame_hypr,   r,   r-   get_active_intent_accuracy   s   
rQ   c                    sX   dd |d D  d|vrdS  fdd| d D } fdd|d D }t ||S )a  Get slot tagging (non-categorical slots only) F1 scores of a frame.

    Args:
      frame_ref: single semantic frame from reference (ground truth) file.
      frame_hyp: single semantic frame from hypothesis (prediction) file.
      utt: user utterance. Slot tagging annotations are the character positions in
        the utterance.
      service: a service data structure in the schema. We use it to infer whether
        a slot is non-categorical.

    Returns:
      A F1Scores object containing F1, precision, and recall scores.
    c                 S   s   g | ]
}|d  s|d qS )r;   r:   r,   .0sr,   r,   r-   
<listcomp>   s    z'get_slot_tagging_f1.<locals>.<listcomp>r9   Nc                    4   g | ]}|d   v r|d  |d |d  fqS rF   startexclusive_endr,   rR   list_noncat_slotsuttr,   r-   rU      
    c                    rV   rW   r,   rR   rZ   r,   r-   rU      r]   r.   )rO   rP   r\   r@   r%   r&   r,   rZ   r-   get_slot_tagging_f1   s   
r_   c                 C   s   t | d d |d d S )a!  Get requested slots F1 scores of a frame.

    Args:
      frame_ref: single semantic frame from reference (ground truth) file.
      frame_hyp: single semantic frame from hypothesis (prediction) file.

    Returns:
      A F1Scores object containing F1, precision, and recall scores.
    rL   requested_slotsr^   rN   r,   r,   r-   get_requested_slots_f1   s   
ra   c                 C   sz  i }t | d d |d d ||\}}}}}	dd t||D }
|
r't|
nt|t< dd t|||D }|r=t|nt|t< dd t|||D }|rSt|nt|t< |r^t|nt|t	< dd t||D }|rst|nt|t
< dd t||D }|rt|nt|t< d	d t|||D }|rt|nt|t< d
d t||D }|rt|nt|t< dd t|||D }|rt|nt|t< dd t||D }|rt|nt|t< dd t|	||D }|rt|nt|t< dd t|	|D }|r
t|nt|t< dd t|	||D }|r!t|nt|t< dd t|	|D }|r7t|nt|t< |S )aq  Get average and joint goal accuracies of a frame.

    Args:
      frame_ref: single semantic frame from reference (ground truth) file.
      frame_hyp: single semantic frame from hypothesis (prediction) file.
      service: a service data structure in the schema. We use it to obtain the
        list of slots in the service and infer whether a slot is categorical.
      use_fuzzy_match: whether to use fuzzy string matching for comparing
        non-categorical slot values.

    Returns:
      goal_acc: a dict whose values are average / joint
          all-goal / categorical-goal / non-categorical-goal accuracies.
    rL   slot_valuesc                 S      g | ]\}}|r|qS r,   r,   )rS   accactiver,   r,   r-   rU         z7get_average_and_joint_goal_accuracy.<locals>.<listcomp>c                 S   s   g | ]\}}}|r|r|qS r,   r,   rS   rd   re   catr,   r,   r-   rU   !      c                 S   s   g | ]\}}}|r|s|qS r,   r,   rg   r,   r,   r-   rU   $  ri   c                 S   rc   r,   r,   rS   rd   rh   r,   r,   r-   rU   *  rf   c                 S      g | ]\}}|s|qS r,   r,   rj   r,   r,   r-   rU   -  rf   c                 S   s   g | ]\}}}|r|r|qS r,   r,   rg   r,   r,   r-   rU   2  ri   c                 S   rc   r,   r,   rj   r,   r,   r-   rU   5  rf   c                 S   s   g | ]\}}}|s|r|qS r,   r,   rg   r,   r,   r-   rU   9  s    
c                 S   rk   r,   r,   rj   r,   r,   r-   rU   @  rf   c                 S   s&   g | ]\}}}|r|d kr|r|qS g      r,   rg   r,   r,   r-   rU   D      
c                 S   s    g | ]\}}|r|d kr|qS rl   r,   rj   r,   r,   r-   rU   I       c                 S   s&   g | ]\}}}|s|d kr|r|qS rl   r,   rg   r,   r,   r-   rU   M  rm   c                 S   s    g | ]\}}|s|d kr|qS rl   r,   rj   r,   r,   r-   rU   R  rn   )rK   zipnpmeanNAN_VALAVERAGE_GOAL_ACCURACYAVERAGE_CAT_ACCURACYAVERAGE_NONCAT_ACCURACYprodJOINT_GOAL_ACCURACYJOINT_CAT_ACCURACYJOINT_NONCAT_ACCURACYAVERAGE_CAT_STATUS_ACCURACYJOINT_CAT_STATUS_ACCURACYAVERAGE_NONCAT_STATUS_ACCURACYJOINT_NONCAT_STATUS_ACCURACYAVERAGE_CAT_VALUE_ACCURACYJOINT_CAT_VALUE_ACCURACYAVERAGE_NONCAT_VALUE_ACCURACYJOINT_NONCAT_VALUE_ACCURACY)rO   rP   r@   r5   goal_acclist_accrD   rE   list_status_acclist_value_acc
active_accactive_cat_accactive_noncat_acccat_acc
noncat_accactive_cat_status_acccat_status_accactive_noncat_status_accnoncat_status_accactive_cat_val_acccat_val_accactive_noncat_val_accnoncat_val_accr,   r,   r-   #get_average_and_joint_goal_accuracy  sN   


r   )&__doc__r    numpyrp   	rapidfuzzr   
namedtupler   ACTIVE_INTENT_ACCURACYSLOT_TAGGING_F1SLOT_TAGGING_PRECISIONSLOT_TAGGING_RECALLREQUESTED_SLOTS_F1REQUESTED_SLOTS_PRECISIONREQUESTED_SLOTS_RECALLrs   rt   ru   rw   rx   ry   rz   r~   r|   r   r{   r   r}   r   rr   r.   r2   r8   rK   rQ   r_   ra   r   r,   r,   r,   r-   <module>   sF   @