o
    wiN                     @   s   d dl Z d dlmZ d dlmZ d dlmZ zd dlZd dlmZ dZ	W n e
efy1   dZ	Y nw 	dd	ee d
ee dee deddf
ddZG dd dZG dd dZdS )    N)
namedtuple)tqdm)logging)tabulateTF[PUNCT]
references
hypothesespunctuation_markspunctuation_maskreturnc                 C   s   t | |||d}|  |jS )a  
    Computes Punctuation Error Rate
    
    Args:
        references (list[str]) - list of references
        hypotheses (list[str]) - list of hypotheses
        punctuation_marks (list[str]) - list of punctuation marks for computing metrics
        punctuation_mask (str, by default "[PUNCT]") - mask token that will be applied to
        given punctuation marks while edit distance calculation
        
    Return:
        punct_er (float) - Punctuation Error Rate
    )r   r   r	   r
   )DatasetPunctuationErrorRatecomputepunct_er)r   r   r	   r
   dper_obj r   e/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/nemo/collections/common/metrics/punct_er.pypunctuation_error_rate   s   r   c                   @   sb   e Zd ZdZddee deddfddZd	ed
efddZdedefddZ	dedefddZ
dS )OccurancePunctuationErrorRatea5  
    Class for computation puncutation-related absolute amounts of operations and thier rates
    between reference and hypothesis strings:
        - Absolute amounts of correct predictions, deletions, insertions
        and substitutions for each given punctuation mark
        - Rates of correct predictions, deletions, insertions
        and substitutions for each given punctuation mark
        - Overall rates of correct predictions, deletions, insertions
        and substiturions between reference and hypothesis string
        - Punctuation Error Rate

    Args to init:
        punctuation_marks (list[str]) - list of punctuation marks for computing metrics
        punctuation_mask (str, by default "[PUNCT]") - mask token that will be applied to
        given punctuation marks while edit distance calculation 
    
    How to use:
        1. Create object of OccurancePunctuationErrorRate class.
           Example:
                punctuation_marks = [".", ",", "!", "?"]
                oper_obj = OccurancePunctuationErrorRate(punctuation_marks)
        
        2. To compute punctuation metrics, pass reference and hypothesis string to the "compute" method
        of created object.
            Example:
                reference_str = "Hi, dear! Nice to see you. What's"
                hypothesis_str = "Hi dear! Nice to see you! What's?"
                oper_obj.compute(reference_str, hypothesis_str)

    Output (listed in order of output):
        1. Dict of absolute operations amounts for each given punctuation mark:
            Example:
            {'.': {'Correct': 0, 'Deletions': 0, 'Insertions': 0, 'Substitutions': 1},
             ',': {'Correct': 0, 'Deletions': 1, 'Insertions': 0, 'Substitutions': 0},
             '!': {'Correct': 1, 'Deletions': 0, 'Insertions': 0, 'Substitutions': 0},
             '?': {'Correct': 0, 'Deletions': 0, 'Insertions': 1, 'Substitutions': 0}}
              
        2. Dict of substitutions absolute amounts between given punctuation marks:
            Example:
            {'.': {'.': 0, ',': 0, '!': 1, '?': 0},
             ',': {'.': 0, ',': 0, '!': 0, '?': 0},
             '!': {'.': 0, ',': 0, '!': 0, '?': 0},
             '?': {'.': 0, ',': 0, '!': 0, '?': 0}}
            
        3. namedtuple "PunctuationRates" of punctuation operation rates (in range from 0 to 1):
            3.1. correct_rate - overall correct rate 
                Example: correct_rate=0.25
            3.2. deletions_rate - overall deletions rate
                Example: deletions_rate=0.25
            3.3. insertions_rate - overall insertions rate
                Example: insertions_rate=0.25
            3.4. substitutions_rate - overall substitutions_rate
                Example: substitutions_rate=0.25
            3.5. punct_er - Punctuation Error Rate
                Example: punct_er=0.75
            3.6. operation_rates - dict of operations rates for each given punctuation mark
                Example: 
                operation_rates={
                    '.': {'Correct': 0.0, 'Deletions': 0.0, 'Insertions': 0.0, 'Substitutions': 1.0},
                    ',': {'Correct': 0.0, 'Deletions': 1.0, 'Insertions': 0.0, 'Substitutions': 0.0},
                    '!': {'Correct': 1.0, 'Deletions': 0.0, 'Insertions': 0.0, 'Substitutions': 0.0},
                    '?': {'Correct': 0.0, 'Deletions': 0.0, 'Insertions': 1.0, 'Substitutions': 0.0}
                    }
  
            3.7. substitution_rates - dict of substitution rates for each given punctuation mark
                Example:
                substitution_rates={
                    '.': {'.': 0.0, ',': 0.0, '!': 1.0, '?': 0.0},
                    ',': {'.': 0.0, ',': 0.0, '!': 0.0, '?': 0.0},
                    '!': {'.': 0.0, ',': 0.0, '!': 0.0, '?': 0.0},
                    '?': {'.': 0.0, ',': 0.0, '!': 0.0, '?': 0.0}
                    }
    r   r	   r
   r   Nc                 C   s.   t |dks
J d|| _|| _g d| _d S )Nr   z"List of punctuation marks is empty)Correct	Deletions
InsertionsSubstitutions)lenr	   r
   
operations)selfr	   r
   r   r   r   __init__   s   z&OccurancePunctuationErrorRate.__init__operation_amountssubstitution_amountsc                    s@  fddj D }fddj D }j D ].t  dkr&qfddjD |< fdd  D |< qfddjD   fdd D }t| }td	g d
}|dkrv|ddddd||}|S |d | }	|d | }
|d | }|d | }|
| | }||	|
|||||}|S )Nc                       i | ]}|d d  j D qS )c                 S      i | ]}|d qS r   r   .0	operationr   r   r   
<dictcomp>       JOccurancePunctuationErrorRate.compute_rates.<locals>.<dictcomp>.<dictcomp>r   r"   pmr   r   r   r$          z?OccurancePunctuationErrorRate.compute_rates.<locals>.<dictcomp>c                    r   )c                 S   r   r    r   r(   r   r   r   r$      r%   r&   r	   r(   r*   r   r   r$      r+   r   c                    s   i | ]}|  |  qS r   r   r!   )r   operations_amount_by_pmr)   r   r   r$          c                    s   i | ]}| |   qS r   r   r"   _pm)r-   r)   r   r   r   r$      r.   c                    s$   i | ]   fd d  D qS )c                    s   i | ]	\}}||  qS r   r   )r"   r)   r   r#   r   r   r$      s    r&   )items)r"   )r   r1   r   r$      s    c                    s   i | ]}|t  |  qS r   )sumvaluesr!   )_operation_amountsr   r   r$      s    PunctuationRates)correct_ratedeletions_rateinsertions_ratesubstitutions_rater   operation_ratessubstitution_ratesr   r   r   r   )r	   r3   r4   r   keysr   )r   r   r   r;   r<   overall_amounts_by_operationoverall_operations_amountpunctuation_ratesratesr7   r8   r9   r:   r   r   )r5   r   r-   r)   r   r   r   compute_rates   sR   





z+OccurancePunctuationErrorRate.compute_rates	reference
hypothesisc           "         sz  fddj D }fddj D dtdtt ffdd}dtt dtt d	tfd
d}||j }||j }||j j}||j j}	|j}
|	j}|
| dkrb|fS t|}t|	  fddt|d D } fddt|d D }d}d\}}d\}}d\}}td|d D ]}|| || d< ||| d< qtd d D ]}|| |d |< ||d |< qtd d D ]}|| |d |< ||d |< qtd|d D ]~}td d D ]t}||d  |	|d  kr||d  |d  || |< ||| |< q||d  |d  | }|| |d  | }||d  | | }t||||| |< || | |krH||| |< q|| | |krX||| |< q||| |< qq|} }|dksn|dkr|| | |kr||d  jks|	|d  jkr||d  }||d  }||kr|| d  d7  < n|| d  d7  < | |  d7  < |d8 }|d8 }n-|| | |kr|d8 }|d8 }n|| | |kr|d8 }n|| | |kr|d8 }|dksn|dksnj D ]:| d }| d }tfddj D }|||  } | | d< |||  }!|!| d< q|fS )Nc                    r   )c                 S   r   r    r   r!   r   r   r   r$      r%   VOccurancePunctuationErrorRate.compute_operation_amounts.<locals>.<dictcomp>.<dictcomp>r'   r(   r*   r   r   r$      r+   zKOccurancePunctuationErrorRate.compute_operation_amounts.<locals>.<dictcomp>c                    r   )c                 S   r   r    r   r(   r   r   r   r$      r%   rE   r,   r(   r*   r   r   r$      r+   textr	   c                    s(   dd  j }td| d| }|S )N\z[\w']+|[])joinr	   refindall)rF   r	   tokensr*   r   r   tokenize   s   zIOccurancePunctuationErrorRate.compute_operation_amounts.<locals>.tokenizerL   r
   c                    s    fdd| D }|S )Nc                    s   g | ]
}| v r
n|qS r   r   )r"   tokenr	   r
   r   r   
<listcomp>   s    zfOccurancePunctuationErrorRate.compute_operation_amounts.<locals>.mask_punct_tokens.<locals>.<listcomp>r   )rL   r	   r
   maskedr   rO   r   mask_punct_tokens   s   zROccurancePunctuationErrorRate.compute_operation_amounts.<locals>.mask_punct_tokensr   c                    "   g | ]}d d t  d D qS )c                 S      g | ]}d qS r    r   r"   innerr   r   r   rP          VOccurancePunctuationErrorRate.compute_operation_amounts.<locals>.<listcomp>.<listcomp>   ranger"   outerh_lenr   r   rP         " zKOccurancePunctuationErrorRate.compute_operation_amounts.<locals>.<listcomp>rY   c                    rS   )c                 S   rT   r    r   rU   r   r   r   rP      rW   rX   rY   rZ   r\   r^   r   r   rP      r`   C)DrY   )IrY   )SrY   r   r   c                    s   g | ]}|   qS r   r   r/   )r)   r   r   r   rP   1  s    r   r   )	r	   strlistr
   countr   r[   minr3   )"r   rC   rD   r   rM   rR   r_tokensh_tokensr_maskedh_maskedr_punct_amounth_punct_amountr_lencosts	backtraceCORDELDEL_PENALTYINSINS_PENALTYSUBSUB_PENALTYijsubstitution_costinsertion_costdeletion_costr_tokenh_tokennum_of_correctnum_substitutions_of_pmnum_substitutions_to_pmnum_of_deletionsnum_of_insertionsr   )r_   r)   r   r   r   compute_operation_amounts   s   (




z7OccurancePunctuationErrorRate.compute_operation_amountsc                 C   s&   |  ||\}}| ||}|||fS N)r   rB   )r   rC   rD   r   r   r@   r   r   r   r   ;  s   
z%OccurancePunctuationErrorRate.computer   )__name__
__module____qualname____doc__rf   re   r   dictrB   r   r   r   r   r   r   r   <   s    J	@kr   c                   @   sV   e Zd ZdZ	ddee dee dee deddf
d	d
Zdd Zdd Zdd Z	dS )r   a
  
    Class for computation the total puncutation-related absolute amounts of operations and their rates 
    in pairs of reference and hypothesis strins:
        - Absolute amounts of correct predictions, deletions, insertions
        and substitutions for each given punctuation mark
        - Rates of correct predictions, deletions, insertions
        and substitutions for each given punctuation mark 
        - Total rates of correct predictions, deletions, insertions
        and substiturions in pairs of reference and hypothesis strings 
        - Punctuation Error Rate
        
    Args to init:
        references (list[str]) - list of references
        hypotheses (list[str]) - list of hypotheses
        punctuation_marks (list[str]) - list of punctuation marks for computing metrics
        punctuation_mask (str, by default "[PUNCT]") - mask token that will be applied to
        given punctuation marks while edit distance calculation
        
    How to use:
        1. Create object of DatasetPunctuationErrorRate class.
           Example:
                references = ["Hi, dear! Nice to see you. What's"]
                hypotheses = ["Hi dear! Nice to see you! What's?"]                
                punctuation_marks = [".", ",", "!", "?"]
                
                dper_obj = DatasetPunctuationErrorRate(references, hypotheses, punctuation_marks)
                
        2. To compute punctuation metrics, call the class method "compute()".
            Example:
                dper_obj.compute() 
                
    Result:
    The following atributes of class object will be updated with calculated metrics values.
    The values are available with calling the atributes:
        
        dper_obj.operation_rates - dict, rates of correctness and errors for each punctuation mark 
        from `preset dper_obj.punctuation_marks` list.
        
        dper_obj.substitution_rates - dict, substitution rates between puncutation marks from
        `preset dper_obj.punctuation_marks` list.
        
        dper_obj.correct_rate - float, total rate of correctness between provided pairs of 
        references and hypotheses.
        
        dper_obj.deletions_rate - float, total rate of deletions between provided pairs of 
        references and hypotheses.
        
        dper_obj.insertions_rate - float, total rate of insertions between provided pairs of 
        references and hypotheses.
        
        dper_obj.substitutions_rate - float, total rate of substitutions between provided pairs of 
        references and hypotheses.
        
        dper_obj.punct_er - float, total Punctuation Error Rate between provided pairs of 
        references and hypotheses.
    r   r   r   r	   r
   r   Nc                 C   sj   || _ || _|| _|| _t| j| jd| _g | _g | _g | _d | _	d | _
d | _d | _d | _d | _d | _d S )NrO   )r   r   r	   r
   r   oper_objr   r   rA   r;   r<   r7   r8   r9   r:   r   )r   r   r   r	   r
   r   r   r   r   {  s"   
z$DatasetPunctuationErrorRate.__init__c           
      C   s   dt t fdd}td tt| j| jt| jdD ] \}}| j	
||\}}}| j| | j| | j| q|| j}|| j}| j	j||d}	|	j| _|	j| _|	j| _|	j| _|	j| _|	j| _|	j| _d S )Namounts_dictsc                    s`    fdd d   D } D ]}| D ]\}}| D ]\}}|| |  |7  < qqq|S )Nc                    s$   i | ]}|d d  d | D qS )c                 S   r   r    r   )r"   _keyr   r   r   r$     r%   zWDatasetPunctuationErrorRate.compute.<locals>.sum_amounts.<locals>.<dictcomp>.<dictcomp>r   r   )r"   keyr   r   r   r$     s   $ zLDatasetPunctuationErrorRate.compute.<locals>.sum_amounts.<locals>.<dictcomp>r   )r=   r2   )r   amountsamounts_dict	outer_key
inner_dict	inner_keyvaluer   r   r   sum_amounts  s   z8DatasetPunctuationErrorRate.compute.<locals>.sum_amountsz Computing Punctuation Error Rate)total)r   r   )rf   r   r   infor   zipr   r   r   r   r   r   appendr   rA   rB   r;   r<   r7   r8   r9   r:   r   )
r   r   rC   rD   r   r   r@   overall_operation_amountsoverall_substitution_amountsoverall_ratesr   r   r   r     s&   
	$

z#DatasetPunctuationErrorRate.computec                 C   s@   g | _ g | _g | _d | _d | _d | _d | _d | _d | _d | _	d S r   )
r   r   rA   r;   r<   r7   r8   r9   r:   r   r*   r   r   r   reset  s   
z!DatasetPunctuationErrorRate.resetc                 C   s   t dttd| j d d  tr=t| jd }t| j	d }t dt
|ddd  t d	t
|ddd  d S t d
 t d| j d t d| j	 d d S )NzDataset PER d      %z1Rates of punctuation correctness and errors (%):
r=   psql)headerstablefmtz2Substitution rates between punctuation marks (%):
z:Some of the modules (pandas or tabulate) can't be importedz?Rates of punctuation correctness and errors (in range [0, 1]):

z@Substitution rates between punctuation marks (in range [0, 1]):
)r   r   re   roundr   HAVE_TABLUATE_AND_PANDASpd	DataFramer;   r<   r   warning)r   rates_by_pm_dfsubstitution_rates_by_pm_dfr   r   r   print  s&   "
z!DatasetPunctuationErrorRate.printr   )
r   r   r   r   rf   re   r   r   r   r   r   r   r   r   r   A  s"    >
 r   r   )rJ   collectionsr   r   
nemo.utilsr   pandasr   r   r   ImportErrorModuleNotFoundErrorrf   re   r   r   r   r   r   r   r   <module>   s6   
  