o
    qoi                     @   s  d dl Z d dlZd dlmZ d dlmZmZ d dlmZ d dl	m
Z
mZmZ d dlZd dlZd dlZd dlZd dlmZ d dlmZ dZd	d
 Zdd Zdd Zdd Zdd ZdcddZdd Zdd ZeG dd dZ dd Z!dd  Z"d!d" Z#eG d#d$ d$Z$G d%d& d&Z%eG d'd( d(Z&d)d* Z'd+d, Z(	-ddd.d/Z)d0d1 Z*d2d3 Z+d4d5 Z,	ded6d7Z-	-	-	-	dfd9d:Z.	;	;	-	-	-	-	-	-	;	-	-	;	-	8	;dg	=	>	?	@	A	B	C	D	E	F	G	H	I	J	K	L	M	N	O	P	Q	R	S	T	U	V	W	X	Y	Z	[	\	]dhd^d_Z/d`da Z0e1dbkre2e/ dS dS )i    N)defaultdict)	dataclassfield)chain)ListTupleDict)logger)coloredz<unk>c                    s(    fdd}||  }|| }||fS )Nc                    sR   g }| D ]"}| vrt dd | D d }|||< |||< |||  q|S )Nc                 s   s    | ]
}t |tr|V  qd S N)
isinstanceint.0v r   I/home/ubuntu/.local/lib/python3.10/site-packages/texterrors/texterrors.py	<genexpr>   s    z2convert_to_int.<locals>.convert.<locals>.<genexpr>   )maxvaluesappend)lstdct_symsintlstwidctr   r   convert   s   zconvert_to_int.<locals>.convertr   )lst_alst_br   r   int_aint_br   r   r   convert_to_int   s   
	
r$   c                 C   s"   t | trt| |S t| |S )zA This function assumes that elements of a and b are fixed width. )r   strtexterrors_alignlev_distance_strlev_distance)abr   r   r   r(   "   s   
r(   c                 C   sB   t | }t |}tj|d |d ftjdd}t|| |d}|S )zO This function is for when a and b have strings as elements (variable length). r   CdtypeorderF)lennpzerosfloat64r&   calc_sum_cost)r)   r*   len_alen_bsummed_costcostr   r   r   seq_distance*   s
   r8   c                 C   s   t jt| d t|d ft jdd}t|| ||}|r,t jdd t jd|ddd	 t|| ||}g g }}	t	|D ]&\}
}|
d
krK|
| n|
| |
  |d
kr\|	
| q=|	
||  q=||	|fS )Nr   r+   r,   ,  	linewidth
summedcost%.3f	fmt	delimiter)r0   r1   r/   r2   r&   r3   set_printoptionssavetxtget_best_pathreversedr   )words_awords_buse_chardiffdebug
insert_tokr6   r7   best_path_reversed	aligned_a	aligned_br   jr   r   r   _align_texts3   s&   

rP   c              
   C   s  t | }t |}	|g|  } |g| }tj|d |	d ftjdd}
t|
| |||||}|r=tjdd tjd|
ddd	 g }t|
|| ||||| t |d
 dksUJ g }t	dt |d
D ]}|| }||d  }|
||f q_|d g}d}|d }|t |d k r|| }||d  }|d d |d kr|d d |d krn|
| |d7 }|}|t |d k s|}g g }}d\}}tt|D ]+\}}||kr|
| |  n|
| ||kr|
||  n|
| ||}}q|||fS )Nr   r+   r,   r9   r:   r<   r=   r>   r?      r   )rB   rB   )r/   r0   r1   r2   r&   calc_sum_cost_ctmrC   rD   get_best_path_ctmranger   listrF   )
text_a_str
text_b_strtimes_atimes_bdurs_adurs_brJ   rK   r4   r5   r6   r7   best_path_lstpathnr   rO   newpathlasttpltplnexttplrM   rN   lastilastjr   r   r   align_texts_ctmM   sZ   



(




re   <eps>Tc                 C   s   t | tr
t |tsJ dt| dkrt | d tsJ t|dkr,t |d ts,J t| ||||d\}}}|rBt| t| |||fS )NzInput types should be a list!r   )rJ   rK   )r   rU   r/   r%   rP   print)text_atext_brJ   rK   rI   rM   rN   r7   r   r   r   align_texts   s   
rj   c                 C   s`   |d | d k r| d |d  }||d d k rdS dS |d | d  }||d d k r.dS dS )Nr   rQ   g      ?r   rB   r   )refwhypw
neg_offset
pos_offsetr   r   r   get_overlap   s   ro   c                 C   s   t | t |ks
J d}d}t| D ]i\}}||v r{|t |7 }|d dkr*|d nd}d}t||d D ]1}	|	|kr`|	t | d ksI| |	 dkrJq5|	|k rW|||	 d 7 }q5|d||	  7 }q5|||	 7 }q5| }|dd}t||}
||
7 }q||fS )Nr   r    rQ   rf    )r/   	enumeraterT   stripreplacer&   r'   )ref_alignedhyp_alignedoov_setoov_count_denomoov_count_errorr   ref_wstartidxhyp_widxdr   r   r   get_oov_cer   s,   r   c                   @   s>   e Zd ZU eed< eed< dZeed< dZeed< dd ZdS )UttuidwordsNtimesdursc                 C   
   t | jS r   )r/   r   selfr   r   r   __len__      
zUtt.__len__)	__name__
__module____qualname__r%   __annotations__rU   r   r   r   r   r   r   r   r      s   
 r   c                 C   s   i }t | 9}t|D ]+\}}|r'| ^}}||vsJ dt||||< q| }t|}t||||< qW d    |S 1 sBw   Y  |S )Nz8There are repeated utterances in reference file! Exiting)openrr   splitr   r%   )ref_fisarkref_uttsfhr   lineuttr   r   r   r   read_ref_file   s   



r   c           	      C   s   |si nt t}t| J}t|D ]<\}}|r9| ^}}dd |D }|s.t||||< q|| t|| q| }t|}t|dd |D ||< qW d    |S 1 sYw   Y  |S )Nc                 S      g | ]}|t kr|qS r   OOV_SYMr   r   r   r   r   
<listcomp>       z!read_hyp_file.<locals>.<listcomp>c                 S   r   r   r   r   r   r   r   r      r   )r   rU   r   rr   r   r   r   r%   )	hyp_fr   
oracle_werhyp_uttsr   r   r   r   r   r   r   r   read_hyp_file   s"   

r   c                 C   s   t t}t| )}|D ]}| ^}}}}}t|}t|}|| |||f qW d   n1 s4w   Y  i }	| D ]+\}}
g }g }g }|
D ]}||d ||d |dgf qKt|||||	|< q?|S )zK Assumes first field is utt and last three fields are word, time, duration Nr   r   rQ   )r   rU   r   r   floatr   itemsr   )futt_to_wordtimesr   r   r   _timedurwordutts	wordtimesr   r   r   er   r   r   read_ctm_file   s$   
(r   c                   @   s.   e Zd ZU ee ed< ee ed< eed< dS )LineElementr   lengths	has_colorN)r   r   r   r   r%   r   r   boolr   r   r   r   r     s   
 r   c                   @   sH   e Zd Zdd Zdd Zdd Zdd Zed	d
 Zdd Z	dd Z
dS )	MultiLinec                 C   s   g | _ || _|| _d S r   )line_elementsterminal_width	num_lines)r   r   r   r   r   r   __init__  s   
zMultiLine.__init__c                 C   s   t |||}| j| d S r   )r   r   r   )r   r   r   r   ler   r   r   add_lineelement  s   zMultiLine.add_lineelementc                 C   r   r   )r/   r   r   r   r   r   r     r   zMultiLine.__len__c                 C   s
   | j | S r   )r   )r   itemr   r   r   __getitem__  r   zMultiLine.__getitem__c                  G   s"   g }| D ]
}| d| q|S )Nrq   )r   join)linesjoined_linesr   r   r   r   	construct  s   zMultiLine.constructc                 C   sP   g }| j D ]}|ddd |jD d ddd |jD   qd|S )N|c                 s       | ]}|V  qd S r   r   r   r   r   r   r   %      z%MultiLine.__repr__.<locals>.<genexpr>rq   ,c                 s   s    | ]}t |V  qd S r   )r%   )r   r^   r   r   r   r   %  s    r>   )r   r   r   r   r   )r   elemsr   r   r   r   __repr__"  s   
8
zMultiLine.__repr__c                 c   s    d}dd t | jD }d}|t| jk rr| j| }|j}t| }|| | jkr=| j| }dd t | jD }|V  d}||d 7 }|j}t	|D ]\}	}
||	 }t
|rZ|d n|}|
|d|  qJ|d7 }|t| jk s| j| }|V  d S )Nr   c                 S      g | ]}g qS r   r   r   r   r   r   r   r   *      z,MultiLine.iter_construct.<locals>.<listcomp>c                 S   r   r   r   r   r   r   r   r   2  r   r   	   ^)rT   r   r/   r   r   r   r   r   r   rr   
_has_colorr   )r   indexr   written_lenr   r   
padded_lenr   r   r   r   r   wordlenr   r   r   iter_construct(  s.   



zMultiLine.iter_constructN)r   r   r   r   r   r   r   staticmethodr   r   r   r   r   r   r   r     s    
r   c                   @   s,  e Zd ZU dZeed< dZeed< eedZ	e
e ed< dZeed< edd dZeeef ed	< ed
d dZeeef ed< edd dZeeef ed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< edd dZeeef ed< dS )
ErrorStatsr   
total_costtotal_count)default_factoryr   	utt_wrongc                   C      t tS r   r   r   r   r   r   r   <lambda>G      zErrorStats.<lambda>insc                   C   r   r   r   r   r   r   r   r   H  r   delsc                   C   r   r   r   r   r   r   r   r   I  r   subschar_error_count
char_country   rx   oov_word_erroroov_word_countkeywords_predictedkeywords_outputkeywords_countc                   C   r   r   r   r   r   r   r   r   S  r   word_countsN)r   r   r   r   r   r   r   r   rU   r   r   r%   r   r   r   r   r   r   r   ry   rx   r   r   r   r   r   r   r   r   r   r   r   A  s$   
 "r   c                 C   s   |st | |}t|||}nt| }t|}t }	|r5t|D ]}
t|
 dks-J d|	|
  qi }|rPt|D ]}
|
jdd\}}| }|||< q=|||	|fS )Nr   z A keyword must be a single word!)maxsplit)	r   r   r   setr   r/   r   addrs   )r   r   r   isctm
keywords_futt_group_map_fr   r   r   keywordsr   utt_group_maputtidgroupr   r   r   
read_filesV  s"   

r   c           
   	      s"  |  d t| dd ddd | D ]\}}|  | d| d q|  d |  d t|  fd	dddd | D ]\}}|  | d| d|  d q@|  d |  d
 t|  fddddd | D ]\}}|dd  }	|  | d| d|	  d qqd S )Nz
Insertions:
c                 S   s   | d S )Nr   r   xr   r   r   r   p  r   z&print_detailed_stats.<locals>.<lambda>T)keyreverser>   
z/Deletions (second number is word count total):
c                    s     s| d S | d | d   S )Nr   r   r   r   	freq_sortr   r   r   r   t  s     zSSubstitutions (reference>hypothesis, second number is reference word count total):
c                    s6    s| d S | d | d  dd    | d fS )Nr   r   >)r   rs   r   r   r   r   r   z  s   6 r   r   )writesortedr   r   rs   )
r   r   r   r   num_top_errorsr   r   r   crz   r   r   r   print_detailed_statsn  s,   
&

"

"r   Fc           /   
   C   sb  t  }|dd|i}g }|  D ]}td|  | | }t|j }|rU|| }g }|D ]}t|j|j||d\}}}|| q-| jt	|7  _| j
t|7  _
q||}|d u rgtd|  q|j| td|j  td|j  |st|j|j||d\}}}nt|j|j|j|j|j|j||\}}}| j|7  _d}d}t|d} tt||D ]t\}!\}"}#|"|v r| jd7  _|#|v r| jd7  _|"|	v r| jd7  _|"|#kr|#|v r| jd7  _|s| |"d	ft|"d
fd n| |"|"ft|"t|"fd |j|"  d7  < |d7 }q|d7 }|"|	v r.| jd7  _|"dkru|rD| d	|#fd
t|#fd n'|sZ| d	t|#dddfd
t|#fd n|# }$| d|$fdt|$fd |j|#  d7  < q|#dkr|r| |"d	ft|"d
fd n'|s| t|"dddd	ft|"d
fd n|" }%| |%dft|%dfd |d7 }|j|"  d7  < |j|"  d7  < q|d7 }|" d|# }&|r| |"|#ft|"t|#fd n4|s| t|"dddt|#dddft|"t|#fd n|" }%|# }$| |%|$ft|%t|$fd |j|&  d7  < |j|"  d7  < q| j
|7  _
|s=||  |rX|| }'||' d  |7  < ||' d  |7  < |rb| j d7  _ |
rdd }(|(|j})|(|j}*t!|)|*|\}+},| j"t#$|+|,7  _"| j%t|+7  _%|rq|	rt&|||	\}-}.| j'|-7  _'| j(|.7  _(q||fS )Nr   z%s)rI   z"Missing hypothesis for utterance: zref: %szhyp: %srQ   r   rp   rB   Frf   redT)force_color*greenr   counterrorsc                 S   sB   g }t | D ]\}}|t| |t| d kr|d q|S )Nr   rq   )rr   extendrU   r/   r   )r   newr   r   r   r   r   convert_to_char_list  s   
z+process_lines.<locals>.convert_to_char_list))r   keysr	   rJ   r/   r   rj   r   r   minr   getwarningr   re   r   r   r   rr   zipr   r   r   r   r   r   r   r
   upperr   r   r   r   r$   r   r&   r(   r   r   ry   rx   )/r   r   rJ   rI   r   skip_detailedr   r   r   rw   cerr   group_statsnocolorrK   	fullprintsuppress_warningserror_statsdct_char
multilinesr   refis_empty_referencehypscostshypr   r7   ru   rv   error_countref_word_countdouble_liner   rz   r|   hyp_w_upperref_w_upperr   r   r  char_refchar_hypref_inthyp_interrcntr   r   r   process_lines  s  

























r$  c                 C   s   t dd|   S )Ns&   \x1b\[[0-9]{2}m([\p{L}\p{P}]+)\x1b\[0ms   \1)resubencodedecoder   r   r   r   _remove_color"  s   r*  c                 C   s
   |  dS )N)
startswithr)  r   r   r   r   &  r   r   c                 C   sF  g }t |  t   t | t   t| |D ]
\}}t|d}d\}}	|t|k r|	t|k r|| }
||	 }|
jd |jd krc|g |
j|jd R g |
j|jd R d |d7 }|	d7 }	nS|
jd dkr|g |
jdR g |
jdR d |d7 }n4|jd dkr|dd|jd fdd|jd fd |	d7 }	nt |
|t|
jd t|jd  td	|t|k r|	t|k s/|t|k r|| }
|g |
jdR g |
jdR d |d7 }|t|k s|	t|k r||	 }||jd d|jd f|jd d|jd fd |	d7 }	|	t|k st | |	| q|S )
N   )r   r   r   rB   Fr   rp   TzShould not be possible AA)
rg   r	  r   r/   r   r   r   r*  RuntimeErrorr   )multilines_amultilines_br   usecolorr  multiline_amultiline_b	multilineidx_aidx_ble_ale_br   r   r   _merge_multilines*  sh   



"r9  c                 C   s  |
d u rt  \}
}|
dkrdn|
}
t| |d|dd|
dg g dd d dddd\}}t| |d|dd|
dg g dd d dddd\}}t||d|dd|
dg g dd d ddd\}}t|||
d}|d| d| d|	 d	 t|j|D ] \}}|| d
 | D ]}|D ]
}|| d
 qqqot|j	
 }t|j
 }t|j
 }|| | t|j }|d| dd| dd| d| d| d|j dd|j t|j dd
 t||j	|j|j|||j |d t|j	
 }t|j
 }t|j
 }|| | t|j }|d|	 dd| dd| d| d| d|j dd|j t|j dd
 t||j	|j|j|||j |d |d t||j	|j|j|||j d S )Nx   Frf   T)r  rK   r  )r  rK   zPer utt details, order is "z", "z":
r   z
Results with file z
WER:       Y@.1f (ins , del , sub  / )
SER: z---
z
Difference between outputs:
)shutilget_terminal_sizer$  r9  r   r	  r   r   sumr   r   r   r   r   r   r   r/   r   r   )r   	hypa_utts	hypb_uttsr   r   rI   r   ref_filefile_afile_br   r   multilines_ref_hypaerror_stats_ref_hypamultilines_ref_hypberror_stats_ref_hypberror_stats_hypa_hypbmerged_multiliner   r4  r   r   	ins_count	del_count	sub_countwerr   r   r   process_multiple_outputs\  s   









rT  
   c           #      C   s  t  \}}|dkrdn|}|d u rt }|d u rt }|d u r"i }i }t| }|D ]}i ||< d|| d< d|| d< q,t| |||	|
||||||||||\}}|s|s|rf|d| d| d n|d| d| d	 |d
 t|j|D ]%\}}|| d | D ]\}}|| d || d qq}|	s|st	dd t
|j |j |j D }||jksJ | d|j |r|d|j|j  d d S t	|j }t	|j }t	|j } || |  t|j }!|s|d |dd|! dd| d| d|  d|j dd|j t|j dd |rI|jt|j }|dd| dd|j d|j d |rv|jrq|dd|j |j dd |dd|j |j dd ntd |r|d|jr|j|j ndd d!|jr|j|j ndd d |r|d" | D ]\}}"d|"d t|"d   }!|| d#|!dd q|d |st||j|j|j|||j  d S d S )$Nr:  r   r   r  "z" is treated as reference, "z)" as hypothesis. Errors are capitalized.
z." is treated as reference (white and green), "z!" as hypothesis (white and red).
zPer utt details:
r   c                 s   r   r   r   r   r   r   r   r     r   z!process_output.<locals>.<genexpr>rq   zOracle WER: zWER: r;  r<  r=  r>  r?  r@  rA  zCER: z (z)
z	OOV CER: z	OOV WER: zCNone of the words in the OOV list file were found in the reference!zKeyword results - recall rB   z.2fz - precision zGroup WERs:
r>   )!rB  rC  r   r   r$  r   r	  r   r   rD  r   r   r   r   r   r   r   r   r/   r   r   r   ry   rx   r   r	   errorr   r   r   r   r   r   )#r   r   r   rG  hyp_filer  r   rw   rJ   rI   r   r  r   r   r   r   r  rK   r   r   r  groupsr   r  r  r   r4  
upper_line
lower_linesrP  rQ  rR  rS  statsr   r   r   process_output  s   

,
,
( "
"


r^  rp   rG  Reference textrX  Hypothesis textoutfOptional output file
oov_list_fzList of OOVsoptionNr   z#Text files start with utterance ID.flagr   zDText files start with utterance ID and end with word, time, durationrg  rI   zTUse character lev distance for better alignment in exchange for slightly higher WER.rg  r  zCalculate CERrg  rJ   z;Print debug messages, will write cost matrix to summedcost.rg  r~   r  zNo per utterance outputrg  r\  r   z,Will filter out non keyword reference words.re  Nr   zBTurn on sorting del/sub errors by frequency (default is by count).rg  Nr   zbHyp file should have multiple hypothesis per utterance, lowest edit distance will be used for WER.rg  Nr   zJShould be a file which maps uttids to group, WER will be output per group.re  rp   r1  z`Show detailed output with color (use less -R). Red/white is reference, Green/white model output.rg  r   r   z5Number of errors to show per type in detailed output.re  second_hyp_fz2Will compare outputs between two hypothesis files.re  c                 C   s\  t   |rt jtjdd nt jtjdd |rt|d}ntj}|s|r5|r*|r,J d}	|r5t d t }|rc|sAt d t|}|D ]}||	 d  qHW d    n1 s^w   Y  t
| ||||
||\}}}}t|||f|||| |||	|||||| |d	 nt| |}t||d
}t||d
}t|||||||| ||
 |  d S )NDEBUG)levelINFOr   Tz|You probably would prefer running without `-use_chardiff`, the WER will be slightly better for the cost of a worse alignmentzBecause you are using standard alignment (not `-use_chardiff`) the alignments could be suboptimal
 which will lead to the OOV-CER being slightly wrong. Use `-use_chardiff` for better alignment, ctm based for the best.r   )r  rJ   rw   rG  rX  rI   r  r   r   r   r   r   r  r   F)r	   remover   sysstderrr   stdoutr  r   r   r   r^  r   r   rT  close)rG  rX  ra  rc  r   r   rI   r  rJ   r  r   r   r   r   r1  r   rs  r   rw   fh_oovr   r   r   r   r   	hyp_uttsa	hyp_uttsbr   r   r   main  sJ   






r  c                   C   s   t t d S r   )placcallr  r   r   r   r   cli%  s   r  __main__)rf   T)FFr   )FrU  NFTFFNNFFFrf   )rp   rp   FFFFFFrp   FFrp   FrU  rp   )"rG  r_  rX  r`  ra  rb  rc  rd  r   rf  r   rh  rI   ri  r  rj  rJ   rk  r  rl  r   rm  r   rn  r   ro  r   rp  r1  rq  r   rr  rs  rt  )3rB  ry  collectionsr   dataclassesr   r   	itertoolsr   typingr   r   r   numpyr0   r  regexr%  r&   logurur	   	termcolorr
   r   r$   r(   r8   rP   re   rj   ro   r   r   r   r   r   r   r   r   r   r   r$  r*  r   r9  rT  r^  r  r  r   r  r   r   r   r   <module>   s   	
;
6
 #3
:
R	

@
