o
    ‚o™iÕh  ã                
   @   s¸  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZmZm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZ dZedd	jZe e d
d¡¡e e e d¡e ¡ Zeƒ Zdee fdd„Zdd„ Zdee deee  fdd„Zdee deee  fdd„Z dGdee deee  dee fdd„Z!dee deee  fdd„Z"dee deee  fdd„Z#defd d!„Z$defd"d#„Z%dHd%ed&edefd'd(„Z&		$dIdee deee  dee d)ee' fd*d+„Z(d,efd-d.„Z)dJd0efd1d2„Z*d%edee de'fd3d4„Z+dKdee d5ee	eef  deee  fd6d7„Z,d8d9„ Z-d:d;„ Z.d%ede
ee ef d<ede'fd=d>„Z/d?d@„ Z0dAedBefdCdD„Z1d%ed&efdEdF„Z2dS )Lé    N)ÚListÚOptionalÚTupleÚUnion)Útop_rewrite)Útqdm)ÚCardinalFst)ÚInverseNormalizerz~~Úcased)Ú
input_casezIn zin ú Úinput_fsc                 C   s&  g }g }g }g }g }| D ]Û}|  d¡ret|dƒD}|D ]9}t |¡}z| |d  ¡ ¡ | |d  ¡ g¡ | dg¡ W q tyT }	 zt|	ƒ td|› ƒ‚d}	~	ww W d  ƒ n1 s_w   Y  qt|dƒu}|D ]j}|dkrÉz| ¡  	t
¡\}
}W n" ty  }	 z| d	¡r‘W Y d}	~	qmt|	ƒ td|› ƒ‚d}	~	ww |d
kr«| |
¡ qm|dkrº| |
¡ | d¡ qm|dkrÈ| |
¡ | d¡ qm| |¡ g }| |¡ g }qmW d  ƒ n1 sâw   Y  qt|ƒdkrø| |¡ | |¡ t|ƒt|ƒksJ ‚dd„ t||ƒD ƒ}||||fS )a>  
    loads data from list of abs file paths
    Returns:
        inputs: List[str] list of abs file paths
        targets: List[List[str]] list of targets, can contain multiple options for each target
        sentences: List[List[str]] list of sentence options
        labels: List[List[int]] list of labels (1,0)
    z.jsonÚrÚtextÚgt_normalizedé   zCheck format for line NÚ
ú#ÚRAWÚ1Ú0r   c                    s&   g | ]\}‰ ‡ fd d„t |ƒD ƒ‘qS )c                    s   g | ]
\}}ˆ | r|‘qS © r   )Ú.0ÚiÚx©Úlsr   úU/home/ubuntu/.local/lib/python3.10/site-packages/nemo_text_processing/hybrid/utils.pyÚ
<listcomp>a   ó    z(load_data.<locals>.<listcomp>.<listcomp>)Ú	enumerate)r   Úsentsr   r   r   r   a   s   & zload_data.<locals>.<listcomp>)ÚendswithÚopenÚjsonÚloadsÚappendÚstripÚ	ExceptionÚprintÚ
ValueErrorÚsplitÚ	DELIMITERÚ
startswithÚlenÚzip)r   ÚinputsÚ	sentencesÚcur_sentencesÚlabelsÚ
cur_labelsÚinput_fÚfÚlineÚeÚsentÚlabelÚtargetsr   r   r   Ú	load_data*   sp   	

€þúÿ€
€ü


€

ëÿ€

r<   c                 C   s&   t  dd| ¡} |  dd¡ dd¡} | S )Nz\|raw_start\|[^|]+\|raw_end\|Ú z|norm_start|z
|norm_end|©ÚreÚsubÚreplace)r   r   r   r   Úremove_whitelist_boudariese   s   rB   r0   r;   c           
      C   s²  t t|ƒƒD ]}t t|| ƒƒD ]}t|| | ƒ|| |< qqt t| ƒƒD ]®}|| D ]§}t| |  ¡ | ¡ d}|ddd… D ]‘}| | |d d |d d …  ¡ }||d d |d d …  ¡ }| | d|d d … | | | |d d d…  }	|dkr‡|dks|dkr”|dkr”|	| |< qA|dkrœ|d	ks¤|d	kr©|dkr©|	| |< qA|d
kr±|dks¹|dkr¾|d
kr¾|	| |< qA|dkrÆ|dksÎ|dkrÒ|dkrÒ|	| |< qAq,q&| |fS )z„
    standardizes format of inputs and targets before being normalized, so more rules apply.
    This is specific for libritts.
    ©ÚaÚbNéÿÿÿÿr   r   ÚsÚzr?   ÚerÚmer=   Úue)Úranger.   Úclean_libri_ttsÚget_diffÚlower)
r0   r;   r   ÚjÚtargetÚdiffsÚdiffÚin_diffÚtg_diffÚreplacementr   r   r   Ú_clean_pre_norm_librittsm   s.   ÿ$ 4 
 
 
 €õþrW   c           	   	   C   s   t t| ƒƒD ]D}t dd| | ¡| |< t dd| | ¡| |< t dd| | ¡| |< t dd| | ¡| |< t d	d
| | ¡| |< t dd| | ¡| |< t dd| | ¡| |< t dd| | ¡| |< t dd| | ¡| |< || D ]Ú}t| |  ¡ | ¡ d}|ddd… D ]Ä}| | |d d |d d …  ¡ }||d d |d d …  ¡ }| | d|d d … | | | |d d d…  }|dkrË|dksÓ|dkrØ|dkrØ|| |< q…|dkrà|dksè|dkrí|dkrí|| |< q…|dkrõ|dksÿ|dkr|dkr|| |< q…|dkr|dks|dkr|dkr|| |< q…|dkr'|dks1|dkr6|dkr6|| |< q…t d d|¡t d!d|¡krI|| |< q…qpq| |fS )"zŠ
    standardizes format of inputs and targets before being normalized, so more rules apply.
    This is specific for google dataset.
    z\$\s([0-9]{1,})z$\1z\bmr zMr. z\bdr zDr. z\bdr$zDr.z\bmrs zMrs. z\bjr zJr. z\bjr$zJr.z\dsr zSr. z\dsr$zSr.rC   NrF   r   r   rG   rH   r?   rI   rJ   r=   ÚurK   z\.z( |\.))rL   r.   r?   r@   rN   rO   )	r0   r;   r   rQ   rR   rS   rT   rU   rV   r   r   r   Ú_clean_pre_norm_googleˆ   s@   $ 4 
 
$
(
(
€ñþrY   Údatasetc                 C   s   t  | ¡}t  |¡}|dkrt||d\}}n|dkr$t||d\}}n	 tt|ƒƒD ]}t dd|| ¡||< t dd|| ¡||< q+||fS )za
    standardizes format of inputs and targets before being normalized, so more rules apply.
    Úlibritts)r0   r;   Úgooglezlibrivox.orgzlibrivox dot orgz:([0-9]?[0-9](\.|:)[0-9][0-9]\s?)(a|A|p|P)(\.?)\s(M|m)(\.?)z
\1\3\4\5\6)ÚcopyÚdeepcopyrW   rY   rL   r.   r?   r@   )r0   r;   rZ   Ú
pre_inputsÚpre_targetsr   r   r   r   Úclean_pre_norm®   s   



ÿra   c                 C   s   ||fS ©Nr   ©r0   r;   Ú
norm_textsr   r   r   Ú_clean_post_norm_librittsÈ   s   re   c              
   C   sø   t t|ƒƒD ]q}|| D ]j}t|| d ƒD ]_\}}t| ¡ | ¡ d}|ddd… D ]I}||d d |d d …  ¡ }	||d d |d d …  ¡ }
|d|d d … |
 ||d d d…  }|	t dd|
¡krt||| d |< q+qqq||fS )z‘
    standardizes format of inputs and targets, and predicted normalizations for easier evaluation.
    This is specific for google dataset.
    r   rC   NrF   r   r   r=   )rL   r.   r    rN   rO   r?   r@   )r0   r;   rd   r   rQ   rP   ÚnormrR   rS   Ú	norm_diffrU   rV   r   r   r   Ú_clean_post_norm_googleÌ   s     ,€ûþÿ
rh   Úreturnc                 C   sr   t  dd| ¡} t  dd| ¡} t  dd| ¡} t  dd| ¡} t  d	d| ¡} t  d
d| ¡} t  dd| ¡} |  dd¡} | S )zh
    standardizes format of inputs and targets, and predicted normalizations for easier evaluation.
    ú oh ú zero z oh$z zeroz^oh zzero z\sO\bÚzeroz o$z^o z'o z'zero ÚmountainÚmountr>   ©Ústrr   r   r   Ú_clean_post_generalß   s   rq   c                 C   s   t  dd| ¡} | S )zClean ground truth options.ú o rk   )r?   r@   ro   r   r   r   Ú_clean_targetsï   s   rs   TÚpredÚgtc                 C   sº  | }|}|rt  d| ¡s| S t  dd| ¡} t  dd| ¡} |  ¡  ¡ } | ¡  ¡ }d}|dv rÛ| |krÛt| |tdr<|} n/t| |ƒrkt  dd| ¡} t  dd|¡}t  d	d
| ¡} t  d	d
|¡}t  dd|¡}t  dd| ¡} | |krÛtj	|dd}tj	| dd}t
|ƒt
|ƒkr’t|ƒt|ƒkr’d}|} nId|v rÛt  dd|¡}| d¡}	d||	d d… v r·||	d d…  d¡n	t
||	d d… ƒ|	 d }
||	d |
… d |d|	…  ||
d…  }|dkr8| |kr8d|v rt  dd|¡}t  dd| ¡} t  d| ¡rt  d|¡st  dd| ¡} n*t  dd|¡}t  dd| ¡} t  dd|¡}t  dd|¡}t  dd|¡}t  d d!|¡}|tkr­|dv r­| d"d¡ d#d¡|  d$d¡ d#d¡krZ|} nS|d%v rg| d%v rg|} nF|d&krt| d'krtd&} n9|dd(… | dd(… kr–|d(d… d)v r–| d(d… d)v r–|} n| d*d¡ d+d¡|  d*d¡ d+d¡kr­|} t  dd|¡}t  d,d| ¡} t  dd| ¡} || krÉd}|rÛ|r×d-|› d.}|S |}|S |S )/z7Standardize prediction format to make evaluation easierz	< (.*?) >ú< r=   ú >F)r\   r[   )rt   ru   Úcardinal_graphú,rk   rr   ú +r   )ÚverboseTz of z(^the | of)r   Nr[   Údollarz\band\bz\bus dollarz(\bthe\b|\.)z\bone\brD   z\bmr\bÚmisterz\bmrs\bÚmissesz\bdr\bÚdoctorz\bco\bÚcompanyú/ú  Úslash)rG   rH   zhash tagÚhashéþÿÿÿ)rI   r?   Útoú-z(\.)z < z > )r?   Úsearchr@   rO   r'   Úis_daterx   Úcontains_monthÚinverse_normalizerÚ	normalizer.   ÚsetÚindexÚpdrA   )rt   ru   rZ   Údelim_presentÚ	orig_predÚorig_gtÚcan_be_adjustedÚgt_itnÚpred_itnÚidxÚidx2Úresr   r   r   Úadjust_predõ   s„   
 
F,
€*>*
ÿr™   r   c                 C   s†  t  |¡}t  |¡}|dkrt| ||d\}}n|dkr&t| ||d\}}n	 tt|ƒƒD ]D}t|| ƒD ]\}}	t|	ƒ|| |< q5t|| d ƒD ]&\}}	|	 d¡|	 d¡krf|	 	dd¡ 	dd¡ 	d	d
¡}	t|	ƒ|| d |< qJq-|dv r?t|ƒD ]Ã\}}
t|| d ƒD ]¶\}}t|
ƒD ]­\}}|s£t
|ddd}t
|ddd}t||d}|ddd… D ]‹}|d d |d d  dkrÏ|d d |d d  dkrÏq°||d d |d d … }||d d |d d … }t d|› d|› d¡ t||||d}|| d | d|d d … | || d | |d d d…  }t d|| d | › d|› d¡ ||| d |< q°qq‡q{||fS )a©  
    Args:
        inputs (List[str]): inputs
        targets (List[List[str]]): targets
        norm_texts (List[(List[str], List[float])]): List of normalization options, weights
        dataset (Optional[str], optional): _description_. Defaults to None.
        delim_present (Optional[str], optional): The flag indicates whether normalization output contain delimiters "<>".
            Set to False for NN baseline.
    r[   rc   r\   r   rv   rw   ú<ú>r‚   r   )r[   r\   FT©Úremove_spacesÚdo_lower)rt   ru   NrF   r   zpred: |z|	gt: |ú|)rt   ru   rZ   r   z| -> |)r]   r^   re   rh   rL   r.   r    rq   ÚcountrA   Úremove_punctuationÚdiff_pred_gtÚloggingÚdebugr™   )r0   r;   rd   rZ   r   Úpost_norm_textsÚpost_targetsr   rP   r   Ú_targetsÚjjÚoptionÚ_Ú_targetrR   rS   rt   ru   Únew_predr   r   r   Úclean_post_normA  s\   

ÿÿý
8ÿþÿ$óøÿr­   rQ   c                 C   s€  dddœ}ddi}i dd“dd	“d
d“dd“dd“dd“dd“dd“dd“dd“dd“dd“dd“dd “d!d"“d#d$“d%d&“i d'd(“d)d*“d+d,“d-d.“d/d0“d1d2“d3d4“d5d6“d7d8“d9d:“d;d<“d=d>“d?d@“dAd“dBdC“dDdE“dFdG“¥dHdHdIdJdKdLdMdNddOdPdQdRdSdTœ¥}|  ¡ D ]}|| v rŸt dU|› dVdW|| › dX| ¡} qˆ| ¡ D ]
\}}|  ||¡} q¤| ¡ D ]
\}}|  ||¡} q³| S )Yz-
	Replace abbreviations in LibriTTS dataset
	ÚsaintÚreverend)zSt.zRev.zvs.ÚversusÚmrr}   ÚMrÚMisterÚmrsr~   ÚMrsÚMissesÚdrr   ÚDrÚDoctorÚdrsÚdoctorsÚDrsÚDoctorsÚltÚ
lieutenantÚLtÚ
LieutenantÚsgtÚsergeantÚSgtÚSergeantÚstÚStÚSaintÚjrÚjuniorÚJrÚJuniorÚmajÚmajorÚMajÚMajorÚhonÚ	honorableÚHonÚ	HonorableÚgovÚgovernorÚGovÚGovernorÚcaptÚcaptainÚCaptÚCaptainÚesqÚesquireÚEsqÚEsquireÚgenÚgeneralÚGenÚGeneralÚltdÚlimitedÚLtdÚLimitedÚrevÚRevÚReverendÚcolÚcolonelÚColÚColonelzand Companyrn   ÚMountÚfortÚFortÚ	tennesseeÚ	TennesseeÚVersusÚandÚsectionr„   Úequals)zand cozand CoÚmtÚMtÚftÚFtÚtennÚTennÚvsÚVsú&õ   Â§r   ú=z	(^|\s|\W)z($|\s)z\1z\2)Úkeysr?   r@   ÚitemsrA   )rQ   Úlibri_sometimes_converts_abbrsÚlibri_wo_changes_abbrsÚgoogle_abbr2expandÚabbrÚtr   r   r   rM   ‡  s¾   
ÿþýüûúùø	÷
öõôóòñðïîíìëêéèçæåäãâá à!ß"Þ#Ð4"€rM   Úenr   c                 C   sŽ   t j}|dur|D ]}| |d¡}q	t d| d d| ¡} |dkr(t dd| ¡} t dd| ¡} |r=|  dd¡ d	d¡ ¡ } |rC|  ¡ } |  ¡ S )
zIRemoves punctuation (and optionally spaces) in text for better evaluationNr=   ú[ú]r   r  z[^\x00-\x7f]rz   õ   Â )ÚstringÚpunctuationrA   r?   r@   r'   rO   )r   r   rž   ÚlangÚexcludeÚall_punct_marksÚpr   r   r   r¡   Ö  s   r¡   c                 C   sf   dd„ }d}t | ddd} |D ]!}tt |dddƒ}tt |dddƒ}||ƒ|| ƒkr0d} |S q|S )z1Returns true if prediction matches target optionsc                 S   s¬   |   dd¡} |   dd¡  dd¡} |   dd¡  dd¡} |   dd¡  d	d
¡  dd¡} |   dd¡} |   dd¡  dd¡} |   dd¡  dd¡} |   dd¡  dd¡} |   dd
¡ ¡ } | S )Nz
us dollarsÚdollarsÚetceterar=   Úetczone half ouncezhalf an ounceÚ
televisionzt v r   ÚtvÚhundredz	forty twozfour twor?   rI   ÚouÚor‚   )rA   r'   )r   r   r   r   Ú_relax_diffï  s   z*get_alternative_label.<locals>._relax_diffFTrœ   )r¡   rq   rs   )rt   r;   r  Ú
acceptablerQ   r   r   r   Úget_alternative_labelì  s   ýr  Únorm_texts_weightsc                 C   s    t dƒ g }tt| ƒƒD ]A\}}g }dd„ |D ƒ}|| d D ](}t|ƒ}t|ƒ}t|||dr6| d¡ qt||drB| d¡ q| d¡ q| |¡ q|S )aT  
    Assign labels to generated normalization options (1 - for ground truth, 0 - other options)
    Args:
        targets: ground truth normalization sentences
        norm_texts_weights: List of tuples: (normalization options, weights of normalization options)
    returns:
        List of labels [1, 0] for every normalization option
    z3Assign labels to generated normalization options...c                 S   s   g | ]}t |ƒ‘qS r   )rs   )r   r
  r   r   r   r     ó    zget_labels.<locals>.<listcomp>r   ©rt   r;   r  r   )rt   r;   )r)   r   r    rs   rB   Ú
is_correctr&   r  )r;   r   r  r3   r   Úcur_targetsÚcurr_labelsÚnorm_optionr   r   r   Ú
get_labels  s   	r'  c                 C   s,   g d¢}|D ]}||v r|| v r dS qdS )z.Check is the pred/gt contain month in the span)ÚjanuaryÚfebruaryÚmarchÚaprilÚmayÚjuneÚjulyÚaugustÚ	septemberÚoctoberÚnovemberÚdecemberTFr   )rt   ru   ÚmonthsÚmonr   r   r   rŠ   "  s   €rŠ   c                 C   sæ   d}d| v r"d| v r"|   ¡  ¡ dd… |  ¡  ¡ dd… kr"d}|S d|v rBd|v rB|  ¡  ¡ dd… |   ¡  ¡ dd… krBd}|S z*t| dd¡ d	d¡|ƒ d
d¡t|  dd¡ d	d¡|ƒ d
d¡krjd}W |S W |S    Y |S )zHReturns True is pred and gt are date format modifications and are equal.FÚthousandr  r…   NTrj   rk   rr   r   r=   )r'   r+   r   rA   )rt   ru   rx   Úis_date_caser   r   r   r‰   9  s(   88õ
øÿ þùýr‰   r  c                    sF   t |tƒr‡ fdd„|D ƒ}n	t|ddˆ dg}t| ddd} | |v S )zG
    returns True if prediction matches targets for language lang.
    c                    s   g | ]
}t |d d ˆ d‘qS )T©r   rž   r  )r¡   ©r   r   ©r  r   r   r   S  r   zis_correct.<locals>.<listcomp>Tr8  rœ   )Ú
isinstancer   r¡   r"  r   r:  r   r#  N  s
   
r#  c              
   C   sF   t  dddddddd¡ t| ƒ W d  ƒ dS 1 sw   Y  dS )z
    prints data frame
    zdisplay.max_rowsNzdisplay.max_columnszdisplay.widthiè  zdisplay.max_colwidthi  )r   Úoption_contextr)   )Údfr   r   r   Úprint_df[  s
   ÿ
"ýr>  rD   rE   c           
      C   s&  t jd| |dd}| ¡ }dd„ |D ƒ}g d¢g| }g }g }t|dd… |dd… ƒD ]"\}}| |d	 |d
  |d	 g¡ | |d |d
  |d g¡ q,tt||ƒƒ}tt||ƒƒD ]/}	t d| |	d	 d	 |	d	 d … › ¡ t d||	d d	 |	d d … › ¡ t d¡ q]|dd… S )zreturns list of different substrings between and b

    Returns:
        list of Tuple(pred start and end, gt start and end) subsections
    NF©Úautojunkc                 S   s   g | ]
}|d  dkr|‘qS )é   r   r   r9  r   r   r   r   p  r   zget_diff.<locals>.<listcomp>©r   r   r   rF   r   r   rA  za: zb: ú====================)ÚdifflibÚSequenceMatcherÚget_matching_blocksr/   r&   Úlistr£   r¤   )
rD   rE   rG   ÚmatchesÚunmatches_lÚunmatches_rÚlr   ÚresultÚitemr   r   r   rN   e  s   " ((rN   c                 C   sj  t jd| |dd}| ¡ }tt d| ¡ƒ}dd„ |D ƒ}tt d| ¡ƒ}dd„ |D ƒ}d	g| t| ƒg }d
g| t| ƒg }g }t|ƒt|ƒksKJ ‚d}t|ƒD ]]\}}|t|ƒd kre|d d
kre nJ|t|ƒd k r‰|d
 || kr‰|d7 }|t|ƒd k r‰|d
 || ksu||d  |d
   kr›|| k r®n qQ|d
 |d  || kr®| |¡ qQg d¢g| t| ƒt|ƒd
gg }g }	g }
t	|dd	… |dd… ƒD ]"\}}|	 |d
 |d  |d
 g¡ |
 |d |d  |d g¡ qÑtt	|	|
ƒƒ}tt	|	|
ƒƒD ]0}t
 d| |d
 d
 |d
 d … › ¡ t
 d||d d
 |d d … › ¡ t
 d¡ q|S )a  returns list of different substrings between prediction and gt
    relies on that prediction uses '< '  ' >'  

    Args:
        pred (str): prediction
        gt (str): ground truth

    Returns:
        list of Tuple(pred start and end, gt start and end) subsections
    
    e.g. pred="< Edward third >., king Our own . loss had been < two thousand two hundred >"
         gt  ="Edward III., king Our own loss had been twenty two hundred"
         --> [([0, 16], [0, 10]),      ([32, 34], [26, 26]),      ([48, 76], [40, 58])]
    NFr?  rv   c                 S   ó   g | ]}|  ¡ ‘qS r   )Ústartr9  r   r   r   r   ˜  r!  z diff_pred_gt.<locals>.<listcomp>rw   c                 S   rN  r   )Úendr9  r   r   r   r   š  r!  rF   r   r   rA  rB  zpred: zgt  : rC  )rD  rE  rF  rG  r?   Úfinditerr.   r    r&   r/   r£   r¤   )rt   ru   rG   ÚleftÚrightrH  r–   r   ÚseqrI  rJ  rK  r   rL  rM  r   r   r   r¢   ‚  s@     ÿ@
€"" ((r¢   rb   )T)NT)TTr  N)r  )3r]   rD  r$   r£   r?   r  Útypingr   r   r   r   Úpandasr   ÚpyniniÚpynini.lib.rewriter   r   ÚCnemo_text_processing.inverse_text_normalization.en.taggers.cardinalr   ÚAnemo_text_processing.inverse_text_normalization.inverse_normalizer	   r,   Úgraph_no_exceptionrx   ÚclosureÚunionÚaccepr‹   rp   r<   rB   rW   rY   ra   re   rh   rq   rs   r™   Úboolr­   rM   r¡   r  r'  rŠ   r‰   r#  r>  rN   r¢   r   r   r   r   Ú<module>   sb   (ÿ;(&Pûÿ
þü
ûFO0&
