o
    oi\                  
   @   s   d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlmZ zd dl	Z	W n e
y6 Z ze
ddZ[ww defddZd	eee ef d
efddZdddZdee fddZddee fddZdddZdS )    N)ListUnion)tqdm)	MLMScorerztorch is not installedmodel_name_listc                 C   s>   |  d}i }|D ]}tj rdnd}t||d||< q	|S )zQ
    returns dictionary of Masked Language Models by their HuggingFace name.
    ,cudacpu)
model_namedevice)splittorchr   is_availabler   )r   model_namesmodelsr
   r    r   [/home/ubuntu/.local/lib/python3.10/site-packages/nemo_text_processing/hybrid/model_utils.pyinit_models   s   
r   textsmodelc              
   C   st   zt | tr	| g} dt||  t|  }W |S  ty9 } zt| td|   tj}W Y d}~|S d}~ww )z/Computes MLM score for list of text using modelzScoring error: N)	
isinstancestrsumscore_sentenceslen	Exceptionprintmathinf)r   r   scoreer   r   r   	get_score*   s   
r"   Tc                 C   s   |r|   n| } td| }t|dkrNg }td| D ]/}| d| d  | dddd | | d d  }td|j	|}|
| q|} t| |S )a  text is normalized prediction which contains <> around semiotic tokens.
    If multiple tokens are present, multiple variants of the text are created where all but one ambiguous semiotic tokens are masked
    to avoid unwanted reinforcement of neighboring semiotic tokens.	<\s.+?\s>r   N<   >   )lowerrefindallr   finditerspangroupreplacesub
MASK_LABELappendr"   )textr   do_lowerspanstext_with_maskmatchnew_textr   r   r   get_masked_score7   s   >
r8   	sentencesc                 C   s   dd t d| d D }| dd D ]%}t d|}t|t|kr& dS tt|D ]}|| ||  q,qg }|D ]}|t|dk q=|S )zreturns None or index list of ambigous semiotic tokens for list of sentences.
    E.g. if sentences = ["< street > < three > A", "< saint > < three > A"], it returns [1, 0] since only 
    the first semiotic span <street>/<saint> is ambiguous.c                 S   s   g | ]}t |gqS r   )set.0xr   r   r   
<listcomp>O       z,_get_ambiguous_positions.<locals>.<listcomp>r#   r   r'   N)r)   r*   r   rangeaddr1   )r9   l_setssentencer4   i	ambiguousr,   r   r   r   _get_ambiguous_positionsK   s   rF   c              	      s  g } dur fdd| D }t tdd |D dkr|} d}| r-t| d tr-t| }t| D ]}t|trofdd|D }t| t| td t	d	d
 |D r^t
j}	ntt|t | d}	||	 q1t|tr|rttd|}
t|
ddd |ddd D ]&\}}|s|d| d  | dddd || d d  }q|tt|d q1t |S )zTreturn list of scores for each sentence in list where model is used for MLM Scoring.Nc                    s   g | ]}t | qS r   )	find_diffr<   s)context_lenr   r   r>   a   r?   z!score_options.<locals>.<listcomp>c                 S   s   g | ]}t |qS r   )r   )r<   dr   r   r   r>   b   s    r'   r   c                    s   g | ]}t | qS r   )r8   rH   )r3   r   r   r   r>   k   s    z2==================================================c                 s   s    | ]}t |V  qd S )N)r   isnanr;   r   r   r   	<genexpr>o   s    z score_options.<locals>.<genexpr>   r#   r   r$   r%   r&   )r3   )r   r:   r   r   rF   r   listloggingdebuganyr   r   roundr   r1   r)   r+   zipr,   r-   r.   r8   
ValueError)r9   rJ   r   r3   scoresdiffsambiguous_positionssentoption_scoresav_scorematchesr6   posr   )rJ   r3   r   r   score_options]   sB   




&r^      c           
         sd  g }dd  fdd}d}| |d v r|| |d   }|} | |d v r|| |d    t  }|| || }d|| d|  | d }t|dkri| d| d  ri|d }d|| |d  d| }	t|	dkr| | d  rd|	 }	||| |	  |d	7 }|d	 }nn| |d v st|dkr| g}|S )
z]Finds parts of text normalized by WFST and returns them in list with a context of context_lenr$   r&   c                    s   |  d  d ddS )Nr%   z   )r.   )rI   pattern_endpattern_startr   r   __clean   s   zfind_diff.<locals>.__cleanr   Nr`   r   r'   )indexr   joinr   isspacer1   )
r2   rJ   rW   rd   index_startoffset	index_endcenterleft_contextright_contextr   ra   r   rG      s2   $ "
rG   )T)r_   )rP   r   r)   typingr   r   r   &nemo_text_processing.hybrid.mlm_scorerr   r   ImportErrorr!   r   r   r"   r8   rF   r^   rG   r   r   r   r   <module>   s$   
'