o
    oi$                     @   s<  d dl Z d dlZd dlmZ d dlmZ d dlZd dlmZ 	 dd ZdZ	dZ
d	Zd
Zeejej ed Zdedeee  fddZdejfddZdejdedejfddZdee defddZdd Zdd Zdee ded ed!efd"d#Zed$kre  e j  e Z!ee!j"d%d&Z"ze"e!j# Z"W n   e$e!j# d'e!j%Z&e Z'ee"e&e'd(\Z(Z)e *d)e!j% d* e *d+e) d* e!j+du ree&Z,ne!j+e!j-fgZ,e,D ]AZ.ee.d  e.d, e(e!j/d-\Z+Z-e *d.e.d   d/e.d,  d0e+ d/e- d1	 e *d2e&e.d  e.d,   d3e)e+e-  d* qdS dS )4    N)ArgumentParser)List)Farc                  C   s   t d} | jddtdd | jddtdttgtd	 | jd
dtddd | jddtdd | jddtdd | jddtdd |  S )Nz map substring to output with FSTz--fstzFAR file containing FSTT)helptyperequiredz	--grammarz	tn or itnF)r   r   r   choicesdefaultz--rulez$rule name in FAR file containing FSTtokenize_and_classify)r   r   r	   r   z--textzinput stringzM2615 Forest Av, 90601 CA, Santa Clara. 10kg, 12/16/2018, $123.25. 1 Aug 2016.)r   r   r	   z--startz%start index of substring to be mappedz--endz#end index of substring to be mapped)r   add_argumentstrITN_MODETN_MODEint
parse_args)args r   `/home/ubuntu/.local/lib/python3.10/site-packages/nemo_text_processing/fst_alignment/alignment.pyr   J   s*   r   z<eps>u   ⎵itntnz$\:+-=textreturnc                 C   s   g }dg}t | D ]K\}}t|dkr|dkr|| q	|dkr6|| t|dks.J || g }q	|t| d krT|d7 }|| t|dksOJ || q	|S )zd
    Returns word segments from given text based on white space in form of list of index spans.
    r          )	enumeratelenappend)r   spanscur_spanidxchr   r   r   get_word_segmentsi   s"   



r"   c                  C   sD   t  } tddD ]
}| t|| q	| td | td | S )zg
    Creates and returns Pynini SymbolTable used to label alignment with ascii instead of integers
    "      r       )pyniniSymbolTablerange
add_symbolchrEPSWHITE_SPACE)tablenumr   r   r   create_symbol_table   s   r/   fst
input_textsymbol_tablec           	         s   t ||  }|j  d}| }| }td|   td|   t	t
 fdd|D  fdd|D }td|  |  | sOJ dttdd |D }||fS )	z
    create alignment of input text based on shortest path in FST. Symbols used for alignment are from symbol_table

    Returns:
        output: list of tuples, each mapping input character to output
    )input_token_typeoutput_token_typezinput: zoutput: c                    s   g | ]}  |qS r   )find.0xr2   r   r   
<listcomp>   s    z(get_string_alignment.<locals>.<listcomp>zalignment:  c                 S      g | ]}|d  qS )r   r   r6   r   r   r   r:          )r&   shortestpathpathsilabelsolabelsloggingdebugistringostringlistzipnextdonejoinmapremove)	r0   r1   r2   latticer?   r@   rA   output
output_strr   r9   r   get_string_alignment   s   *rP   	alignmentindexc                 C   sd   d}d}||k r| | d t kr|d7 }|d7 }||k s| | d t kr0|d7 }| | d t ks$|S )zj
    Given index in contracted input string computes corresponding index in alignment (which has EPS)
    r   r   r+   )rQ   rR   aligned_indexr    r   r   r   _get_aligned_index   s   rU   c                 C   s<   d}d}||k r| | d t kr|d7 }|d7 }||k s|S )z`
    Given index in aligned output, returns corresponding index in contracted output string
    r   r   rS   )rQ   rT   og_indexr    r   r   r   _get_original_index   s   rW   c                 C   s   | t krdS | tkrdS | S )Nr;   r   )r+   r,   )r8   r   r   r   <lambda>   s    rX   startendmodec                 C   sJ  t | |}t | |d }tdtttdd | ||d  D  d td| d|d  d |d dkr| |d  d tkr| |d  d tv sW| |d  d tkr|d8 }|d dkr| |d  d tkr| |d  d tv sW| |d  d tksW|d t| k r| |d  d tkr| |d  d tv s| |d  d tkr|d7 }|d t| k r| |d  d tkr| |d  d tv s| |d  d tks|t	kr|d t| k r| |d  d tv s| |d  d tkr|d7 }|d t| k r| |d  d tv s| |d  d tkst
| |d	}t
| |d d	}||fS )
a5  
    Given input start and end index of contracted substring return corresponding output start and end index

    Args:
        alignment: alignment generated by FST with shortestpath, is longer than original string since including eps transitions
        start: inclusive start position in input string
        end: exclusive end position in input string
        mode: grammar type for either tn or itn 

    Returns:
        output_og_start_index: inclusive start position in output string
        output_og_end_index: exclusive end position in output string
    r   z0: |c                 S   r<   )r   r   r6   r   r   r   r:      r=   z)indexed_map_to_output.<locals>.<listcomp>|z1: |:r   )rQ   rT   )rU   rB   rC   rF   rK   rL   r+   tn_itn_symbolsr   r   rW   )rQ   rY   rZ   r[   aligned_startaligned_endoutput_og_start_indexoutput_og_end_indexr   r   r   indexed_map_to_output   s6   
2((((
*(rc   __main__r)r[   z1 not found. Please specify valid --rule argument.)r0   r1   r2   zinp string: |r\   zout string: |r   )rY   rZ   rQ   r[   zinp indices: [r]   z] out indices: []zin: |z| out: |)0rB   stringargparser   typingr   r&   r   r   r+   r,   r   r   rF   ascii_lettersdigitsr^   r   r   r"   r'   r/   FstrP   tuplerU   rW   rL   rc   __name__	getLoggersetLevelINFOr   r0   rule
ValueErrorr   r1   r-   rQ   output_textinforY   indicesrZ   r8   grammarr   r   r   r   <module>   sT   2
2

,2