o
    Ni                     @   sh   d dl mZ d dlZd dlZd dlZd dlZd dlZd dl	m
Z
 ddlmZmZmZ G dd deZdS )    )unicode_literalsN)issparse   )WXOneHotEncoderUrduNormalizerc                   @   sX   e Zd ZdZdddZdd Zdd Zd	d
 ZdddZdd Z	dddZ
dddZdS )BaseTransliteratora  Base class for transliterator.

    Attributes
    ----------
    vectorizer_ : instance
        `OneHotEncoder` instance for converting categorical features to
        one-hot features.

    classes_ : dict
        Dictionary of set of tags with unique ids ({id: tag}).

    coef_ : array
        HMM coefficient array

    intercept_init_ : array
        HMM intercept array for first layer of trellis.

    intercept_trans_ : array
        HMM intercept/transition array for middle layers of trellis.

    intercept_final_ : array
        HMM intercept array for last layer of trellis.

    wx_process : method
        `wx2utf`/`utf2wx` method of `WX` instance

    nu : instance
        `UrduNormalizer` instance for normalizing Urdu scripts.

    Fc                 C   s   |dv rd}n|dkrd}|dv rd}n|dkrd}|| _ || _t | _|| _|\| _| _d| _d| _d| _	t
jt
jt| _|   d S )N)marnepkokbodhinasmbenzz )sourcetargetdictlookupbuild_lookupdecodedecodertabspaceesc_chospathdirnameabspath__file__dist_dirbase_fit)selfr   r   r   r    r#   C/home/ubuntu/.local/lib/python3.10/site-packages/indictrans/base.py__init__3   s$   zBaseTransliterator.__init__c                 C   s<  t  | _d| j| jf }td| j|f }t|| j_W d   n1 s'w   Y  t	jd| j|f dddd | _
t	jd	| j|f dddd t	j| _t	jd
| j|f dddt	j| _t	jd| j|f dddt	j| _t	jd| j|f dddt	j| _t| j
d t	jsdd | j
 D | _
dS dS )zLoads transliteration models.z%s-%sz%s/models/%s/sparse.vecNz%s/models/%s/classes.npylatin1T)encodingallow_pickler   z%s/models/%s/coef.npyz%s/models/%s/intercept_init.npyz %s/models/%s/intercept_trans.npyz %s/models/%s/intercept_final.npyc                 S   s   i | ]
\}}|| d qS )utf-8)r   ).0kvr#   r#   r$   
<dictcomp>g   s    z2BaseTransliterator.load_models.<locals>.<dictcomp>)r   vectorizer_r   r   openr    jsonloadunique_featsnpclasses_astypefloat64coef_intercept_init_intercept_trans_intercept_final_
isinstanceunicode_items)r"   modeljfpr#   r#   r$   load_modelsG   sn   zBaseTransliterator.load_modelsc                 C   s   t  | _tjd| j dd-}|D ]!}| \}}| jdkr+|dv r#q|| jt|< q|| jt|< qW d    d S 1 s>w   Y  d S )Nz%s/mappings/punkt.mapr)   )r'   urd)'")r   	punkt_tblior/   r    splitr   ord)r"   punkt_fplinestr#   r#   r$   load_mappingsj   s   
"z BaseTransliterator.load_mappingsc                 C   s|   |    d| j| jfv r|   | jdkrt | _| jdv r+td| jd}|j| _d S td| jd}|j	| _t
d| _d S )NrA   )engrA   wx2utf)orderlangutf2wxz([a-zA-Z]+))r@   r   r   rL   r   nur   rN   
wx_processrQ   recompile
mask_roman)r"   wxpr#   r#   r$   r!   x   s   

zBaseTransliterator.base_fit   c           	         s    j |}t|r| jj }n j|jj} jdkrA j| j	 j
 j} fdd|D }d|dd}|S t } j| j	 j
 j|}|D ]} fdd|D }d|dd}|| qS|S )z\Given encoded word matrix and HMM parameters, predicts output
        sequence (target word)viterbic                       g | ]} j | qS r#   r4   r*   pidr"   r#   r$   
<listcomp>       z.BaseTransliterator.predict.<locals>.<listcomp> _c                    rZ   r#   r[   r\   r^   r#   r$   r_      r`   )r.   	transformr   dotr7   Ttoarrayr   r   r9   r8   r:   joinreplacelistappend)	r"   wordk_bestXscoresytop_seqr   wr#   r^   r$   predict   s2   
zBaseTransliterator.predictc                 C   sl   | j dkr	| S | j dkr| j|S | j dkr%|dd}|dd}| jd| j |}| |}|S )	zConverts Indic scripts to WX.rM   rA   r   u   ৰu   রu   ৱu   বz%s\1)	r   lowerrR   	normalizerh   rV   subr   rS   )r"   textr#   r#   r$   convert_to_wx   s   



z BaseTransliterator.convert_to_wxNc                 C   s   g }|  |}|d| j}|d| j}|d}|D ]&}| s(|| qt }| j|}|D ]	}|| 	|7 }q3|| qd
|}|| jd}|| jd}|S )z3Single best transliteration using viterbi decoding.	 
)rw   rh   r   r   rF   striprj   str	non_alpha
case_transrg   )r"   rv   rl   
trans_listlinesrI   
trans_linerk   r#   r#   r$   transliterate   s$   



z BaseTransliterator.transliteratec                 C   sx   |dk rt dg }| |}| j|}|D ]}| ||}t|tr*|| q||g|  qdd t| D S )zReturns k-best transliterations using beamsearch decoding.

        Parameters
        ----------
        k_best : int, default: 5, optional
            Used by `Beamsearch` decoder to return k-best transliterations.
           z`k_best` value should be >= 2c                 S   s   g | ]}d  |qS )ra   )rg   )r*   rq   r#   r#   r$   r_      r`   z2BaseTransliterator.top_n_trans.<locals>.<listcomp>)	
ValueErrorrw   r}   rF   r~   r;   ri   rj   zip)r"   rv   rl   
trans_wordwordsrk   op_wordr#   r#   r$   top_n_trans   s   

zBaseTransliterator.top_n_trans)F)rX   )N)__name__
__module____qualname____doc__r%   r@   rL   r!   rr   rw   r   r   r#   r#   r#   r$   r      s    
#

r   )
__future__r   rE   rT   r0   os.pathr   numpyr3   scipy.sparser   _utilsr   r   r   objectr   r#   r#   r#   r$   <module>   s   