o
    'NiB"                     @   s  d dl Zd dlZd dlZd dlmZ d dlmZ d dlm	Z
 	 da	 da	 da	 da	 da	 dag dZd dgddgdd	gd	d
gd
dgddgddgddgddgddgddgddgdZd ZdZdZdZdZdZeZed ZdZd ZdZdd Zdd Zdd Z dd  Z!d!d" Z"d#d$ Z#d%d& Z$d'd( Z%d)d* Z&d+d, Z'd-d. Z(d/d0 Z)d1d2 Z*d3d4 Z+d5d6 Z,d7d8 Z-d9d: Z.d;d< Z/d=d> Z0d?d@ Z1dAdB Z2dCdD Z3dEdF Z4dGdH Z5dIdJ Z6dKdL Z7dMdN Z8dS )O    N)common)IndicNlpException)langinfo&      )
basic_typevowel_lengthvowel_strengthvowel_statusconsonant_typearticulation_place
aspirationvoicingnasalizationvowel_horizontalvowel_verticalvowel_roundness                            $                        c                   C   sz   t jtjt ddddat jtjt ddddatj	ddt
df jatj	ddt
df jatjd adS )zI
    To be called by library loader, do not call it in your program 
    scriptzall_script_phonetic_data.csvzutf-8)encodingztamil_script_phonetic_data.csvNr   )pdread_csvospathjoinr   get_resources_pathALL_PHONETIC_DATATAMIL_PHONETIC_DATAilocPHONETIC_VECTOR_START_OFFSETvaluesALL_PHONETIC_VECTORSTAMIL_PHONETIC_VECTORSshapePHONETIC_VECTOR_LENGTH r5   r5   Q/home/ubuntu/.local/lib/python3.10/site-packages/indicnlp/script/indic_scripts.pyinit`   s
   r7   c                 C   s   | t tj v S )N)listliSCRIPT_RANGESkeys)langr5   r5   r6   is_supported_languageo   s   r=   c                 C   s,   t |std|t| tj| d  S )NLanguage {}  not supportedr   )r=   r   formatordr9   r:   cr<   r5   r5   r6   
get_offsetr   s   rC   c                 C   s,   t |std|t| tj| d  S )z5
    Applicable to Brahmi derived Indic scripts 
    r>   r   )r=   r   r?   chrr9   r:   )offr<   r5   r5   r6   offset_to_charw   s   rF   c                 C   sL   t |std|t| |}|tkr|tk p%t| tjkp%t| tj	kS )z
    Applicable to Brahmi derived Indic scripts 
    Note that DANDA and DOUBLE_DANDA have the same Unicode codepoint for all Indic scripts 
    r>   )
r=   r   r?   rC   SCRIPT_OFFSET_STARTSCRIPT_OFFSET_RANGEr@   r9   DANDADOUBLE_DANDA)rB   r<   or5   r5   r6   is_indiclang_char   s   
rL   c                 C   s   | t jko	| t jkS )z6 
    Applicable to Brahmi derived Indic scripts 
    )r9   !COORDINATED_RANGE_START_INCLUSIVECOORDINATED_RANGE_END_INCLUSIVE)c_offsetr5   r5   r6   in_coordinated_range_offset   s   rP   c                 C   s$   t |std|tt| |S Nr>   )r=   r   r?   rP   rC   rA   r5   r5   r6   in_coordinated_range   s   rR   c                 C   sB   t | std| | tjkrtnt}| tjkrtnt}||fS rQ   )	r=   r   r?   r9   LC_TAr,   r-   r1   r2   )r<   phonetic_dataphonetic_vectorsr5   r5   r6   get_phonetic_info   s
   rV   c                   C   s   t dgt S Nr   )nparrayr4   r5   r5   r5   r6   invalid_vector   s   rZ   c                 C   sD   t | |}t|st S t|\}}|j| d dkrt S || S NzValid Vector Representationr   )rC   rP   rZ   rV   r.   )rB   r<   offsetrT   rU   r5   r5   r6   get_phonetic_feature_vector   s   
r]   c                 C   s:   t | st S t|\}}|j|  d dkrt S ||  S r[   )rP   rZ   rV   r.   )r\   r<   rT   rU   r5   r5   r6   "get_phonetic_feature_vector_offset   s   r^   c                 C   s   t | dkS rW   )rX   sumvr5   r5   r6   is_valid   s   rb   c                 C      | t  dkS Nr   )PVIDX_BT_VOWELr`   r5   r5   r6   is_vowel      rf   c                 C   rc   rd   )PVIDX_BT_CONSONANTr`   r5   r5   r6   is_consonant   rg   ri   c                 C   rc   rd   )PVIDX_BT_HALANTr`   r5   r5   r6   	is_halant   rg   rk   c                 C   rc   rd   )PVIDX_BT_NUKTAr`   r5   r5   r6   is_nukta   rg   rm   c                 C   rc   rd   )PVIDX_BT_ANUSVAARr`   r5   r5   r6   is_anusvaar   rg   ro   c                 C   rc   rd   )PVIDX_BT_MISCr`   r5   r5   r6   is_misc   rg   rq   c                 C   s   t | o	| t dkS rd   )rf   PVIDX_VSTAT_DEPr`   r5   r5   r6   is_dependent_vowel   s   rs   c                 C   s   t | ot| dd dkS )Nr   r   r   )ri   get_property_vectorr`   r5   r5   r6   
is_plosive      ru   c                 C      t dd t| |D S )Nc                 S   s$   g | ]\}}|| d krd ndqS r   r   r5   .0b1b2r5   r5   r6   
<listcomp>   s   $ zor_vectors.<locals>.<listcomp>rX   rY   zipv1v2r5   r5   r6   
or_vectors   rv   r   c                 C   rw   )Nc                 S   s    g | ]\}}||krd ndqS rx   r5   ry   r5   r5   r6   r}      s     zxor_vectors.<locals>.<listcomp>r~   r   r5   r5   r6   xor_vectors   rv   r   c                 C   s   | t | d t | d  S )Nr   r   )PV_PROP_RANGES)ra   	prop_namer5   r5   r6   rt      s   rt   c                 C   sF   t | | }d} d}|d d d D ]}| || 7 } |d }qt| S )Nr   r   g       @)rt   tolistint)ra   r   factor_bitsrB   br5   r5   r6   get_property_value   s   
r   c                 C   s,  t t| d t|d f}t| dD ]g\}}t|dD ]]\}}t||}	t||}
t|	rHt|
rH|	|
krH||d |d f d |||f< qt|	set|
se||kre||d |d f d |||f< qt|||d f ||d |f |||f< qq|d ttt| t| tt| tt|fS )a2  
    compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.
    This works for Indic scripts by mapping both languages to a common script

    srcw: source language string
    tgtw: source language string
    slang: source language 
    tlang: target language 
    r         ?r   r   )rX   zeroslen	enumeraterC   rP   maxfloat)srcwtgtwslangtlang	score_matsisctitcsotor5   r5   r6   
lcsr_indic   s   


""4r   c                 C   s   t t| d t|d f}t| dD ]8\}}t|dD ].\}}||kr6||d |d f d |||f< qt|||d f ||d |f |||f< qq|d ttt| t| tt| tt|fS )zA
    LCSR computation if both languages have the same script
    r   r   r   )rX   r   r   r   r   r   )r   r   r   r   r   r   r   r5   r5   r6   lcsr_any  s   "4	r   c                 C   s0   ||kst |rt |st| |||S t| |S )z
    compute the Longest Common Subsequence Ratio (LCSR) between two strings at the character level.

    srcw: source language string
    tgtw: source language string
    slang: source language 
    tlang: target language 
    )r=   r   r   )r   r   r   r   r5   r5   r6   lcsr  s   

r   )9pandasr&   numpyrX   r(   indicnlpr   indicnlp.commonr   r   r9   r,   r-   r1   r2   r4   r/   PV_PROPr   re   rh   rl   rj   rn   rp   
PVIDX_BT_S
PVIDX_BT_Err   rG   rH   r7   r=   rC   rF   rL   rP   rR   rV   rZ   r]   r^   rb   rf   ri   rk   rm   ro   rq   rs   ru   r   r   rt   r   r   r   r   r5   r5   r5   r6   <module>   s   