o
    wi                     @   s2   d Z ddlZddlZddlmZ ejfddZdS )z
Some useful functions for converting and disambiguating between common alternative orthographies (ways of writing) the same text.
    N)	sanscriptc                 C   sN  |t jkr| }tdd|}tdd|}tdd|}tdd|}tdd|}tdd	|}td
d|}tdd|}tdd|}tdd|}tdd|}tdd|}tdd|}tdd|}tdd|}tdd|}tdd|}tdd |}td!d"|}t j||t jd#}|S td$||  tdd| S )%a  
    Given some (devanAgarI?) sanskrit text, this function produces a "key" so
    that
  
    1] The key should be the same for different observed orthographical
    forms of the same text. For example:
  
    ::
  
        - "dharmma" vs "dharma"
        - "rAmaM gacChati" vs "rAma~N gacChati" vs "rAma~N gacChati"
        - "kurvan eva" vs "kurvanneva"
  
    2] The key should be different for different for different texts.
  
    -  "stamba" vs "stambha"
  
    This function attempts to succeed at [1] and [2] almostall the time.
    Longer the text, probability of failing at [2] decreases, while
    probability of failing at [1] increases (albeit very slightly).
  
    Sources of orthographically divergent forms:
  
    -  Phonetically sensible grammar rules
    -  Neglect of sandhi while writing
    -  Punctuation, spaces, avagraha-s.
    -  Regional-language-influenced mistakes (La instead of la.)
  
    Some example applications of this function:
  
    -  Create a database of quotes or words with minimal duplication.
    -  Search a database of quotes or words while being robust to optional
       forms.
  
    Also see equivalent function in the scala indic-transliteration package.
    z\P{IsDevanagari} z\sz\p{P}u   [०-९।॥॰ऽ]|[॑-॔]u   [यरल]्ँu   म्u   [ङञणन]u   मu   ँ|ंu   ॐu	   ओम्u   [ळऴ]u   लu   ([क-हक़-य़])्\1+z\1u   [कग]्ख्u   ख्u   [कग]्घ्u   घ्u   च्छ्u   छ्u   ज्झ्u   झ्u   त्थ्u   थ्u   द्ध्u   ध्u   ड्ढ्u   ढ्u   प्फ्u   फ्u   ब्भ्u   भ्)_from_tozgot script {} for '{}')	r   
DEVANAGARIregexsubtransliterate	OPTITRANSloggingwarningformat)textencoding_schemekey r   f/home/ubuntu/maya3_transcribe/venv/lib/python3.10/site-packages/indic_transliteration/deduplication.pyget_approx_deduplicating_key   s2   
%r   )__doc__r   r   indic_transliterationr   r   r   r   r   r   r   <module>   s
   