o
    'Ni                  	   @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlmZ d dlm	Z	 d dlm
Z
 d dlmZ G dd deZG dd	 d	eZed
kreejdk rTed ed ejd Zejd e
_dZeejdkrsejd dkrqdndZede  eeeZede  e ejd ddMZe ejd dd*Ze D ]Ze ZeeZ e!e Z"e#d$e" e#d qW d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS dS )    N)	lru_cache)langinfo)common)indic_tokenizec                   @   s    e Zd ZdZdd Zdd ZdS )MorphAnalyzerIz'
     Interface for Morph Analyzer
    c                 C      d S N )wordr	   r	   U/home/ubuntu/.local/lib/python3.10/site-packages/indicnlp/morph/unsupervised_morph.pymorph_analyze      zMorphAnalyzerI.morph_analyzec                 C   r   r   r	   )tokensr	   r	   r   morph_analyze_document   r   z%MorphAnalyzerI.morph_analyze_documentN)__name__
__module____qualname____doc__r   r   r	   r	   r	   r   r      s    r   c                   @   sD   e Zd ZdZdddZdd Zdd Zed	d
dd Zdd Z	dS )UnsupervisedMorphAnalyzerzG
    Unsupervised Morphological analyser built using Morfessor 2.0
    Fc              
   C   sr   || _ || _t }|tjtj	ddd
|| _d
ttj| d ttj| d | _t| j| _d S )Nmorph	morfessorz{}.modelz
^[{}-{}]+$r      )lang
add_markerr   MorfessorIOread_any_modelospathjoinr   INDIC_RESOURCES_PATHformat_morfessor_modelchrr   SCRIPT_RANGES_script_range_patrecompile_script_check_re)selfr   r   ior	   r	   r   __init__'   s   "*z"UnsupervisedMorphAnalyzer.__init__c                 C   sL   | j tjv r$|D ]}t|tj| j  d  }|tjkr#|tjkr# dS qdS )Nr   TF)r   r   r#   ordNUMERIC_OFFSET_STARTNUMERIC_OFFSET_END)r(   textcoffsetr	   r	   r   _contains_number1   s   z*UnsupervisedMorphAnalyzer._contains_numberc                 C   s   | j |o| | S r   )r'   matchr1   )r(   r
   r	   r	   r   _morphanalysis_needed9   s   z/UnsupervisedMorphAnalyzer._morphanalysis_neededi @  )maxsizec                 C   sX   g }|  |r| j|}|d }| jrdd t|D }|S | jr'd|}|g}|S )z
        Morphanalyzes a single word and returns a list of component morphemes

        @param word: string input word 
        r   c                 S   s,   g | ]\}}|d krd |nd |qS )r   z{}_S_z{}_R_)r    ).0imr	   r	   r   
<listcomp>H   s   , z;UnsupervisedMorphAnalyzer.morph_analyze.<locals>.<listcomp>z{}_E_)r3   r!   viterbi_segmentr   	enumerater    )r(   r
   m_listvalr	   r	   r   r   <   s   

z'UnsupervisedMorphAnalyzer.morph_analyzec                 C   s&   g }|D ]}|  |}|| q|S )a&  
        Morphanalyzes a document, represented as a list of tokens
        Each word  is morphanalyzed and result is a list of morphemes constituting the document 

        @param tokens: string sequence of words 

        @return list of segments in the document after morph analysis 
        )r   extend)r(   r   
out_tokenstokenmorphsr	   r	   r   r   W   s
   

z0UnsupervisedMorphAnalyzer.morph_analyze_documentN)F)
r   r   r   r   r*   r1   r3   r   r   r   r	   r	   r	   r   r   "   s    


r   __main__   zgUsage: python unsupervised_morph.py <infile> <outfile> <language> <indic_resources_path> [<add_marker>]r      F      TrueTzLoading morph analyser for zLoaded morph analyser for rzutf-8   w 
)%codecssys	itertoolsr%   r   r   	functoolsr   indicnlpr   r   indicnlp.tokenizer   objectr   r   r   lenargvprintexitlanguager   r   analyzeropenifileofile	readlineslinestriptrivial_tokenizer   r   morph_tokenswriter   r	   r	   r	   r   <module>   sD   (R




"