o
    qi                     @   sL   d Z ddlmZ ddlmZ ddlmZmZ ddlm	Z	 G dd de	Z
d	S )
z#Parser for performing normalization    )CoNLL)Document)sentence_tokenizerword_tokenizer   )Parserc                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	TokenizeParserz1Parser covert Urdu text into sentences and words.c                 C   s   dS )passN )selfconfigr
   r
   V/home/ubuntu/.local/lib/python3.10/site-packages/urduhack/pipeline/parsers/tokenize.py_set_up   s    zTokenizeParser._set_upc                 C   s   g }g }t |}d}|D ]D}t|}|d| g }t|D ](\}	}
|tjt|	d tj|
tj	d| d|t
|
  i |t
|
d 7 }q |i |f q|d|fS )z"generate dictionary data structurer       zstart_char=z
|end_char=)r   r   appendjoin	enumerater   IDstrTEXTMISClen)r   textdocconll_format	sentencesidxsentencewordssenttoken_idtokenr
   r
   r   _tokenized_text   s    zTokenizeParser._tokenized_textc                 C   s   |  |\}}t||S )zGenerate sentences and words)r#   r   )r   document
conll_datar   r
   r
   r   parse(   s   
zTokenizeParser.parseN)__name__
__module____qualname____doc__r   r#   r&   r
   r
   r
   r   r   
   s
    r   N)r*   urduhack.conllr   urduhack.core.unit.documentr   urduhack.tokenizationr   r   parserr   r   r
   r
   r
   r   <module>   s   