o
    qi>A                     @   s   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 dZdZd	Zd
ZG dd dZG dd deZG dd dZdS )z
Token, Word data structures
    N)Dict)CoNLL)	Conllable)
STOP_WORDSner
start_charend_char	span_typec                   @   s   e Zd ZdZd&defddZdd Zedd	 Zej	d
d	 Zedd Z
e
j	dd Z
edd Zej	dd Zedd Zej	dd Zedd Zedd Zedd Zej	dd Zdd Zdd Zdd  Zed!efd"d#Zed!efd$d%ZdS )'Tokenah   A token class that stores attributes of a token and carries a list of words. A token corresponds to a unit in
     the raw
    text. In some languages such as English, a token has a one-to-one mapping to a word, while in other languages such
     as French,
    a (multi-word) token might be expanded into multiple words that carry syntactic annotations.
    Ntoken_entryc                 C   s   | tjr| tjsJ ddgd \| _| _| _| _| _| _	| _
| tj| _| tj| _| tjd| _| td| _|durF|ng | _| jdurT|   dS dS )zu Construct a token given a dictionary format token entry. Optionally link itself to the corresponding words.
        z,id and text should be included for the tokenN   )getr   IDTEXT_id_text_misc_words_start_char	_end_char_neridtextMISCmiscNERr   wordsinit_from_misc)selfr   r    r   L/home/ubuntu/.local/lib/python3.10/site-packages/urduhack/core/unit/token.py__init__   s    &
zToken.__init__c                 C   sl   | j dD ]-}|dd}t|dkrq|\}}|ttfv r#t|}d| }t| |r3t| || qdS )z4 Create attributes by parsing from the `misc` field.|=   _N)r   splitlen
START_CHAREND_CHARinthasattrsetattrr   item	key_valuekeyvalueattrr   r   r    r   *   s   

zToken.init_from_miscc                 C      | j S )z! Access the index of this token. r   r   r   r   r    r   8      zToken.idc                 C   
   || _ dS )z Set the token's id value. Nr4   r   r1   r   r   r    r   =      
c                 C   r3   )z/ Access the text of this token. Example: 'The' r   r5   r   r   r    r   B   r6   z
Token.textc                 C   r7   )z, Set the token's text value. Example: 'The' Nr:   r8   r   r   r    r   G   r9   c                 C   r3   )z- Access the miscellaneousness of this token. r   r5   r   r   r    r   L   r6   z
Token.miscc                 C      |  |rd}|| _dS )z* Set the token's miscellaneousness value. N_is_nullr   r8   r   r   r    r   Q      

c                 C   r3   )z; Access the list of syntactic words underlying this token. r   r5   r   r   r    r   Z   r6   zToken.wordsc                 C   s   || _ | j D ]}| |_qdS )z6 Set this token's list of underlying syntactic words. N)r   parent)r   r1   wordr   r   r    r   _   s   
c                 C   r3   )zB Access the start character index for this token in the raw text. r   r5   r   r   r    r   f   r6   zToken.start_charc                 C   r3   )z@ Access the end character index for this token in the raw text. r   r5   r   r   r    r   k   r6   zToken.end_charc                 C   r3   )z3 Access the NER tag of this token. Example: 'B-ORG')r   r5   r   r   r    r   p   r6   z	Token.nerc                 C   r<   )z* Set the token's NER tag. Example: 'B-ORG'N)r>   r   r8   r   r   r    r   u   r?   c                 C      t j|  dddS N   F)indentensure_asciijsondumpsto_dictr5   r   r   r    __repr__      zToken.__repr__c                 C   s"   g }| j D ]	}||  q|S )z Dumps the token into a list of dictionary for this token with its extended words
        if the token is a multi-word token.
        )r   appendrM   )r   retrB   r   r   r    rM      s   
zToken.to_dictc                 C      |d u p|dkS Nr%   r   r8   r   r   r    r>         zToken._is_nullreturnc                 C   s
   | j tv S )zd
        Check the token is stop_word

        Returns:
            bool: Return true|False
        )r   r   r5   r   r   r    is_stop   s   
zToken.is_stopc                 C   s&   | j D ]}t|ds dS qdS )z`
        Check the token is punct

        Returns:
            bool: Return true|False
        PFT)r   unicodedatacategory
startswith)r   charr   r   r    is_punct   s
   
zToken.is_punct)N)__name__
__module____qualname____doc__r   r!   r   propertyr   setterr   r   r   r   r   r   rN   rM   r>   boolrV   r\   r   r   r   r    r
      sF    











			r
   c                   @   s  e Zd ZdZdefddZdd Zedd Zej	d	d Zed
d Z
e
j	dd Z
edd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd  Zej	d!d  Zed"d# Zej	d$d# Zed%d& Zej	d'd& Zed(d) Zej	d*d) Zd+d, Zd-d. Zd/d0 Zd1efd2d3Zd4S )5Wordz4 A word class that stores attributes of a word.
    
word_entryc                 C   s  | tjr| tjsJ d|dgd \| _| _| _| _| _	| _
| _| _| _| _| _| tj| _| tj| _| tjd| _| tjd| _| tjd| _| tjd| _| tjd| _| tjd| _| tjd| _| tj d| _!| j!dur| "  dS dS )z@ Construct a word given a dictionary format word entry.
        z/id and text should be included for the word. {}N   )#r   r   r   r   formatr   r   _lemma_upos_xpos_feats_head_deprel_depsr   _parentr   r   LEMMAlemmaUPOSuposXPOSxposFEATSfeatsHEADheadDEPRELdeprelDEPSdepsr   r   r   )r   re   r   r   r    r!      s&   &
zWord.__init__c                 C   sX   | j dD ]#}|dd}t|dkrq|\}}d| }t| |r)t| || qdS )z= Create attributes by parsing from the `misc` field.
        r"   r#   r$   r%   N)r   r&   r'   r+   r,   r-   r   r   r    r      s   

zWord.init_from_miscc                 C   r3   )z  Access the index of this word. r4   r5   r   r   r    r      r6   zWord.idc                 C   r7   )z Set the word's index value. Nr4   r8   r   r   r    r      r9   c                 C   r3   )z- Access the text of this word. Example: 'The'r:   r5   r   r   r    r      r6   z	Word.textc                 C   r7   )z* Set the word's text value. Example: 'The'Nr:   r8   r   r   r    r      r9   c                 C   r3   )z  Access the lemma of this word. )rh   r5   r   r   r    rq      r6   z
Word.lemmac                 C   s"   |  |s
| jdkrd}|| _dS )z Set the word's lemma value. r%   N)r>   r   rh   r8   r   r   r    rq      s   
c                 C   r3   zB Access the universal part-of-speech of this word. Example: 'NOUN'ri   r5   r   r   r    rs      r6   z	Word.uposc                 C   r<   z? Set the word's universal part-of-speech value. Example: 'NOUN'Nr>   ri   r8   r   r   r    rs      r?   c                 C   r3   )zI Access the treebank-specific part-of-speech of this word. Example: 'NNP')rj   r5   r   r   r    ru      r6   z	Word.xposc                 C   r<   )zF Set the word's treebank-specific part-of-speech value. Example: 'NNP'N)r>   rj   r8   r   r   r    ru     r?   c                 C   r3   )zF Access the morphological features of this word. Example: 'Gender=Fem')rk   r5   r   r   r    rw     r6   z
Word.featsc                 C   r<   )z> Set this word's morphological features. Example: 'Gender=Fem'N)r>   rk   r8   r   r   r    rw     r?   c                 C   r3   )z- Access the id of the governer of this word. )rl   r5   r   r   r    ry     r6   z	Word.headc                 C   s"   |  |r
d| _dS t|| _dS )z# Set the word's governor id value. N)r>   rl   r*   r8   r   r   r    ry     s   

c                 C   r3   )z= Access the dependency relation of this word. Example: 'nmod')rm   r5   r   r   r    r{   '  r6   zWord.deprelc                 C   r<   )z: Set the word's dependency relation value. Example: 'nmod'N)r>   rm   r8   r   r   r    r{   ,  r?   c                 C   r3   )z' Access the dependencies of this word. )rn   r5   r   r   r    r}   5  r6   z	Word.depsc                 C   r<   )z$ Set the word's dependencies value. N)r>   rn   r8   r   r   r    r}   :  r?   c                 C   r3   )z, Access the miscellaneousness of this word. r;   r5   r   r   r    r   C  r6   z	Word.miscc                 C   r<   )z) Set the word's miscellaneousness value. Nr=   r8   r   r   r    r   H  r?   c                 C   r3   )z Access the parent token of this word. In the case of a multi-word token, a token can be the parent of
        multiple words. Note that this should return a reference to the parent token object.
        ro   r5   r   r   r    rA   Q  s   zWord.parentc                 C   r7   )z Set this word's parent token. In the case of a multi-word token, a token can be the parent of
        multiple words. Note that value here should be a reference to the parent token object.
        Nr   r8   r   r   r    rA   X  s   
c                 C   r3   r~   r   r5   r   r   r    pos_  r6   zWord.posc                 C   r<   r   r   r8   r   r   r    r   d  r?   c                 C   rE   rF   rJ   r5   r   r   r    rN   n  rO   zWord.__repr__c                 C   s2   i }t  D ]}t| |durt| |||< q|S )z+ Dumps the word into a dictionary.
        N)r   
get_fieldsgetattr)r   	word_dictfieldr   r   r    rM   q  s   zWord.to_dictc                 C   rR   rS   r   r8   r   r   r    r>   z  rT   zWord._is_nullrU   c                 C   sL   g }t  D ]}t| |}|du r|t j q|t| qt j|S )a  
        Convert this Word to its CoNLL-U representation.

        A Token's CoNLL-U representation is a line. Note that this method does
        not include a newline at the end.

        Returns:
            str:  A string representing the Word in CoNLL-U format.
        N)r   r   r   rP   EMPTYstrFIELD_DELIMITERjoin)r   itemsr   r1   r   r   r    conll}  s   

z
Word.conllN)r]   r^   r_   r`   r   r!   r   ra   r   rb   r   rq   rs   ru   rw   ry   r{   r}   r   rA   r   rN   rM   r>   r   r   r   r   r   r    rd      sp    























		rd   c                   @   s   e Zd ZdZd"ddZdd Zdd Zed	d
 Zej	dd
 Zedd Z
e
j	dd Z
edd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	dd Zedd Zej	dd Zdd Zd d! ZdS )#Spanz A span class that stores attributes of a textual span. A span can be typed.
    A range of objects (e.g., entity mentions) can be represented as spans.
    Nc                 C   s   |dus|dur|dusJ d|dusJ ddgd \| _ | _| _| _g | _g | _|| _|| _|dur:| | |durF| 	|| dS dS )z Construct a span given a span entry or a list of tokens. A valid reference to a doc
        must be provided to construct a span (otherwise the text of the span cannot be initialized).
        NzMEither a span_entry or a token list needs to be provided to construct a span.z2A parent doc must be provided to construct a span.   )
r   
_span_typer   r   _tokensr   _doc_sentinit_from_entryinit_from_tokens)r   
span_entrytokensr	   docsentr   r   r    r!     s   
zSpan.__init__c                 C   s>   | tjd| _| td| _| td| _| td| _	dS )z init from entryN)
r   r   r   r   TYPEr	   r(   r   r)   r   )r   r   r   r   r    r     s   zSpan.init_from_entryc                 C   sx   t |ts	J dt|dksJ d|| _|| _| jd j| _| jd j| _| jj| j| j | _dd |D | _	dS )z init from tokensz6Tokens must be provided as a list to construct a span.r   z)Tokens of a span cannot be an empty list.c                 S   s   g | ]
}|j D ]}|qqS r   )r   ).0twr   r   r    
<listcomp>  s    z)Span.init_from_tokens.<locals>.<listcomp>N)

isinstancelistr'   r   r	   r   r   r   r   r   )r   r   r	   r   r   r    r     s   zSpan.init_from_tokensc                 C   r3   )z% Access the parent doc of this span. r   r5   r   r   r    r     r6   zSpan.docc                 C   r7   )z" Set the parent doc of this span. Nr   r8   r   r   r    r     r9   c                 C   r3   )z= Access the text of this span. Example: 'Stanford University'r:   r5   r   r   r    r     r6   z	Span.textc                 C   r7   )z: Set the span's text value. Example: 'Stanford University'Nr:   r8   r   r   r    r     r9   c                 C   r3   )zD Access reference to a list of tokens that correspond to this span. r   r5   r   r   r    r     r6   zSpan.tokensc                 C   r7   )z  Set the span's list of tokens. Nr   r8   r   r   r    r     r9   c                 C   r3   )zC Access reference to a list of words that correspond to this span. r@   r5   r   r   r    r     r6   z
Span.wordsc                 C   r7   )z Set the span's list of words. Nr@   r8   r   r   r    r     r9   c                 C   r3   )z0 Access the type of this span. Example: 'PERSON'r   r5   r   r   r    r	     r6   zSpan.span_typec                 C   r7   )z Set the type of this span. Nr   r8   r   r   r    r	     r9   c                 C   r3   )z1 Access the start character offset of this span. rC   r5   r   r   r    r     r6   zSpan.start_charc                 C   r7   )z. Set the start character offset of this span. NrC   r8   r   r   r    r     r9   c                 C   r3   )z/ Access the end character offset of this span. rD   r5   r   r   r    r     r6   zSpan.end_charc                 C   r7   )z, Set the end character offset of this span. NrD   r8   r   r   r    r      r9   c                 C   s(   g d}i }|D ]	}t | |||< q|S )z# Dumps the span into a dictionary. )r   r	   r   r   )r   )r   attrs	span_dict	attr_namer   r   r    rM     s
   zSpan.to_dictc                 C   rE   rF   rJ   r5   r   r   r    rN     rO   zSpan.__repr__)NNNNN)r]   r^   r_   r`   r!   r   r   ra   r   rb   r   r   r   r	   r   r   rM   rN   r   r   r   r    r     sF    














	r   )r`   rK   rX   typingr   urduhack.conllr   urduhack.conll.conllabler   urduhack.stop_wordsr   r   r(   r)   r   r
   rd   r   r   r   r   r    <module>   s      m