o
    qi                     @   sH   d Z ddlZddlmZmZ ddlmZ ddlmZ G dd deZ	dS )	z
Document data structure
    N)ListTuple)	Conllable   )Sentencec                   @   s   e Zd ZdZddeeeef  defddZ	e
defdd	Ze
defd
dZe
defddZejdd Ze
defddZejdd Zdd Zdd Zdd ZdefddZdefddZdd ZdS )DocumentzW A document class that stores attributes of a document and carries a list of sentences.N	sentencestextc                 C   s&   g | _ || _d| _d| _| | dS )a   Construct a document given a list of sentences in the form of lists of CoNLL-U dicts.

        Args:
            sentences (list): List of sentences, which being a tuple of dict, list of token entry (CoNLL-U dict).
            text (str): Urdu text of the document.
        r   N)
_sentences_text_num_tokens
_num_words_process)selfr   r	    r   O/home/ubuntu/.local/lib/python3.10/site-packages/urduhack/core/unit/document.py__init__   s
   zDocument.__init__returnc                 C      | j S )zj
        Access the Urdu text for this document.

        Returns:
            str: Document text
        )r   r   r   r   r   r	      s   zDocument.textc                 C   r   )z1 Access the list of sentences for this document. )r
   r   r   r   r   r   (      zDocument.sentencesc                 C   r   )z0 Access the number of tokens for this document. r   r   r   r   r   
num_tokens-   r   zDocument.num_tokensc                 C   
   || _ dS )z- Set the number of tokens for this document. Nr   r   valuer   r   r   r   2      
c                 C   r   )z/ Access the number of words for this document. r   r   r   r   r   	num_words7   r   zDocument.num_wordsc                 C   r   )z, Set the number of words for this document. Nr   r   r   r   r   r   <   r   c                 C   s   |D ]8}| j t|| d | j d jd j| j d jd j}}t| jdu|du|dugr:| j|| | j d _qtdd | j D | _	tdd | j D | _
dS )z(process sentences in to words and tokens)docr   Nc                 S      g | ]}t |jqS r   )lentokens.0sentencer   r   r   
<listcomp>I       z%Document._process.<locals>.<listcomp>c                 S   r!   r   )r"   wordsr$   r   r   r   r'   J   r(   )r   appendr   r#   
start_charend_charallr	   sumr   r   )r   r   r&   	begin_idxend_idxr   r   r   r   A   s   &zDocument._processc                 c       | j D ]}|jE dH  qdS )z= An iterator that returns all of the words in this Document. N)r   r)   r   r&   r   r   r   
iter_wordsL      
zDocument.iter_wordsc                 c   r1   )z> An iterator that returns all of the tokens in this Document. N)r   r#   r2   r   r   r   iter_tokensQ   r4   zDocument.iter_tokensc                 C   s   dd | j D S )zp Dumps the whole document into a list of list of dictionary for each token in each sentence in the doc.
        c                 S   s   g | ]}|  qS r   )to_dictr$   r   r   r   r'   Y   s    z$Document.to_dict.<locals>.<listcomp>)r   r   r   r   r   r6   V   s   zDocument.to_dictc                 C   s(   t tdd | j}|d d|S )z
        Output the Conll object to a CoNLL-U formatted string.

        Returns:
            str: The CoNLL-U object as a string. This string will end in a newline.
        c                 S   s   |   S N)conll)sentr   r   r   <lambda>d   s    z Document.conll.<locals>.<lambda> z

)listmapr   r*   join)r   
componentsr   r   r   r8   [   s   	

zDocument.conllc                 C   s   t j|  dddS )N   F)indentensure_ascii)jsondumpsr6   r   r   r   r   __repr__i   s   zDocument.__repr__r7   )__name__
__module____qualname____doc__r   r   dictr<   strr   propertyr	   r   intr   setterr   r   r3   r5   r6   r8   rE   r   r   r   r   r      s*     	

r   )
rI   rC   typingr   r   urduhack.conll.conllabler   r&   r   r   r   r   r   r   <module>   s   