o
    qi                     @   s   d Z ddlmZmZmZmZmZmZ ddlZ	dZ
e
d Ze
d ZdZded	efd
dZded	eeeee f ee f fddZded	ee fddZded	ee fddZdS )zConll format parser    )DictTupleAnyOptionalListIteratorN#z\s*([^=]+?)\s*=\s*(.+)z\s*(\S.*?)\s*$	linereturnc                 C   s~   | d dkr| dd } |  t}t|dkrtd|  |d |d |d |d	 |d
 |d |d |d |d |d d
S )z
    Parse single conll line

    Args:
        line (str): A single conll-u token line
    Returns:
         dict: A dictionary containing conll-u token attributes
    Raises:
        ValueError: If the number of columns in line are not 10
    
N
   z@The number of columns per token line must be 10. Invalid token: r                           	   )
idtextlemmauposxposfeatsheaddepreldepsmisc)splitFIELD_DELIMITERlen
ValueError)r
   fields r'   I/home/ubuntu/.local/lib/python3.10/site-packages/urduhack/conll/parser.pyparse_conll_token   s    
r)   sentencec                 C   s   |  d}i }g }|D ]<}|rG|d tkr>tt|}tt|}|r2|d}|d}|||< q|r=|d}	d||	< qt|}
||
 q||fS )z
    Parse single conll sentence

    Args:
        sentence (str):  A complete conll-u sentence
    Returns:
        tuple: Two dicts containing sentence metadata and token data
    r   r   r   r   N)	r"   COMMENT_MARKERrematchKEY_VALUE_COMMENT_PATTERNSINGLETON_COMMENT_PATTERNgroupr)   append)r*   linessentence_metatokensr
   kv_matchsingleton_matchkeyvalktokenr'   r'   r(   parse_conll_sentence.   s(   
	




r;   r2   c                 c   sp    g }| D ] }|  }|r|| q|r%d|}t|}|  |V  q|r6d|}t|}|V  dS dS )a  
    Iterate over the constructed sentences in the given lines.

    This method correctly takes into account newpar and newdoc comments as well.

    Args:
        lines: An iterator over the lines to parse.
    Yields:
        An iterator over the constructed Sentence objects found in the source.
    Raises:
        ValueError: If there is an error constructing the Sentence.
    r   N)stripr1   joinr;   clear)r2   
sent_linesr
   single_conll_sentence	_sentencer'   r'   r(   _iter_linesM   s"   


rB   	file_namec                 C   sN   g }t | dd}t|D ]}|| qW d   |S 1 s w   Y  |S )a_  
    Load a CoNLL-U file given its location.

    Args:
        file_name (str): The location of the file.
    Returns:
        List[Tuple]: A Conll object equivalent to the provided file.
    Raises:
        IOError: If there is an error opening the given filename.
        ParseError: If there is an error parsing the input into a Conll object.
    utf8)encodingN)openrB   r1   )rC   
conll_datafiler*   r'   r'   r(   
_load_filem   s   
rI   )__doc__typingr   r   r   r   r   r   regexr,   r+   r.   r/   r#   strdictr)   r;   iterrB   rI   r'   r'   r'   r(   <module>   s    *  