o
    is#                     @   s  d dl Z d dlmZmZmZmZmZmZmZ ddl	m
Z
mZ ddlmZmZ dee dee fdd	Zdee dee fd
dZdee dee fddZdee dee fddZd%dedefddZdedee fddZ	d%dedeeeeeeef f  dedee fddZdedee dee fddZdedee deeeeeeef f  fddZdee deeeeef  fddZd edeeef fd!d"Zd edefd#d$ZeZeZeZdS )&    N)DictIterableIteratorListTupleUnioncast   )ErrorsWarnings)DocSpantagsreturnc                 C   s4   g }t | } | r|t|  |t|  | s|S )N)listextend_consume_os_consume_ent)r   out r   L/home/ubuntu/.local/lib/python3.10/site-packages/spacy/training/iob_utils.pyiob_to_biluo   s   r   c                 C   sH   g }| D ]}|d u r| | q|dddddd}| | q|S )NU-B-   L-I-)appendreplace)r   r   tagr   r   r   biluo_to_iob   s   r    c                 c   s>    | r| d dkr|  dV  | r| d dks	d S d S d S d S )Nr   O)pop)r   r   r   r   r      s    r   c                    s   | sg S |  d}d|dd   }d|dd   }d}| r8| d ||hv r8|d7 }|  d | r8| d ||hv s%|dd   |dkrVt dkrQttjj|dd  gS d  }d	  } fd
dtd|d D }|g| |g S )Nr   Ir   Lr	   r   r   r   r   c                    s   g | ]}d   qS )r   r   .0_labelr   r   
<listcomp>3       z _consume_ent.<locals>.<listcomp>)r"   len
ValueErrorr
   E177formatrange)r   r   	target_intarget_lastlengthstartendmiddler   r)   r   r   !   s&   


r   r!   docmissingc                 C   s   t | dd | jD |dS )Nc                 S      g | ]
}|j |j|jfqS r   
start_charend_charlabel_)r'   entr   r   r   r+   :       z%doc_to_biluo_tags.<locals>.<listcomp>r9   )offsets_to_biluo_tagsents)r8   r9   r   r   r   doc_to_biluo_tags7   s
   rD   c                 C   s4   t | dd}t| D ]\}}|jdkrd||< q
|S )N-rA   r	   r!   )rD   	enumerateent_iob)r8   rC   itokenr   r   r   _doc_to_biluo_tags_with_partial?   s   
rJ   entitiesc                 C   s(  i }dd | D }dd | D }dd | D }|D ]\}}}	|	s4|D ]}
|
|kr2|
|k r2d|||
 < q"qt ||D ]+}|| v r]ttjj|| d || d || d	 f|||	fd
|||	f||< q9||}||}|dur|dur||krd|	 ||< qd|	 ||< t |d |D ]	}d|	 ||< qd|	 ||< qt }|D ]\}}}	t ||D ]}|| qq| D ]}t |j	|j	t
| D ]}||v r nq|||j< qd|v r|dkrt|}ttjjt
| jdkr| jdd d n| jt
|dkr|dd d n|d |S )u  Encode labelled spans into per-token tags, using the
    Begin/In/Last/Unit/Out scheme (BILUO).

    doc (Doc): The document that the entity offsets refer to. The output tags
        will refer to the token boundaries within the document.
    entities (iterable): A sequence of `(start, end, label)` triples. `start`
        and `end` should be character-offset integers denoting the slice into
        the original string.
    missing (str): The label used for missing values, e.g. if tokenization
        doesn’t align with the entity offsets. Defaults to "O".
    RETURNS (list): A list of unicode strings, describing the tags. Each tag
        string will be of the form either "", "O" or "{action}-{label}", where
        action is one of "B", "I", "L", "U". The missing label is used where the
        entity offsets don't align with the tokenization in the `Doc` object.
        The training algorithm will view these as missing values. "O" denotes a
        non-entity token. "B" denotes the beginning of a multi-token entity,
        "I" the inside of an entity of three or more tokens, and "L" the end
        of an entity of two or more tokens. "U" denotes a single-token entity.

    EXAMPLE:
        >>> text = 'I like London.'
        >>> entities = [(len('I like '), len('I like London'), 'LOC')]
        >>> doc = nlp.tokenizer(text)
        >>> tags = offsets_to_biluo_tags(doc, entities)
        >>> assert tags == ["O", "O", 'U-LOC', "O"]
    c                 S   s   i | ]}|j |jqS r   )idxrH   r'   rI   r   r   r   
<dictcomp>f   r,   z)offsets_to_biluo_tags.<locals>.<dictcomp>c                 S   s   i | ]}|j t| |jqS r   )rL   r-   rH   rM   r   r   r   rN   g   s    c                 S   s   g | ]}d qS )rE   r   r&   r   r   r   r+   h   s    z)offsets_to_biluo_tags.<locals>.<listcomp>r!   r   r   r	   )span1span2Nr   r   r   r   rE   2   z...)textrK   )r1   keysr.   r
   E103r0   getsetaddrL   r-   rH   strwarningswarnr   W030rR   )r8   rK   r9   tokens_in_entsstartsendsbiluor<   r=   r*   stoken_indexstart_token	end_tokenrH   entity_charsrI   ent_strr   r   r   rB   G   sl   






$ rB   c                 C   s>   t |}g }|D ]\}}}t| ||d |d}|| q|S )a  Encode per-token tags following the BILUO scheme into Span object, e.g.
    to overwrite the doc.ents.

    doc (Doc): The document that the BILUO tags refer to.
    tags (iterable): A sequence of BILUO tags with each tag describing one
        token. Each tag string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
    RETURNS (list): A sequence of Span objects. Each token with a missing IOB
        tag is returned as a Span with an empty label.
    r   r)   )tags_to_entitiesr   r   )r8   r   token_offsetsspansr*   	start_idxend_idxspanr   r   r   biluo_tags_to_spans   s   rl   c                 C   s   t | |}dd |D S )a  Encode per-token tags following the BILUO scheme into entity offsets.

    doc (Doc): The document that the BILUO tags refer to.
    tags (iterable): A sequence of BILUO tags with each tag describing one
        token. Each tags string will be of the form of either "", "O" or
        "{action}-{label}", where action is one of "B", "I", "L", "U".
    RETURNS (list): A sequence of `(start, end, label)` triples. `start` and
        `end` will be character-offset integers denoting the slice into the
        original string.
    c                 S   r:   r   r;   )r'   rk   r   r   r   r+      r@   z)biluo_tags_to_offsets.<locals>.<listcomp>)rl   )r8   r   rh   r   r   r   biluo_tags_to_offsets   s   
rm   c                 C   s,  g }d}t | D ]\}}|du s|dr%|durd}q|d||f q|dr+q|drG|du rFttjjdt| d|d  dq|drY||d	d ||f q|d
ra|}q|dr|du r|ttjjdt| d|d  d||d	d ||f d}qttjj|d|S )zxNote that the end index returned by this function is inclusive.
    To use it for Span creation, increment the end by 1.NrE    r!   r#   r   )r5   r   Ur	   Br$   r%   )	rF   
startswithr   r.   r
   E067r0   r   E068)r   rK   r5   rH   r   r   r   r   rf      s8   




rf   r*   c                 C   s   t tttf | ddS NrE   r   )r   r   rX   splitr)   r   r   r   split_bilu_label   s   rv   c                 C   s   |  ddd S rt   )ru   r)   r   r   r   remove_bilu_prefix   s   rw   )r!   ) rY   typingr   r   r   r   r   r   r   errorsr
   r   tokensr   r   rX   r   r    r   r   rD   rJ   intrB   rl   rm   rf   rv   rw   offsets_from_biluo_tagsspans_from_biluo_tagsbiluo_tags_from_offsetsr   r   r   r   <module>   sD    $		
W
$#