o
    Lεi 	                     @   s~  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ	 d dl
mZ ejddddZedZeed ejdd	gd
ZejdddZe Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd ZeeZ dd Z!e"dkrd dl#Z#eeeeeeefZ$e$D ]4Z%e## Z&e'd D ]Z(e!ee%Z)qe## e& Z*e+  e+e%j" e+d!,e) e+d",e*d# d   qdS dS )$    N)	Tokenizer)GOLDEN_EN_RULESenF)languageclean	char_spansentencizeren_core_web_smner)disabletokenize)lang
processorsc                 C   s   t | dS )N
)	blingfiretext_to_sentencessplittext r   R/home/ubuntu/.local/lib/python3.10/site-packages/benchmarks/benchmark_sbd_tools.pyblingfire_tokenize   s   r   c                 C   s
   t | S N)nltksent_tokenizer   r   r   r   nltk_tokenize   s   
r   c                 C   s   t | }dd |D S )Nc                 S   s   g | ]}|  qS r   )strip).0sr   r   r   
<listcomp>   s    z"pysbd_tokenize.<locals>.<listcomp>)pysbd_segmentersegment)r   segmentsr   r   r   pysbd_tokenize   s   
r#   c                 C      dd t | jD S )Nc                 S      g | ]}|j qS r   r   r   sentr   r   r   r   !       z"spacy_tokenize.<locals>.<listcomp>)nlpsentsr   r   r   r   spacy_tokenize       r+   c                 C   r$   )Nc                 S   r%   r   r   r&   r   r   r   r   $   r(   z&spacy_dep_tokenize.<locals>.<listcomp>)nlp_depr*   r   r   r   r   spacy_dep_tokenize#   r,   r.   c                 C   r$   )Nc                 S   r%   r   r   )r   er   r   r   r   '   r(   z#stanza_tokenize.<locals>.<listcomp>)
stanza_nlp	sentencesr   r   r   r   stanza_tokenize&   r,   r2   c                 c   s*    | D ]}d dd |D  V  qd S )N c                 s   s    | ]}t |V  qd S r   )str)r   tokenr   r   r   	<genexpr>+   s    z!make_sentences.<locals>.<genexpr>)joinr   )segmented_tokenssentencer   r   r   make_sentences)   s   r:   c                 C   s.   t | }tt|}dd t|D }|S )Nc                 S   s   g | ]}|qS r   r   r&   r   r   r   r   0   s    z#syntok_tokenize.<locals>.<listcomp>)syntok_tokenizerr   syntok_segmenteriterr:   )r   tokensresultr"   r   r   r   syntok_tokenize-   s   
r@   c                 C   s>   d}| D ]}|\}}||}||kr|d7 }q|t  d }|S )Nr      g      Y@)total_rules)golden_rulestokenize_funcscoreruler   expectedr"   percent_scorer   r   r   	benchmark6   s   rI   __main__d   zGRS score: {:0.2f}%z&Speed(Avg over 100 runs): {:>10.2f} msi  )-r   r   pysbdspacystanzasyntok.tokenizerr   syntok.segmenter	segmenterr<   english_golden_rulesr   	Segmenterr    blankr)   add_pipecreate_pipeloadr-   Pipeliner0   r;   r   r   r#   r+   r.   r2   r:   r@   lenrB   rI   __name__time	librariesrD   trangeirH   
time_takenprintformatr   r   r   r   <module>   sZ    


