o
    Lεiw                  	   @   sz  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ	 ej
ddddZedZeed ejddgd	Zejdd
dZe Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zedkrd dlZeeeeeeefZ e D ]<Z!e Z"e#dZ$e$% Z&W d   n1 sw   Y  ee&e!Z'e e" Z(e)  e)e!j e)d *e(d!  q~dS dS )"    N)	TokenizerenF)languageclean	char_spansentencizeren_core_web_smner)disabletokenize)lang
processorsc                 C   s   t | dS )N
)	blingfiretext_to_sentencessplittext r   V/home/ubuntu/.local/lib/python3.10/site-packages/benchmarks/bigtext_speed_benchmark.pyblingfire_tokenize   s   r   c                 C   s
   t | S N)nltksent_tokenizer   r   r   r   nltk_tokenize   s   
r   c                 C   s   t | }dd |D }|S )Nc                 S   s   g | ]}|  qS r   )strip).0sr   r   r   
<listcomp>   s    z"pysbd_tokenize.<locals>.<listcomp>)pysbd_segmentersegment)r   segmentsr   r   r   pysbd_tokenize   s   
r"   c                 C      dd t | jD S )Nc                 S      g | ]}|j d qS r   r   r   r   sentr   r   r   r           z"spacy_tokenize.<locals>.<listcomp>)nlpsentsr   r   r   r   spacy_tokenize      r,   c                 C   r#   )Nc                 S   r$   r%   r&   r'   r   r   r   r   #   r)   z&spacy_dep_tokenize.<locals>.<listcomp>)nlp_depr+   r   r   r   r   spacy_dep_tokenize"   r-   r/   c                 C   r#   )Nc                 S   s   g | ]}|j qS r   r   )r   er   r   r   r   &   s    z#stanza_tokenize.<locals>.<listcomp>)
stanza_nlp	sentencesr   r   r   r   stanza_tokenize%   r-   r3   c                 c   s*    | D ]}d dd |D  V  qd S )N c                 s   s    | ]}t |V  qd S r   )str)r   tokenr   r   r   	<genexpr>*   s    z!make_sentences.<locals>.<genexpr>)joinr   )segmented_tokenssentencer   r   r   make_sentences(   s   r;   c                 C   s.   t | }tt|}dd t|D }|S )Nc                 S   s   g | ]}|qS r   r   r'   r   r   r   r   /   s    z#syntok_tokenize.<locals>.<listcomp>)syntok_tokenizerr   syntok_segmenteriterr;   )r   tokensresultr!   r   r   r   syntok_tokenize,   s   
rA   c                 C   s   || }|S r   r   )big_texttokenize_funcr!   r   r   r   speed_benchmark2   s   rD   __main__zbenchmarks/1661-0.txtzSpeed : {:>20.2f} msi  )+r   r   pysbdspacystanzasyntok.tokenizerr   syntok.segmenter	segmenterr=   	Segmenterr   blankr*   add_pipecreate_pipeloadr.   Pipeliner1   r<   r   r   r"   r,   r/   r3   r;   rA   rD   __name__time	librariesrC   topenbigfilereadrB   r2   
time_takenprintformatr   r   r   r   <module>   sX    
	



