o
    ߥi                     @   s:   d dl Z d dlZd dlZd dlZG dd dZdd ZdS )    Nc                   @   s    e Zd Zdd Zedd ZdS )NLTKSegmenterc                 C   s
   t   d S N)download_nltk)self r   d/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/models/nlp/mglm/data_utils/extraction.py__init   s   
zNLTKSegmenter.__initc                 C   s   t j| S r   )nltktokenizesent_tokenize)articler   r   r   segment_string   s   zNLTKSegmenter.segment_stringN)__name__
__module____qualname___NLTKSegmenter__initstaticmethodr   r   r   r   r   r   
   s    r   c                  C   sT  t d d} d}t }t|d}tjtj| dddD ]}tjtj|dd	dD ]}t| g }d}t|d
ddd}|D ]}	|		 }	d|	v rOd	}qBd|	v rg g }
}d g }}|dd  D ]R}t
|dkr|rt
|dksyt
|dkr|
| || n|d | d g }}|
|d  ||dd   qct
|dkr|r||d  qc|d }qc|rt
|dkst
|dkr|
| || n|d | dd |D }|
|d}|t| |d d}g }qB|r|	r||	}|| qBW d    n	1 sw   Y  q-qW d    d S 1 s#w   Y  d S )Npunktzdata/extractedzformatted/wiki-key.txtw*F)	recursivezwiki_*Tr
zutf-8)modenewlineencodingz<doc id=z</doc>   r   c                 S   s   g | ]}d  |qS ) )join).0contentr   r   r   
<listcomp>B   s    
z!download_nltk.<locals>.<listcomp>)keyr!   )r	   downloadr   openglobospathr   printrstriplenappendwritejsondumpsr   )	wiki_pathoutput_path	segmenteroutputdirnamefilenamearticle_linesarticle_openfilelinekey_sentencescontentsr#   r!   	sentencesr   r   r   r   r      s   










$r   )r&   r'   r.   r	   r   r   r   r   r   r   <module>   s   
