o
    Qii"                     @   s   d dl mZmZmZ d dlZd dlZd dlZd dlmZ d dl	Z	d dl
Z
d dlZd dlZd dlmZ ejejdd eeZ				dddZdd Zdd ZdS )    )divisionprint_functionabsolute_importN)OrderedDict)Taggerz%(message)s)levelformat	EOS      
                d   皙?333333?  c              	   C   s  t | ti d|d|	d|
d|d|d|d|d|d	|d
|d|d|d|d|d| d|d||||d |d |d |d d}tj|d |d |d |d ||d}|durvtj|d |d d\}}||d< nd}tj|d |d |||d}tj|d |d |||d}tj|d |d |||d}t|j|d< t|j|d < t|j|d!< t|d" |d#< t|d$ |d%< t|d |d&< t|d' |d(< t	j
||d)}t|||||d* dS )+a  Train a joint word segmentation and sequence labeling (e.g, POS-tagging, NER) model.

    args:
        - train_file (str): Path to a train file.
        - dev_file (str): Path to a development file for early stopping.
        - test_file (str):  Path to a test file for evaluation.
        - model_name (str): Output model filename.
        - dict_file (str, optional): Path to a dictionary file.
        - emb_file (str, optional): Path to a pre-trained embedding file (word2vec format).
        - delimiter (str, optional): Separate word and tag in each line by 'delimiter'.
        - newline (str, optional):  Separate lines in the file by 'newline'.
        - layers (int, optional): RNN Layer size.
        - min_count (int, optional): Ignores all words with total frequency lower than this.
        - decay (int, optional): Learning rate decay.
        - epoch (int, optional): Epoch size.
        - window_size (int, optional): Window size of the context characters for word segmentation.
        - dim_uni (int, optional): Dimensionality of the char-unigram vectors.
        - dim_bi (int, optional): Dimensionality of the char-bigram vectors.
        - dim_word (int, optional): Dimensionality of the word vectors.
        - dim_ctype (int, optional): Dimensionality of the character-type vectors.
        - dim_tagemb (int, optional): Dimensionality of the tag vectors.
        - dim_hidden (int, optional): Dimensionality of the BiLSTM's hidden layer.
        - learning_rate (float, optional): Learning rate of SGD.
        - dropout_rate (float, optional): Dropout rate of the input vector for BiLSTMs.
        - seed (int, optional): Random seed.

    return:
        - Nothing. After finish training, however,
          save the three model files (*.vocabs, *.params, *.hp) in the current directory.

    LAYERS	THRESHOLDDECAYEPOCHWINDOW_SIZEDIM_UNIDIM_BIDIM_WORD	DIM_CTYPE
DIM_TAGEMB
DIM_HIDDENLEARNING_RATEDROPOUT_RATESEEDTRAINSETTESTSETDEVSETz.hpz.paramsz.vocabsz_epoch.params)
DICTIONARY	EMBEDDINGHYPERPARAMSMODELVOCABEPOCH_MODELr'   r+   )trainset	thresholdfn_dictionary	fn_vocabs	delimiternewlineNr(   r   )fn_embeddingword2id)filenamewindow_sizevocabsr1   r2   	NUM_TRAINNUM_TESTNUM_DEVr   VOCAB_SIZE_UNIr   VOCAB_SIZE_BIVOCAB_SIZE_WORDr   VOCAB_SIZE_POSTAG)hpembs)model
train_data	test_datadev_data)randomseedr   preprocreate_vocabs_from_trainsetembedding_loader	from_filelenws_datarA   Model_start)
train_filedev_file	test_file
model_name	dict_fileemb_filer1   r2   layers	min_countdecayepochr6   dim_unidim_bidim_word	dim_ctype
dim_tagemb
dim_hiddenlearning_ratedropout_raterF   r?   r7   r@   	TrainDataTestDataDevData_model re   @/home/ubuntu/.local/lib/python3.10/site-packages/nagisa/train.pyfit   s   
&	








rg   c                    s   t | d || d d dd }g }g }dd tt|jD }|D ]5}|j| }|j| d }	 fd	d|	D }
||||
  d
|}|j}|j	}|||| q!t
||}t
|\}}}}}}||fS )Nr+   r)   )r7   paramsr?   c                 S   sT   g }t | |D ] \}}|d | }tjdu r |d}|d}|||g q|S )Nr	   TzUTF-8)zipmecab_system_evalPY_3encodeappend)wordspostagssentwpre   re   rf   data_for_eval   s   


z"_evaluation.<locals>.data_for_evalc                 S      g | ]}|qS re   re   .0ire   re   rf   
<listcomp>       z_evaluation.<locals>.<listcomp>r   c                    s   g | ]} j | qS re   )id2pos)rv   pidtaggerre   rf   rx      s     )r   rangerK   rL   rn   pos_datarm   taggingjoinro   rj   
mecab_evalcalculate_fvalues)r?   fn_modeldatars   sys_dataans_dataindicerw   rn   pidsro   output	sys_wordssys_postagsr_ws_fpos_fre   r|   rf   _evaluation   s"   

r   c                 C   sj  |   D ]\}}td|| qdddddddd	d
}t| t| | d  d}d}	dd tt|jD }
td| d d D ]}t		 }d}t
|
 |
D ]X}|j| d }|j| d }|j|dd}|||}||}|| }|  |j  || 7 }|j| d }|j| d }|||}|| 7 }|  |j  qS|j| d  t| | d |d\}}||	kr|}	d}|j| d  t| | d |d\}}n|d7 }|| d kr|jjd |j_d}|t|
 }||jj|t		 | d ||||g}dd tt|D }d|d |d |d |d |d |d |d |d  }t| qDd S )!Nz[nagisa] {}: {}z'{:5}	{:5}	{:5}	{:5}	{:8}	{:8}	{:8}	{:8}EpochLRLossTime_mDevWS_f1	DevPOS_f1	TestWS_f1
TestPOS_f1r)   r   g      c                 S   rt   re   re   ru   re   re   rf   rx      ry   z_start.<locals>.<listcomp>r   r   g        T)trainr,   )r   r   r*   r   r   <   c                 S   s   g | ]}|d d qS )N   re   )rv   logre   re   rf   rx      s    r      r         )itemslogginginfor   utils	dump_datar   rK   rL   timerE   shuffle	encode_wsscore_sentenceforwardbackwardtrainerupdatevaluer   get_POStagging_lossrA   saver   r_   mapstr)r?   rA   rB   rC   rD   kvlogsdecay_counterbest_dev_scorer   etlossesrw   XYobs
gold_scoreforward_scorelossdev_ws_f	dev_pos_f	test_ws_f
test_pos_fre   re   rf   rN      sj   




rN   )NNr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )
__future__r   r   r   r   rE   r   collectionsr   rA   rG   rj   nagisa_utilsr   r}   r   basicConfigINFO	getLogger__name__loggerrg   r   rN   re   re   re   rf   <module>   s&   

u 