o
    Xεi\3                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlZddlZedZejeejeeeeeje f f ZG dd dZ					
ddejee
f dejee
f dedededefddZdd Zdd Zdd Zdd Zdd Zedkre  dS dS )aO  
Part of speech tagging using python CRF suite.

Credit to: https://towardsdatascience.com/pos-tagging-using-crfs-ea430c5fb78b

Training requires conllu package:

.. code-block:: sh

    pip install conllu

Training data comes from Univeral Dependencies (https://universaldependencies.org/)

Example:

.. code-block:: sh

    python3 -m gruut.pos train --conllu train.conllu --output model.crf --label xpos

Pre-trained models have the following settings:

* c1 = 0.25
* c2 = 0.3
* max-iterations = 100

English model is trained with "xpos" label.
French model is trained with "upos" label.
    N)Pathz	gruut.posc                   @   s  e Zd ZdZdejeeej	f fddZ
deje deje fddZe			
						d)dedededededededededefddZe				d*deje dedededededefdd Zedeje deje fd!d"Zed#edefd$d%Zed#edefd&d'Zd(S )+PartOfSpeechTaggerz3Part of speech tagger using a pre-trained CRF model
crf_taggerc                 K   s4   t |tjr|| _d S t | _| jt| d S )N)
isinstance
pycrfsuiteTaggerr   openstr)selfr   kwargs r   =/home/ubuntu/.local/lib/python3.10/site-packages/gruut/pos.py__init__7   s   

zPartOfSpeechTagger.__init__wordsreturnc                 C   s   t |}| j|S )zReturns POS tag for each word)r   sent2featuresr   tag)r
   r   featuresr   r   r   __call__A   s   
zPartOfSpeechTagger.__call__       ?T   wordprefixbiasadd_punctuation	add_digit
add_lengthchars_front
chars_backencodec	                 C   s   | d|| d|rt | n| i}	|rt| |	| d< |r*| tjv |	| d< |r5|  |	| d< td|d D ]}
| d|
 |	| d	|
 d
< q<td|d D ]}
| |
 d |	| d|
 d< qT|	S )zGet features for a single wordr   r   z	len(word)zword.ispunctuationzword.isdigit()      Nzword[:]zword[-z:])r   encode_stringlenstringpunctuationisdigitrange)r   r   r   r   r   r   r   r   r    r   ir   r   r   local_featuresF   s   
 z!PartOfSpeechTagger.local_featuresr!   sentencer*   add_bosadd_eoswords_backwardwords_forwardc              	   K   s   | | }t | }tj|fi |}	|dkr|rd|	d< ||d kr'|r'd|	d< td|d D ]}
||
krL| ||
  }|	tj|fdd|
 di| q.td|d D ] }
|||
 k rt| ||
  }|	tj|fdd	|
 di| qT|	S )
z/Get features for a word and surrounding contextr   TBOSr"   EOSr   -:+)r%   r   r+   r)   update)r,   r*   r-   r.   r/   r0   r   r   	num_wordsr   j	word_prev	word_nextr   r   r   word2featuresk   sD   

z PartOfSpeechTagger.word2featuresc                    s    fddt tD S )z(Get features for all words in a sentencec                    s    g | ]}t j|fi  qS r   )r   r;   ).0r*   r   r,   r   r   
<listcomp>   s    z4PartOfSpeechTagger.sent2features.<locals>.<listcomp>)r)   r%   )r,   r   r   r=   r   r      s   
z PartOfSpeechTagger.sent2featuressc                 C   s   t |  dS )zMEncodes string in a form that crfsuite will accept (ASCII) and can be decodedascii)base64	b64encoder    decoder?   r   r   r   r$         z PartOfSpeechTagger.encode_stringc                 C   s   t | d S )z)Decodes a string encoded by encode_stringr@   )rA   	b64decoder    rC   rD   r   r   r   decode_string   rE   z PartOfSpeechTagger.decode_stringN)r   r   TTTr   r   T)TTr!   r!   )__name__
__module____qualname____doc__typingUnionr	   r   r   r   r   Sequencer   staticmethodfloatboolintFEATURES_TYPEr+   r;   Listr   r$   rG   r   r   r   r   r   4   s    

	
$(	r   xpos      ?333333?d   conllu_pathoutput_pathlabelc1c2max_iterationsc              
   C   s  zddl }W n ty } ztd td |d}~ww t| } t|}|jjddd td|  t| dd	d
}|	|
 }	W d   n1 sNw   Y  td| tjdd}
tdt|	 |	D ]9}dd |D }t|}g }d}|D ]}||}|du rtd|| d} n|| q}|rqi|
|| qi|
|||dd t|
  td t }|
t| t }td||  t|
jj dS )z"Train a new model from CONLLU datar   N'conllu package is required for trainingpip install 'conllu>=4.4'T)parentsexist_okzLoading train file (%s)rutf-8encodingz&Training model for %s max iteration(s)F)verbosez,Getting features for %s training sentence(s)c                 S      g | ]}|d  qS formr   r<   tokenr   r   r   r>          ztrain_model.<locals>.<listcomp>z"Example has empty label for %s: %s)r\   r]   r^   zfeature.possible_transitionsTrainingz"Training completed in %s second(s))conlluImportError_LOGGERcriticalr   parentmkdirdebugr   parsereadr   Trainerr%   r   r   getwarningappend
set_params
get_paramsinfotimeperf_countertrainr	   	logparserlast_iteration)rY   rZ   r[   r\   r]   r^   ro   econllu_filetrain_sentstrainersentr   r   labels	skip_sentrl   token_label
start_timeend_timer   r   r   train_model   s^   	



	
r   c                 C   s$   t | j| j| j| j| j| jd dS )zCLI method for train_model)rY   rZ   r[   r\   r]   r^   N)r   ro   outputr[   r\   r]   r^   argsr   r   r   do_train   s   
r   c              
   C   s   zddl }W n ty } ztd td |d}~ww t }t| j ddd&}|| D ]}|D ]}|| j	}|durG|
| q6q2W d   n1 sSw   Y  tt| dS )z"Print label set from a CONLLU filer   Nr_   r`   rc   rd   re   )ro   rp   rq   rr   setr   rv   rw   ry   r[   addprintsorted)r   ro   r   r   r   r   rl   r   r   r   r   do_print_labels  s(   


r   c                 C   s   t | j}| jr| j}ntj}ttj rtdtj	d t
jtjdd}|D ]}| }|s1q(| }tt|||}|| q(dS )zCLI method for predictzReading sentences from stdin...)fileT)flushN)r   modeltextssysstdinosisattyfilenor   stderr	jsonlinesWriterstdoutstripsplitlistzipwrite)r   taggerlineswriterliner   words_and_tagsr   r   r   
do_predict  s   
r   c              
      s^  zddl }W n ty } ztd td |d}~ww t j}td j  d}d}d}d}t j dddJ}||	 D ]:}	d	d
 |	D }
 fdd
|	D }||
}d}t
||D ]\}}||krn|d7 }d}|d7 }q`|ry|d7 }|d7 }qCW d   n1 sw   Y  |dk s|dk rdS td||||  td||||  dS )zCLI method for testingr   Nr_   r`   zTesting file (%s)rc   rd   re   c                 S   rh   ri   r   rk   r   r   r   r>   C  rm   zdo_test.<locals>.<listcomp>c                    s   g | ]}|  jqS r   )ry   r[   rk   r   r   r   r>   D  s    Fr"   Tz7{0} out of {1} word(s) had an incorrect tag ({2:0.2f}%)z={0} out of {1} sentence(s) had at least one error ({2:0.2f}%))ro   rp   rq   rr   r   r   ru   r   rv   rw   r   r   format)r   ro   r   r   num_sentencesr7   sents_with_errorstotal_errorsr   r   r   actual_labelsexpected_labels	had_erroractualexpectedr   r   r   do_test0  sT   






r   c                  C   s  t jdd} |  }d|_d|_|jddd}|jddd	d
 |jdddd
 |jdddd |jdtddd |jdtddd |jdtddd |j	t
d |jddd}|jdddd
 |jddd d
 |jdddd |j	td |jd!d"d}|jddd	d
 |jdddd |j	td |jd#d$d}|jdddd
 |jd%d&d'd( |j	td ||||fD ]
}|jd)d*d+d, q|  }|jrtjtjd- ntjtjd- t| || d.S )/zMain entry pointzpos.py)progTcommandr   z&Train a new POS model from CONLLU file)helpz--conlluzCONLLU file with training data)requiredr   z--outputzPath to write output modelz--labelrU   z!Field to predict in training data)defaultr   z--c1rV   z
L1 penalty)typer   r   z--c2rW   z
L2 penaltyz--max-iterationsrX   z)Maximum number of iterations to train for)functestz!Test a POS model on a CONLLU filez--modelzPath to POS tagger modelzCONLLU file with testing datazprint-labelsz-Print set of unique labels from a CONLLU filepredictz Predict POS tags for sentence(s)r   *	Sentences)nargsr   z--debug
store_truezPrint DEBUG messages to console)actionr   )levelN)argparseArgumentParseradd_subparsersr   dest
add_parseradd_argumentrP   rR   set_defaultsr   r   r   r   
parse_argsru   loggingbasicConfigDEBUGINFOrq   r   )parsersub_parserstrain_parsertest_parserprint_labels_parserpredict_parser
sub_parserr   r   r   r   mainf  s   
r   __main__)rU   rV   rW   rX   )rK   r   rA   r   r   r&   r   r   rL   pathlibr   r   r   	getLoggerrq   Dictr	   rM   rQ   rR   rP   rN   rS   r   r   r   r   r   r   r   rH   r   r   r   r   <module>   sT   
{
F6_
