o
    MiVz                     @   s&  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZddl	m
Z
 ddl	mZ ddlmZmZmZmZmZ ddlmZ ddlmZ ddlmZmZmZmZ ejd  d	kZer^eZnd
d Zee Z!dZ"dd Z#dd Z$ej%e"ddd Z&ej%e"ddd Z'dd Z(dd Z)dd Z*dS )    N   )get_version)utils)BaselineModelAnnotationCorpusWeightMorphLengthCorpusWeightNumMorphCorpusWeightFixedCorpusWeight)ArgumentException)MorfessorIO)MorfessorEvaluationEvaluationConfigWilcoxonSignedRankFORMAT_STRINGS   c                 C   s   t | t dS )Nencoding)unicodelocalegetpreferredencodingx r   A/home/ubuntu/.local/lib/python3.10/site-packages/morfessor/cmd.py<lambda>   s    r   i@B c               
   C   sN  dd l } | jddt  d| jdd}|dj}|dd	d
d ddd |dddd ddd |ddddg ddd |ddddg ddd |dj}|dddd dd!d |d"d#d$d dd%d |d&d'd(d dd)d |d*d+d dd,d |d-d.d/d dd0d |d1d2d3td4d5d6 |d7j}|d8d9d:d;d<d= |d>d?dd@dAdB |dCdDdd@dEdB |dFdGtd dHdIdJ |dKdLtdMdHdNdJ |dOdPtdQdRdSdJ |dTdUtdVdWdXdJ |dYdZtd[dRd\dJ |d]d^dd@d_dB |d`j}|dadbdcdddeg dfdgdh |didjdkdldmdldngdodh |dpdqdrtdsdtg dudvdw |dxdydztd gd{d|dJ |d}d~dt	ddddJ |dddd ddd |dddd t	ddd6 |dddd@ddB |ddtd3d4ddJ |ddtd d4ddJ |ddtd dHddJ |ddtdd4ddJ |dddt	ddd6 |dddtd4dd6 |
 j}|dddd ddd |ddd t	ddd6 |ddd t	ddd6 |dj}|dddt	ddddJ |ddddt	dd |ddd@ddd |dddd ddd |dddt	d dddJ |dġj}|dddd ddd |dɡj}|dddtd3d4ddJ |ddddd= |dddd@ddB |dԡj}|dddddٍ |dddt  ddލ |S )Nr   zmorfessor.pyuq  
Morfessor %s

Copyright (c) 2012-2019, Sami Virpioja, Peter Smit, and Stig-Arne Grönroos.
All rights reserved.

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:

1.  Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.

2.  Redistributions in binary form must reproduce the above
    copyright notice, this list of conditions and the following
    disclaimer in the documentation and/or other materials provided
    with the distribution.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.

Command-line arguments:
z
Simple usage examples (training and testing):

  %(prog)s -t training_corpus.txt -s model.pickled
  %(prog)s -l model.pickled -T test_corpus.txt -o test_corpus.segmented

Interactive use (read corpus from user):

  %(prog)s -m online -v 2 -t -

F)progdescriptionepilogformatter_classadd_helpzinput data filesz-lz--loadloadfile<file>z4load existing model from file (pickled model object))destdefaultmetavarhelpz-Lz--load-segmentationloadsegfilezAload existing model from segmentation file (Morfessor 1.0 format)-tz--traindata
trainfilesappendzinput corpus file(s) for training (text or bz2/gzipped text; use '-' for standard input; add several times in order to append multiple files))r"   actionr#   r$   r%   z-Tz
--testdata	testfileszinput corpus file(s) to analyze (text or bz2/gzipped text;  use '-' for standard input; add several times in order to append multiple files)zoutput data filesz-oz--outputoutfile-zWoutput file for test data results (for standard output, use '-'; default '%(default)s')z-sz--savesavefilez/save final model to file (pickled model object)z-Sz--save-segmentationsavesegfilez7save model segmentations to file (Morfessor 1.0 format)z--save-reducedsavereducedzsave final model to file in reduced form (pickled model object). A model in reduced form can only be used for segmentation of new words.z-xz	--lexiconlexfilez"output final lexicon to given filez--nbestnbestr   <int>zoutput n-best viterbi results)r"   r#   typer$   r%   zdata format options-e
--encodingr   
<encoding>bencoding of input and output files (if none is given, both the local encoding and UTF-8 are tried)r"   r$   r%   z--lowercase	lowercase
store_truezlowercase input datar"   r#   r*   r%   z--traindata-listlistz`input file(s) for batch training are lists (one compound per line, optionally count as a prefix)z--atom-separator	separator<regexp>z+atom separator regexp (default %(default)s)r"   r4   r#   r$   r%   z--compound-separator
cseparatorz\s+z1compound separator regexp (default '%(default)s')z--analysis-separatoranalysisseparator,z<str>ziseparator for different analyses in an annotation file. Use  NONE for only allowing one analysis per linez--output-formatoutputformatz{analysis}\n<format>ai  format string for --output file (default: '%(default)s'). Valid keywords are: {analysis} = constructions of the compound, {compound} = compound string, {count} = count of the compound (currently always 1), {logprob} = log-probability of the analysis, and {clogprob} = log-probability of the compound. Valid escape sequences are '\n' (newline) and '\t' (tabular)z--output-format-separatoroutputformatseparator zMconstruction separator for analysis in --output file (default: '%(default)s')z--output-newlinesoutputnewlineszRfor each newline in input, print newline in --output file (default: '%(default)s')z!training and segmentation optionsz-mz--mode	trainmode
init+batchz<mode>)nonebatchinitrJ   onlineonline+batchzitraining mode ('none', 'init', 'batch', 'init+batch', 'online', or 'online+batch'; default '%(default)s'))r"   r#   r$   choicesr%   z-az--algorithm	algorithm	recursivez<algorithm>viterbiz>algorithm type ('recursive', 'viterbi'; default '%(default)s')z-dz--dampening	dampeningonesz<type>)rK   logrU   zWfrequency dampening for training data ('none', 'log', or 'ones'; default '%(default)s'))r"   r4   r#   r$   rP   r%   z-fz--forcesplit
forcesplitz<list>ztforce split on given atoms (default '-'). The list argument is a string of characthers, use '' for no forced splits.z-Fz--finish-thresholdfinish_thresholdg{Gzt?z<float>zStopping threshold. Training stops when the improvement of the last iteration issmaller then finish_threshold * #boundaries; (default '%(default)s')z-rz
--randseedrandseedz<seed>z seed for random number generatorz-Rz--randsplit	splitprobzainitialize new words by random splitting using the given split probability (default no splitting)z--skipsskipszCuse random skips for frequently seen compounds to speed up trainingz--batch-minfreqfreqthresholdzEcompound frequency threshold for batch training (default %(default)s)z--max-epochs	maxepochsz"hard maximum of epochs in trainingz--nosplit-renosplitzfif the expression matches the two surrounding characters, do not allow splitting (default %(default)s)z--online-epochintepochinterval'  z8epoch interval for online training (default %(default)s)z--viterbi-smoothingviterbismoothzXadditive smoothing parameter for Viterbi training and segmentation (default %(default)s)z--viterbi-maxlenviterbimaxlen   zVmaximum construction length in Viterbi training and segmentation (default %(default)s)z-Dz
--develset	develfilez:load annotated data for tuning the corpus weight parameterz--morph-lengthmorphlengthz@tune the corpusweight to obtain the desired average morph lengthz--num-morph-types
morphtypeszAtune the corpusweight to obtain the desired number of morph typesz semi-supervised training optionsz-wz--corpusweightcorpusweightg      ?zfcorpus weight parameter (default %(default)s); sets the initial value if other tuning options are usedz--weight-threshold	thresholdg{Gz?z7percentual stopping threshold for corpusweight updaters)r"   r#   r$   r4   r%   z--full-retrainfullretrainz2do a full retrain after any weights have converged)r"   r*   r#   r%   z-Az--annotationsannofilez0load annotated data for semi-supervised learningz-Wz--annotationweightannotationweightzcorpus weight parameter for annotated data (if unset, the weight is set to balance the number of tokens in annotated and unannotated data sets)zEvaluation optionsz-Gz--goldstandardgoldstandardz9If provided, evaluate the model against the gold standardlogging options-v	--verboseverbosefverbose level; controls what is written to the standard error stream or log file (default %(default)s)	--logfilelog_file?write log messages to file in addition to standard error streamz--progressbarprogresszcForce the progressbar to be displayed (possibly lowers the log level for the standard error stream)other options-h--helpr%   show this help message and exitr*   r%   	--versionversion	%(prog)s show version number and exitr*   r|   r%   )argparseArgumentParserr   RawDescriptionHelpFormatteradd_argument_groupadd_argumentint_strr=   floatadd_mutually_exclusive_group)r   parseradd_argr   r   r   get_default_argparser!   s   !2






















r   c           	      C   s   | j dkr	tj}n| j dkrtj}ntj}t }|tj d}d}d}|}| jdus4t| dr:| j	r:t
|tj}t }|| |t| || | jdurot| jd}|| |t|| || |S )	z-Initialize loggers based on command line args   r   z%%(asctime)s %(levelname)s:%(message)sz%Y-%m-%d %H:%M:%Sz%(message)sNru   w)rp   loggingDEBUGINFOWARNING	getLoggersetLevelrs   hasattrru   maxStreamHandlersetFormatter	Formatter
addHandlerFileHandler)	argsloglevel
rootloggerlogfile_formatdate_formatconsole_formatconsole_levelchfhr   r   r   initialize_logging$  s.   






r   )maxsizec                 C   s   |  |||S N)viterbi_segment)modelatomssmoothmaxlenr   r   r   _viterbi_segmentJ  s   r   c                 C   s   |  ||||S r   )viterbi_nbest)r   r   r2   r   r   r   r   r   _viterbi_nbestO  s   r   c                 C   s  t | }|tjksttjdrtj sdt_| j	rdt_| j
d u r2| jd u r2t| jdkr2td| jd ur=t| j t| j| j| j| jd}| j
d urU|| j
}nt| j| j| j| jd}| jd uro||| j | jdkrw| jnd }| j d ur|j!| j |d	}|"|| j# | j$d ur|j!| j$|d	}t%|| j&}|'| | j(d urt)| j(| j&}|'| | j*d urt+| j*| j&}|'| |, }| j-d
krd }	n| j-dkrdd }	n| j-dkrdd }	ntd| j- | j.dkr| j/| j0f}
nd}
| j1d
krn{| j1dkrRt|2 dkrt34d net| jdkr't34d t55 }|6| j.|
| j7| j8\}}t55 }t39d| t39d| t39d||  n-t| jdkrzt55 }| j1dkr| j:ro|;| j}n|<| j}|=|| j>|	| j?}n| j1dkr| j:r|;| j}n|<| j}|=|| j>|	| j?}|6| j.|
| j7| j8\}}t39d| | j@rtA|, | dkr|'tB|,  |C  |6| j.|
| j7| j8\}}t39d| n| j1dkr|<| j}|D||	| jE| j.|
| j?| j8\}}t39d| n_| j1dkra|<| j}|D||	| jE| j.|
| j?| j8\}}|6| j.|
| j7| j8| \}}t39d| | j@r`tA|, | dkr`|C  |6| j.|
| j7| j8\}}t39d| ntd| j1t55 }t39d| t39d||  nt34d  | jFd ur|G| jF| | jHd ur|I| jH|J  | jKd ur|L| jK|M  | jNd ur|O  |G| jN| t| jPdkrt39d! | jQ}| jR}|Sd"d#}|Sd$d%}d&d' tTU V|D }|W| jX}|<| jP}d}|D ]\}}|jYd u rd(Z|}n|jYZ|}t|dkr| j[r|\d# qd)|v r*|]|}nd}| j^d*krZt_||| j^| j/| j0}|D ]\}}|j`||d+}|\|ja|||||d, q?ntb||| j/| j0\}}|j`||d+}|\|ja|||||d, |d*7 }|d- dkrtj\d. qtj\d# W d    n	1 sw   Y  t39d/ | jcd urt39d0 td|!| jc}|je|d1d2id3}tf|atgd4  t39d5 d S d S )6NisattyFTr   z4either model file or training data should be defined)r   compound_separatoratom_separatorr:   )forcesplit_listrg   	use_skips
nosplit_reNONE)analysis_seprK   rV   c                 S   s   t tt| d dS )Nr   r   )r   roundmathrV   r   r   r   r   r     s    zmain.<locals>.<lambda>rU   c                 S   s   dS )Nr   r   r   r   r   r   r     s    zunknown dampening type '%s'rS   r   rL   zVModel contains no compounds for batch training. Use 'init+batch' mode to add new data.z`Training mode 'batch' ignores new data files. Use 'init+batch' or 'online' to add new compounds.z
Epochs: %szFinal cost: %szTraining time: %.3fsrM   rJ   g?zRetrain Epochs: %srN   rO   zunknown training mode '%s'z!No training data files specified.zSegmenting test data...z\n
z\t	c                 S   s   g | ]}|d  qS )r   r   ).0r   r   r   r   
<listcomp>  s    zmain.<locals>.<listcomp> clogprobr   )csep)analysiscompoundcountlogprobr   r`   .zDone.zEvaluating ModelnameMODEL)	meta_datar#   Done)hr   r   r   r   sysstderrr   r   show_progress_barru   r    r&   lenr(   r
   rY   randomseedr   r   rA   r>   r:   read_binary_model_filer   rW   rg   r[   r^   load_segmentationsread_segmentation_filerB   rj   read_annotations_fileset_annotationsrk   rd   r   rh   set_corpus_weight_updaterre   r   rf   r   get_corpus_coding_weightrT   rQ   ra   rb   rI   get_compounds_loggerwarningtimetrain_batchrX   r]   infor=   read_corpus_list_filesread_corpus_files	load_datar\   rZ   ri   absr	   clear_segmentationtrain_onliner_   r.   write_binary_model_filer/   write_segmentation_fileget_segmentationsr1   write_lexicon_fileget_constructionsr0   make_segment_onlyr+   rD   rF   replacestringr   parse_open_text_file_writer,   r   joinrH   writeforward_logprobr2   r   format_constructionsformatr   rl   r   evaluate_modelprintr   )r   r   ior   r   annotationsdevelannotsupdaterstart_corpus_weightdampfunc	algparamstsectedata	outformatr   keywordsfobjtestdatair   r   r   r   	nbestlistconstructionslogpr   resultr   r   r   mainT  s  


































(
r  c               	   C   s\  dd l } t }| jdd|j| jdd}|dj}|ddtd	d
dd |ddtd	ddd |dj}|ddddd |dddddd |dj}|ddtd d!d"d# |d$d%d&d'd(d |d)j}|d*d+d,td-d	d.d# |d/d0d1d2d |d3j}|d4d5d6d7d8 |d9d:d;t	  d<d= |j}|d>d?d-d@dA |dBdCdDdEdA |dFdGdHg dIdJdK |S )LNr   zmorfessor-evaluatez>Simple usage example:

  %(prog)s gold_standard model1 model2
F)r   r   r   r   r   zevaluation optionsz--num-samples
numsamplesr3   
   z%number of samples to take for testing)r"   r4   r$   r#   r%   z--sample-size
samplesizei  zsize of each testing sampleszformatting optionsz--format-stringformatstringrE   a  Python new style format string used to report evaluation results. The following variables are a value and and action separated with and underscore. E.g. fscore_avg for the average f-score. The available values are "precision", "recall", "fscore", "samplesize" and the available actions: "avg", "max", "min", "values", "count". A last meta-data variable (without action) is "name", the filename of the model See also the format-template option for predefined stringsr9   z--format-templatetemplatez
<template>r#   zUses a template string for the format-string options. Available templates are: default, table and latex. If format-string is defined this option is ignored)r"   r$   r#   r%   zfile optionsz--construction-separatorrA   rG   r?   zJconstruction separator for test segmentation files (default '%(default)s')r@   r5   r6   r   r7   r8   rm   rn   ro   rp   r   rq   rr   rs   r!   rt   rv   rw   rx   r%   ry   rz   r{   r|   r}   r~   r   rl   z<goldstandard>z0gold standard file in standard annotation format)r$   nargsr%   modelsz<model>+zRmodel files to segment (either binary or Morfessor 1.0 style segmentation models).r'   z--testsegmentationtest_segmentationsr)   zXSegmentation of the test set. Note that all words in the gold-standard must be segmentedr<   )
r   r   r   r   r   r   r   r   r   r   )r   standard_parserr   r   r   r   r   get_evaluation_argparser;  sx   




r  c                 C   s0  t |  t| jd}t|| jd }g }| j}| j}| j}|du r(t	| j
 }| jD ]"}|j||t||dtj|id}|| t|| q+| j|_| jD ]%}||d}	|j|	t||dtj|id}|| t|| qUt|dkr|dkrt }
|
|}t| dS dS dS )z Separate main for running evaluation and statistical significance
    testing. Takes as argument the results of an get_evaluation_argparser()
    r   r   Nr   )configurationr   Fr   )r   r   r   r   r   rl   r  r	  r  r   r  r  r   read_any_modelr   ospathbasenamer)   r   r   rA   construction_separatorr  r   evaluate_segmentationr   r   significance_testprint_table)r   r   evresultssample_sizenum_samplesf_stringfr  segmentationwsrrr   r   r   main_evaluation  sH   






r&  )+r   r   r   r   os.pathr  r   r   r   r   r   r   baseliner   r   r   r   r	   	exceptionr
   r   r   
evaluationr   r   r   r   version_infoPY3strr   r   __name__r   LRU_MAX_SIZEr   r   	lru_cacher   r   r  r  r&  r   r   r   r   <module>   s@   
  
&


 hM