o
    Mi                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZddlmZm	Z	 ddl
mZmZ eeZdd Ze dg dZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZdS )    N   )	_progress
_is_string)MorfessorExceptionSegmentOnlyModelExceptionc                 C   s*   t | d rd| S dtdd | S )z5Return a readable string for a list of constructions.r   z + c                 S   s
   d | S )N )join)x r
   F/home/ubuntu/.local/lib/python3.10/site-packages/morfessor/baseline.py<lambda>   s   
 z'_constructions_to_str.<locals>.<lambda>)r   r   map)constructionsr
   r
   r   _constructions_to_str   s   
r   
ConstrNode)rcountcountsplitlocc                   @   s|  e Zd ZdZdZ		dZddZdd Zd	d
 Zedd Z	edd Z
dd Zdd Zdd Zd[ddZdd Zdd Zdd Zdd Zd\d"d#Zd$d% Zd&d' Zd(d) Zd*d+ Zed,d- Zed.d/ Zed0d1 Zd2d3 Zd4d5 Zd6d7 Zd8d9 Z		d]d;d<Z d=d> Z!d^d?d@Z"dAdB Z#	D	d_dFdGZ$	H	D	d`dIdJZ%dadLdMZ&dNdO Z'dadPdQZ(dRdS Z)dTdU Z*dVdW Z+dXdY Z,dS )bBaselineModela  Morfessor Baseline model class.

    Implements training of and segmenting with a Morfessor model. The model
    is complete agnostic to whether it is used with lists of strings (finding
    phrases in sentences) or strings of characters (finding morphs in words).

    3333NFc                 C   s   i | _ d| _t | _t| j| _d| _| | || _d| _	t
 | _|du r+g | _n|| _|du r6d| _nt|tj| _d| _dS )a  Initialize a new model instance.

        Arguments:
            forcesplit_list: force segmentations on the characters in
                               the given list
            corpusweight: weight for the corpus cost
            use_skips: randomly skip frequently occurring constructions
                         to speed up training
            nosplit_re: regular expression string for preventing splitting
                          in certain contexts

        FN)	_analyses_segment_onlyLexiconEncoding_lexicon_codingCorpusEncoding_corpus_coding_annot_codingset_corpus_weight_updater
_use_skips_supervisedcollectionsCounter_counterforcesplit_list
nosplit_rerecompileUNICODEannotations)selfr#   corpusweight	use_skipsr$   r
   r
   r   __init__,   s    


zBaselineModel.__init__c                 C   sD   |d u r
t d| _nt|tjrt || _n|| _| j| d d S )N      ?r   )FixedCorpusWeight_corpus_weight_updater
isinstancenumbersNumberupdate)r)   corpus_weightr
   r
   r   r   ]   s   z'BaselineModel.set_corpus_weight_updaterc                 C   s   | j rt d S N)r   r   r)   r
   r
   r   _check_segment_onlyg   s   z!BaselineModel._check_segment_onlyc                 C      | j jS )z)Return the number of construction tokens.)r   tokensr6   r
   r
   r   r9   k   s   zBaselineModel.tokensc                 C   s   | j jd S )z(Return the number of construction types.r   )r   typesr6   r
   r
   r   r:   p   s   zBaselineModel.typesc                 C   sH   | j  j|7  _| || | j| j}| j| j|| d| j|< dS )z"Add compound with count c to data.r   N)r   
boundaries_modify_construction_countr   r   _replace)r)   compoundcoldrcr
   r
   r   _add_compoundu   s
   zBaselineModel._add_compoundc                 C   s&   | j | \}}}| ||  ||fS )zRemove construction from model.)r   r=   )r)   constructionr   r   r   r
   r
   r   _remove}   s   zBaselineModel._removec                    s,   t  fddtdt|D }| ||S )zReturn a random split for compound.

        Arguments:
            compound: compound to split
            threshold: probability of splitting at each position

        c                 3   s     | ]}t    k r|V  qd S r5   )random).0i	thresholdr
   r   	<genexpr>   s    z.BaselineModel._random_split.<locals>.<genexpr>r   )tuplerangelen_splitloc_to_segmentation)r)   r?   rI   r   r
   rH   r   _random_split   s    zBaselineModel._random_splitrbranchc                 C   sF  t |dkr| |\}}t|dt | j|< | || dS |dkrE| |\}}| |}t|||| j|< |D ]}| || q:dS |dkr|}tt |D ]I}	| |\}}||	 }
|	t |d krvt|dd| j|< | || qQ| ||	d d }t||t |
| j|< | |
| | || |}qQdS t	d| )a  Set analysis of compound to according to given segmentation.

        Arguments:
            compound: compound to split
            parts: desired constructions of the compound
            ptype: type of the parse tree to use

        If ptype is 'rbranch', the analysis is stored internally as a
        right-branching tree. If ptype is 'flat', the analysis is stored
        directly to the compound's node.

        r   r   flatrP   NzUnknown parse type '%s')
rM   rD   r   rK   r   r=   segmentation_to_splitlocrL   _join_constructionsr   )r)   r?   partsptyper   r   r   constrrC   pprefixsuffixr
   r
   r   _set_compound_analysis   s<   


z$BaselineModel._set_compound_analysisc           	      C   s   | j sdS t }| j D ]'\}}|| jvr| |d | |\}}|D ]}||  | j| j7  < q&q| j	
| | D ]\}}d}|| jv rW| j| jsW| j| j}| j	|| q@dS )zUpdate the selection of alternative analyses in annotations.

        For semi-supervised models, select the most likely alternative
        analyses included in the annotations of the compounds.

        Nr   r   )r   r    r!   r(   itemsr   rB   _best_analysisr   r   set_constructionsr   r   	set_count)	r)   r   r?   alternativesanalysiscostmfr   r
   r
   r   _update_annotation_choices   s"   
z(BaselineModel._update_annotation_choicesc                 C   s   d}d}|D ]8}d}|D ]%}|| j v r,| j | js,|t| jjt| j | j 7 }q|| j8 }q|du s:||k r>|}|}q||fS )z2Select the best analysis out of the given choices.N        )r   r   mathlogr   r9   r   penalty)r)   choicesbestcostbestanalysisr`   ra   rb   r
   r
   r   r\      s   zBaselineModel._best_analysisc                 C   s   t | jdkr
|gS t |}d}g }td|D ]+}|| | jv rBt ||| dkr3||||  ||||d   |d }q||k rP|||d  dd |D S )z$Return forced split of the compound.r   r   Nc                 S   s   g | ]
}t |d kr|qS r   )rM   )rF   rW   r
   r
   r   
<listcomp>   s    z.BaselineModel._force_split.<locals>.<listcomp>)rM   r#   rL   append)r)   r?   clenjrT   rG   r
   r
   r   _force_split   s   zBaselineModel._force_splitc                 C   sD   || j v r| j | }t dtd| krdS | j |  d7  < dS )z.Return true if construction should be skipped.r-   r   TF)r"   rE   max)r)   rC   tr
   r
   r   
_test_skip   s   

zBaselineModel._test_skipr      c                 C   st   t |}|dkr|gS | jr| |r| |S | |}g }|D ]}|| j|||dd 7 }q!| j||dd |S )a%  Optimize segmentation of the compound using the Viterbi algorithm.

        Arguments:
          compound: compound to optimize
          addcount: constant for additive smoothing of Viterbi probs
          maxlen: maximum length for a construction

        Returns list of segments.

        r   )addcountmaxlenr   rQ   )rU   )rM   r   rt   segmentrq   viterbi_segmentrZ   )r)   r?   rv   rw   ro   rT   r   partr
   r
   r   _viterbi_optimize  s   


zBaselineModel._viterbi_optimizec                 C   sx   t |dkr	|gS | jr| |r| |S | |}t |dkr&| |S | || g }|D ]	}|| |7 }q0|S )zmOptimize segmentation of the compound using recursive splitting.

        Returns list of segments.

        r   )rM   r   rt   rx   rq   _recursive_splitrZ   )r)   r?   rT   r   rz   r
   r
   r   _recursive_optimize  s   


z!BaselineModel._recursive_optimizec                 C   s  t |dkr	|gS | jr| |r| |S | |\}}| || |  }| ||  d}tdt |D ]F}| jrK| j	||d |d  rKq7|d| }||d }| || | || |  }	| ||  | ||  |	|kr}|	}|}q7|rt
|||| j|< |d| }||d }| || | || | |}
||kr|
| | S |
|
 S t
|dt | j|< | || |gS )znOptimize segmentation of the construction by recursive splitting.

        Returns list of segments.

        r   r   N)rM   r   rt   rx   rD   r=   get_costrL   r$   matchr   r   r|   rK   )r)   rC   r   r   mincostr   rG   rX   rY   ra   lpr
   r
   r   r|   1  sP   


zBaselineModel._recursive_splitc           	      C   s   || j v r| j | \}}}nd\}}}|| }|dkr | j |= n	t|||| j |< |r>| ||}|D ]}| || q3dS | j||| | jrQ| j||| |dkra|dkra| j	| dS |dkrq|dkrs| j
| dS dS dS )zModify the count of construction by dcount.

        For virtual constructions, recurses to child nodes in the
        tree. For real constructions, adds/removes construction
        to/from the lexicon whenever necessary.

        )r   r   r   r   N)r   r   rN   r=   r   update_countr   r   r   addremove)	r)   rC   dcountr   r   r   newcountchildrenchildr
   r
   r   r=   d  s,   



z(BaselineModel._modify_construction_countc                 C   sF   d}| j | |r|d7 }| jrt | _| jr!|   | j	  |S )am  Do model updates that are necessary between training epochs.

        The argument is the number of training epochs finished.

        In practice, this does two things:
        - If random skipping is in use, reset construction counters.
        - If semi-supervised learning is in use and there are alternative
          analyses in the annotated data, select the annotations that are
          most likely given the model parameters. If not hand-set, update
          the weight of the annotated corpus.

        This method should also be run prior to training (with the
        epoch number argument as 0).

        r      )
r/   r3   r   r    r!   r"   r   rd   r   update_weight)r)   	epoch_numforced_epochsr
   r
   r   _epoch_update  s   

zBaselineModel._epoch_updatec                 C   s8   g }d}| D ]}|t |7 }|| qt|dd S )z:Return a list of split locations for a segmented compound.r   N)rM   rn   rK   )r   r   rG   r@   r
   r
   r   rR     s   z&BaselineModel.segmentation_to_splitlocc                 C   sv   t |tjr| d| | |d gS g }d}d}tt|D ]}|| }|| ||  |}q|| |d  |S )zAReturn segmentation corresponding to the list of split locations.Nr   )r0   r1   r2   rL   rM   rn   )r?   r   rT   startposendposrG   r
   r
   r   rN     s   z'BaselineModel._splitloc_to_segmentationc                 C   s$   t | d  }| D ]}||7 }q	|S )z`Append the constructions after each other by addition. Works for
        both lists and strings r   )type)r   resultr@   r
   r
   r   rS     s   
z!BaselineModel._join_constructionsc                 C   s   |    dd | j D S )z.Return the compound types stored by the model.c                 S   s   g | ]\}}|j d kr|qS rl   r;   )rF   wnoder
   r
   r   rm     s    
z/BaselineModel.get_compounds.<locals>.<listcomp>)r7   r   r[   r6   r
   r
   r   get_compounds  s   zBaselineModel.get_compoundsc                 C   s   t dd | j D S )z<Return a list of the present constructions and their counts.c                 s   s$    | ]\}}|j s||jfV  qd S r5   )r   r   )rF   r@   r   r
   r
   r   rJ     s   
 z2BaselineModel.get_constructions.<locals>.<genexpr>)sortedr   r[   r6   r
   r
   r   get_constructions  s   zBaselineModel.get_constructionsc                 C   s,   | j  | j  }| jr|| j  S |S )z#Return current model encoding cost.)r   r~   r   r   r   )r)   ra   r
   r
   r   r~     s   zBaselineModel.get_costc                 c   sH    |    t| j D ]}| j| j}|dkr!||| |fV  qdS )z>Retrieve segmentations for all compounds encoded by the model.r   N)r7   r   r   keysr   rx   )r)   r   r@   r
   r
   r   get_segmentations  s   zBaselineModel.get_segmentationsr   c           	      C   s   |    t }|D ]\}}t|dkr||  |7  < q
| D ]0\}}||k r*q!|dur7| ||| n| || |durQ|dkrQ| ||}| || q!|  S )a  Load data to initialize the model for batch training.

        Arguments:
            data: iterator of (count, compound_atoms) tuples
            freqthreshold: discard compounds that occur less than
                             given times in the corpus (default 1)
            count_modifier: function for adjusting the counts of each
                              compound
            init_rand_split: If given, random split the word with
                               init_rand_split as the probability for each
                               split

        Adds the compounds in the corpus to the model lexicon. Returns
        the total cost.

        r   N)	r7   r    r!   rM   r[   rB   rO   rZ   r~   )	r)   datafreqthresholdcount_modifierinit_rand_split
totalcountr   atomsrT   r
   r
   r   	load_data  s"   zBaselineModel.load_datac                 C   s4   |    |D ]\}}}| || | || qdS )zLoad model from existing segmentations.

        The argument should be an iterator providing a count, a
        compound, and its segmentation.

        N)r7   rB   rZ   )r)   segmentationsr   r?   segmentationr
   r
   r   load_segmentations  s
   z BaselineModel.load_segmentationsc                 C   s6   |    d| _|| _t| j|d| _t| j| j_dS )zVPrepare model for semi-supervised learning with given
         annotations.

         TweightN)r7   r   r(   AnnotatedCorpusEncodingr   r   rM   r<   )r)   r(   annotatedcorpusweightr
   r
   r   set_annotations  s   zBaselineModel.set_annotationsc                 C   sR   |    | j| \}}}g }|r"| ||D ]	}|| |7 }q|S || |S )zSegment the compound by looking it up in the model analyses.

        Raises KeyError if compound is not present in the training
        data. For segmenting new words, use viterbi_segment(compound).

        )r7   r   rN   rx   rn   )r)   r?   r   r   r   r   r   r
   r
   r   rx     s   
zBaselineModel.segment	recursiver
   {Gzt?c                 C   sf  d}t d| |}|  }t|  }tdt|| jj	 td td|| 	 t
| t|D ]-}	|dkrE| j|	g|R  }
n|dkrS| j|	g|R  }
ntd	| td
|	t|
 q5|d7 }td|   t || |}|}|  }td|| |dkr|||| jj	  krn|dkr|d8 }|dur||krtd nq,td ||fS )a  Train the model in batch fashion.

        The model is trained with the data already loaded into the model (by
        using an existing model or calling one of the load_ methods).

        In each iteration (epoch) all compounds in the training data are
        optimized once, in a random order. If applicable, corpus weight,
        annotation cost, and random split counters are recalculated after
        each iteration.

        Arguments:
            algorithm: string in ('recursive', 'viterbi') that indicates
                         the splitting algorithm used.
            algorithm_params: parameters passed to the splitting algorithm.
            finish_threshold: the stopping threshold. Training stops when
                                the improvement of the last iteration is
                                smaller then finish_threshold * #boundaries
            max_epochs: maximum number of epochs to train

        r   r   z0Compounds in training data: %s types / %s tokenszStarting batch trainingzEpochs: %s	Cost: %sTr   viterbiunknown algorithm '%s'z	#%s -> %szCost before epoch update: %sN+Max number of epochs reached, stop trainingzDone.)rr   r   r~   listr   _loggerinforM   r   r<   rE   shuffler   r}   r{   r   debugr   )r)   	algorithmalgorithm_paramsfinish_threshold
max_epochsepochsr   newcost	compoundsr   segmentsoldcostr
   r
   r   train_batch0  sN   



zBaselineModel.train_batch'  c              	   C   s  |    |dur
i }td d}	d}
d}|r| |	 |  }td|
| tt|D ]}zt|\}}W n tyC   d}Y  nw t	|dkrKq-|dur{||vr\d}d||< d}n|| }|d ||< ||d || }|dkrz| 
|| n| 
|d |dur|dkr| ||}| || |dkr| j|g|R  }n|d	kr| j|g|R  }ntd
| td|
|t| |
d7 }
q-|	d7 }	|dur|	|krtd n|s| |	 |  }td|
| |	|fS )a  Train the model in online fashion.

        The model is trained with the data provided in the data argument.
        As example the data could come from a generator linked to standard in
        for live monitoring of the splitting.

        All compounds from data are only optimized once. After online
        training, batch training could be used for further optimization.

        Epochs are defined as a fixed number of compounds. After each epoch (
        like in batch training), the annotation cost, and random split counters
        are recalculated if applicable.

        Arguments:
            data: iterator of (_, compound_atoms) tuples. The first
                    argument is ignored, as every occurence of the
                    compound is taken with count 1
            count_modifier: function for adjusting the counts of each
                              compound
            epoch_interval: number of compounds to process before starting
                              a new epoch
            algorithm: string in ('recursive', 'viterbi') that indicates
                         the splitting algorithm used.
            algorithm_params: parameters passed to the splitting algorithm.
            init_rand_split: probability for random splitting a compound to
                               at any point for initializing the model. None
                               or 0 means no random splitting.
            max_epochs: maximum number of epochs to train

        NzStarting online trainingr   TzTokens processed: %s	Cost: %sFr   r   r   r   z#%s: %s -> %sr   )r7   r   r   r   r~   r   rL   nextStopIterationrM   rB   rO   rZ   r}   r{   r   r   r   )r)   r   r   epoch_intervalr   r   r   r   countsr   rG   more_tokensr   _r   r@   addcrT   r   r
   r
   r   train_onlineq  sj   !




/zBaselineModel.train_onliner-   c              	   C   s  t |}dg}| jj| jj | dkr!t| jj| jj | }nd}|| d }td|d D ]}d}	d}
| jrW||k rW| j||d |d  rW|	|| |d f q0tt
d|| |D ]}|| d du rlqa|| d }||| }|| jv r| j| js| j| jdkrtd|| j| jf ||t| j| j|  7 }ne|dkr| jjdkr||t| | j|| jj  7 }nG||t| | jj| t| jj|  | jjt| jj  | j| | jj  7 }nt |dkr||7 }n| jr|t || 7 }nqa|
du s||
k r|}
|}	qa|	|
|	f q0g }|d \}}|d }|durL|}|	|||  || d }|}|dus4|  |t| jj| jj t| jj 7 }||fS )a  Find optimal segmentation using the Viterbi algorithm.

        Arguments:
          compound: compound to be segmented
          addcount: constant for additive smoothing (0 = no smoothing)
          maxlen: maximum length for the constructions

        If additive smoothing is applied, new complex construction types can
        be selected during the search. Without smoothing, only new
        single-atom constructions can be selected.

        Returns the most probable segmentation and its log-probability.

        )re   Nr   r-   r   N Construction count of '%s' is %sr   )rM   r   r9   r<   rf   rg   rL   r$   r   rn   rr   r   r   r   r   r   get_codelengthr   reverse)r)   r?   rv   rw   ro   grid	logtokensbadlikelihoodrs   bestpathrj   ptra   rC   r   pathltr
   r
   r   ry     s   








zBaselineModel.viterbi_segmentc           
      C   sD  t |}dg}| jj| jj dkrt| jj| jj }nd}td|d D ]a}d}td|D ]B}|| }||| }	|	| jv rh| j|	 jsh| j|	 j	dkrZt
d|	| j|	 j	f ||t| j|	 j	 7 }nq/|t| 7 }q/|dkr|t|  q&|| j  q&|d }|t| jj| jj t| jj 7 }|S )aW  Find log-probability of a compound using the forward algorithm.

        Arguments:
          compound: compound to process

        Returns the (negative) log-probability of the compound. If the
        probability is zero, returns a number that is larger than the
        value defined by the penalty attribute of the model object.

        re   r   r   r   r   )rM   r   r9   r<   rf   rg   rL   r   r   r   r   exprn   rh   )
r)   r?   ro   r   r   rs   psumr   ra   rC   r
   r
   r   forward_logprob,  sL   



zBaselineModel.forward_logprobc              
   C   s\  t |}dgg}| jj| jj | dkr"t| jj| jj | }nd}|| d }td|d D ]}	g }
| jrZ|	|k rZ| j||	d |	d  rZ|	| | |	d dfg q1tt
d|	| |	D ]}tt || D ]}|| | d du r{qn|| | d }|||	 }|| jv r| j| js| j| jdkrtd|| j| jf ||t| j| j|  8 }ng|dkr| jjdkr||t| | j|| jj  8 }nH||t| | jj| t| jj|  | jjt| jj  | j| | jj  8 }nt |dkr||8 }n| jr|t || 8 }nqnt |
|k r0t|
|||f qnt|
|||f qnqd|	|
 q1g }tt |d D ]Y}g }|d | \}}}|d }|dur|}	|	||	|  ||	 | d }||	 | d }|	}|dusa|  |t| jj| jj t| jj 8 }|	| |f qKd	d
 t|D S )aP  Find top-n optimal segmentations using the Viterbi algorithm.

        Arguments:
          compound: compound to be segmented
          n: how many segmentations to return
          addcount: constant for additive smoothing (0 = no smoothing)
          maxlen: maximum length for the constructions

        If additive smoothing is applied, new complex construction types can
        be selected during the search. Without smoothing, only new
        single-atom constructions can be selected.

        Returns the n most probable segmentations and their
        log-probabilities.

        )re   NNr   r-   r   r   Nr   r   c                 S   s   g | ]\}}||fqS r
   r
   )rF   ra   rV   r
   r
   r   rm     s    z/BaselineModel.viterbi_nbest.<locals>.<listcomp>)rM   r   r9   r<   rf   rg   rL   r$   r   rn   rr   r   r   r   r   r   r   r   heapqheappushheappushpopr   r   )r)   r?   nrv   rw   ro   r   r   r   rs   bestnr   kra   rC   resultsr   r   kir   r
   r
   r   viterbi_nbest]  s   





	
-

zBaselineModel.viterbi_nbestc                 C   r8   r5   )r   r   r6   r
   r
   r   get_corpus_coding_weight  s   z&BaselineModel.get_corpus_coding_weightc                 C   s   |    || j_d S r5   )r7   r   r   r)   r   r
   r
   r   set_corpus_coding_weight  s   z&BaselineModel.set_corpus_coding_weightc                 C   s    d| _ dd | j D | _dS )a  Reduce the size of this model by removing all non-morphs from the
        analyses. After calling this method it is not possible anymore to call
        any other method that would change the state of the model. Anyway
        doing so would throw an exception.

        Tc                 S   s   i | ]
\}}|j s||qS r
   )r   )rF   r   vr
   r
   r   
<dictcomp>  s    z3BaselineModel.make_segment_only.<locals>.<dictcomp>N)r   r   r[   r6   r
   r
   r   make_segment_only  s   zBaselineModel.make_segment_onlyc                 C   s$   t |  D ]	}| ||g qd S r5   )r   r   rZ   )r)   r?   r
   r
   r   clear_segmentation  s   z BaselineModel.clear_segmentation)NNFN)rP   )r   ru   )r   NNr5   )r   r
   r   N)Nr   r   r
   NN)r-   ru   )-__name__
__module____qualname____doc__rh   r,   r   r7   propertyr9   r:   rB   rD   rO   rZ   rd   r\   rq   rt   r{   r}   r|   r=   r   staticmethodrR   rN   rS   r   r   r~   r   r   r   r   rx   r   r   ry   r   r   r   r   r   r   r
   r
   r
   r   r   !   sn    
1



*
	3"
	


&

A

^]
1er   c                   @   s   e Zd Zedd ZdS )CorpusWeightc                 C   s\   |dkr,|  }|dkr|dd|  9 }n
|ddd|   9 }|| td| dS dS )Nr   r          @r-   zCorpus weight set to %sTF)r   r   r   r   )clsmodel	directionepochr   r
   r
   r   move_direction  s   
zCorpusWeight.move_directionN)r   r   r   classmethodr   r
   r
   r
   r   r     s    r   c                   @   s   e Zd Zdd Zdd ZdS )r.   c                 C   s
   || _ d S r5   r   r   r
   r
   r   r,     s   
zFixedCorpusWeight.__init__c                 C   s   | | j dS )NF)r   r   )r)   r   r   r
   r
   r   r3     s   zFixedCorpusWeight.updateNr   r   r   r,   r3   r
   r
   r
   r   r.     s    r.   c                   @   sB   e Zd ZdZdddZdd Zedd Zed	d
 Zdd Z	dS )AnnotationCorpusWeightzcClass for using development annotations to update the corpus weight
    during batch training

    {Gz?c                 C      || _ || _d S r5   )r   rI   )r)   	devel_setrI   r
   r
   r   r,        
zAnnotationCorpusWeight.__init__c                    sN   |dk rdS | j  }t| \}} fdd|D }| ||}|  ||S )zuTune model corpus weight based on the precision and
        recall of the development data, trying to keep them equalr   Fc                    s   g | ]	}  |d  qS rl   )ry   )rF   r   r   r
   r   rm     s    z1AnnotationCorpusWeight.update.<locals>.<listcomp>)r   r[   zip_estimate_segmentation_dirr   )r)   r   r   tmpwlistr(   r   dr
   r   r   r3     s   
zAnnotationCorpusWeight.updatec                 C   s   d}d}t ||D ]G\}}d}|D ]2}tt|}	t|	dkr$d} n |D ]}
tt|
}t|	|tt|	 }||krB|}q&q|dkrP||7 }|d7 }q	||fS )z:Calculate average boundary recall for given segmentations.r   re   r   r-   r   )r   setr   rR   rM   intersectionfloat)r   
prediction	reference	rec_totalrec_sumpre_listref_listbestrefref_bprepre_brr
   r
   r   _boundary_recall  s*   z'AnnotationCorpusWeight._boundary_recallc           
      C   sN   |  ||\}}|  ||\}}|| }|| }d| | ||  }	|||	fS )zAReturn boundary precision, recall, and F-score for segmentations.r   )r  )
r   r   r  rec_srec_tpre_spre_trecr	  rc   r
   r
   r   _bpr_evaluation  s   
z&AnnotationCorpusWeight._bpr_evaluationc                 C   sP   |  dd |D |\}}}td|| t|| | jk r dS ||kr&dS dS )a  Estimate if the given compounds are under- or oversegmented.

        The decision is based on the difference between boundary precision
        and recall values for the given sample of segmented data.

        Arguments:
          segments: list of predicted segmentations
          annotations: list of reference segmentations

        Return 1 in the case of oversegmentation, -1 in the case of
        undersegmentation, and 0 if no changes are required.

        c                 S   s   g | ]}|gqS r
   r
   )rF   r	   r
   r
   r   rm   6  s    zEAnnotationCorpusWeight._estimate_segmentation_dir.<locals>.<listcomp>z0Boundary evaluation: precision %.4f; recall %.4fr   r   r   )r  r   r   absrI   )r)   r   r(   r	  r  rc   r
   r
   r   r   (  s   z1AnnotationCorpusWeight._estimate_segmentation_dirNr   )
r   r   r   r   r,   r3   r   r  r  r   r
   r
   r
   r   r     s    


	r   c                   @   s*   e Zd Zd	ddZdd Zedd ZdS )
MorphLengthCorpusWeightr   c                 C   r   r5   )morph_lengthrI   )r)   morph_lenghtrI   r
   r
   r   r,   A  r   z MorphLengthCorpusWeight.__init__c                 C   sf   |dk rdS |  |}td| t| j| | j | jkr1t| j| | j|  }| |||S dS )Nr   FzCurrent morph-length: %s)calc_morph_lengthr   r   r  r  rI   r   )r)   r   r   
cur_lengthr   r
   r
   r   r3   E  s   
zMorphLengthCorpusWeight.updatec                 C   sV   d}d}|  D ]}||}|D ]}|d7 }|t|7 }qq|dkr)t|| S dS )Nr   r   re   )r   rx   rM   r   )r   r   total_constructionstotal_atomsr?   r   rC   r
   r
   r   r  S  s   
z)MorphLengthCorpusWeight.calc_morph_lengthNr  )r   r   r   r,   r3   r   r  r
   r
   r
   r   r  @  s
    
r  c                   @   s   e Zd ZdddZdd ZdS )NumMorphCorpusWeightr   c                 C   r   r5   )num_morph_typesrI   )r)   r  rI   r
   r
   r   r,   c  r   zNumMorphCorpusWeight.__init__c                 C   sd   |dk rdS |j j}td| t| j| | j | jkr0t| j| | j|  }| |||S dS )Nr   FzNumber of morph types: %s)r   r<   r   r   r  r  rI   r   )r)   r   r   cur_morph_typesr   r
   r
   r   r3   g  s   zNumMorphCorpusWeight.updateNr  r   r
   r
   r
   r   r  b  s    
r  c                   @   sb   e Zd ZdZdddZedej Ze	dd Z
edd	 Zd
d Zdd Zdd Zdd ZdS )EncodingzBase class for calculating the entropy (encoding length) of a corpus
    or lexicon.

    Commonly subclassed to redefine specific methods.

    r-   c                 C   s   d| _ d| _d| _|| _dS )z`Initizalize class

        Arguments:
            weight: weight used for this encoding
        re   r   N)logtokensumr9   r<   r   r   r
   r
   r   r,   }  s   
zEncoding.__init__r   c                 C   s   dS )zxDefine number of types as 0. types is made a property method to
        ensure easy redefinition in subclasses

        r   r
   r6   r
   r
   r   r:     s   zEncoding.typesc                 C   sH   |dk rdS |dk rt t |S t |}|| | d|| j   S )zaCalculate logarithm of n!.

        For large n (n > 20), use Stirling's approximation.

        r   re      g      ?)rf   rg   	factorial_log2pi)r   r   lognr
   r
   r   _logfactorial  s   
zEncoding._logfactorialc                 C   sH   | j dk rdS | j| j }| |d | | j d  | || j   S )z~Calculate -log[(u - 1)! (v - u)! / (v - 1)!]

        v is the number of tokens+boundaries and u the number of types

        r   re   r   )r:   r9   r<   r%  r)   r9   r
   r
   r   frequency_distribution_cost  s   
z$Encoding.frequency_distribution_costc                 C   s   |  | j S )z'The permutations cost for the encoding.)r%  r<   r6   r
   r
   r   permutations_cost  s   zEncoding.permutations_costc                 C   sZ   |  j || 7  _ |dkr|  j|t| 8  _|dkr+|  j|t| 7  _dS dS )z"Update the counts in the encoding.r   N)r9   r   rf   rg   )r)   rC   	old_count	new_countr
   r
   r   r     s   zEncoding.update_countc                 C   sV   | j dkrdS | j| j  }|t| | j t| j   | j |   | j |   S )z2Calculate the cost for encoding the corpus/lexiconr   re   )r<   r9   rf   rg   r   r(  r   r'  r)   r   r
   r
   r   r~     s   
zEncoding.get_costNr-   )r   r   r   r   r,   rf   rg   pir#  r   r:   r   r%  r'  r(  r   r~   r
   r
   r
   r   r  v  s    


r  c                       s>   e Zd ZdZd fdd	Zedd Zdd Zd	d
 Z  Z	S )r   zEncoding the corpus class

    The basic difference to a normal encoding is that the number of types is
    not stored directly but fetched from the lexicon encoding. Also does the
    cost function not contain any permutation cost.
    r-   c                    s   t t| | || _d S r5   )superr   r,   lexicon_encoding)r)   r/  r   	__class__r
   r   r,     s   
zCorpusEncoding.__init__c                 C   s   | j jd S )z}Return the number of types of the corpus, which is the same as the
         number of boundaries in the lexicon + 1

        r   )r/  r<   r6   r
   r
   r   r:     s   zCorpusEncoding.typesc                 C   sF   | j dk rdS | j}| |d | | j d  | || j  d  S )zXCalculate -log[(M - 1)! (N - M)! / (N - 1)!] for M types and N
        tokens.

        r   re   r   )r:   r9   r%  r&  r
   r
   r   r'    s   
z*CorpusEncoding.frequency_distribution_costc                 C   sN   | j dkrdS | j| j  }|t| | j t| j   | j | j |   S )zhOverride for the Encoding get_cost function. A corpus does not
        have a permutation cost

        r   re   )r<   r9   rf   rg   r   r   r'  r+  r
   r
   r   r~     s   
zCorpusEncoding.get_costr,  )
r   r   r   r   r,   r   r:   r'  r~   __classcell__r
   r
   r0  r   r     s    
r   c                       sJ   e Zd ZdZd fdd	Zdd Zdd	 Zd
d Zdd Zdd Z	  Z
S )r   zsEncoding the cost of an Annotated Corpus.

    In this encoding constructions that are missing are penalized.

    Nr   c                    sH   t t|   d| _d| _|durd| _|| _|| _|| _t | _	dS )a  
        Initialize encoding with appropriate meta data

        Arguments:
            corpus_coding: CorpusEncoding instance used for retrieving the
                             number of tokens and boundaries in the corpus
            weight: The weight of this encoding. If the weight is None,
                      it is updated automatically to be in balance with the
                      corpus
            penalty: log penalty used for missing constructions

        Tr-   NF)
r.  r   r,   do_update_weightr   corpus_codingrh   r    r!   r   )r)   r4  r   rh   r0  r
   r   r,     s   z AnnotatedCorpusEncoding.__init__c                 C   s   || _ t| | _d| _dS )zMethod for re-initializing the constructions. The count of the
        constructions must still be set with a call to set_count

        re   N)r   sumvaluesr9   r   )r)   r   r
   r
   r   r]     s   
z)AnnotatedCorpusEncoding.set_constructionsc                 C   sF   | j | }|dkr|  j|t| 7  _dS |  j|| j 7  _dS )z`Set an initial count for each construction. Missing constructions
        are penalized
        r   Nr   r   rf   rg   rh   )r)   rC   r   annot_countr
   r
   r   r^     s   
z!AnnotatedCorpusEncoding.set_countc                 C   s   || j v rC| j | }|dkr|  j|t| 8  _n
|  j|| j 8  _|dkr7|  j|t| 7  _dS |  j|| j 7  _dS dS )zoUpdate the counts in the Encoding, setting (or removing) a penalty
         for missing constructions

        r   Nr7  )r)   rC   r)  r*  r8  r
   r
   r   r   &  s   

z$AnnotatedCorpusEncoding.update_countc                 C   sL   | j sdS | j}| jjt| jj | j | _| j|kr$td| j dS dS )zxUpdate the weight of the Encoding by taking the ratio of the
        corpus boundaries and annotated boundaries
        Nz)Corpus weight of annotated data set to %s)r3  r   r4  r   r<   r   r   )r)   oldr
   r
   r   r   6  s   

z%AnnotatedCorpusEncoding.update_weightc                 C   sT   | j dkrdS | j| j  }|t| jj| jj   | j t| jj   | j | j S )z)Return the cost of the Annotation Corpus.r   re   )r<   r9   rf   rg   r4  r   r   r+  r
   r
   r   r~   B  s   
z AnnotatedCorpusEncoding.get_cost)Nr   )r   r   r   r   r,   r]   r^   r   r   r~   r2  r
   r
   r0  r   r     s    	
r   c                       sD   e Zd ZdZ fddZedd Zdd Zdd	 Zd
d Z	  Z
S )r   z7Class for calculating the encoding cost for the Lexiconc                    s   t t|   t | _dS )zInitialize Lexcion EncodingN)r.  r   r,   r    r!   r   r6   r0  r
   r   r,   P  s   zLexiconEncoding.__init__c                 C   s   t | jd S )zdReturn the number of different atoms in the lexicon + 1 for the
        compound-end-token

        r   )rM   r   r6   r
   r
   r   r:   U  s   zLexiconEncoding.typesc                 C   sF   |  j d7  _ |D ]}| j| }|d | j|< | |||d  q	dS )zcAdd a construction to the lexicon, updating automatically the
        count for its atoms

        r   Nr<   r   r   r)   rC   atomr@   r
   r
   r   r   ]     
zLexiconEncoding.addc                 C   sF   |  j d8  _ |D ]}| j| }|d | j|< | |||d  q	dS )zfRemove construction from the lexicon, updating automatically the
        count for its atoms

        r   Nr:  r;  r
   r
   r   r   h  r=  zLexiconEncoding.removec                 C   sj   t |d }|t| j|  }|t| jd 8 }|D ]}|| jv r)| j| }nd}|t|8 }q|S )z6Return an approximate codelength for new construction.r   )rM   rf   rg   r9   r<   r   )r)   rC   lra   r<  r@   r
   r
   r   r   s  s   
zLexiconEncoding.get_codelength)r   r   r   r   r,   r   r:   r   r   r   r2  r
   r
   r0  r   r   M  s    
r   )r    r   loggingrf   r1   rE   r%   utilsr   r   	exceptionr   r   	getLoggerr   r   r   
namedtupler   objectr   r   r.   r   r  r  r  r   r   r   r
   r
   r
   r   <module>   s>    
       ?	O"R.W