o
    XεiU4                     @   s6  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
mZ ddlZedZejeejeeeef f ZdZdZG dd dZd	deedd
ddfdejeef dejeef dededededejeje  dededefddZdd Zdd Zdd Zdd  Ze d!kre  dS dS )"u^  
Grapheme to phoneme prediction using python CRF suite.

Training requires pre-aligned corpus in Phonetisaurus format.
https://github.com/AdolfVonKleist/Phonetisaurus

The format of this corpus is:

    t}t e}ˈɛ s}s t}t

Each line contains a single word, with graphemes and phonemes separated by "}".
Multiple graphemes are separated by "|":

    s|h}ʃ o|w}ˈoʊ

The empty phoneme is "_":

    w}w h}_ y}ˈaɪ

Example:

.. code-block:: sh

    python3 -m gruut.g2p train --corpus g2p.corpus --output model.crf

Pre-trained models have the following settings:

* c1 = 0
* c2 = 1
* max-iterations = 100
    N)Pathz	gruut.g2p_|c                   @   s   e Zd ZdZeefdejee	e
jf dedefddZd ded	ed
eje fddZe	d dejeeje f d	efddZe						d!dejeeje f dededededededed
efddZeded
efddZeded
efddZdS )"GraphemesToPhonemeszGrapheme to phoneme CRF tagger
crf_taggereps_phonemephoneme_joinc                 C   s>   t |tjr
|| _nt | _| jt| || _|| _d S )N)
isinstance
pycrfsuiteTaggerr   openstrr   r   )selfr   r   r    r   =/home/ubuntu/.local/lib/python3.10/site-packages/gruut/g2p.py__init__:   s   

zGraphemesToPhonemes.__init__Tword	normalizereturnc           	      C   s\   t j||d}| j|}g }|D ]}t |}|| jD ]}|| jkr*|| qq|S )zGuess phonemes for word)r   )	r   word2featuresr   tagdecode_stringsplitr   r   append)	r   r   r   featurescoded_phonemesphonemescoded_ps
decoded_pspr   r   r   __call__M   s   


zGraphemesToPhonemes.__call__c                    s6   |rt trtd fddttD S )z0Create feature dicts for all graphemes in a wordNFCc                    s    g | ]}t j|fi  qS r   )r   grapheme2features).0ikwargsr   r   r   
<listcomp>g   s    z5GraphemesToPhonemes.word2features.<locals>.<listcomp>)r	   r   unicodedatar   rangelen)r   r   r&   r   r%   r   r   ]   s
   
z!GraphemesToPhonemes.word2features         ?r$   	add_beginadd_endchars_backwardchars_forwardbiasencodec                 C   s   | | }t | }	||rt|n|d}
|dkr|rd|
d< td|d D ]}||kr?| ||  }|r8t|n||
d| < q%td|d D ]}||	| k rc| ||  }|r\t|n||
d| < qG||	d krp|rpd|
d< |
S )	z'Create feature dict for single grapheme)r1   graphemer   Tbegin   z	grapheme-z	grapheme+end)r*   r   encode_stringr)   )r   r$   r-   r.   r/   r0   r1   r2   gnum_gr   jg_prevg_nextr   r   r   r"   l   s,   z%GraphemesToPhonemes.grapheme2featuressc                 C   s   t |  dS )zMEncodes string in a form that crfsuite will accept (ASCII) and can be decodedascii)base64	b64encoder2   decoder=   r   r   r   r7         z!GraphemesToPhonemes.encode_stringc                 C   s   t | d S )z)Decodes a string encoded by encode_stringr>   )r?   	b64decoder2   rA   rB   r   r   r   r      rC   z!GraphemesToPhonemes.decode_stringN)T)TTr+   r+   r,   T)__name__
__module____qualname____doc__EPS_PHONEMEPHONEME_JOINtypingUnionr   r   r
   r   r   boolSequencer    staticmethodListr   intfloatFEATURES_TYPEr"   r7   r   r   r   r   r   r   7   sb    
	)r   }        r,   d   corpus_pathoutput_pathgroup_separatoritem_separatorr   r   remove_phonemesc1c2max_iterationsc
                    s  t | } t |}|jjddd t pg  tjdd}
t| ddd}t|D ]\}}| }|s3q(d}|	 }g }g }|D ]K}|j	|dd	\}}|	|} fd
d|	|D }t
j||gddD ]!\}}|du rpd} n|| |r||| qd|| qd|r nq?|rtd|d ||| q(zdd |D }|
t|| W q( ty } z	td|| |d}~ww W d   n1 sw   Y  |
|||	dd t|
  td t }|
t| t }td||  t|
jj dS )zTrain a new G2P modelT)parentsexist_okF)verboserzutf-8)encodingr5   maxsplitc                    s   g | ]}| vr|qS r   r   r#   r   r[   r   r   r'      s    ztrain.<locals>.<listcomp>N)	fillvaluez7Failed to align line %s: %s (graphemes=%s, phonemes=%s)c                 S   s   g | ]}t |qS r   )r   r7   rf   r   r   r   r'      s    zgraphemes=%s phonemes=%s)r\   r]   r^   zfeature.possible_transitionsTrainingz"Training completed in %s second(s))r   parentmkdirsetr
   Trainerr   	enumeratestripr   	itertoolszip_longestr   join_LOGGERwarningr   r   	Exception	exception
set_paramsdebug
get_paramsinfotimeperf_countertrainr   	logparserlast_iteration)rW   rX   rY   rZ   r   r   r[   r\   r]   r^   trainercorpusr$   line	skip_lineparts	aligned_g	aligned_ppartgs_strps_strgspsg1p1	encoded_pe
start_timeend_timer   rg   r   r}      s   


8

r}   c              
   C   s,   t | j| j| j| j| j| j| j| jd dS )zCLI method for train)rW   rX   rY   rZ   r[   r\   r]   r^   N)	r}   r   outputrY   rZ   r[   r\   r]   r^   )argsr   r   r   do_train  s   
r   c                 C   st   t | j}| jr| j}ntj}ttj rtdtj	d |D ]}|
 }|s)q |}||}t|g|R   q dS )zCLI method for predictzReading words from stdin...fileN)r   modeltextssysstdinosisattyfilenoprintstderrro   )r   taggerlinesr   r   r   r   r   r   
do_predict  s   
r   c              
   C   s  zddl m} W n ty  } ztd td |d}~ww t| j}| jr-| j}ntj	}t
tj	 r?tdtjd i }|D ]}| }|rOd|vrPqC|jd	d
\}}|||< qCi }	t }
|D ]}||}d||	|< qet }d}d}d}| D ]&\}}|	|d}|r|||}||7 }|t|7 }q|d	7 }td| q|dksJ dt|| d}tt|	||
  d}td|d|d| |dkrtd| dS dS )zCLI method for testr   )levenshteinz4rapidfuzz library is needed for levenshtein distancezpip install 'rapidfuzz>=1.4.1'Nz#Reading lexicon lines from stdin...r    r5   rd    zNo pronunciation for %szNo phonemes were read   zPER:zErrors:z
words/sec:zTotal missing:)rapidfuzz.string_metricr   ImportErrorrs   criticalr   r   r   r   r   r   r   r   r   r   ro   r   r{   r|   rr   itemsgetr*   rt   round)r   r   r   r   r   lexiconr   r   actual_phonemespredicted_phonemesr   r   r   
num_errorsnum_missingnum_phonemesexpected_phonemesdistanceperwpsr   r   r   do_test-  sZ   




r   c                  C   s  t jdd} |  }d|_d|_|jddd}|jddd	d
 |jdddd
 |jdtddd |jdtddd |jdtddd |jdddd |jdddd |jddd d! |j	t
d" |jd#d$d}|jd%dd&d
 |jd'dd(d! |j	td" |jd)d*d}|jd%dd&d
 |jd'dd+d! |j	td" |||fD ]
}|jd,d-d.d/ q|  }|jrtjtjd0 ntjtjd0 t| || d1S )2zMain entry pointzg2p.py)progTcommandr}   z=Train a new G2P model from a pre-aligned Phonetisaurus corpus)helpz--corpusz(Path to aligned Phonetisaurus g2p corpus)requiredr   z--outputzPath to output tagger modelz--c1rU   z
L1 penalty)typedefaultr   z--c2r,   z
L2 penaltyz--max-iterationsrV   z4Maximum number of training iterations (default: 100)z--group-separatorrT   z(Separator between graphemes and phonemes)r   r   z--item-separatorr   z"Separator between items in a groupz--remove-phonemes*zRemove phonemes from examples)nargsr   )funcpredictzPredict phonemes for word(s)z--modelzPath to G2P tagger modelr   WordstestzTest G2P model on a lexiconz+Lines with '<word> <phoneme> <phoneme> ...'z--debug
store_truezPrint DEBUG messages to console)actionr   )levelN)argparseArgumentParseradd_subparsersr   dest
add_parseradd_argumentrR   rQ   set_defaultsr   r   r   
parse_argsrx   loggingbasicConfigDEBUGINFOrs   r   )parsersub_parserstrain_parserpredict_parsertest_parser
sub_parserr   r   r   r   mains  sr   
r   __main__)!rH   r   r?   rp   r   r   r   r{   rK   r(   pathlibr   r
   	getLoggerrs   Dictr   rL   rM   rQ   rR   rS   rI   rJ   r   OptionalIterabler}   r   r   r   r   rE   r   r   r   r   <module>   sj   
p	

dFV
