o
    Xεi;                     @   s   d Z ddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZedZejeejf Zdd Zdd	 Zd
d Ze ZG dd dZedkrVe  dS dS )zlGuess word pronunciations using a Phonetisaurus FST

See bin/fst2npz.py to convert an FST to a numpy graph.
    N)defaultdict)Pathg2p_phonetisaurusc                  C   s  t jdd} |  }d|_d|_|jddd}|jddd	d
 |jdddd |jddtdd |jddtdd |jddtdd |jddtdd |jdddd |jd d!d"d |jd#d$d%d& |j	t
d' |jd(d)d}|jddd	d
 |jd*dd+d |jddtdd |jddtdd |jddtdd |jd#d$d%d& |j	td' ||fD ]
}|jd,d$d-d& q|  }|jrtjtjd. ntjtjd. t| || d/S )0zMain entry pointr   )progTcommandpredictzPredict phonemes for word(s))helpz--graphz&Path to graph npz file from fst2npy.py)requiredr   words*z!Words to guess pronunciations for)nargsr   z--max-guesses   z/Maximum number of guesses per word (default: 1))defaulttyper   z--beami  z+Initial width of search beam (default: 500)z
--min-beamd   z+Minimum width of search beam (default: 100)z--beam-scale333333?z8Scalar multiplied by beam after each step (default: 0.6)z--grapheme-separator z1Separator between input graphemes (default: none))r   r   z--phoneme-separator z2Separator between output phonemes (default: space)z--preload-graph
store_truez)Preload graph into memory before starting)actionr   )functestzTest G2P model on a lexicontextsz+Lines with '<word> <phoneme> <phoneme> ...'z--debugzPrint DEBUG messages to console)levelN)argparseArgumentParseradd_subparsersr	   dest
add_parseradd_argumentintfloatset_defaults
do_predictdo_test
parse_argsdebugloggingbasicConfigDEBUGINFO_LOGGERr   )parsersub_parserspredict_parsertest_parser
sub_parserargs r2   K/home/ubuntu/.local/lib/python3.10/site-packages/gruut/g2p_phonetisaurus.pymain   s   
r4   c                 C   s   t | j| _td| j tj| j| jd}| jr%| j}tdt	| nt
j}tt
j r7tdt
jd |j|| j| j| j| j| jdD ]\}}}|sVtd|| qGt|| j| qGdS )	zPredict phonemes for wordsLoading graph from %spreloadz&Guessing pronunciations for %s word(s)zReading words from stdin...file)grapheme_separatormax_guessesbeammin_beam
beam_scalezNo pronunciation for %s (%s)N)r   graphr+   r&   PhonetisaurusGraphloadpreload_graphr
   infolensysstdinosisattyfilenoprintstderrg2pr:   r;   r<   r=   r>   warningphoneme_separatorjoin)r1   
phon_graphr
   word	graphemesphonemesr2   r2   r3   r#      s,   r#   c              
   C   s  zddl m} W n ty  } ztd td |d}~ww t| j| _td| j tj	| j| j
d}| jr>| j}ntj}ttj rPtdtjd	 i }|D ]}| }|r`d
|vraqT|jdd\}}|||< qTi }	t }
|D ]}|j|g| j| j| jddD ]\}}}d
||	|<  qvt }d}d}d}| D ]&\}}|	|d}|r|||}||7 }|t|7 }q|d7 }td| q|dksJ dt || d}t t|	||
  d}td|d|d| |dkrtd| dS dS )z)Test performance relative a known lexiconr   )levenshteinz4rapidfuzz library is needed for levenshtein distancezpip install 'rapidfuzz>=1.4.1'Nr5   r6   z#Reading lexicon lines from stdin...r8   r   r   )maxsplit)r<   r=   r>   r;   r   zNo pronunciation for %szNo phonemes were read   zPER:zErrors:z
words/sec:zTotal missing:)!rapidfuzz.string_metricrT   ImportErrorr+   criticalr   r?   r&   r@   rA   rB   r   rE   rF   rG   rH   rI   rJ   rK   stripsplittimeperf_counterrL   r<   r=   r>   rO   itemsgetrD   rM   round)r1   rT   erP   lineslexiconlinerQ   actual_phonemespredicted_phonemes
start_time_guessed_phonemesend_time
num_errorsnum_missingnum_phonemesexpected_phonemesdistanceperwpsr2   r2   r3   r$      sn   



r$   c                   @   s   e Zd ZdZddedefddZedej	e
ef dd fd	d
Zdejej	e
eje
 f  dejejej	e
eje
 f eje
 eje
 f f fddZ						ddej	e
eje
 f de
dededede
dedejejeje
 eje
 f  fddZdS ) r@   zGraph of numpy arrays that represents a Phonetisaurus FST

    Also contains shared cache of edges and final state probabilities.
    These caches are necessary to ensure that the .npz file stays small and fast
    to load.
    Fr?   r7   c                 C   s   || _ t| j d  | _| j d | _| j d | _g | _| j d D ]}|ddd}| j	t
||f q!| j d | _| j d	 | _|| _tt| _i | _|rqt| jD ]\}^}}| j| 	| qU| jt| j| j d S d S )
N
start_nodeedges
edge_probssymbolsrh   r   |final_nodesfinal_probs)r?   r    itemrr   rs   rt   ru   replacer[   appendrD   rw   rx   	preloadedr   list	out_edgesfinal_node_probs	enumerateupdatezip)selfr?   r7   
symbol_strsymbol_listedge_idx	from_noderh   r2   r2   r3   __init__  s$   
zPhonetisaurusGraph.__init__
graph_pathreturnc                 K   s   t j| dd}t|fi |S )zLoad .npz file with numpy graphT)allow_pickle)nprA   r@   )r   kwargsnp_graphr2   r2   r3   rA   2  s   zPhonetisaurusGraph.loadr
   c                 k   s8    |D ]}| j |fi |D ]
\}}|||fV  qqdS )zGuess phonemes for wordsN)g2p_one)r   r
   r   rQ   rR   rS   r2   r2   r3   rL   8  s   
zPhonetisaurusGraph.g2p<eps>  r   r   r   r   rQ   epsr<   r=   r>   r:   r;   c           &      c   s`   |}g }	t |tr| }|r||}	nt|}	n|}	|	s#|	g fS d| j|	g dfg}
g }g }t }|
rd}g }|
D ]-\}}}}}|rft|}||vrZ|||f |	| t
||kred} nq<|duslJ |s| jry| j|t}n.| j|}|du rtt| j|}| j| |krt| j| }|| j|< nt}|| j|< |tkrtt|}||| dg |df t
|}| jr| j| }n@| j|}|du rtt| jdddf |}g }| j| d |kr|| |d7 }| j| d |ks|| j|< n|}|D ]]}| j| \}}}}| j| }| j| \} }!| |kr*q|!|gkr@|| |||df}"||" q|d|  }#|!|#krh| j| \}}$|| ||t
|#d ||$ df}"||" qq<|ront|dd d	d| }|}
t|t|| }|
s6|rt|d
d d	d| D ]\}}%|	dd |%D fV  qdS |	g fV  dS )zGuess phonemes for wordg        FTNr   r   c                 S      | d S Nr   r2   ry   r2   r2   r3   <lambda>      z,PhonetisaurusGraph.g2p_one.<locals>.<lambda>)keyc                 S   r   r   r2   r   r2   r2   r3   r     r   c                 S   s   g | ]}|r|qS r2   r2   ).0pr2   r2   r3   
<listcomp>  s    z.PhonetisaurusGraph.g2p_one.<locals>.<listcomp>)
isinstancestrrZ   r[   r}   rr   settupler{   addrD   r|   r   r_   
_NOT_FINALr    r   searchsortedrw   r!   rx   typingcastr~   rs   rt   ru   sortedmax)&r   rQ   r   r<   r=   r>   r:   r;   current_beamrR   qq_next	best_heapri   done_with_wordprobnodenext_graphemesoutputis_finalrS   
final_prob	final_idxlen_next_graphemes	edge_idxsmaybe_edge_idxsr   rh   to_node
ilabel_idx
olabel_idxout_problen_igraphemes
igraphemesry   sub_graphemesolabelguess_phonemesr2   r2   r3   r   F  s   









]zPhonetisaurusGraph.g2p_oneN)F)r   r   r   r   r   r   )__name__
__module____qualname____doc__NUMPY_GRAPHboolr   staticmethodr   Unionr   r   rA   IterableSequenceTuplerL   r    r!   r   r2   r2   r2   r3   r@     sP    "
	r@   __main__)r   r   r'   rG   rE   r\   r   collectionsr   pathlibr   numpyr   	getLoggerr+   Dictr   ndarrayr   r4   r#   r$   objectr   r@   r   r2   r2   r2   r3   <module>   s*   
x%P ^
