o
    ,wi                     @   s0  d Z ddlmZ ddlmZ ddlZddlmZ e jZddl	Z
ddlZddlZddlZddlZddlmZ ddlmZ zejd	 W n eyR   ed
 Y nw zejd W n eyi   ed Y nw ejeZdd ZG dd deZedkrg dZe Z eD ]Z!e e!Z"e#e" qdS dS )z~
By kyubyong park(kbpark.linguist@gmail.com) and Jongseok Kim(https://github.com/ozmig77)
https://www.github.com/kyubyong/g2p
    )pos_tag)cmudictN)TweetTokenizer)str   )normalize_numbersz&taggers/averaged_perceptron_tagger.zipaveraged_perceptron_taggerzcorpora/cmudict.zipr   c                  C   sp   t jtd} t }t| dd  D ] }|	drq|
 d\}}}}| | |f|| < q|S )Nzhomographs.enrutf8#|)ospathjoindirnamedictcodecsopenread
splitlines
startswithstripsplitlower)fhomograph2featureslineheadwordpron1pron2pos1 r!   G/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/g2p_en/g2p.pyconstruct_homograph_dictionary    s   r#   c                       sV   e Zd Z fddZdd Zdd Zdd Zdd
dZdd Zdd Z	dd Z
  ZS )G2pc                    s   t    g dtd | _g dg d | _dd t| jD | _dd t| jD | _dd t| jD | _d	d t| jD | _	t
 | _|   t | _d S )
N)<pad><unk></s>abcdefghijklmnopqrstuvwxyz)r%   r&   z<s>r'   )FAA0AA1AA2AE0AE1AE2AH0AH1AH2AO0AO1AO2AW0AW1AW2AY0AY1AY2BCHDDHEH0EH1EH2ER0ER1ER2EY0EY1EY2FGHHIH0IH1IH2IY0IY1IY2JHKLMNNGOW0OW1OW2OY0OY1OY2PRSSHTTHUH0UH1UH2UWUW0UW1UW2VWYZZHc                 S      i | ]\}}||qS r!   r!   .0idxgr!   r!   r"   
<dictcomp>A       z G2p.__init__.<locals>.<dictcomp>c                 S      i | ]\}}||qS r!   r!   rp   r!   r!   r"   rt   B   ru   c                 S   ro   r!   r!   rq   rr   pr!   r!   r"   rt   D   ru   c                 S   rv   r!   r!   rw   r!   r!   r"   rt   E   ru   )super__init__list	graphemesphonemes	enumerateg2idxidx2gp2idxidx2pr   r   cmuload_variablesr#   r   self	__class__r!   r"   rz   5   s   
	
zG2p.__init__c                 C   s   t tjtd| _| jd | _| jd | _| jd | _	| jd | _
| jd | _| jd | _| jd | _| jd	 | _| jd
 | _| jd | _| jd | _| jd | _d S )Nzcheckpoint20.npzenc_embenc_w_ihenc_w_hhenc_b_ihenc_b_hhdec_embdec_w_ihdec_w_hhdec_b_ihdec_b_hhfc_wfc_b)nploadr   r   r   r   	variablesr   r   r   r   r   r   r   r   r   r   r   r   r   r!   r!   r"   r   K   s   zG2p.load_variablesc                 C   s   ddt |   S )Nr   )r   exp)r   xr!   r!   r"   sigmoid[   s   zG2p.sigmoidc                 C   s   t ||j| }t ||j| }|d d d |jd d d f |d d |jd d d d f }	}
|d d d |jd d d f |d d |jd d d d f }}| |	| }t |dd\}}t |
||  }d| | ||  }|S )N      r   )r   matmulra   shaper   r   tanh)r   r   hw_ihw_hhb_ihb_hhrzn_ihrzn_hhrz_ihn_ihrz_hhn_hhrzr	   znr!   r!   r"   grucell^   s   FFzG2p.grucellNc              	   C   s   |d u rt |jd |jd ft j}|}t |jd ||jd ft j}	t|D ] }
| |d d |
d d f |||||}||	d d |
d d f< q)|	S )Nr   r   )r   zerosr   float32ranger   )r   r   stepsr   r   r   r   h0r   outputstr!   r!   r"   grum   s    &zG2p.gruc                    s>   t |dg } fdd|D }tj jt|ddd}|S )Nr'   c                    s    g | ]} j | j d  qS r&   )r   getrq   charr   r!   r"   
<listcomp>y   s     zG2p.encode.<locals>.<listcomp>r   axis)r{   r   taker   expand_dims)r   wordcharsr   r!   r   r"   encodew   s   z
G2p.encodec           
         s    |} j|t|d  j j j jtd jj	d ftj
d}|d d dd d f }tj jdgdd}|}g }tdD ]4} || j j j j}t| jj j }| }	|	dkrf n||	 tj j|	gdd}qA fd	d
|D }|S )Nr   r   )r   r   r   r      r   c                    s   g | ]	} j |d qS r   )r   r   )rq   rr   r   r!   r"   r      s    zG2p.predict.<locals>.<listcomp>)r   r   lenr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ra   r   argmaxappend)
r   r   enclast_hiddendecr   predsilogitspredr!   r   r"   predict~   s"   
 
zG2p.predictc                 C   s
  t |}t|}ddd td|D }| }tdd|}|dd}|dd	}t	|}t
|}g }|D ]E\}}td
|d u rI|g}n*|| jv ra| j| \}}	}
||
r^|}n|	}n|| jv rn| j| d }n| |}|| |dg q9|d d S )N c                 s   s"    | ]}t |d kr|V  qdS )MnN)unicodedatacategoryr   r!   r!   r"   	<genexpr>   s    zG2p.__call__.<locals>.<genexpr>NFDz[^ a-z'.,?!\-]zi.e.zthat isze.g.zfor examplez[a-z]r    r   )unicoder   r   r   	normalizer   resubreplaceword_tokenizer   searchr   r   r   r   extend)r   textwordstokenspronsr   pospronr   r   r    r!   r!   r"   __call__   s0   




zG2p.__call__)N)__name__
__module____qualname__rz   r   r   r   r   r   r   r   __classcell__r!   r!   r   r"   r$   4   s    

r$   __main__)zI have $250 in my pocket.z popular pets, e.g. cats and dogsz+I refuse to collect the refuse around here.zI'm an activationist.)$__doc__nltkr   nltk.corpusr   nltk.tokenizer   tokenizer   numpyr   r   r   r   r   builtinsr   r   expandr   datafindLookupErrordownloadr   r   __file__r#   objectr$   r   textsg2pr   outprintr!   r!   r!   r"   <module>   sF    
