o
    jo™iJ$  ã                   @   sÆ   d Z ddlZddlZddlmZ ddlmZ ddlmZ dejdejfdd	„Z	dejdejfd
d„Z
dejdejfdd„Ze ej¡ ¡ ZG dd„ dƒZG dd„ dƒZG dd„ dƒZG dd„ dƒZdS )a  Grammar to handle the "channel" part of source-channel chatspeak normalizer.

We consider the following cases

1) Common chatspeak terms, from a lexicon.
2) Common patterns, as described by regular expression.
3) Letter-duplication cases, such as "cooooooool"
4) Some abbreviations with optional sonorant and vowel deletions.

Chatspeak terms are normalized by this grammar to a lattice of possible
verbalizations for a sentence, which then gets scored by a language model.

For simplicity we assume that all text is case-free (lower case).
é    N)Úbyte)Úpynutil)ÚrewriteÚtokenÚreturnc                 C   s   t  | d¡S )Né   ©ÚpyniniÚclosure©r   © r   úM/home/ubuntu/.local/lib/python3.10/site-packages/pynini/examples/chatspeak.pyÚ_plus)   s   r   c                 C   s
   t  | ¡S ©Nr   r   r   r   r   Ú_star-   s   
r   c                 C   s   t  | dd¡S )Nr   r   r   r   r   r   r   Ú_ques1   s   r   c                   @   sd   e Zd ZU dZejed< ejed< dejfdd„Zede	dejfd	d
„ƒZ
dejdejfdd„ZdS )ÚDeduplicatorz-Container for a deduplicator for all letters.Ú_dedupÚ_lexiconÚlexiconc                 C   sP   t tjƒ}t|ƒ}t |¡| _|D ]}|  jt |¡  _| j ¡  q|| _dS )z\Constructs the deduplicator.

    Args:
      lexicon: an FSA representing the lexicon.
    N)	ÚiterÚstringÚascii_lowercaseÚnextr   Ú
dedup_ruler   Úoptimizer   )Úselfr   ÚitÚletterr   r   r   Ú__init__G   s   

zDeduplicator.__init__r   r   c                 C   s6   t j|  }t t t| ƒt| ƒ¡d|B |  d|B t¡S )zíCompiles transducer that optionally deletes multiple letters.

    One or two of the same letter must be encountered beforehand.

    Args:
      letter: a letter.

    Returns:
      An FST deleting that in an appropriate sequence.
    z[BOS]ú[EOS])r   ÚLOWERr	   Ú	cdrewriteÚcrossr   r   Ú_sigma_star)r   Ú
not_letterr   r   r   r   U   s
   
þzDeduplicator.dedup_ruler   c                 C   s<   zt  || j¡}t  || j¡W S  t jy   t ¡  Y S w )zµFinds deduplication candidates for a token in a lexicon.

    Args:
      token: a "cooooool"-like token.

    Returns:
      An FST representing a lattice of possible matches.
    )r   Úrewrite_latticer   r   ÚErrorr	   ÚFst)r   r   Úlatticer   r   r   Úexpandf   s   	ÿzDeduplicator.expandN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r	   r(   Ú__annotations__r   ÚstaticmethodÚstrr   ÚFstLiker*   r   r   r   r   r   A   s   
 

r   c                   @   sÊ   e Zd ZU dZe ddddd¡Ze ddd	¡Zej	e Z
eeƒed
ƒ Zeje e¡ee
e edd ¡ Zeje e¡e
e
dB edd ¡ Zej	d Zejed< dejfdd„Zdejdejfdd„ZdS )ÚDeabbreviatora  Expands abbreviations formed by deleting vowels or sonorants.

  The result must have at least two letters and should not already be in the
  lexicon (i.e., we don't wish to expand things that could be words), and the
  expansion should have at least three letters.
  ÚaÚeÚiÚoÚuÚrÚlÚnÚyÚopt)Úmoder    )é   .Ú	_deabbrevr   c                 C   sH   t jd | }| ¡  || j | j | j }|| ¡  | _| j ¡  dS )z]Constructs the deabbreviator.

    Args:
      lexicon: an FSA representing the lexicon.
    )é   .N)r   r!   r   Ú_three_lettersÚ_r_deletionÚ_v_deletionÚinvertr@   )r   r   Útwo_letters_not_in_lexiconÚrulesr   r   r   r   Œ   s
   zDeabbreviator.__init__r   r   c                 C   ó.   zt  || j¡W S  t jy   t ¡  Y S w r   )r   r&   r@   r'   r	   r(   ©r   r   r   r   r   r*   ˜   ó
   ÿzDeabbreviator.expandN)r+   r,   r-   r.   r	   ÚunionÚ_vÚ_rr   r!   Ú_cr   r   Ú_vowel_spanr"   r   Údeleter$   r   rC   rD   rB   r(   r/   r   r2   r*   r   r   r   r   r3   v   s*   
 
ÿÿþþ

r3   c                	   @   s$  e Zd ZdZejg e dedƒ edƒ d¡‘e dedƒ d¡‘e dedƒ d¡‘e ded	ƒ d
¡‘e dedƒ d¡‘e dedƒ d¡‘e dedƒ d¡‘e dedƒ d d¡‘e de	dƒ edƒ d¡‘e de	dƒ edƒ d¡‘e de	dƒ edƒ d¡‘e de	dƒ edƒ d¡‘e de	dƒ edƒ d¡‘e dedƒ edƒ d¡‘e dedƒ d¡‘e d ed!ƒ d d¡‘e dedƒ d¡‘e edƒe d!d"¡ 
d#¡ edƒ edƒ e d$d%¡¡‘e edƒed"ƒ edƒ ed&ƒ d'¡‘e edƒed"ƒ ed&ƒ edƒ edƒ d'¡‘e d(ed	ƒ d)¡‘e d*ed*ƒ d d+¡‘e d,edƒ d-¡‘e d.ed	ƒ d/¡‘e d/edƒ edƒ d/¡‘e d0edƒ d-¡‘e d1edƒ d2¡‘e d3edƒ d2¡‘e dedƒ d2¡‘e d4edƒ d5¡‘e d6edƒ d5¡‘e d7edƒ d8¡‘e d9edƒ d:¡‘e d;edƒ d< d:¡‘e d=ed>ƒ d?¡‘e d@edƒ d¡‘e dAedƒ dB¡‘e dCedƒ dC¡‘e dDed!ƒ dE edEƒ edƒ dC¡‘e dFedƒ dC¡‘e dGedEƒ dH¡‘e dIed=ƒ dH¡‘e dJed=ƒ dH¡‘e dJedƒ dH¡‘e dKed*ƒ dL¡‘e ed"ƒdM¡‘e dNed	ƒ dO¡‘e d"ed	ƒ d¡‘e dPedJƒ dQ¡‘e dEedRƒ edƒ dS¡‘e dEedRƒ edƒ edRƒ dT¡‘e edUƒedƒ ed"ƒ edVƒ dT¡‘e dWedRƒ dX¡‘e dEed"ƒ edJƒ dX¡‘e edYƒdZ¡‘e d[ed	ƒ d\¡‘e d[edƒ d\¡‘e d]edRƒ d\¡‘e d^ed	ƒ d^¡‘e d]edƒ d^¡‘e d_edDƒ d_¡‘e d_edƒ d_¡‘e d`edƒ d5¡‘e ded"ƒ edVƒ dT¡‘R Ž  ¡ Zdaejdbejfdcdd„ZdeS )fÚRegexpsz#Container for regexp substitutions.ÚbÚzzbye byeÚcongratÚcongratulationsÚcoolÚdelisÚhÚ	deliciousr5   r9   ÚuhÚfÚffÚgr:   z	good luckÚher<   Úheyú'zhe'sÚhowzhow iszhow haszhow waszhow doesÚkewÚkÚokÚkor7   r8   r   zlaugh out loudÚlaughÚvÚloveÚmisÚmissÚmzmm okÚn00bÚnewbieÚnaÚnoÚnoobÚokeÚokayÚokiÚomz	oh my godÚomgÚorlyz	oh reallyÚplÚpleaseÚpwÚaseÚqr;   Ú	_questionÚqoolÚrox0rÚrocksÚsorryÚsÚwÚsryÚthankez	thank youÚthankÚtÚtyvzthank you very muchÚyouÚugÚughÚwaiÚwaitr4   zwhat'sz	what's upÚwaÚpÚwhÚwhatÚxoz'hugs and kisses'ÚyaÚyeahÚyeÚyesÚyupÚzomr   r   c                 C   rH   )z Finds regexps candidates for a token.

    Args:
      token: a "zomggg"-like token.

    Returns:
      An FST representing a lattice of possible matches.
    )r   r&   Ú_regexpsr'   r	   r(   rI   r   r   r   r*   ë   s
   	ÿzRegexps.expandN)r+   r,   r-   r.   r	   rK   r#   r   r   r   r
   r   r˜   r2   r(   r*   r   r   r   r   rQ   Ÿ   s    ÿþýüûúùø	÷
öõôóòñðï ÿ
þî&ë&þêçæåäãâá à!ß"Þ#Ý$Ü%Û&Ú'Ù(Ø)×*Ö&+Õ,Ô-Ó.Ò/Ñ0Ð1Ï2Î3Í4Ì5Ë6Ê"7É8 ÿÈ:Æ;Å<Ä=Ã>Â?Á@ÀA¿B¾C½D¼E»E»GrQ   c                   @   s@   e Zd ZU dZejed< defdd„Zdej	dejfdd	„Z
d
S )ÚLexiconz%Container for a substitution lexicon.r   Úpathc                 C   s   t  |¡ ¡ | _d S r   )r	   Ústring_filer   r   )r   rš   r   r   r   r   ÿ   s   zLexicon.__init__r   r   c                 C   rH   r   )r   r&   r   r'   r	   r(   rI   r   r   r   r*     rJ   zLexicon.expandN)r+   r,   r-   r.   r	   r(   r/   r1   r   r2   r*   r   r   r   r   r™   ú   s
   
 
r™   )r.   r   r	   Ú
pynini.libr   r   r   r2   r(   r   r   r   r
   ÚBYTEr   r$   r   r3   rQ   r™   r   r   r   r   Ú<module>   s   
5)[