o
    
i5                     @   s  d dl mZ d dlmZ ddlmZ ddlmZ ddlm	Z	 ddl
ZddlZddlZddlZddlZi edd	ed
d	eddeddeddeddeddeddeddeddeddeddeddeddedd ed!d"ed#d$i ed%d&ed'd(ed)d*ed+d,ed-d.ed/d0ed1d2ed3d4ed5d6ed7d8ed9d:ed;d<ed=d>ed?d@edAdBedCdDedEd2i edFdGedHd6edIdJedKdLedMdNedOdPedQdRedSdTedUdVedWdXedYdZed[d\ed]d^ed_d`edadbedcddededfi edgdhedidjedkdledmdnedodpedqdredsdtedudvedwdxedydzed{d|ed}d~eddeddeddeddeddi eddeddeddeddeddeddeddeddeddeddeddeddeddeddeddedd$Zeedkredd eddD sJ eeddeddeddeddi eedkredd eddD s!J ei eded deded deded deded deded deded deded deded deded deded deded deded deded deded deded ded!ed ded!ed di ed!ed ded!ed ded/ed ded/ed dÓed/ed dēed/ed dœed1ed dƓed1ed dǓed1ed dȓed1ed dɓedCed dʓedCed d˓edCed d̓edCed d͓edEed dǓedEed dȓedEed dɓi edFed dΓedFed dϓedFed dГedFed dѓedIed dғedIed dӓedKed dԓedKed dՓedMed d֓edOed dדedSed dؓedSed dٓedSed dړedSed dۓedaed dܓedaed dݓedaed dޓi edaed dߓedced dedced dedced dedeed dedeed dedeed dedged dedged dedged dedged dedged dedged ded{ed ded{ed ded{ed deded deded deded deded deded deded deded deded deded di eedksJ ee D ] \ZZeedkr e\ZZeev reev s J eefqei dddddddddeddedd eddeddddddddd	d
ddd	ddddddddddededddd g dZedd eeddeD sJ dd eD ZejedZ e!dd e D Z"W d   n	1 sw   Y  dd Z#e!d Z$e!d!Z%eG d"d# d#Z&eG d$d% d%Z'G d&d' d'Z(dS ((     )data)Convert    )	dataclass)Tagger)TupleNiA0  aiB0  iC0  iiD0  iE0  u   ɯiF0  iG0  eiH0  iI0  oiJ0  iK0  kaiL0  u   ɡaiM0  u   kʲiiN0  u   ɡʲiiO0  u   kɯiP0  u   ɡɯiQ0  keiR0  u   ɡeiS0  koiT0  u   ɡoiU0  saiV0  u   ʣaiW0  u   ɕiiX0  u   ʥiiY0  u   sɨiZ0  u   zɨi[0  sei\0  u   ʣei]0  soi^0  u   ʣoi_0  tai`0  daia0  u   ʨiib0  id0  u   ʦɨie0  if0  teig0  deih0  toii0  doij0  naik0  u   ɲiil0  u   nɯim0  nein0  noio0  haip0  baiq0  pair0  u   çiis0  u   bʲiit0  u   pʲiiu0  u   ɸɯiv0  u   bɯiw0  u   pɯix0  heiy0  beiz0  pei{0  hoi|0  boi}0  poi~0  mai0  u   mʲii0  u   mɯi0  mei0  moi0  jai0  i0  u   jɯi0  i0  joi0  i0  u   ɾai0  u   ɾʲii0  u   ɾɯi0  u   ɾei0  u   ɾoi0  u   βai0  i0  i0  i0  i0  u   vɯi0  i0  T   c                 c   s$    | ]}|d v pt |tv V  qdS )>   c0  0  NchrHEPBURN.0r	    r1   A/home/ubuntu/.local/lib/python3.10/site-packages/misaki/cutlet.py	<genexpr>f      " r3   i0  i0  vai0  u   vʲii0  vei0  voX   c                 c   s    | ]	}t |tv V  qd S Nr,   r/   r1   r1   r2   r3   n   s    i0  jeu   βiu   βeu   βou   kʲeu   kʲau   kʲɨu   kʲou   ɡʲau   ɡʲɨu   ɡʲou   kᵝau   kᵝiu   kᵝeu   kᵝou   ɡᵝau   ɡᵝiu   ɡᵝeu   ɡᵝou   ɕeu   ɕau   ɕɨu   ɕou   ʥeu   ʥau   ʥɨu   ʥou   ʨeu   ʨau   ʨɨu   ʨou   ʦau   ʦʲiu   ʦeu   ʦou   tʲiu   tʲɨu   dʲiu   dʲɨu   tɯu   dɯu   ɲeu   ɲau   ɲɨu   ɲou   çeu   çau   çɨu   çou   bʲau   bʲɨu   bʲou   pʲau   pʲɨu   pʲou   ɸau   ɸʲiu   ɸeu   ɸou   ɸʲɨu   ɸʲou   mʲau   mʲɨu   mʲou   ɾʲau   ɾʲɨu   ɾʲo      u   。.u   、,u   ？?u   ！!u   「i   u   」i   u   『u   』u   ：:u   ；;u   （(u   ）)u   《u   》u   【[u   】]u   ・    — )u   ，u   ～u   〜rH      «   »u   ゚u   ゙)u   ㇰクu   ㇱシu   ㇲスu   ㇳトu   ㇴヌu   ㇵハu   ㇶヒu   ㇷフu   ㇸヘu   ㇹホu   ㇺムu   ㇻラu   ㇼリu   ㇽルu   ㇾレu   ㇿロc                 c   s$    | ]\}}t ||d  kV  qdS )r   N)r-   )r0   r	   kkr1   r1   r2   r3      r4   i1  i 2  c                 C   s   i | ]	}|d  |d qS )r   r   r1   )r0   rL   r1   r1   r2   
<dictcomp>   s    rM   zja_words.txtc                 C   s   h | ]}|  qS r1   )strip)r0   liner1   r1   r2   	<setcomp>       rP   c                 C   s*   z
d | }d| W S  ty   Y dS w )z6Given a kana (single-character string), add a dakuten.u<   かきくけこさしすせそたちつてとはひふへほu<   がぎぐげござじずぜぞだぢづでどばびぶべぼN)index
ValueError)rL   iir1   r1   r2   add_dakuten   s   

rU   u   ゃゅょぁぃぅぇぉu   〃々ゝゞヽc                   @   s&   e Zd ZU eed< eed< eed< dS )Wordsurfacehira	char_typeN)__name__
__module____qualname__str__annotations__intr1   r1   r1   r2   rV         
 rV   c                   @   s&   e Zd ZU eed< eed< dd ZdS )TokenrW   spacec                 C   s   | j rdnd}| j | S )NrG   rI   )rb   rW   )selfspr1   r1   r2   __str__  s   zToken.__str__N)rZ   r[   r\   r]   r^   boolre   r1   r1   r1   r2   ra      r`   ra   c                   @   sJ   e Zd Zdd Zdeedf fddZdd Zd	d
 Zdd Z	dd Z
dS )Cutletc                 C   s   t  | _tt| _i | _d S r9   )r   taggerdictr.   table
exceptionsrc   r1   r1   r2   __init__  s   

zCutlet.__init__returnNc                 C   s|   |sdS |  |}dd | |D }| |}ddd |D }tdd| dd	d
d}tdd|}|dfS )z(Build a complete string from input text.)rI   Nc                 S   sH   g | ] }t |jt|jjp|jjp|j|jd ks|jsdn|jqS )      )	rV   rW   jaconv	kata2hirafeaturepronkanarY   is_unkr0   wr1   r1   r2   
<listcomp>  s    z#Cutlet.__call__.<locals>.<listcomp>rI   c                 S   s   g | ]}t |qS r1   )r]   )r0   tokr1   r1   r2   ry     rQ   z\s+rG   rC   rJ   rD   rK   u5   (?<![!",.:;?»—…”]) (?=ʔ)|(?<=ʔ) (?!["«“])N)_normalize_textrh   _romaji_tokensjoinresubrN   replace)rc   textwordstokensoutpsr1   r1   r2   __call__  s   

"zCutlet.__call__c                 C   sr   t dd|}t D ]
\}}|||}qtd|}tj|dd}tj	|ddd}d
dd	 t d
|D S )a-  Given text, normalize variations in Japanese.

        This specifically removes variations that are meaningless for romaji
        conversion using the following steps:

        - Unicode NFKC normalization
        - Full-width Latin to half-width
        - Half-width katakana to full-width
        u   [〜～](?=\d)u   からNFKCF)ru   )digitasciirI   c                 S   s$   g | ]}|  rd t| n|qS )rG   )isdigitr   )r0   tr1   r1   r2   ry   0  s   $ z*Cutlet._normalize_text.<locals>.<listcomp>z\d+|\D+)r~   r   Katakana_Phonetic_Extensionsitemsr   unicodedata	normalizemojimoji
zen_to_han
han_to_zenr}   findall)rc   r   kvr1   r1   r2   r{     s   zCutlet._normalize_textc                    s  g }d  t k rSt fddt d t D t }t fddt| dD d}|du rB|  g  d7  n| |  |  t k s
dd	 |D g }tD ]d\}}|rj|d nd}|dkrv|d  nd}	|t d k r|d  nd}
| |}t|d
}|j}|dv s|dv r|rd|_n|dv s|dv r|rd
|_d|_n|dkrd
|_nd|_|| q`|D ]
}|j	dd|_q|S )z(Build a list of tokens from input nodes.r   c                 3   s(    | ]}| j   j kr|V  qd S r9   )rY   )r0   zr	   r   r1   r2   r3   7  s   & z(Cutlet._romaji_tokens.<locals>.<genexpr>r   c                 3   s4    | ]}d  dd  | D tv r|V  qdS )rI   c                 s       | ]}|j V  qd S r9   rW   rw   r1   r1   r2   r3   8      z2Cutlet._romaji_tokens.<locals>.<genexpr>.<genexpr>N)r}   JA_WORDS)r0   jr   r1   r2   r3   8  s   2 Nc                 S   s>   g | ]}t d dd |D d dd |D |d jqS )rI   c                 s   r   r9   r   rw   r1   r1   r2   r3   @  r   z3Cutlet._romaji_tokens.<locals>.<listcomp>.<genexpr>c                 s   r   r9   )rX   rw   r1   r1   r2   r3   A  r   r   )rV   r}   rY   )r0   gr1   r1   r2   ry   ?  s    z)Cutlet._romaji_tokens.<locals>.<listcomp>Fu   「『«z([Tu   」』»z]).,?!:rG      っrI   )
lennextrangeappend	enumerate_romaji_wordra   rW   rb   r   )rc   r   groupsr   r   r   wiwordr#   pwnwromarz   rW   r1   r   r2   r|   2  sJ   ,"
	 

zCutlet._romaji_tokensc           	         s   |j }| jv r j| S | rJ || r|S |jdkr,dt fdd|S |jdkr3dS d}|j}t|D ])\}}|t	|d k rN||d  nd}|dkrZ||d  nd}| 
|||7 }q<|S )	z+Return the romaji for a single word (node).   rI   c                    s    j | | S r9   )rj   get)crl   r1   r2   <lambda>g  s    z%Cutlet._romaji_word.<locals>.<lambda>rp   r   Nr   )rW   rk   r   isasciirY   r}   maprX   r   r   _get_single_mapping)	rc   r   rW   r   rX   kicharnkpkr1   rl   r2   r   ^  s"   



 zCutlet._romaji_wordc                    sL  |t v r%|dv r|r|S dS |dv r#|sdS t|}|r!| j| S dS dS |r5|| | jv r5| j||  S |r@|| | jv r@dS |rZ|tv rZ|dkrLdS | j| dd | j|  S |tv r`dS |dkrfdS |dkrld	S |d
kr| j|  r d dv rdS  d dv rdS t fdddD rdS  d dv rdS dS | j|dS )z@Given a single kana and its neighbors, return the mapped romaji.u   ゝヽrI   u   ゞヾr   Nr   u   ーu   ːu   ʔu   んr   mpbmu   kɡu   ŋc                 3   s    | ]}  |V  qd S r9   )
startswith)r0   ptnkr1   r2   r3     s    z-Cutlet._get_single_mapping.<locals>.<genexpr>)   ɲu   ʨu   ʥr   u   ntdɾznu   ɴ)ODORIrU   rj   SUTEGANAr   any)rc   r   rL   r   vvr1   r   r2   r   r  sH   zCutlet._get_single_mapping)rZ   r[   r\   rm   r   r]   r   r{   r|   r   r   r1   r1   r1   r2   rg     s    ,rg   ))rI   r   num2kanar   dataclassesr   fugashir   typingr   importlib.resources	importlibrq   r   r~   r   r-   r.   r   allr   updatelistr   r   r   r   br   zip	resources	open_textr	frozensetr   rU   r   r   rV   ra   rg   r1   r1   r1   r2   <module>   s  	
 !"$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRTUV,X,	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEN 
	


 *
	
