o
    PεiDy                     @   s   d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z  ddl!m"Z" e#dZ$G dd dZ%G d	d
 d
Z&G dd dZ'G dd dZ(dS )z*Functions for manipulating phones/phonemes    N)defaultdict)	_DATA_DIR_DIR
CONSONANTSFEATURE_COLUMNSFEATURE_EMPTYFEATURE_KEYSFEATURE_ORDINAL_COLUMNSIPALANG_ALIASESSCHWASVOWELSAccentBreak	BreakType	ConsonantConsonantPlaceConsonantTypeDipthong
IntonationPhonemeLengthSchwaStressVowelVowelHeightVowelPlacement)resolve_lang	gruut_ipac                   @   s  e Zd ZdZ								ddedeje dejeje	  de
d	ejeje  d
ejeje  dejejeeje f  dejeje  defddZedefddZede
fddZede
fddZede
fddZdefddZededd fddZdS )Phonez4Single IPA phone with diacritics and suprasegmentalsNF lettersstressaccentsis_longnasalraised
diacriticssuprasegmentalstonec
                 C   s  t d|| _|| _t|pg | _|| _|pt | _t	| j| _
|p$t | _t	| j| _|	| _|p4tt| _|p:t | _| jtjkrJ| jtj n| jtjkrW| jtj tj| jv rd| jtj tj| jv rq| jtj | jr{| jtj | jD ]}
| j|
}|d u rt }|| j|
< |tj q~| jD ]}
| j|
}|d u rt }|| j|
< |tj  qd| _!t"| j| _#t$| j| _%t&| j| _'d S )NNFCr   )(unicodedata	normalizer    r!   listr"   r#   setr$   boolis_nasalr%   	is_raisedr(   r   r&   r'   r   PRIMARYaddr
   STRESS_PRIMARY	SECONDARYSTRESS_SECONDARYr   ACUTEACCENT_ACUTEGRAVEACCENT_GRAVELONGgetNASALRAISED_textr   vowelr   	consonantr   schwa)selfr    r!   r"   r#   r$   r%   r&   r'   r(   letter_indexletter_diacritics rE   F/home/ubuntu/.local/lib/python3.10/site-packages/gruut_ipa/phonemes.py__init__.   sN   



zPhone.__init__returnc                 C   s  | j r| j S | jD ]}|tjkr|  j tj7  _ q	|tjkr&|  j tj7  _ q	| jt	j
kr6|  j tj7  _ n| jt	jkrD|  j tj7  _ t| jD ]\}}|  j |7  _ | j|g D ]	}|  j |7  _ q[qI| jrq|  j | j7  _ | jr||  j tj7  _ td| j | _ | j S )z4Get textual representation of phone (NFC normalized)r)   )r>   r"   r   r6   r
   r7   r8   r9   r!   r   r1   r3   r4   r5   	enumerater    r&   r;   r(   r#   r:   r*   r+   )rB   accentrC   letter	diacriticrE   rE   rF   texts   s.   


z
Phone.textc                 C   
   | j duS )zTrue if phone is a vowelN)r?   rB   rE   rE   rF   is_vowel      
zPhone.is_vowelc                 C   rN   )zTrue if phone is a consonantN)r@   rO   rE   rE   rF   is_consonant   rQ   zPhone.is_consonantc                 C   rN   )zTrue if phone is a schwaN)rA   rO   rE   rE   rF   is_schwa   rQ   zPhone.is_schwac                 C      | j S NrM   rO   rE   rE   rF   __repr__      zPhone.__repr__	phone_strc                 C   s  t d| }dttdg t t d}d}d}d}|D ]}|tjkr-|s-|d tj q|tj	kr=|s=|d tj
 q|tjkrHtj|d< q|tjkrStj|d< q|rf|tjtjhv rf|d  |7  < qt|rpd	|d
< qt|r}|d | qt|r|d | qt|st|rqt|r|d  |7  < |d7 }qt|r|d  |7  < d	}qt |dkr|d | | q|d  |7  < |r|d7 }d	}qtdi |S )zParse phone from stringNFDr   )r    r&   r(   r"   r$   r%   Fr   r"   r!   r(   Tr#   r$   r%   r       r&   NrE   )r*   r+   r   r-   r
   r7   appendr   r6   r9   r8   r3   r   r1   r5   r4   TONE_GLOTTALIZED
TONE_SHORTr#   r/   r2   r0   
is_bracketis_breakis_tieis_tone	combiningr   )rY   
codepointskwargsin_tone
new_letterrC   crE   rE   rF   from_string   sV   	








zPhone.from_string)NNFNNNNr   )__name__
__module____qualname____doc__strtypingOptionalr   Iterabler   r.   SetintDictrG   propertyrM   rP   rR   rS   rW   staticmethodri   rE   rE   rE   rF   r   +   sP    	

E&r   c                   @   s   e Zd ZdZdejejeee	f  fddZ
edefddZdefdd	Zd
d Zdd Ze				ddededeje dededd fddZdS )PronunciationzLCollection of phones and breaks for some unit of text (word, sentence, etc.)phones_and_othersc                 C   st   || _ g | _g | _g | _| j D ]%}t|tr| j| qt|tr)| j| qt|tr4| j| qd| _	d S )Nr   )
rx   phonesbreaksintonations
isinstancer   r\   r   r   r>   )rB   rx   prE   rE   rF   rG      s   




zPronunciation.__init__rH   c                 C   s$   | j sddd | jD | _ | j S )z9Get text representation of pronunciation (NFC normalized)r   c                 s       | ]}|j V  qd S rU   rV   .0r}   rE   rE   rF   	<genexpr>      z%Pronunciation.text.<locals>.<genexpr>)r>   joinrx   rO   rE   rE   rF   rM     s   zPronunciation.textc                 C   rT   rU   rV   rO   rE   rE   rF   rW     rX   zPronunciation.__repr__c                 C   
   t | jS rU   )iterrx   rO   rE   rE   rF   __iter__     
zPronunciation.__iter__c                 C   
   | j | S rU   )rx   )rB   idxrE   rE   rF   __getitem__  r   zPronunciation.__getitem__TNFpron_strkeep_stresskeep_accents
drop_tones	keep_tiesc                 C   s  |du r|}g }d}d}d}d}	d}
d}d}d}t d| }|D ]}d}d}d}
| s7t|s7|tjhv r8q t|sBt|rDd}t|rT|sTd}
|rSd}d}nNt	|rbd}|rad}d}n@|rs|tj
tjhv rs|sr||7 }q t|ryn)t|r|rd}nq t|r|s||7 }d}q t |dkr|rd}n|rd}|r|r||	| | |  d}	d}d}d}|
r|r|	|7 }	q |r|r||7 }q ||7 }q |r||	| | |  g }|D ]&}t|r|t| qt|r|t| q|t| qt|S )ue  Split an IPA pronunciation into phones.

        Stress/accent markers bind to the next non-combining codepoint (e.g., ˈa).
        Elongation markers bind to the previous non-combining codepoint (e.g., aː).
        Ties join two non-combining sequences (e.g. t͡ʃ).

        Whitespace and brackets are skipped.

        Returns list of phones.
        Nr   FrZ   Tr   )r*   r+   isspacer
   r_   BREAK_SYLLABLEr`   is_intonation	is_accent	is_stressr]   r^   r#   ra   rb   rc   r\   r   ri   r   r   rw   )r   r   r   r   r   clustersclusterr!   r   r"   r   r(   rf   skip_next_clusterrd   	codepointnew_clusterrx   rE   rE   rF   ri     s   






zPronunciation.from_stringTNFT)rj   rk   rl   rm   ro   ListUnionr   r   r   rG   ru   rn   rM   rW   r   r   rv   r.   rp   ri   rE   rE   rE   rF   rw      s8    
rw   c                   @   s   e Zd ZdZ				ddededed	ejeje  d
ef
ddZ	e
defddZe
defddZdddZdefddZdejeejf fddZdefddZdS )Phonemez;Phoneme composed of international phonetic alphabet symbolsr   FNTrM   exampleunknowntonesis_ipac                 C   s  d| _ d| _|| _|| _t|pg | _d | _g | _d| _t	 | _
t	 | _tt| _td|}d| _d| _|rd}d}d}	|D ]}
|
tjkrP|sP| jtj q?|
tjkr_|s_| jtj q?|
tjkritj| _q?|
tjkrstj| _q?|r|
tjtjhv r|  j|
7  _q?t |
rd| _q?t!|
r| j
"| q?t#|
r| j"| q?t$|
st%|
rq?t&|
r|  j|
7  _d}q?|
tj'tj(tj)hv r| j| |
 q?|  j|
7  _|	r|d7 }d}	q?n|| _td| j| _t*| j| _+t,-| j| _.t/-| j| _0t1-| j| _2d | _3| j.s@| j0sB| j2sDt4| jdkrFt,-| jd }t,-| jd }|rH|rJt5||| _3d S d S d S d S d S d S d S )	Nr   FrZ   r   Tr[   r)      )6r>   _text_comparer   r   r,   r   r!   r"   	elongatedr-   	nasalatedr%   r   _extra_combiningr*   r+   r    r(   r
   r7   r\   r   r6   r9   r8   r3   r   r1   r5   r4   r]   r^   r#   r/   r2   r0   r_   r`   rb   SYLLABICNON_SYLLABICEXTRA_SHORT	graphemesletters_graphemesr   r;   r?   r   r@   r   rA   dipthonglenr   )rB   rM   r   r   r   r   rd   rf   rC   rg   rh   vowel1vowel2rE   rE   rF   rG     s   








)zPhoneme.__init__rH   c                 C   s>  | j r| j S | jD ]}|tjkr|  j tj7  _ q	|tjkr&|  j tj7  _ q	| jt	j
kr6|  j tj7  _ n| jt	jkrD|  j tj7  _ t| jD ]4\}}|  j |7  _ || jv ra|  j tj7  _ || jv rn|  j tj7  _ | j| D ]	}|  j |7  _ qsqI| jr|  j | j7  _ | jr|  j tj7  _ td| j | _ | j S )z:Return letters with stress and elongation (NFC normalized)r)   )r>   r"   r   r6   r
   r7   r8   r9   r!   r   r1   r3   r4   r5   rI   r    r   r<   r%   r=   r   r(   r   r:   r*   r+   )rB   rJ   rC   rK   rh   rE   rE   rF   rM     s6   




zPhoneme.textc                 C   s   | j r| j S t| jD ]4\}}|  j |7  _ || jv r#|  j tj7  _ || jv r0|  j tj7  _ | j| D ]	}|  j |7  _ q5q| j	rK|  j tj
7  _ td| j | _ | j S )zCReturn letters and elongation with no stress/tones (NFC normalized)r)   )r   rI   r    r   r
   r<   r%   r=   r   r   r:   r*   r+   )rB   rC   rK   rh   rE   rE   rF   text_compare#  s   

zPhoneme.text_comparec                 C   s   t | j| j| jdS )zCreate a copy of this phonemes)rM   r   r   )r   rM   r   r   rO   rE   rE   rF   copy=  s   zPhoneme.copyc                 C   rT   )z)Return symbol with stress and elongation.rV   rO   rE   rE   rF   rW   A  s   zPhoneme.__repr__c                 C   s$  d}t | | j| j| jd}| jrd|d< | jr| j|d< dd | jD |d< | jd	ur0| jjnd
|d< | j	rNd}| j	j
j|d< | j	jj|d< | j	j|d< n+| jrhd}| jjj|d< | jjj|d< | jj|d< n| jrnd}n| jryd}| jj|d< ||d< t| j|d< t| j|d< | j|d< |S )z&Return properties of phoneme as a dictr   )rM   r    r(   r   Tr   r   c                 S      g | ]}|j qS rE   )value)r   arE   rE   rF   
<listcomp>U      z#Phoneme.to_dict.<locals>.<listcomp>r"   Nr   r!   r   height	placementroundedr   typeplacevoicedr   r   
r_colouredr   r%   r   )reprr    r(   r   r   r   r"   r!   r   r?   r   r   r   r@   r   r   r   r   rA   r   r,   r   r%   r   )rB   	type_namepropsrE   rE   rF   to_dictE  s@   

zPhoneme.to_dictc                 C   s>   |   }|dd}dd | D }| dd| d S )z$Return descriptive string of phonemer   r   c                 S   s   g | ]\}}| d | qS )=rE   )r   kvrE   rE   rF   r   u  s    z%Phoneme.to_string.<locals>.<listcomp>(z, ))r   r;   itemsr   )rB   r   r   	prop_strsrE   rE   rF   	to_stringp  s   zPhoneme.to_string)r   FNT)rH   r   )rj   rk   rl   rm   rn   r.   ro   rp   rq   rG   ru   rM   r   r   rW   rt   Anyr   r   rE   rE   rE   rF   r     s2    
b'
+r   c                   @   s   e Zd ZdZdZdddZdd Zdd	 Zd
d Zdd Z	e
dedd fddZe
d ddZdd Z				d!dejeef dedeje dededeje fddZdS )"Phonemesz-Set of phonemes and allophones for a language#Nc                 C   s8   |pg | _ |pi | _d | _d | _i | _i | _|   d S rU   )phonemesipa_map_ipa_map_regex_phonemes_sortedgruut_ipa_mapphoneme_textsupdate)rB   r   r   rE   rE   rF   rG     s   

zPhonemes.__init__c                 C   r   rU   )r   r   rO   rE   rE   rF   r     r   zPhonemes.__iter__c                 C   r   rU   )r   r   rO   rE   rE   rF   __len__  r   zPhonemes.__len__c                 C   r   rU   )r   )rB   keyrE   rE   rF   r     r   zPhonemes.__getitem__c                 C   s   t |tr
|| jv S || jv S rU   )r|   rn   r   r   )rB   itemrE   rE   rF   __contains__  s   


zPhonemes.__contains__languagerH   c           
      C   s   t | } t|  d }t|ddd}t|}W d   n1 s!w   Y  d}t|  d }| rci }t|ddd }|D ]}| }|sGq>|jdd\}}	|	||< q>W d   n1 s^w   Y  |rh||_|S )	z"Load phonemes for a given languagezphonemes.txtrzutf-8)encodingNzipa_map.txtr[   maxsplit)	r   r   openr   	from_textis_filestripsplitr   )
r   phonemes_pathphonemes_filer   r   map_pathmap_filelinefrom_phonemeto_iparE   rE   rF   from_language  s*   
	zPhonemes.from_languagec           
      C   s   t  }| D ]X}|jt jdd^}}| }|r]| }|d }d}t|dkr+|d }g }t|dkrRd}|dd D ]}	|	dkrDd	}q;|rL||	 q;||j|	< q;|jt|||d
 q|	  |S )z6Load text file with phonemes, examples, and allophonesr[   r   r   r   r   FN!T)rM   r   r   )
r   r   COMMENT_STRr   r   r\   r   r   r   r   )
	text_filelangr   _partsphoneme_ipar   r   rf   partrE   rE   rF   r     s2   zPhonemes.from_textc                 C   s  g }t | j tddD ]R}|dr||dd  qd}| jD ]1}t|jt| }|dkrS|j|rS|dt	
|d| t	
|j|d  d} nq"|s^|t	
| qd	d
|}t	|| _dd | jD }t |dd dd| _tdd | jD | _dS )z3Call after modifying phonemes or IPA map to re-sortT)r   reverse,r[   NFr   z{}(?!{})z({})|c                 S   s&   g | ]}d d t |jD |fqS )c                 S   r   rE   rV   r   pbrE   rE   rF   r     r   z.Phonemes.update.<locals>.<listcomp>.<listcomp>)rw   ri   rM   r   rE   rE   rF   r     s    z#Phonemes.update.<locals>.<listcomp>c                 S   s   t | d S )Nr   )r   )kprE   rE   rF   <lambda>  s    z!Phonemes.update.<locals>.<lambda>c                 s   r~   rU   rV   r   rE   rE   rF   r      r   z"Phonemes.update.<locals>.<genexpr>)sortedr   keysr   
startswithr\   r   rM   formatreescaper   compiler   r   r-   r   )rB   cases
match_text
case_addedphoneme	num_extraipa_map_regex_strsplit_phonemesrE   rE   rF   r     s<   


zPhonemes.updateTFr   r   r   r   r   c                    s   j s   |du r|}g } jr.t|tr!ddd |D } fdd} j ||}t|tr;dd |D }n|rNtj||||d	}	d
d |	D }nt	|}t
t}
t
t}|rd}t|D ]^\}}|rd}|D ]O}t|r|s|r|
|  |7  < qmt|r|r|
|  |7  < qm|r|tjtjhv r|s||  |7  < qmt|r|s||  |7  < d}qm||7 }qm|||< qct|}tt|D ]}|| }|du rqd} jD ]h\}}||t| krDd}d}d}tt|D ]!}||
||  7 }||||  7 }|| |||  krd} nq|rD|s |r,t||j | |jd}|| tdt|D ]	}d||| < q8 nq|sQ|t|dd q|S )z(Split an IPA pronunciation into phonemesNr   c                 s   r~   rU   rV   r   rE   rE   rF   r   5  r   z!Phonemes.split.<locals>.<genexpr>c                    s   |  d} j||S )Nr[   )groupr   r;   )matchrM   rO   rE   rF   handle_replace7  s   
z&Phonemes.split.<locals>.handle_replacec                 S   r   rE   rV   r   rE   rE   rF   r   @  r   z"Phonemes.split.<locals>.<listcomp>)r   r   r   c                 S   r   rE   rV   r   rE   rE   rF   r   I  r   FT)rM   r   r[   )rM   r   )r   r   r   r|   rw   r   subri   r
   r   r   rn   rI   r   r   r]   r^   rb   r   ranger   r   rM   r   r\   )rB   r   r   r   r   r   word_phonemesr	  ipaspron
ipa_stress	ipa_tonesrf   ipa_idxipakeep_ipar   num_ipasphoneme_matchphoneme_ipasr  phoneme_stressphoneme_tonesphoneme_idxrE   rO   rF   r   "  s   	






zPhonemes.split)NN)rH   r   r   )rj   rk   rl   rm   r   rG   r   r   r   r   rv   rn   r   r   r   ro   r   rw   r.   rp   r   r   r   rE   rE   rE   rF   r   }  s<    
'>r   ))rm   loggingr   ro   r*   collectionsr   gruut_ipa.constantsr   r   r   r   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   gruut_ipa.utilsr   	getLogger_LOGGERr   rw   r   r   rE   rE   rE   rF   <module>   s    l
 F ' h