o
    wi                     @   s   d Z g dZddlZddlZddlZi dddddd	d
ddddddddddddddddddddddddd Zd)d"efd#d$Zd"efd%d&ZG d'd( d(Z	dS )*z4OpenAI's non-english basic text normalization module)ADDITIONAL_DIACRITICSremove_symbols_and_diacriticsremove_symbolsBasicTextNormalizer    Nu   œoeu   ŒOE   øo   ØO   æae   ÆAE   ßssu   ẞSSu   đdu   ĐD   ð   Ð   þth   Þu   łlu   ŁL sc                    s    d  fddtd| D S )z
    Replace any other markers, symbols, and punctuations with a space,
    and drop any diacritics (category 'Mn' and some manual mappings)
    r   c                 3   sX    | ]'}| v r
|n|t v rt | nt|d krdnt|d dv r&dn|V  qdS )Mnr   r   MSP N)r   unicodedatacategory.0ckeep U/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/whisper_normalizer/basic.py	<genexpr>,   s    
z0remove_symbols_and_diacritics.<locals>.<genexpr>NFKDjoinr!   	normalize)r   r'   r(   r&   r)   r   '   s   
r   c                 C   s   d dd td| D S )z[
    Replace any other markers, symbols, punctuations with a space, keeping diacritics
    r   c                 s   s*    | ]}t |d  dv rdn|V  qdS )r   r   r    N)r!   r"   r#   r(   r(   r)   r*   B   s
    
z!remove_symbols.<locals>.<genexpr>NFKCr,   )r   r(   r(   r)   r   >   s   

r   c                   @   s6   e Zd ZdZ		ddedefddZdefdd	Zd
S )r   a  As per the text normalization/standardization approach mentioned in  Appendix Section C pp.21 in  the paper [Robust Speech Recognition via Large-Scale  Weak Supervision](https://cdn.openai.com/papers/whisper.pdf). The `BasicTextNormalizer` does the following functionality:

        1. Remove any phrases between matching brackets ([, ]).
        2. Remove any phrases between matching parentheses ((, )).
        3. Replace any markers, symbols, and punctuation characters with a space, i.e. when the Unicode category of each
        character in the NFKC-normalized string starts with M, S, or P.
        4. make the text lowercase.
        5. replace any successive whitespace characters with a space

    Note: It's not recommended to use this function for non-english languages because it may removes vowels in languages as identified by [kavya in this tweet](https://twitter.com/kavya_manohar/status/1752574864618365059).
    Fremove_diacriticssplit_lettersc                 C   s   |rt nt| _|| _dS )a$  
        remove_diaciritics - Replace any other markers, symbols, and punctuations with a space and drop any diacritics
        split_letters  - It uses a regular expression \X to find all Unicode graphemes (extended grapheme clusters) in the string s and join them together by space
        N)r   r   cleanr1   )selfr0   r1   r(   r(   r)   __init__U   s   


zBasicTextNormalizer.__init__r   c                 C   s`   |  }tdd|}tdd|}| |  }| jr'dtd|tj}tdd|}|S )Nz[<\[][^>\]]*[>\]]r   z\(([^)]+?)\)r    z\Xz\s+)	lowerresubr2   r1   r-   regexfindallU)r3   r   r(   r(   r)   __call__c   s   zBasicTextNormalizer.__call__N)FF)__name__
__module____qualname____doc__boolr4   strr;   r(   r(   r(   r)   r   H   s    
r   )r   )
r?   __all__r6   r!   r8   r   rA   r   r   r   r(   r(   r(   r)   <module>   sR    	

