o
    i                     @   sZ  d dl Z d dlmZ d dlmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ deded	 fd
dZd&dededefddZ				d'dedededededefddZd&deded dedefddZdeded	 ded defddZd&deded	 ded dedef
dd Z	d&deded	 ded dedef
d!d"Zd(ded#ee fd$d%ZdS ))    N)replace)LiteralOptional)NormalizerConfig)FSTS)TokenParsertextreturn)enzhjac                 C   s>   d}| D ]}d|  krdkrn qd}q|s|   rdS dS )z
    Get the language of the text.

    Args:
        text: The text to get the language of.
    Returns:
        The language of the text.
    Fu   一u   鿿Tr   r
   )isdigit)r   contains_chinesech r   @/home/ubuntu/.local/lib/python3.10/site-packages/wetext/utils.pyget_lang   s   	r   Ftraditional_to_simplec                 C   s   |r
t d d | } |  S )z
    Preprocess the text before normalization.

    Args:
        text: The text to preprocess.
        traditional_to_simple: Whether to convert traditional Chinese to simplified Chinese.
    Returns:
        The preprocessed text.
    
preprocessr   r   strip)r   r   r   r   r   r   *   s   
r   full_to_halfremove_interjectionsremove_punctstag_oovc                 C   sX   |r
t d d | } |rt d d | } |rt d d | } |r(t d d | } |  S )a  
    Postprocess the text after normalization.

    Args:
        text: The text to postprocess.
        full_to_half: Whether to convert full-width characters to half-width.
        remove_interjections: Whether to remove interjections.
        remove_puncts: Whether to remove punctuations.
        tag_oov: Whether to tag out-of-vocabulary words.
    Returns:
        The postprocessed text.
    postprocessr   r   r   r   r   )r   r   r   r   r   r   r   r   r   9   s   r   operator)tnitnremove_erhuac                 C   s@   |dkrt td| rdS |rtd| rdS dS t| dkS )a  
    Check if the text should be normalized.

    Args:
        text: The text to check.
        operator: The operator to use.
        remove_erhua: Whether to remove erhua for TN.
    Returns:
        True if the text should be normalized, False otherwise.
    r   z\dTu   儿|兒Fr   )boolresearchlen)r   r   r   r   r   r   should_normalizeW   s   r$   langc                 C   s   t ||| S )z
    Reorder the text.

    Args:
        text: The text to reorder.
        lang: The language of the text.
    Returns:
        The reordered text.
    )r   reorder)r   r%   r   r   r   r   r&   k   s   
r&   enable_0_to_9c                 C   s@   t | | d }|r|dkr|dkrt | d d }||  S )z
    Tag the text.

    Args:
        text: The text to tag.
        lang: The language of the text.
        operator: The operator to use.
        enable_0_to_9: Whether to enable 0-to-9 conversion for ITN.
    Returns:
        The tagged text.
    taggerr
   r   tagger_enable_0_to_9r   )r   r%   r   r'   r(   r   r   r   tagx   s   r*   c                 C   s@   t | | d }|r|dkr|dkrt d d d }||  S )z
    Verbalize the text.

    Args:
        text: The text to verbalize.
        lang: The language of the text.
        operator: The operator to use.
        remove_erhua: Whether to remove erhua for TN.
    Returns:
        The verbalized text.
    
verbalizerr   r   verbalizer_remove_erhuar   )r   r%   r   r   r+   r   r   r   	verbalize   s   r-   configc                 K   s   t |pt fi |}|jrd| v rt| } t| |j} |j}t| |j	|j
rT|dkr0t| }|dkr;|j	dkr;d}t| ||j	|j} t| ||j	} t| ||j	|j
} t| |j|j|j|j} | S )z
    Normalize the text.

    Args:
        text: The text to normalize.
        config: Optional normalization config object.
    Returns:
        The normalized text.
    'autor
   r   r   )r   r   fix_contractionscontractionsfixr   r   r%   r$   r   r   r   r*   r'   r&   r-   r   r   r   r   r   )r   r.   kwargsr%   r   r   r   	normalize   s   

r5   )F)FFFF)N)r!   dataclassesr   typingr   r   r2   wetext.configr   wetext.constantsr   wetext.token_parserr   strr   r    r   r   r$   r&   r*   r-   r5   r   r   r   r   <module>   sT   
 "(
