o
    }oi                  	   @   sp   d dl Z d dlmZ zd dlZd dlZdZdZW n eefy'   dZdZY nw G dd dZ	G dd dZ
dS )	    N)ListTFc                   @   sP   e Zd ZdZdefddZdee defddZdefd	d
ZdefddZ	dS )EnJaProcessorz
    Tokenizer, Detokenizer and Normalizer utilities for Japanese & English
    Args:
        lang_id: One of ['en', 'ja'].
    lang_idc                 C   sF   ddl m}m}m} || _||d| _||d| _||ddd| _d S )Nr   )MosesDetokenizerMosesPunctNormalizerMosesTokenizer)langT)r   pre_replace_unicode_punctpost_remove_control_chars)
sacremosesr   r   r   r   moses_tokenizermoses_detokenizer
normalizer)selfr   r   r   r    r   g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/common/tokenizers/en_ja_tokenizers.py__init__#   s   zEnJaProcessor.__init__tokensreturnc                 C   s   | j |S )z
        Detokenizes a list of tokens
        Args:
            tokens: list of strings as tokens
        Returns:
            detokenized Japanese or English string
        )r   
detokenize)r   r   r   r   r   r   -   s   zEnJaProcessor.detokenizec                 C   s   | j |}d|S )I
        Tokenizes text using Moses. Returns a string of tokens.
         )r   tokenizejoin)r   textr   r   r   r   r   7   s   
zEnJaProcessor.tokenizec                 C   s   | j dkr| j|S |S )Nen)r   r   	normalizer   r   r   r   r   r   >   s   
zEnJaProcessor.normalizeN)
__name__
__module____qualname____doc__strr   r   r   r   r   r   r   r   r   r      s    

r   c                   @   sJ   e Zd ZdZdd Zdee defddZdefdd	Zdefd
dZ	dS )JaMecabProcessorzV
    Tokenizer, Detokenizer and Normalizer utilities for Japanese MeCab & English
    c                 C   s&   t rtstdttjd | _d S )NzRPlease ensure that you have installed `MeCab` and `ipadic` to use JaMecabProcessorz	 -Owakati)
HAVE_MECABHAVE_IPADICImportErrorMeCabTaggeripadic
MECAB_ARGSmecab_tokenizer)r   r   r   r   r   L   s   zJaMecabProcessor.__init__r   r   c                    s2   ddl m td  fdd}|d|S )Nr   )spacingz([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])c                    s     d|  S )Nz\1)substrip)sRE_WS_IN_FWr,   r   r   <lambda>Y   s    z-JaMecabProcessor.detokenize.<locals>.<lambda>r   )pangur,   recompiler   )r   r   r   r   r0   r   r   R   s   zJaMecabProcessor.detokenizec                 C   s   | j | S )r   )r+   parser.   r   r   r   r   r   \   s   zJaMecabProcessor.tokenizec                 C   s   |S )Nr   r   r   r   r   r   b   s   zJaMecabProcessor.normalizeN)
r   r   r    r!   r   r   r"   r   r   r   r   r   r   r   r#   G   s    
r#   )r4   typingr   r)   r'   r$   r%   r&   ModuleNotFoundErrorr   r#   r   r   r   r   <module>   s   +