o
    ॵiS                     @   sN   d dl mZ G dd deZG dd deZG dd deZG dd	 d	Zd
S )    )normalize_chinese_numberc                   @   s   e Zd Zdd ZdS )TrieNodec                 C   s   i | _ d| _dS )6
        Initialize your data structure here.
        FN)datais_wordself r	   a/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/preprocessors/ofa/utils/text2phone.py__init__   s   
zTrieNode.__init__N)__name__
__module____qualname__r   r	   r	   r	   r
   r      s    r   c                   @   s8   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d ZdS )Triez
    trie-tree
    c                 C   s   t  | _dS )r   N)r   rootr   r	   r	   r
   r      s   zTrie.__init__c                 C   s@   | j }|D ]}|j|}|st |j|< |j| }qd|_dS )z\
        Inserts a word into the trie.
        :type word: str
        :rtype: void
        TN)r   r   getr   r   )r   wordnodecharschildr	   r	   r
   insert   s   
zTrie.insertc                 C   s,   | j }|D ]}|j|}|s dS q|jS )zb
        Returns if the word is in the trie.
        :type word: str
        :rtype: bool
        F)r   r   r   r   )r   r   r   r   r	   r	   r
   search)   s   zTrie.searchc                 C   s*   | j }|D ]}|j|}|s dS qdS )z
        Returns if there is any word in the trie that starts with the given prefix.
        :type prefix: str
        :rtype: bool
        FT)r   r   r   )r   prefixr   r   r	   r	   r
   
startsWith6   s   zTrie.startsWithc                    s\    fdd g }|  |s|S | |r|| |S | j}|D ]}|j|}q  ||S )zn
          Returns words started with prefix
          :param prefix:
          :return: words (list)
        c                    sH   g }|j r
||  |j D ]}| | t| |j| q|S N)r   appendr   keysextendstrr   )prepre_node	word_listxget_keyr	   r
   r$   J   s   
"zTrie.get_start.<locals>.get_key)r   r   r   r   r   r   )r   r   wordsr   r   r	   r#   r
   	get_startC   s   



zTrie.get_startN)	r   r   r   __doc__r   r   r   r   r&   r	   r	   r	   r
   r      s    r   c                       sH   e Zd ZdZ fddZdd Zdd Zdd	 Zd
d Zdd Z	  Z
S )TrieTokenizerz'
    word_split based on trie-tree
    c                    s    t t|   || _|   d S r   )superr(   r   	dict_pathcreate_trie_tree)r   r*   	__class__r	   r
   r   c   s   zTrieTokenizer.__init__c                 C   sh   g }t | jddd }|D ]}|| dd dd qW d    |S 1 s-w   Y  |S )Nrzutf-8)modeencoding	r   z	utf-8-sig)openr*   r   stripsplitencodedecode)r   r%   fileliner	   r	   r
   	load_dicth   s   
zTrieTokenizer.load_dictc                 C   s    |   }|D ]}| | qd S r   )r9   r   )r   r%   r   r	   r	   r
   r+   p   s   zTrieTokenizer.create_trie_treec                 C   sF   |t |d kr!|| |jv r!|d }| |j||d   ||}|S )N   )lenr   	mine_tree)r   treesentencetrace_indexr	   r	   r
   r<   u   s   zTrieTokenizer.mine_treec                 C   s   g }t |}|dkrGd}| | j||}|dkr.||dd  |dt | }t |}n||d|  ||t | }t |}|dks
|S )Nr   r:   )r;   r<   r   r   )r   r>   tokenssentence_lenr?   r	   r	   r
   tokenize~   s   
zTrieTokenizer.tokenizec                 C   s   d}g }g }|D ]=}t |dkr4|dkr||d d   q|d| ||d d   g }d}q|dkr@|| d}q|| q|S )Nr   r:    )r;   r   join)r   
token_listflagoutputtempir	   r	   r
   combine   s    
zTrieTokenizer.combine)r   r   r   r'   r   r9   r+   r<   rB   rJ   __classcell__r	   r	   r,   r
   r(   ^   s    	r(   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )
Text2Phonec                 C   s   t || _| || _d S r   )r(   trie_cwsget_phone_map	phone_map)r   phone_dict_pathr	   r	   r
   r      s   
zText2Phone.__init__c                 C   sb   t  }t|d}|D ]}| d\}}||vr|||< qW d    |S 1 s*w   Y  |S )Nr.   r1   )dictr2   r3   r4   )r   rP   rO   phone_map_file_readerr8   keyphone_seriesr	   r	   r
   rN      s   
zText2Phone.get_phone_mapc                 C   sx   t |}| j|}g }|D ](}|| jv r|| j|  qt|dkr6|D ]}|| jv r5|| j|  q&qd|S )Nr:    )r   rM   rB   rO   r   r;   rD   )r   textr@   phonesr   charr	   r	   r
   trans   s   


zText2Phone.transN)r   r   r   r   rN   rY   r	   r	   r	   r
   rL      s    	rL   N)modelscope.utils.chinese_utilsr   objectr   r   r(   rL   r	   r	   r	   r
   <module>   s
   
NH