o
    i                   !   @   s<  d dl Z d dlmZ d dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ de	e defddZdedede	e de	e dedede	e de	e dedededededee de	e de	e f dd Zde jfd!d"Zd&d#d$Zed%kre  dS dS )'    N)Counter)Path)List)Optional)get_commandline_args)build_tokenizer)TextCleaner)g2p_classes)str2bool)str_or_nonefieldreturnc                 C   s   |   } zBd| v r5| jddd\}}|  dkrd}nt|}|dkr'td|  dkr0d}nt|}nt| }|d }|dkrEtdW n tyT   td	|  w |du r`td|}|S t|d |}|S )
a  Convert field string to slice

    Note that field string accepts 1-based integer.

    Examples:
        >>> field2slice("1-")
        slice(0, None, None)
        >>> field2slice("1-3")
        slice(0, 3, None)
        >>> field2slice("-3")
        slice(None, 3, None)
    -   )maxsplit Nr   z1-based stringzmust be 1 or more valuez)Format error: e.g. '2-', '2-5', or '-5': )stripsplitint
ValueErrorRuntimeErrorslice)r   s1s2slic r   L/home/ubuntu/.local/lib/python3.10/site-packages/funasr/bin/tokenize_text.pyfield2slice   s2   

r   inputoutput	delimiter
token_typespace_symbolnon_linguistic_symbolsbpemodel	log_levelwrite_vocabularyvocabulary_sizeremove_non_linguistic_symbolscutoff
add_symbolcleanerg2pc               	      s  t j|dd | dkrtj}n	t| jddd}|dkr tj}nt|}|jjddd |jd	dd}t	|}t
|||||||d
}t }|d urMt|}|D ]E}| }|d urq||}|| }|d u rld|}n||}||}||}|	s|d|d  qO|D ]
}||  d7  < qqO|	sd S |D ]%}z	|d\}}W n ty   td| w | }||v r||= qtt fddt| dd d}|
dkr|
t|k rtd|
 |d |
t|  }|D ]8}z|d\}}t|}W n ty   td| w | }|dk r!t|d | }|||d f q|D ]\}}||d  q,t| }tdd |D }t d|| | d  d d S )Nz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatr   rzutf-8)encodingT)parentsexist_okw)r!   r$   r    r"   r#   r(   g2p_type 
r   :z Format error: e.g. '<blank>:0': c                    s   | d  kS Nr   r   xr)   r   r   <lambda>   s    ztokenize.<locals>.<lambda>c                 S   s
   | d  S r8   r   r9   r   r   r   r<      s   
 )keyr   zvocabulary_size is too small: c                 s   s     | ]\}}|d ur|V  qd S Nr   ).0r3   cr   r   r   	<genexpr>   s    ztokenize.<locals>.<genexpr>zOOV rate = d   z %)loggingbasicConfigsysstdinr   openstdoutparentmkdirr   r   r   r   rstripr   jointext2tokenswriter   r   r   listfiltersorteditemslenr   insertsumvaluesinfo) r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   finfoutp	tokenizercounterlinetokenstsymbol_and_idsymbolidxwords_and_countsr3   r@   total_countinvocab_countr   r;   r   tokenizeA   s   



 
"rf   c                  C   sL  t jdt jd} | jddd dddd	 | jd
dddd | jddddd | jdddd | jdddg ddd | jddd dd | jd d!d"d | jd#d d$d | jd%td&d' | jd(td)d*d+ | jd,tg d-d d.d/ | jd0ttd d1d/ | d2}|jd3td)d4d+ |jd5td6d7d+ |jd8d6td9d: |jd;t	g d<d=d> | S )?NzTokenize texts)descriptionformatter_classz--log_levelc                 S   s   |   S r>   )upperr9   r   r   r   r<      s    zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGrj   DEBUGNOTSETzThe verbose level of logging)typedefaultchoiceshelpz--inputz-iTz!Input text. - indicates sys.stdin)requiredrs   z--outputz-oz#Output text. - indicates sys.stdoutz--fieldz-fz?The target columns of the input text as 1-based integer. e.g 2-)rs   z--token_typez-tchar)ru   bpewordphnz
Token type)rq   rr   rs   z--delimiterz-dzThe delimiter)rq   rs   z--space_symbolz<space>zThe space symbolz
--bpemodelzThe bpemodel file pathz--non_linguistic_symbolsz non_linguistic_symbols file path)rp   rs   z--remove_non_linguistic_symbolsFz'Remove non-language-symbols from tokens)rp   rq   rs   z	--cleaner)Ntacotronjaconv
vietnamesekorean_cleanerzApply text cleaning)rp   rr   rq   rs   z--g2pz&Specify g2p method if --token_type=phnzwrite_vocabulary mode relatedz--write_vocabularyz4Write tokens list instead of tokenized text per linez--vocabulary_sizer   zVocabulary sizez--cutoffz0cut-off frequency used for write-vocabulary mode)rq   rp   rs   z--add_symbolappendzBAppend symbol e.g. --add_symbol '<blank>:0' --add_symbol '<unk>:1')rp   rq   actionrs   )
argparseArgumentParserArgumentDefaultsHelpFormatteradd_argumentr   r
   r	   add_argument_groupr   str)parsergroupr   r   r   
get_parser   s   
r   c                 C   s:   t t tjd t }|| }t|}tdi | d S )N)filer   )printr   rE   stderrr   
parse_argsvarsrf   )cmdr   argskwargsr   r   r   main  s
   
r   __main__r>   )r   collectionsr   rC   pathlibr   rE   typingr   r   funasr.utils.cli_utilsr    funasr.tokenizer.build_tokenizerr   funasr.tokenizer.cleanerr   "funasr.tokenizer.phoneme_tokenizerr	   funasr.utils.typesr
   r   r   r   r   boolr   rf   r   r   r   __name__r   r   r   r   <module>   sh   .	

z
Q
