o
    iE                  !   @   s8  d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ de	e defddZdedede	e de	e dedede	e de	e dedededededee de	e de	e f ddZde jfd d!Zd%d"d#Ze d$kre  dS dS )&    N)Counter)Path)ListOptional)check_argument_types)build_tokenizer)TextCleaner)g2p_choices)str2boolstr_or_none)get_commandline_argsfieldreturnc                 C   s   |   } zBd| v r5| jddd\}}|  dkrd}nt|}|dkr'td|  dkr0d}nt|}nt| }|d }|dkrEtdW n tyT   td	|  w |du r`td|}|S t|d |}|S )
a  Convert field string to slice

    Note that field string accepts 1-based integer.

    Examples:
        >>> field2slice("1-")
        slice(0, None, None)
        >>> field2slice("1-3")
        slice(0, 3, None)
        >>> field2slice("-3")
        slice(None, 3, None)
    -   )maxsplit Nr   z1-based stringzmust be 1 or more valuez)Format error: e.g. '2-', '2-5', or '-5': )stripsplitint
ValueErrorRuntimeErrorslice)r   s1s2slic r   M/home/ubuntu/.local/lib/python3.10/site-packages/espnet2/bin/tokenize_text.pyfield2slice   s2   

r   inputoutput	delimiter
token_typespace_symbolnon_linguistic_symbolsbpemodel	log_levelwrite_vocabularyvocabulary_sizeremove_non_linguistic_symbolscutoff
add_symbolcleanerg2pc               	      sh  t  sJ tj|dd | dkrtj}n	t| jddd}|dkr%tj}nt|}|jj	ddd |jd	dd}t
|}t|||||||d
}t }|d urRt|}|D ]E}| }|d urv||}|| }|d u rqd|}n||}||}||}|	s|d|d  qT|D ]
}||  d7  < qqT|	sd S tt fddt| dd d}|
dkr|
t|k rtd|
 |d |
t|  }|D ]6}z|d\}}t|}W n ty   td| w | }|dk rt|d | }|||d f q|D ]\}}||d  qt| }tdd |D }td|| | d  d d S )Nz>%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s)levelformatr   rzutf-8)encodingT)parentsexist_okw)r"   r%   r!   r#   r$   r)   g2p_type 
r   c                    s   | d  kS Nr   r   xr*   r   r   <lambda>   s    ztokenize.<locals>.<lambda>c                 S   s
   | d  S r8   r   r9   r   r   r   r<      s   
 )keyr   zvocabulary_size is too small: :z Format error: e.g. '<blank>:0': c                 s   s     | ]\}}|d ur|V  qd S Nr   ).0r4   cr   r   r   	<genexpr>   s    ztokenize.<locals>.<genexpr>zOOV rate = d   z %) r   loggingbasicConfigsysstdinr   openstdoutparentmkdirr   r   r   r   rstripr   jointext2tokenswritelistfiltersorteditemslenr   r   r   r   insertsumvaluesinfo) r   r    r   r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   finfoutp	tokenizercounterlinetokenstwords_and_countssymbol_and_idsymbolidxr4   rA   total_countinvocab_countr   r;   r   tokenize@   s   




 "rg   c                  C   sL  t jdt jd} | jddd dddd	 | jd
dddd | jddddd | jdddd | jdddg ddd | jddd dd | jd d!d"d | jd#d d$d | jd%td&d' | jd(td)d*d+ | jd,tg d-d d.d/ | jd0ttd d1d/ | d2}|jd3td)d4d+ |jd5td6d7d+ |jd8d6td9d: |jd;t	g d<d=d> | S )?NzTokenize texts)descriptionformatter_classz--log_levelc                 S   s   |   S r?   )upperr9   r   r   r   r<      s    zget_parser.<locals>.<lambda>INFO)CRITICALERRORWARNINGrk   DEBUGNOTSETzThe verbose level of logging)typedefaultchoiceshelpz--inputz-iTz!Input text. - indicates sys.stdin)requiredrt   z--outputz-oz#Output text. - indicates sys.stdoutz--fieldz-fz?The target columns of the input text as 1-based integer. e.g 2-)rt   z--token_typez-tchar)rv   bpewordphnz
Token type)rr   rs   rt   z--delimiterz-dzThe delimiter)rr   rt   z--space_symbolz<space>zThe space symbolz
--bpemodelzThe bpemodel file pathz--non_linguistic_symbolsz non_linguistic_symbols file path)rq   rt   z--remove_non_linguistic_symbolsFz'Remove non-language-symbols from tokens)rq   rr   rt   z	--cleaner)Ntacotronjaconv
vietnamesekorean_cleanerzApply text cleaning)rq   rs   rr   rt   z--g2pz&Specify g2p method if --token_type=phnzwrite_vocabulary mode relatedz--write_vocabularyz4Write tokens list instead of tokenized text per linez--vocabulary_sizer   zVocabulary sizez--cutoffz0cut-off frequency used for write-vocabulary mode)rr   rq   rt   z--add_symbolappendzBAppend symbol e.g. --add_symbol '<blank>:0' --add_symbol '<unk>:1')rq   rr   actionrt   )
argparseArgumentParserArgumentDefaultsHelpFormatteradd_argumentr   r
   r	   add_argument_groupr   str)parsergroupr   r   r   
get_parser   s   
r   c                 C   s:   t t tjd t }|| }t|}tdi | d S )N)filer   )printr   rF   stderrr   
parse_argsvarsrg   )cmdr   argskwargsr   r   r   main  s
   
r   __main__r?   )!r   rD   rF   collectionsr   pathlibr   typingr   r   	typeguardr   espnet2.text.build_tokenizerr   espnet2.text.cleanerr   espnet2.text.phoneme_tokenizerr	   espnet2.utils.typesr
   r   espnet.utils.cli_utilsr   r   r   r   boolr   rg   r   r   r   __name__r   r   r   r   <module>   sf   .	

o
U
