o
    ॵin                     @   sj   d dl Z d dlmZ e ZdZdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd Zdd ZdS )    N)
get_loggeruT   [’!"#$%&'()*+,-./:;<>=?@，。?★、…【】《》？“”‘’！[\]^_`{|}~]+c                 C   sr   g }|   }t|dkr7td|}|d ur|d}n|dd }|| ||ddd}t|dks|S )Nr   z[A-Za-z!?,<>()\']+     )lowerlenrematchgroupappendreplacestrip)	input_strtokenssr	   word r   b/home/ubuntu/.local/lib/python3.10/site-packages/modelscope/trainers/audio/kws_utils/file_utils.pysplit_mixed_label   s   
r   c                 C   s$   t | }ddd |D }| S )Nr   c                 s   s    | ]}| d V  qdS )r   Nr   ).0subr   r   r   	<genexpr>(   s    z$space_mixed_label.<locals>.<genexpr>)r   joinr   )r   splits	space_strr   r   r   space_mixed_label&   s   r   c                 C   s\   g }t | ddd}|D ]}| dkr||  qW d    |S 1 s'w   Y  |S )Nrutf8encodingr   )openr   r   )	list_filelistsfinliner   r   r   
read_lists,   s   
r%   c              	   C   s   i }|D ]+}|  dd }t|dk r!td|   q||d d  ||d < qg }| D ]6}|  dd }t|dkr`|d |v r`|t|d ||d  |d dd	 q4td
|d  q4|S )N	r      zinvalid line in trans file: {}r   r   r   i>  )keytxtwavsample_ratez*can't find corresponding trans for key: {})	r   r   splitr   loggerdebugformatr   dict)	wav_liststrans_liststrans_tabler$   arrr"   r   r   r   	make_pair5   s0   
r5   c                 C   s|   i }t | ddd'}|D ]}|  }t|dksJ t|d d ||d < qW d    n1 s3w   Y  |  |S )Nr   r   r   r'   r   r   )r    r   r,   r   intclose)
token_filetokens_tabler#   r$   r4   r   r   r   
read_tokenQ   s   r:   c                 C   s   i }t | ddd)}|D ]}| dd }t|dks J |dd  ||d < qW d    n1 s5w   Y  |  |S )	Nr   r   r   r&   r   r'   r   r   )r    r   r   r,   r   r7   )lexicon_filelexicon_tabler#   r$   r4   r   r   r   read_lexicon\   s   r=   c                 C   s  t  }t  }t| }|D ]a}|dks|dks|dkr|d }q|dks'|dkr,|d }q|dks<|d	ks<|d
ks<|dkrA|d }q||v rK||f }q||v r\|| D ]}||f }qSqttd|}|D ]}||f }qeq|D ]c}||v r~||| f }qp|dkrd|v r||d f }qp||d f }qp|dkrd|v r||d f }qp||d f }qpd|v r||d f }td| d qp||d f }td| d qp||fS )N!sil(sil)<sil>)r>   <blk><blank>)rA   (noise)noise)(noise<noise>)<GBG>r   silrG   ')' is not in token set, replace with <GBG>)' is not in token set, replace with <blk>)tupler   r   r   
symbol_strr-   infor)   symbol_tabler<   
tokens_str
tokens_idxpartspartchr   r   r   query_token_setg   sR   

 


rV   c                 C   s  g }g }t | }|D ]d}|dks|dks|dkr|d q
|dks&|dkr,|d q
|dks<|dks<|dks<|d	krB|d
 q
||v rL|| q
||v r]|| D ]}|| qTq
ttd|}|D ]}|| qfq
|D ]c}||v r|||  qq|dkrd|v r||d  qq||d  qq|d
krd
|v r||d
  qq||d  qqd
|v r||d
  td| d qq||d  td| d qq||fS )Nr>   r?   r@   rA   rB   rC   rD   rE   rF   rG   r   rH   rI   rJ   rK   )r   r   r   r   rM   r-   rN   rO   r   r   r   query_token_list   sR    

rW   c                 C   sF   | D ]}d|v s
J |d   }t|||\}}||d< ||d< q| S )Nr)   r   )r   rW   )	data_listrP   r<   sampler)   strsindexsr   r   r   tokenize   s   
r\   )r   modelscope.utils.loggerr   r-   rM   r   r   r%   r5   r:   r=   rV   rW   r\   r   r   r   r   <module>   s   	11