o
    i&                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZm	Z	 d dl
Zd dlZd dlmZ d dlmZ 	ddedefdd	Zd
d ZddedefddZejddddefddZ	 edkrie  dS dS )    N)
DictConfig	OmegaConf)tqdmsourcetargetiic/SenseVoiceSmalljsonl_file_out	model_dirc              
      s  z
t  }t  }W n   d}d}Y t pd}td|  |dkri }t|| D ]}\}	i |< t|	dh}
|
  t	 d | d t	 |krP|nd}|dkrt
jj|d( fddt|D }t
j|D ]}| |  qrW d    n1 sw   Y  nt }| | W d    n1 sw   Y  q,d|vsd	|vsd
|vrddlm} ||d}i }|d  D ]1}|d | d }|j|i ddd}|d d }d}t||}|d d \}}}|||g||< qd|vr+|d d|vri |d< |d  D ]}i |d |< || d |d | d< qd	|vrZ|d	 d	|vr>i |d	< |d  D ]}i |d	 |< || d |d	 | d	< qDd
|vr|d
 d
|vrmi |d
< |d  D ]}i |d
 |< || d |d
 | d
< qst|d9}
||d   D ](}d|i}|D ]}	|||	 |  qtj|dd}|
|d  |
  qW d    n	1 sw   Y  tdt	||d   d n	 |dkrt   d S d S )Nr      z%convert wav.scp text to jsonl, ncpu: r)max_workersc              	      s0   g | ]} t | |d    |qS )r   )submitparse_context_length).0idata_file_lists	data_typeexecutorlines_for_each_th c/home/ubuntu/.local/lib/python3.10/site-packages/funasr/datasets/audio_datasets/sensevoice2jsonl.py
<listcomp>(   s    	z0gen_jsonl_from_wav_text_list.<locals>.<listcomp>text_language
emo_targetevent_target)	AutoModel)modelr   autoT)inputcachelanguageuse_itntextz<\|[^|]+\|>      wkeyF)ensure_ascii
z
processed z samples)distget_rankget_world_sizeos	cpu_countprintzipopen	readlineslen
concurrentfuturesThreadPoolExecutorrangeas_completedupdateresultr   funasrr   keysgeneraterefindallappendjsondumpswriteflushbarrier)pathdata_type_listr	   r
   kwargsrank
world_size	cpu_cores	json_dict	data_fileftask_numr6   futureresr   r   	rich_dictr(   	input_wavr$   patternmatchesr   r   r   
jsonl_liner   r   r   gen_jsonl_from_wav_text_list   s   
	










rX   c                    s    t jd  t fdd| D S )Nu[   ，。、；：？！""（）【】《》〈〉「」『』〔〕［］｛｝～·…—–c                 3   s    | ]}| v V  qd S )Nr   )r   charpunctuationsr   r   	<genexpr>   s    z'contains_punctuation.<locals>.<genexpr>)stringpunctuationany)sr   rZ   r   contains_punctuationz   s
   ra   	data_listr   c                 C   s:  t t| dd}i }t| D ]\}}|d |d|  | jdd}|d }t|dkr5|d nd}| }tj	|rYt
j|dd	\}	}
t|	}t|d d
 d }nd|v rct| nt|}|dkrw||| d|i||< q|dkrt|}|rd}nd}||| d|d|i||< q||i||< q|S )NT)totaldynamic_ncolsr   zcpu: )maxsplitr    i>  )sri  
    r   _lenr   z<|withitn|>z	<|woitn|>with_or_wo_itn)r   r4   	enumerater:   set_descriptionstripsplitr.   rG   existslibrosaloadintra   )rb   r   idpbarrR   r   linelinesr(   waveform_
sample_numcontext_lenpuncrk   r   r   r   r      s0   
r   )config_nameversion_basecfgc                 C   sl   t j| dd}t| |dd}t|trt|}|dd}|dd}|d	d
}t||||d d S )NT)resolvescp_file_list)z)/Users/zhifu/funasr1.0/test_local/wav.scpz*/Users/zhifu/funasr1.0/test_local/text.txtrH   r   r	   z6/Users/zhifu/funasr1.0/test_local/audio_datasets.jsonlr
   r   )rH   r	   r
   )r   to_containerr0   get
isinstancestrevalrX   )r   rI   r   rH   r	   r
   r   r   r   
main_hydra   s    

r   __main__)r   Nr   )r   )r.   rB   torchlogginghydrar?   r]   	omegaconfr   r   concurrent.futuresr5   rq   torch.distributeddistributedr+   r   r   rX   ra   listr   mainr   __name__r   r   r   r   <module>   s4    
k
