o
    i                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlmZmZ d dlZ	d dl
Z
d dlmZ d dlmZ 	ddefddZdded	efd
dZejddddefddZ	 edkr[e  dS dS )    N)
DictConfig	OmegaConf)tqdmsourcejsonl_file_outc              	      s  z
t  }t  }W n   d}d}Y t pd}td|  |dkri }|d | }i |< t|dl}	|	  td t d | d t |krQ|nd}
|
dkrt	j
j|d( fddt|
D }t	j
|D ]}| |  qsW d    n1 sw   Y  nt }| | W d    n1 sw   Y  t|d	;}	||d   D ]*}d
|i}|D ]}||| |  q|d }| d| }|	|d  |	  qW d    n1 sw   Y  tdt||d   d n	 |dkrt   d S d S )Nr      z%convert wav.scp text to jsonl, ncpu: r )max_workersc              	      s0   g | ]} t | |d    |qS )r   )submitparse_context_length).0idata_file_lists	data_typeexecutorlines_for_each_th Z/home/ubuntu/.local/lib/python3.10/site-packages/funasr/datasets/audio_datasets/scp2len.py
<listcomp>)   s    z0gen_jsonl_from_wav_text_list.<locals>.<listcomp>wkey
source_len 
z
processed z samples)distget_rankget_world_sizeos	cpu_countprintopen	readlineslen
concurrentfuturesThreadPoolExecutorrangeas_completedupdateresultr   keyswriteflushbarrier)pathdata_type_listr   kwargsrank
world_size	cpu_cores	json_dict	data_fileftask_numr'   futureresr   
jsonl_liner   r   r   r   gen_jsonl_from_wav_text_list   s`   





r>   	data_listr   c                 C   s   t t| dd}i }t| D ]d\}}|d |d|  | jdd}|d }t|dkr5|d nd}| }tj	|rYt
j|dd	\}	}
t|	}t|d d
 d }nd|v rct| nt|}||| d|i||< q|S )NT)totaldynamic_ncolsr   zcpu: )maxsplitr   r
   i>  )sri  
   r   _len)r   r%   	enumerater+   set_descriptionstripsplitr    r1   existslibrosaloadint)r?   r   idpbarr<   r   linelinesr   waveform_
sample_numcontext_lenr   r   r   r   M   s    
r   )config_nameversion_basecfgc                 C   sL   t j| dd}t| |dd}|dd}|dd}t|||d	 d S )
NT)resolvescp_file_listz./Users/zhifu/funasr1.0/data/list/train_wav.scpr2   r   r   z,/Users/zhifu/funasr1.0/data/list/wav_len.txt)r2   r   )r   to_containerr"   getr>   )rX   r3   rZ   r2   r   r   r   r   
main_hydraa   s   
r]   __main__)r   N)r   )r    jsontorchlogginghydra	omegaconfr   r   concurrent.futuresr&   rK   torch.distributeddistributedr   r   strr>   listr   mainr]   __name__r   r   r   r   <module>   s*    
@
