o
    iR                     @   s   d dl Z d dlZd dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
mZmZ d dlmZ d dlZd dlmZ d dlmZ d dlmZmZmZ G d	d
 d
Ze jddddefddZedkrfe  dS dS )    N)ThreadPoolExecutoras_completed)BytesIO)DictOptionalTuple)urlopen)AutoTokenizer)tqdm)
DictConfig	OmegaConf
ListConfigc                   @   s2   e Zd Zdd Zdeeef dee fddZdS )LineProcessorc                 C   s   || _ t | _d S )N)	tokenizer	threadingLocklock)selfr    r   ^/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/fun_asr_nano/tools/scp2jsonl.py__init__   s   zLineProcessor.__init__	line_pairreturnc              
   C   s  |\}}|  |  }}|r|sd S |jdd|jdd}}t|dks,t|dkr.d S |d |d }}|d |d }}	||krNdd| d| iS zc|drrt|}
|
jd	kredd
| iW S t|
 }t	|j
}ntj|sdd
| iW S t	|j
}ddddd| ddd|	dgt|d d d d t| j|	d}||dW S  ty } zdd| dt| iW  Y d }~S d }~ww )N   )maxsplit   r   errorzUTT mismatch: z vs http   zWAV not found: systemzYou are a helpful assistant.)rolecontentuseru!   语音转写：<|startofspeech|>!z<|endofspeech|>	assistanti     
   )messagesspeech_lengthtext_length)successuttzError processing z: )stripsplitlen
startswithr   statusr   readsfinfodurationospathexistsintr   tokenize	Exceptionstr)r   r   line1line2parts1parts2utt1utt2wav_pathtextresponse
audio_filer3   dataer   r   r   process_line   sF   


$zLineProcessor.process_lineN)	__name__
__module____qualname__r   r   r:   r   r   rG   r   r   r   r   r      s    "r   )config_nameversion_basecfgc                    s  fdd| }|d }|d }| dt }|d }t|d%}t|d}| }| }	W d    n1 s<w   Y  W d    n1 sKw   Y  t|t|	krftdt| d	t|	  td
}
t	|
t
t||	}d}d}g }tt|dd}t|d t|dj} fddt|D }t|D ]Q}| }|rd|v rԈj tj|d |dd |d W d    n1 sw   Y  |d7 }n|rd|v r|d7 }||d  |d |||d qW d    n1 sw   Y  W d    n	1 sw   Y  W d    n	1 sw   Y  td tdt|  td|  td|  |r^t|dkr^td |d d D ]
}td|  qQd S |rtd  |d d D ]
}td|  qktd!t|d  d" d S d S )#Nc                    s<   t | trtj| ddS t | tr fdd|  D S | S )NT)resolvec                    s   i | ]	\}}| |qS r   r   ).0kvto_plain_listr   r   
<dictcomp>J   s    z5main_hydra.<locals>.to_plain_list.<locals>.<dictcomp>)
isinstancer   r   to_containerr   items)cfg_itemrR   r   r   rS   F   s
   

z!main_hydra.<locals>.to_plain_listscp_filetranscript_filemax_workers
jsonl_filerz$Warning: Line count mismatch - scp: z, transcript: zQwen/Qwen3-0.6Br   
Processing)totaldesc)r[   wc                    s    i | ]\}}  j||qS r   )submitrG   )rO   ipair)executor	processorr   r   rT   g   s     zmain_hydra.<locals>.<dictcomp>r)   F)ensure_ascii
r   r   )	processedfailedz
Processing completed:z  Total lines: z  Successfully processed: z
  Failed: r%   z
Sample errors:z  - z
First 10 errors:z
  ... and z more errors)getr4   	cpu_countopen	readlinesr-   printr	   from_pretrainedr   listzipr
   r   	enumerater   resultr   jsondumpwriteappendupdateset_postfix)rM   kwargsrY   rZ   r[   r\   f1f2	scp_linestranscript_linesr   
data_pairsprocessed_countfailed_counterror_messagespbarf_outfuturesfuturert   r   r   )re   rf   rS   r   
main_hydraD   st   
 


r   __main__)hydraru   r4   r   concurrent.futuresr   r   ior   typingr   r   r   urllib.requestr   	soundfiler1   
modelscoper	   r
   	omegaconfr   r   r   r   mainr   rH   r   r   r   r   <module>   s$    4C
