o
    iW                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlm	Z	 d dl
mZmZ e	ddG dd dejjjZe	ddG dd dejjjZdS )	    N)tables)extract_fbankload_audio_text_image_videodataset_classesFunASRc                       sr   e Zd ZdZ					ddededef fdd	Zd
d Zdd Z	dd Z
dd Zdd ZddefddZ  ZS )r   z
    FunASR dataset
    N        index_dsint_pad_valuefloat_pad_valuec                    s  t    tj|}||fi || _|dd }	|	r,tj|	}
|
d(i |d}	|	| _|dd }|rGtj|}|d(i |d}|| _|dd }|d uretj	|}|d(i |d}nd }|| _	|| _
|d u rsdn|j| _d| _|| _|| _|| _|d	d
| _|dd| _|d| _|d| _d| _|dd| _td| _|dd| _|dd| _|dd| _|dd| _|dd| _|dd| _|dd | _|d!d | _ |d"d#| _!|d$d%| _"|d&d'| _#d S ))Npreprocessor_speechpreprocessor_speech_confpreprocessor_noisepreprocessor_noise_confprompt_classesprompt_confi>  soundsosz<|startoftranscript|>eosz<|endoftext|>
batch_size
batch_typer   retryd   z)(<\|startofspeech\|>.*?<\|endofspeech\|>)max_token_lengthi  batch_size_scale_ratio_maxg      ?batch_size_token_maxi	  multiturn_num_max   max_source_lengthi  max_target_lengthi   do_thinkT
sys_promptuse_dynamic_output_ratior   min_mask_token_len   min_non_mask_token_len    )$super__init__r   index_ds_classesgetr	   preprocessor_classesr   r   r   frontendfs	data_type	tokenizerr
   r   r   r   r   r   prompt_ids_lenr   recompilepatternr   r   r   r   r   r   r    r!   r"   min_output_mask_token_lenmin_output_non_mask_token_len)selfpathr	   r-   r0   r
   r   kwargsindex_ds_classr   preprocessor_speech_classr   preprocessor_noise_classprompt_classes_textr   	__class__r'   ]/home/ubuntu/.local/lib/python3.10/site-packages/funasr/datasets/fun_asr_datasets/datasets.pyr)      sZ   

FunASR.__init__c                 C      | j | }| j |S N)r	   get_source_lenr7   indexitemr'   r'   r@   rD   T      
FunASR.get_source_lenc                 C   rB   rC   )r	   get_target_lenrE   r'   r'   r@   rJ   X   rH   FunASR.get_target_lenc                 C   s   ddg}| dd }d }|d ur1| dkr!|d |d n| dkr1|d |d	 t|d
kr=t|}nt|dkrNtj|g ddd }d|v r\|d|d }|S d|v rh|d|d }|S )Nu   语音转写：zSpeech transcription:languagezhu   语音转写成中文：zTranscribe speech into Chinese:enu   语音转写成英文：zTranscribe speech into English:      )皙?rQ   皙?rR   )weightsr   u    语音转写：<|startofspeech|><|startofspeech|>z&Speech transcription:<|startofspeech|>)r+   lowerappendlenrandomchoicechoicesreplace)r7   rG   user_prompttasksrL   taskr'   r'   r@   get_random_user_prompt\   s(   


zFunASR.get_random_user_promptc                 C   
   t | jS rC   )rW   r	   r7   r'   r'   r@   __len__r      
FunASR.__len__c           1      C   s6  d }t | jD ]}|dkrtd|  d}|dkr|}ntdt| jd }| j| }|d }|d }|d }	|	dd}
t|d	k sNt|	d	k rWt
d
|  qg g g g g g g f\}}}}}}}tt|||	D ]?\}\}}}|| jkr n1t|| jkrtdt| d| j d|   n| jd ur|dd }| j|}| j||}nd}|dkrd| d| | d}| jsd| | d}nd| d}| js|d7 }| j|}g }g }d}d}g }g } g }!t|D ]\}"}#|#ds| j|#}$||$7 }q|#dddd}#|#drz^t|#d	d  | jd}%| jd urU|
sUz	| |% }%W n tyT }& ztd|&  W Y d }&~&nd }&~&ww t |%| j!| j"dd\} }!|!| j#krxtd|! d| j# d|  d}W n' ty }& zt
dt$|& dt%&  d|  d}W Y d }&~&qd }&~&ww 	 d	|!d  d d  d   }'d	|'d d  d   }'|'d	 d  d	 }dg| }(t|}||(7 }q|rqo|dkr||t| g7 }||g7 }n
|dg7 }|dg7 }|d ur"t'd!d" |D r"t(d#d" |D d })|| j|)7 }d$gt| }*|d  d%}nd$gt| }*| d%}| j|}+t|+| j)krMtd&t|+ d'| j) d(|  t*j+, | j-k rt|+},t.| j/|,}-t.| j0|,}.|,|. |-krwt*j+|-|,|. }/n|,|. }/|/dkrd$g|/ |+d |/< |||+ 7 }||*|+ 7 }t| dkr|1| dd d d d f  |1|! qo|rqtj2|tj3d)}tj2d	gt| tj4d)}0tj2|tj3d)}tj2|tj4d)}tj2|tj4d)}||||0|d*}||d+< t|dkr||d,< ||d-< t|| jkrt
d.t| d/| j d0|  q |S |S )1Nr   zretry: Fr'   systemuser	assistantnoisedr$   zitem is error: zinput_ids > max_token_length: >, rT    z<|im_start|>system
z<|im_end|>
<|im_start|>user
z!<|im_end|>
<|im_start|>assistant
z<|im_start|>user
z<think>

</think>

r   z<|endofspeech|>!)r.   zGenerate noise audio failed: T)r/   r-   is_finalz$speech_lengths > max_source_length: zLoading wav failed! 
   rO   c                 s   s"    | ]}t |tod |v V  qdS prev_contentN
isinstancedict.0rG   r'   r'   r@   	<genexpr>   s    
z%FunASR.__getitem__.<locals>.<genexpr>c                 s   s*    | ]}t |trd |v r|d  V  qdS rp   rr   ru   r'   r'   r@   rw      s    iz
<|im_end|>ztext_length: z > z, drop it: )dtype)	fbank_begfake_token_len	input_idsattention_mask
labels_idsrG   speechspeech_lengthszlen(input_ids): z > max_token_length: z, item: )5ranger   logginginfotorchrandintrW   r	   rG   r+   warning	enumeratezipr   r   r   splitdetect_language
get_promptr!   r    r4   
startswithr0   encoder[   r   r.   r   numpy	Exceptionerrorr   r/   r-   r   str	traceback
format_excanynextr   nprX   randr"   minr5   r6   rV   tensorint64int32)1r7   rF   outputidxbadcase_flag	index_currG   re   rf   rg   	is_noisedr{   labelsfbank
fbank_lens
fbank_maskry   rz   isystem_promptr\   
target_out
asr_promptrL   user_prompt_all_contextsource_inputsplits
source_idsfbank_ifake_token_len_ifbank_beg_ifbank_lens_ir~   r   ksub_str	sub_tokendata_srceolens
fake_token
prev_valuesource_mask
target_idsmax_lenr5   r6   	end_indexr|   r'   r'   r@   __getitem__u   sP  















FunASR.__getitem__samplesc                 C   s`  t | jD ]}d}i }|D ]2}|d u rq| D ]&}||vr"g ||< t|| ttfr5|| ||  q|| ||  qq| D ]0\}}t|d t	j
rt|d jt	jks`|d jt	jkrd| j}n| j}t	jjjj|d|d||< qD| jdkr|d j\}	}
|	dkr|	|
 | jkrtd| d	|	 d
|
 d|	|
  d| j d |d d }q |S |S )NFr   T)batch_firstpadding_valueexampler{   r$   z	Warning, z	th, b*t: *=z > batch_size_sample_max: z, drop last datar   )r   r   keysrs   listtupleextendrV   itemsr   Tensorrx   r   r   r
   r   nnutilsrnnpad_sequencer   shaper   r   r   )r7   r   r   r   outputssamplekey	data_list	pad_valuebtr'   r'   r@   collator5  s@    


(zFunASR.collator)NNNr   r   rC   )__name__
__module____qualname____doc__r   intfloatr)   rD   rJ   r_   rb   r   r   r   __classcell__r'   r'   r>   r@   r      s*    A Ar*   c                       sB   e Zd Zdef fddZdd Zdd Zdd	 Zd
d Z  Z	S )r   r8   c                    sX  t    |dd| _|dd| _|dd| _|dd| _t|d	d
}|dd}|ds|ds|dd}|dd}|sKd}d}t	|dd6}|
 }t|d | d }	|||	 |d |	  }
td| d| d| d|
 d| 
 W d    n1 sw   Y  n|g}
g }d}d}|
D ]}t	| dd}|D ]}z	t| }W n  ty } ztd| d| d|  W Y d }~qd }~ww |d }t|ddttfrt|ddgd }t|ddd }nt|dd}t|dd}t|}t|}|dkr|dk rq|dk rEtd | d!| d"| d|  t|d#krDt|d# d$ }q|| jkrLq|| jk rSq|| jkrZqg g g }}}t|D ]\\}}z
|d% }|d$ }W n ty   td&| d|  Y qfw |d'kr|| qf|d(kr|| qf|d)krd*|v r|d* }||d*|ig qf|| qft|dkrd+g}|t| }||||| d,}d-|v rt|d-d.ttfs|d- n|d- d |d-< d/|v r|d/ |d/< d0|v r|d0 |d0< d1|v r|d1 |d1< d2|v r%|d2 |d2< d3|v r0|d3 |d3< d4|v r;|d4 |d4< d5|v rF|d5 |d5< d6|v rQ|d6 |d6< |d7d8r\||d9< ||d: d; d< | 7 }|||d
  d= d= d= 7 }|| qW d    n	1 sw   Y  q|| _td>t| j d?|d@dA|dBdC| dC|
 dD d S )ENr   i@  min_source_length
   r   i   min_target_lengthr   audio_downsample_rate   is_trainingTz.jsonlz.jsondata_split_numr$   data_split_izutf-8)encodingzis_training: z, data_split_num: z, data_split_i: z, 
file_list: z, 
file_list_all: r   zdrop it, json error: z, line: z, file_json: messagesspeech_lengthtext_lengthzspeech_length: z, text_length: z, data: rO   contentrolezdrop it, KeyError: re   rf   rg   rq   zYou are a helpful assistant.)re   rf   rg   
source_lenr   	key_01234hist_contexthotwordsasr_hotwordsvad_segs	word_listone_pass_resultone_pass_werrh   	save_metaFmetag      Y@i  i'  i  z

total_num of samplers: z, total_whrs: z.5fz, total_token_for_llm_B: z.5grj   z

)r(   r)   r+   r   r   r   r   r   endswithopen	readlinesrW   r   r   stripjsonloadsr   r   rs   r   r   r   r   KeyErrorrV   contents)r7   r8   r9   r   r   r   r   finfile_list_allnum_per_slice	file_listr   
total_whrstotal_token_for_llm_B	file_jsonline	data_dictr   datar   r   re   rf   rg   r   rG   r   r   rq   
contents_ir>   r'   r@   r)   c  s   
 

















^,rA   c                 C   r`   rC   )rW   r   ra   r'   r'   r@   rb     rc   rd   c                 C   s   | j | }|S rC   )r   )r7   rF   r  r'   r'   r@   r     s   
r   c                 C   s0   | dd}|dk rt|d t|d  }|S )Nr   r   r   re   rf   )r+   rW   )r7   r  r   r'   r'   r@   rD     s   rI   c                 C   s   dS )Nr   r'   )r7   r  r'   r'   r@   rJ     s   rK   )
r   r   r   r   r)   rb   r   rD   rJ   r   r'   r'   r>   r@   r   `  s     	)r   r   r2   r   rX   r   r   r   funasr.registerr   funasr.utils.load_utilsr   r   registerr   r  Datasetr   r'   r'   r'   r@   <module>   s    
  
T