o
    i|                     @   s   d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZmZ d dlmZmZ d d	lmZ d d
lmZ e	je	j e	j!dZ"e#ddG dd dej$Z%dS )    N)Union)compute_accuracy)tables)force_gatherable	to_device)DatadirWriter)extract_fbankload_audio_text_image_video)
AutoConfigAutoModelForCausalLM)CTC)forced_align)bf16fp16fp32model_classes
FunASRNanoc                       sR  e Zd Z								d4dededededed	ed
edef fddZ							d5dej	dej	dej	dej	dej	dej	dej	fddZ
dd Zdd Zdd Zi fdefddZ				d6d efd!d"Zd7d$ee d%ed&efd'd(Zd)ed*eeej	f fd+d,Z				d6d efd-d.Z				d6d efd/d0Zed8d1efd2d3Z  ZS )9r   NP   Faudio_encoderaudio_encoder_confaudio_adaptoraudio_adaptor_confllmllm_conf
input_sizelength_normalized_lossc	           !         s  t    |dd }
|dd| _|
dkr>ddlm} ||dd}t|jd	r,|jjnd
}t|jdr9|jjj	n|jj	}nt
j|}|d-d|i|}| }|dd}|ri| D ]\}}d|_q]|  || _d | _|dd }d }|di }t|}tj|fi |}|dd}|r| D ]\}}d|_q|  |ddr|  |dd| _|t| j | _| jjd
 }t
j|}|dkr||d< |d ur|n|d |d< |d-i |}|dd}|r| D ]\}}d|_q|  || _|dd| _d | _ t
j|	dd }|d urd|	v r%|	dd n|	d d }d|	v r6|	dd n|	d d }|d urV|d urVt
j!|}|d-i |}|| _"|d us_J d|	dd}|	di }|dkrt||d< |d-i || _ |dd }|d urt#j$|dd}| j j%|dd }t&'d!| d"|  |dd}|r| j  D ]\}}d|_q| j   |	d#i }|d$|d% | _(|	d&d'| _)t*d-||| j(d(|| _+|	d)d| _,d | _-|| _.t/t0j1d*d} t&'d+|  d, d S ).Nhubactivation_checkpointFmsr   	AutoModelmaster)modelmodel_revisionencoder_output_sizer"   r   freezeTinit_param_pathload_kwargs	llm_dtyper   encoder_dimllm_dimuse_low_frame_ratectc_decoderctc_tokenizerdataset_confctc_tokenizer_confzctc_tokenizer must be setctc_vocab_sizeic  ctc_decoder_confcpu)map_location)strictzLoading ctc_decoder ckpt: z
, status: ctc_confblank_id   
ctc_weightg333333?)odimr$   r7   detach_ctc_decoderRANKzrank: z, model is builded. )2super__init__get#audio_encoder_activation_checkpointfunasrr    hasattrr"   r$   encoderr   encoder_classesoutput_sizenamed_parametersrequires_gradevalr   r   r
   from_pretrainedr   from_configgradient_checkpointing_enabler)   to	dtype_mapget_input_embeddingsweightshapeadaptor_classesr   r,   r-   tokenizer_classesr.   torchloadload_state_dictlogginginfor7   r9   r   ctcr;   error_calculatorr   intosenviron)!selfr   r   r   r   r   r   r   r   kwargsr   r    r"   audio_encoder_output_sizeencoder_classr&   _paramr'   r+   llm_load_kwargsconfigadaptor_classctc_decoder_classr.   r0   ctc_tokenizer_classr1   r2   	src_stateflagr6   rank	__class__r=   T/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/fun_asr_nano/model.pyr?      s   












zFunASRNano.__init__speechspeech_lengths	input_idsattention_mask
labels_ids	fbank_beg
fbank_maskc           %      K   s(  |j \}	}
i }d||dk < | jj |}|d ur+t| dkr*|d d df }|j \}}}| jrEddlm} || j	||dd\}}n| 	||\}}| 
||\}}|j \}	}
}|d}d||dk < d||dk < d}t|	D ]}t|j d D ]}|||f  }|dkr|||f }||d |d d f }z|||||| d d f< W n^ ty } zQtt| dt   td| d	|j  d
| d| d|j  d| d| d|  ||  }||d |d d f }|||||| d d f< W Y d }~nd }~ww |d7 }q{qr||d< || |d< |  |d< |d |d  |d< t|  jj}tj|dv r<|nd| jdkrEdndt| j d% d||dk< d||dk < | j|t| j ||d}|j}W d    n	1 svw   Y  t  * t!|j"d}t#|d d d df |d d dd f dd} | |d< W d    n	1 sw   Y  t$|% |d< |	|d< |
|	 |d < |  |d!< |d  |d!  |d"< |dkd}!t&|!'  }"|!  |	 }#|"|d#< |#|d$< | j(rt'|dk }	t)|||	f|j\}}}$|||$fS )%Nr   r8   )
checkpointF)use_reentrantfake_token_len, batch_idx: , inputs_embeds: , fbank_beg_idx: , speech_token_len: z, encoder_out: z, encoder_out_lens: , fake_token_len: , speech_lengths: batch_size_speechbatch_size_x_framesbatch_size_real_framespadding_framescudaxpumpsr3   r   Tdevice_typeenableddtyper%   )inputs_embedsrr   labels)ignore_labelaccloss
batch_sizebatch_size_x_tokensbatch_size_real_tokenspadding_tokensdialog_turns_maxdialog_turns_avg)*rQ   r   r"   rO   lensizerA   torch.utils.checkpointrv   encoder   r@   rangeitem	ExceptionrW   errorstr	traceback
format_excrX   sumnext
parametersdevicetyperT   autocastr)   rN   rM   r   no_gradargmaxlogitsr   clonedetachmaxr[   r   r   )%r^   ro   rp   rq   rr   rs   rt   ru   r_   r   	token_numstatsr   r   framesrb   rv   encoder_outencoder_out_lensdimsrx   
speech_idx	batch_idxturn_idfbank_beg_idxspeech_token_lenspeech_tokener   model_outputsr   predsacc_attdialog_turnsr   r   rP   r=   r=   rn   forward   s   





6

.

zFunASRNano.forwardc                 K   s(   |  ||\}}| ||\}}||fS N)r   r   )r^   ro   rp   r_   xolensr   r   r=   r=   rn   forward_export  s   zFunASRNano.forward_exportc                 C   s   |  ||\}}||fS r   )r   )r^   ro   rp   r   r   r=   r=   rn   r     s   zFunASRNano.encodec                 C   s   g g g }}}t |D ]5\}}|d }|d }|dkr"|| q|dkr8d|v r2|d }	||	g}|| q|dkrA|| q|t| }|||d}
|
S )Nrolecontentsystemuseraudio	assistant)r   r   r   )	enumerateappendr   )r^   datar   r   r   ir   r   r   r   contentsr=   r=   rn   data_template  s(   
zFunASRNano.data_templater   c           /      K   s<  |d }|d }|d }t d}	d}
d}d|v r)|d dd}
|d dd}g g g g g g g f\}}}}}}}g }tt|||D ]\}\}}}||d	d
krV nt||ddkrc nt|ttfrn|\}}|dkr|ddrd| d| }|sd| }n$d| d| d}|sd| d}n|ddrd| }nd| d}|
s|d7 }|dd d ur||d 7 }|		|}g }g }d}d}g g }} t|D ]\}!}"|"
ds||"}#||#7 }|dgt|# 7 }q|"dddd}"|"
dr|"dd  }"|"
dr|}"zt }$t|"fd|ji|}%t }&|&|$ d|d< W n" tyO }' ztdt|' d t   W Y d }'~'nd }'~'ww t|%|d!d"|dd#\}} t }(|(|& d|d$< |   |j |j d% |d&< | jrd| d  d' d( d(  })d|)d' d( d(  })|)d d( d }n| d  }dg| }*t|}||*7 }|dgt|* 7 }q||t| g7 }||g7 }d)gt| }+| d*}||},|| }|||, 7 }||+|, 7 }||7 }t|dkr	||dd d d d f  ||  qCtj|tj d+}tjdgt| tj!d+}-tj|tj d+}tj|tj"d+}tj|tj!d+}tj|tj!d+}tj|tj d+}tj|,tj d+},t|dkrmtj#j$j%j&|dd,d-}tj#j$j%j&|ddd-} ng }g } || |d d d f |d. |d d d f |d. |-d. ||d d d f |,d d d f d/
}.|.S )0Nr   r   r   z)(<\|startofspeech\|>.*?<\|endofspeech\|>)Tr/   do_think
sys_promptmultiturn_num_max   max_token_lengthi  r   infer_with_assistant_inputFz<|im_start|>system
z<|im_end|>
<|im_start|>user
z<|im_start|>user
z!<|im_end|>
<|im_start|>assistant
z<think>

</think>

	prev_textr%   z<|startofspeech|> <|endofspeech|>!r8   fsz0.3f	load_datazLoading wav failed! ry   	data_typesound)r   frontendis_finalextract_feat  batch_data_time      r   z
<|im_end|>r   g        )batch_firstpadding_valuer   )
ro   rp   ru   rt   rx   rq   rr   rs   
source_ids
target_ids)'recompiler@   r   zipr   
isinstancelisttuplesplit
startswithr   replacetimeperf_counterr	   r   r   rW   r   r   r   r   r   r   r   frame_shiftlfr_nr,   r   rT   tensorint64int32float32nnutilsrnnpad_sequence)/r^   r   	tokenizerr   	meta_datar_   r   r   r   patternr   r   rq   r   fbank
fbank_lensru   rt   rx   input_source_idsr   system_promptuser_prompt
target_outr   source_inputsplitsr   fbank_mask_ifake_token_len_ifbank_beg_iro   rp   ksub_str	sub_tokentime1data_srctime2r   time3r   
fake_tokensource_maskr   rr   outputr=   r=   rn   data_load_speech4  s  
	 






*








zFunASRNano.data_load_speechkeyc                  K   s  i }| dddkrtd| |d }| j|||fd|i|}	t|	|d }
|
d }t|dkrd|v rDd	|v rD|d }|d	 }nC|
d
 d d df }| ddr[|tj}n| ddrg|tj	}| 
||\}}| ||\}}||d< ||d< ||d< ||d< |
d }|
d }|
d }|
d }| dds|}d||dk < | jj |}|j\}}}d||dk < d||dk < d}t|D ]}t|jd D ]}|||f  }|dkr`|||f }||d |d d f }z|||||| d d f< W n^ ty[ } zQtt| dt   td| d|j d| d| d|j d| d| d|  ||  }||d |d d f }|||||| d d f< W Y d }~nd }~ww |d7 }qq|||
||fS ) Nr   r8   z!batch decoding is not implementedr   r  r   ro   audio_embeddingaudio_embedding_lensrp   r   Fr   r   r   audio_adaptor_outaudio_adaptor_out_lensrq   r   rt   rx   teacherforcingry   rz   r{   r|   r}   z, adaptor_out: z, adaptor_out_lens: r~   r   )r@   NotImplementedErrorr   r  r   r   rM   rT   float16bfloat16r   r   r   r"   rO   rQ   r   r   r   rW   r   r   r   r   rX   ) r^   data_indata_lengthsr  r   r   r_   r  r   r  batchro   r   r   rp   adaptor_outadaptor_out_lensrq   r   rt   rx   r   r   r   r   r   r   r   r   r   r   r   r=   r=   rn   inference_prepare  s   	


6
zFunASRNano.inference_prepareThotwordslanguageitnc                 C   sd   t |dkrd|}d}|d| d7 }nd}|d u r!|d7 }n|d| 7 }|s.|d	7 }|d
 S )Nr   ry   u   请结合上下文信息，更加准确地完成语音转写任务。如果没有相关信息，我们会留空。


**上下文信息：**


u   热词列表：[z]
r   u   语音转写u   语音转写成u   ，不进行文本规整u   ：)r   join)r^   r(  r)  r*  promptr=   r=   rn   
get_prompt&  s   

zFunASRNano.get_promptr,  r   c                 C   sf   t |trdddd| d| dddddgS t |tjr1dddd| d	|d
dddgS d S )Nr   zYou are a helpful assistant.)r   r   r   z<|startofspeech|>!r   r   nullz"<|startofspeech|>!!<|endofspeech|>)r   r   r   )r   r   rT   Tensor)r^   r,  r   r=   r=   rn   generate_chatml5  s   
zFunASRNano.generate_chatmlc              	      s    |dg |dd |ddfdd|D }|d u rAg }|D ]}tjtj  |dd fd	d
tdD   q%j|f||||d|S )Nr(  r)  r*  Tc                    s   g | ]}  |qS r=   )r0  ).0r   )r,  r^   r=   rn   
<listcomp>S  s    z(FunASRNano.inference.<locals>.<listcomp>	rand_key_r   c                 3   s    | ]}t  V  qd S r   )randomchoice)r1  rb   )charsr=   rn   	<genexpr>Y  s    z'FunASRNano.inference.<locals>.<genexpr>   )r#  r  r   r   )	r-  r@   stringascii_lettersdigitsr   r+  r   inference_llm)r^   r"  r#  r  r   r   r_   rb   r=   )r6  r,  r^   rn   	inferenceG  s&   	(zFunASRNano.inferencec           /   	   K   s  | j |||||fi |\}}}	}
}g }| jd ur|d }|d }| ||\}}| j|}| \}}}t|d ttfrC|d }t||k rM|| }t	|D ]:}||d || 
 d d f }|jdd}tj|dd}|| jk}||  }| j|}||| ||d qQ|dd}|dkr|d	d
rd	n|}|dd
rdn|}t|ddj}tj|dv r|nd|dkrdnd
t| d |d d }| jt| | _|t| }|di }|dd
s|	dd }| jjd;|||dd| jjjp	| jjjd|} |j| |dddd }!d }"nK|	d }#d|#|#dk< |	dd }| jd;|||#| jjjp?| jjjd|}$t|$jdd d |
j d d f }%|j|%d
|dddd }!|$j!
 }"W d    n	1 suw   Y  |d d!|! }!d }&|d"d urt"| d#st#|d"| _$| j$d d$ }&g }'t%&d%d!|!}(|d t%&d&d'|!'d(d'|(|d)})|"d ur|"|)d*< |'|) t(||'D ]v\}*}+|*d+ 'd,d!|+d-< tj)| j*|+d- tj+d.},t,|*d/ |,| j|+d0< tj)| j*|+d+ tj+d.},t,|*d/ |,| j|+d1< |+d1 |+d0 fD ],}-|-D ]&}.| j|.d2 g|.d2< |.d3 d4 d5 d6 |.d3< |.d7 d4 d5 d6 |.d7< qqq|&d url|!'d8d'|&d+ |d < |'d8d'|&d9 |d < |(|&d: |d < |'|fS )<Nr   r   r   r%   )dim)r  text
ctc_logitsr)   r   r   Fr   r   r   r   r3   Tr   r   
llm_kwargsr  rr   
max_lengthi   )r   rr   max_new_tokenspad_token_idskip_special_tokens)rE  rs   r   )r   rr   r   rD  r8   )add_special_tokensrE  r   r   
output_dirwriter
best_recogz[^\w\s\u3000\u4e00-\u9fff]+z\s+ z/sil)r  r?  text_tnlabelr   r?  z<|nospeech|>ctc_textr   r@  ctc_timestamps
timestampstoken
start_time   
   r   end_time
rL  rK  r=   )-r'  r-   rY   log_softmaxr   r   r   r   r   r   r   r   rT   unique_consecutiver7   tolistr.   decoder   r@   r   r   r   rN   r   rM   generatere   rD  eos_token_idbatch_decoder   rQ   r   rC   r   rH  r   subr   r   r   r   r   r   )/r^   r"  r#  r  r   r   r_   r   r   r$  r   r  ctc_resultsr   r   decoder_outdecoder_out_lensr@  bndr   r   yseqmask	token_intr?  r)   r   rL  rA  rr   generated_idsresponser   rs   r   r   ibest_writerresultsresponse_cleanresult_i
ctc_resultresultr   rO  	timestampr=   r=   rn   r<  d  s   	




$

,


zFunASRNano.inference_llmr"   c                 K   s.   ddl m} |jd| dd|\} }| |fS )Nr   r   T)r"   trust_remote_coder=   )rB   r    build_model)r"   r_   r    r=   r=   rn   rJ     s   zFunASRNano.from_pretrained)NNNNNNr   F)NNNNNNN)NNNN)NTr   )__name__
__module____qualname__r   dictr[   boolr?   rT   r/  r   r   r   r   r  r   r'  r-  r   r0  r=  r<  staticmethodrJ   __classcell__r=   r=   rl   rn   r      s    	 
o 
Y
 
 )&rW   r\   r4  r   r9  r   r   typingr   rT   torch.nnr   funasr.metrics.compute_accr   funasr.registerr   funasr.train_utils.device_funcsr   r   funasr.utils.datadir_writerr   funasr.utils.load_utilsr   r	   transformersr
   r   rY   r   tools.utilsr   r!  r   r   rN   registerModuler   r=   r=   r=   rn   <module>   s*    
