o
    ic                     @   s~  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z
d dlmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dl m!Z! d dl"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z* d dl+m,Z,m-Z- eej.edkrd dl/m0Z0 nedddZ0e1ddG dd de!eZ2edddd Z3dS )    N)DictTuple)contextmanager)LooseVersion)	lru_cache)tables)postprocess_utils)
Paraformer)DatadirWriter)
Hypothesis)force_gatherable)BiCifParaformer)LabelSmoothingLoss)add_sos_eos)ts_prediction_lfr6_standard)make_pad_maskpad_list)load_audio_text_image_videoextract_fbankz1.6.0)autocastTc                 c   s    d V  d S N )enabledr   r   X/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/seaco_paraformer/model.pyr   '   s   
r   model_classesSeacoParaformerc                       s   e Zd ZdZ fddZdejdejdejdejdeejee	ejf ejf f
d	d
Z
dd Zdd ZdejdejdejdejdejdejdejfddZ		d&ddZdd Z				d'defd d!Zd(d"d#Zd$d% Z  ZS ))r   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and Effective Hotword Customization Ability
    https://arxiv.org/abs/2308.03266
    c           
         s  t  j|i | |dd| _|dd| _|dd}|dd}|d	d}|d
d}| jdkrVtjj| j| jdd||d| _|rRtj	| jd | j| _
nd | _
n| jdkrftj| j| j| _n	td| j |dd }|d ur|d}tj|}	|	d| j| jd|| _tj	| j| j| _t| j| j||d| _|dd| _|dd| _|dd| _|d| _d S )N	inner_dim   bias_encoder_typelstmbias_encoder_dropout_rate        bias_encoder_bidFseaco_lsm_weightseaco_length_normalized_lossT   )batch_firstdropoutbidirectionalmeanzUnsupport bias encoder type: {}seaco_decoderseaco_decoder_conf)
vocab_sizeencoder_output_size)sizepadding_idx	smoothingnormalize_lengthtrain_decoderseaco_weightg{Gz?NO_BIASi   	predictorr   )super__init__getr   r   torchnnLSTMbias_encoderLinear	lstm_proj	Embeddingr,   
bias_embedloggingerrorformatr   decoder_classesr*   hotword_output_layerr   	ignore_idcriterion_seacor2   r3   r4   predictor_name)
selfargskwargsr    r"   r#   r$   r*   r+   seaco_decoder_class	__class__r   r   r7   4   sV   


zSeacoParaformer.__init__speechspeech_lengthstexttext_lengthsreturnc              	   K   s  t | dkr|dddf }t | dkr |dddf }|jd |jd   kr;|jd   kr;|jd ksHn J |j|j|j|jf|d}|d}|d}t | dkrg|dddf }|jd }	|ddd| f }|ddd| f }| ||\}
}| jdkrt|| j| j	| j
\}}|| j }t }| |
||||||}| jr| |
|||\}}}}}||| j  }t| |d< ||d< n|}t| |d	< t| |d
< | jr|| j  }	t|||	f|j\}}}|||fS )zFrontend + Encoder + Decoder + Calc loss

        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
           Nr   hotword_padhotword_lengthsseaco_label_padloss_attacc_att
loss_seacoloss)lenr.   shaper8   maxencodepredictor_biasr   soseosrF   dict_calc_seaco_lossr2   _calc_att_lossr3   r9   clonedetachlength_normalized_losssumr   device)rI   rO   rP   rQ   rR   rK   rU   rV   rW   
batch_sizeencoder_outencoder_out_lens_ys_pad
ys_lengthsstatsrZ   rX   rY   r[   weightr   r   r   forwardl   sV   :





	

zSeacoParaformer.forwardc                 C   s   || S r   r   )rI   cif_attendeddec_attendedr   r   r   _merge   s   zSeacoParaformer._mergec                 C   sN   t ||ddd d d d d f  |j}| j|d || jd}|d d S )NrT   maxlenrF      )r   r.   torj   r5   rF   )rI   rl   rm   encoder_out_maskpredictor_outsr   r   r   calc_predictor   s   $
zSeacoParaformer.calc_predictorrl   rm   ro   rp   rU   rV   rW   c                 C   s  t ||ddd d d d d f  |j}| j|||| jdd }	| j|||	|dd\}
}| ||}|d	|j
d dd|j}|j
d }t|g 	|j
d |j}| |||	|\}}| |||
|\}}| ||}| |d d d df }| ||}|S )NrT   rw   ry   r   T)return_hidden)r   r.   r{   rj   r5   rF   decoder_hotword_representationsqueezerepeatr]   r9   Tensorintr*   rv   rE   rG   )rI   rl   rm   ro   rp   rU   rV   rW   r|   pre_acoustic_embedsdecoder_outrn   selectedcontextual_infonum_hot_word_contextual_lengthrt   ru   merged
dha_outputrX   r   r   r   rd      s:   $


 
"z SeacoParaformer._calc_seaco_loss2         ?c                    s  j ||||ddd\}}	}
tj|dd}|d urdd |D }dd |D }t|d|j}|t| |j}|	d
|jd d	d	|j}|jd	 }t|g 
|jd |j}|dkr||k rȈj|||	|}|d dd}t|t||d	 d	  }|}|t|d	  || }|	d
|jd d	d	|j}|jd	 }t|g 
|jd |j}||||\}}
|||	|\}}
||}|}tj|dd} fd
d}|||}|S |S )NT)r   return_bothr   dimc                 S   s   g | ]}t |qS r   )r\   .0ir   r   r   
<listcomp>      z:SeacoParaformer._seaco_decode_with_ASF.<locals>.<listcomp>c                 S   s   g | ]	}t | qS r   )r9   r   longr   r   r   r   r     s    r   rT   c                    s   t  g|jd  }|dd }|jk d}d| | }d| }|| j|| j}}||	ddd |	ddd }| | |d d d d d d f d|   }|S )Nr   r   rT   )
r9   r   r]   r^   r4   r   	unsqueezer{   rj   reshape)
dec_outputr   lmbddha_idsdha_maskablogitsr3   rI   r   r   
_merge_res9  s    *z:SeacoParaformer._seaco_decode_with_ASF.<locals>._merge_res)r   r9   log_softmaxr   r{   rj   r   r   r   r   r   r]   r*   forward_asf6ri   topkmintolistappendr\   rv   rE   )rI   rl   rm   sematic_embedsys_pad_lenshw_listnfilterr3   r   decoder_hiddenrn   decoder_pred
hw_lengthshw_list_hw_list_padr   r   r   r   hotword_scores
dec_filter
add_filterrt   ru   r   r   dha_predr   merged_predr   r   r   _seaco_decode_with_ASF   sp   	 

 


z&SeacoParaformer._seaco_decode_with_ASFc           
      C   s   | j dkr
td 	 | j|}tjjjj	||
 tjddd}| |\}}tjjjj|ddd }| jd urA| |}n|}td|jd  }||dd	 | 
  D f }	|	S )
Nr   zUnsupported bias encoder typeTF)r&   enforce_sorted)r&   r   c                 S   s   g | ]}|d  qS )rT   r   r   r   r   r   r   g  r   z;SeacoParaformer._hotword_representation.<locals>.<listcomp>)r   rA   rB   r   embedr9   r:   utilsrnnpack_padded_sequencecputypeint64r<   pad_packed_sequencer>   nparanger]   r   rg   )
rI   rU   rV   hw_embedpacked_rnn_outputrn   
rnn_output	hw_hidden_indr   r   r   r   r   J  s$   



"z'SeacoParaformer._hotword_representationNkeyc           .   
      s  | dddko jd k}| dddko| dd d u} jd u r:|s&|r:td  jd0i | | dd _i }	t }
t	||j
| d	d
d}t }||
 d|	d< t|| dd|d\}}t }|| d|	d< |  |j |j d |	d< |j|d d}|j|d d} j| dd ||d _ ||\}}t|tr|d } ||}|d |d }}|  }t|dk rg fS  j|||| jd} jdkr |||\}}}}nd }g }| \}}}t|D ]X}||d || d d f }||d || d d f } jd ur5 j||| dd| ddd} | d  j } n.|j dd}!|jddd }"tj|"dd}"tj! j"g|!#   j$g |!j%d}!t&|!|"d g} t'| D ]\}#}$d }%| d!d urt( d"st)| d! _* j*|#d  d# }%d}&t|$j+t,r|$j+d|& }'n	|$j+d|& # }'t,t- fd$d%|'}'|d urD|.|'}(|/|(})|d urt0|| d || d&  || d || d&  t11|(| d'dd(\}}*t23|(|*\}+},}|| |+|,d)}-|%d urd*4|(|%d+ || < |,|%d, || < |+|%d- || < n.t23|(\}+}|| |+d.}-|%d urCd*4|(|%d+ || < |+|%d- || < n|| |'d/}-|5|- qgq||	fS )1Ndecoding_ctc_weightr!   gh㈵>	lm_weightlm_filezenable beam_searchnbestrT   fsi>  )r   audio_fsz0.3f	load_data	data_typesound)r   frontendextract_feati  batch_data_timerj   )rj   hotword)	tokenizerr   r   )r   CifPredictorV3maxlenratiominlenratio)x	am_scoresr   r   r   r   )yseqscore
output_dirwriter
best_recogc                    s   |  j ko|  jko|  jkS r   )rb   ra   blank_id)r   rI   r   r   <lambda>  s    z+SeacoParaformer.inference.<locals>.<lambda>   
begin_time)
vad_offset)r   rQ   	timestamp tokenr   rQ   )r   rQ   )r   	token_intr   )6r8   ctcbeam_searchrA   infoinit_beam_searchr   timeperf_counterr   r   r   ri   itemframe_shiftlfr_nr{   generate_hotwords_listhotword_listr_   
isinstancetupler~   roundr   r9   r^   r   rH   calc_predictor_timestampr.   rangeargmaxtensorra   r   rb   rj   r   	enumeratehasattrr
   r   r   listfilter
ids2tokenstokens2textr   copyr   sentence_postprocessjoinr   ).rI   data_indata_lengthsr   r   r   rK   
is_use_ctc	is_use_lm	meta_datatime1audio_sample_listtime2rO   rP   time3rl   rm   r}   r   pre_token_lengthr   rn   	us_alphasus_peaksresultsr   ndr   r   r   
nbest_hypsr   r   	nbest_idxhypibest_writerlast_posr   r   rQ   r   text_postprocessedtime_stamp_postprocessedresult_ir   r   r   	inferencej  s   



	

$








6zSeacoParaformer.inferencec                 C   s  dd }d }|j d ur&tj|j }tj|d}tj|r$t|}nd }|d u r.d }|S tj|r|drt	d g }g }	t
|d9}
|
 D ] }| }| }|d ur`|||}|	| ||| qM|| jg |	d W d    n1 sw   Y  t	d||	 |S |d	r!t	d
 t j}tj|st| tj|tj|}t|}t|d|j |}g }g }	t
|d9}
|
 D ] }| }| }|d ur|||}|	| ||| q|| jg |	d W d    n	1 sw   Y  t	d||	 |S |dsmt	d g }g }	|  D ] }|	| |  }|d urM|||}||| q6|| jg |	d t	d|	 |S d }|S )Nc                 S   s   t d}d}| D ]3}| }||v r||| d 7 }q	||r8|D ]}||v r2||| d 7 }q#|d7 }q#q	|d7 }q	|  S )Nz^[\u4E00-\u9FA50-9]+$ r   z<unk> )recompilelowermatchstripsplit)txtseg_dictpatternout_txtwordcharr   r   r   seg_tokenize  s   



z<SeacoParaformer.generate_hotwords_list.<locals>.seg_tokenizer(  z.txtz.Attempting to parse hotwords from local txt...rz<s>z9Initialized hotword list from file: {}, hotword list: {}.httpz(Attempting to parse hotwords from url...wbz&Attempting to parse hotwords as str...zHotword list: {}.)	cmvn_fileospathdirnamer  existsload_seg_dictendswithrA   r   codecsopen	readlinesr%  r&  r   
tokens2idsra   rC   
startswithtempfileTemporaryDirectorynamemakedirsbasenamerequestsr8   writecontent)rI   hotword_list_or_filer   r   r-  r(  	model_dirseg_dict_filer   hotword_str_listfinlinehwr   work_dirtext_file_path
local_filer   r   r   r     s   

@



0











z&SeacoParaformer.generate_hotwords_listc                 K   s2   d|vrd|d< ddl m} |dd| i|}|S )Nmax_seq_leni   rT   )export_rebuild_modelmodelr   )export_metarP  )rI   rK   rP  modelsr   r   r   exporta  s
   zSeacoParaformer.export)r   r   )NNNN)NN)__name__
__module____qualname____doc__r7   r9   r   r   r   strrs   rv   r~   rd   r   r   r  r  r   rT  __classcell__r   r   rM   r   r   ,   sZ    8
G	
2
`#
 
`rT   )maxsizec                 C   s   i }t | ts	J t| ddd)}| }|D ]}|  }|d }|dd  }d|||< qW d    |S 1 s<w   Y  |S )Nr.  utf8)encodingr   rT   r   )r   rY  r9  r:  r%  r&  r  )rG  r(  flinesrJ  sr   valuer   r   r   r6  m  s   
r6  )T)4r2  r!  r   r  r9   r8  rA   r=  rB  numpyr   typingr   r   
contextlibr   distutils.versionr   	functoolsr   funasr.registerr   funasr.utilsr   funasr.models.paraformer.modelr	   funasr.utils.datadir_writerr
   funasr.models.paraformer.searchr   funasr.train_utils.device_funcsr   $funasr.models.bicif_paraformer.modelr   "funasr.losses.label_smoothing_lossr   +funasr.models.transformer.utils.add_sos_eosr   funasr.utils.timestamp_toolsr   *funasr.models.transformer.utils.nets_utilsr   r   funasr.utils.load_utilsr   r   __version__torch.cuda.ampr   registerr   r6  r   r   r   r   <module>   sL   
    D