o
    ´©i¢Y  ã                   @   s>  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZ	d dl
mZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z%m&Z& eej'ƒedƒkrˆd dl(m)Z) neddd„ƒZ)e *dd¡G dd„ deƒƒZ+dS )é    N)ÚDictÚTuple)Úcontextmanager)ÚLooseVersion)Útables)Úpostprocess_utils)Úth_accuracy)Ú
Paraformer)ÚDatadirWriter)Ú
Hypothesis)Úforce_gatherable)Úadd_sos_eos)Úmake_pad_maskÚpad_list)Úload_audio_text_image_videoÚextract_fbankz1.6.0)ÚautocastTc                 c   s    d V  d S ©N© )Úenabledr   r   ú]/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/contextual_paraformer/model.pyr   #   s   €
r   Úmodel_classesÚContextualParaformerc                       sÌ   e Zd ZdZ‡ fdd„Zdejdejdejdejdeejee	ejf ejf f
d	d
„Z
dejdejdejdejdejdejfdd„Zdd„ Z		d dd„Z				d!defdd„Zd"dd„Zdd„ Z‡  ZS )#r   z 
    Author: Speech Lab of DAMO Academy, Alibaba Group
    FunASR: A Fundamental End-to-End Speech Recognition Toolkit
    https://arxiv.org/abs/2305.11013
    c           	         s  t ƒ j|i |¤Ž | dd¡| _| dd¡}| dd¡}| dd¡}| d	d
¡}| dd
¡}| dd
¡}|dkrNtjj||dd|d| _tj | j	|¡| _
n|dkr\tj | j	|¡| _
nt d |¡¡ | jdkrrd | _g | _d| _|| _|| _| jdkrƒtj ¡ | _|| _d S )NÚtarget_buffer_lengthéÿÿÿÿÚ	inner_dimé   Úbias_encoder_typeÚlstmÚuse_decoder_embeddingFÚcrit_attn_weightç        Úcrit_attn_smoothÚbias_encoder_dropout_rateé   T)Úbatch_firstÚdropoutÚmeanzUnsupport bias encoder type: {}r   )ÚsuperÚ__init__Úgetr   ÚtorchÚnnÚLSTMÚbias_encoderÚ	EmbeddingÚ
vocab_sizeÚ
bias_embedÚloggingÚerrorÚformatÚhotword_bufferÚlength_recordÚcurrent_buffer_lengthr   r    ÚL1LossÚ	attn_lossr"   )	ÚselfÚargsÚkwargsr   r   r   r    r"   r#   ©Ú	__class__r   r   r)   0   s2   
ÿ


zContextualParaformer.__init__ÚspeechÚspeech_lengthsÚtextÚtext_lengthsÚreturnc                 K   s¤  |  ¡ }|  ¡ }|jd }| d¡}| d¡}|  ||¡\}	}
d\}}tƒ }| jdkrE|  |	|
||¡\}}|dur=| ¡ nd|d< ||d< |  |	|
||||¡\}}}}}}| jdkrb||| j	  }n| j| d	| j |  || j	  }|dur†||| j
  }| ¡  ¡ |d
< |durŽ| ¡ nd|d< ||d< ||d< ||d< |dur¨| ¡  ¡ nd|d< t | ¡ ¡|d< | jrÁt|| j  ¡ ƒ}t|||f|jƒ\}}}|||fS )zäFrontend + Encoder + Decoder + Calc loss

        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Úhotword_padÚhotword_lengths©NNr!   NÚloss_ctcÚcer_ctcr$   Ú
loss_idealÚloss_attÚaccÚcerÚwerÚloss_preÚloss)ÚsqueezeÚshaper*   ÚencodeÚdictÚ
ctc_weightÚ_calc_ctc_lossÚdetachÚ_calc_att_clas_lossÚpredictor_weightr    Úcpur+   ÚcloneÚlength_normalized_lossÚintÚpredictor_biasÚsumr   Údevice)r:   r?   r@   rA   rB   r<   Ú
batch_sizerD   rE   Úencoder_outÚencoder_out_lensrG   rH   ÚstatsrJ   Úacc_attÚcer_attÚwer_attrN   rI   rO   Úweightr   r   r   ÚforwardS   sL   



ÿÿ
ÿþÿ
zContextualParaformer.forwardra   rb   Úys_padÚys_pad_lensrD   rE   c                 C   sê  t || d¡dd d …d d d …f   |j¡}| jdkr-t|| j| j| jƒ\}}|| j }| j	|||| jd\}	}
}}| j
rE| j |¡}n|  |¡}|  |¡\}\}}t d|jd ¡ ¡ }||dd„ | ¡  ¡  ¡ D ƒf }| d¡ |jd dd¡ |j¡}d }| jdkr”|  |||||	|¡\}}n|	}| j|||||d}|d |d }}	 d }|d u r²|}|  ||¡}t| d	| j¡|| jd
}|  | |
¡|
¡}| jsÖ| j d u rÛd\}}n|j!d	d}|   | ¡ | ¡ ¡\}}||||||fS )Nr$   ©Úmaxlen)Ú	ignore_idr   c                 S   s   g | ]}|d  ‘qS )r$   r   ©Ú.0Úir   r   r   Ú
<listcomp>»   ó    z<ContextualParaformer._calc_att_clas_loss.<locals>.<listcomp>r!   ©Úcontextual_infor   )Úignore_labelrF   ©Údim)"r   ÚsizeÚtor_   r]   r   ÚsosÚeosrm   Ú	predictorr   ÚdecoderÚembedr1   r.   ÚnpÚarangerQ   ÚtolistrV   rY   rP   ÚrepeatÚsampling_ratioÚsamplerÚcriterion_attr   Úviewr0   Úcriterion_preÚtype_asÚtrainingÚerror_calculatorÚargmax)r:   ra   rb   ri   rj   rD   rE   Úencoder_out_maskÚ_Úpre_acoustic_embedsÚpre_token_lengthÚhw_embedÚ_indÚselectedrt   Údecoder_out_1stÚsematic_embedsÚdecoder_outsÚdecoder_outrI   rJ   rd   rN   re   rf   Úys_hatr   r   r   rW   Ÿ   sf   $
þ


ÿ
""

ú	û	ý
z(ContextualParaformer._calc_att_clas_lossc                 C   s¸  t || ¡ dd d …d d …d f   |j¡}||d d …d d …df  }| jr.| jjj| }n| j |¡}t	 
¡ ‡ | j|||||d}	|	d |	d }
}|
 d¡}| | j¡}| d¡}||k|@  d¡}t	 |¡}| ¡ \}}t|ƒD ].}|| ||  ¡   ¡ | j  ¡ }|dkr¢|| jdt	 || ¡d |…  |j¡dd qt| d¡}| | d¡}| d¡ |j¡}W d   ƒ n1 sÂw   Y  | | d¡| |d¡ }|| |
| fS )	Nrk   r   rs   r$   r   )rw   ÚindexÚvalueFé   )r   Úmaxry   r_   Úshare_embeddingr}   Úoutput_layerrg   r~   r+   Úno_gradr‹   Únerm   r^   Ú	ones_likerx   ÚrangeÚfloatrƒ   ÚlongÚscatter_ÚrandpermÚeqÚmasked_fillÚ	unsqueeze)r:   ra   rb   ri   rj   rŽ   rt   Útgt_maskÚys_pad_embedr•   r–   r   Úpred_tokensÚnonpad_positionsÚseq_lensÚsame_numÚ
input_maskÚbszÚseq_lenÚliÚ
target_numÚinput_mask_expand_dimr”   r   r   r   r„   õ   sZ   $	ÿ
û


þÿû€
ãÿ
þzContextualParaformer.samplerNç      ð?c                 C   s,  |d u r9t  dg¡ ¡  |j¡g}t|dƒ}| jr | j |¡}n|  	|¡}|  
|¡\}\}	}
|	 |jd dd¡}nCdd„ |D ƒ}tdd„ |D ƒdƒ |j¡}| jrX| j |¡}n|  	|¡}t jjjj||ddd}|  
|¡\}
\}	}
|	 |jd dd¡}| j||||||d	}|d }t j|d
d}||fS )Nr$   r   c                 S   s   g | ]}t |ƒ‘qS r   )Úlenrn   r   r   r   rq   =  rr   zCContextualParaformer.cal_decoder_with_predictor.<locals>.<listcomp>c                 S   s   g | ]	}t  |¡ ¡ ‘qS r   )r+   ÚTensorr£   rn   r   r   r   rq   >  s    TF)r%   Úenforce_sorted)rt   Ú
clas_scaler   rv   )r+   r·   r£   ry   r_   r   r   r}   r~   r1   r.   r‚   rQ   r,   ÚutilsÚrnnÚpack_padded_sequenceÚlog_softmax)r:   ra   rb   r”   rj   Úhw_listr¹   Úhw_list_padr   Úh_nr   Ú
hw_lengthsr•   r–   r   r   r   Úcal_decoder_with_predictor*  s>   	

ÿ

ÿú	z/ContextualParaformer.cal_decoder_with_predictorÚkeyc           .   
      s  |  dd¡dkoˆ jd k}|  dd¡dko|  dd ¡d u}ˆ jd u r:|s&|r:t d¡ ˆ jd/i |¤Ž |  dd¡ˆ _i }	t ¡ }
t	||j
|  d	d
¡d}t ¡ }||
 d›|	d< t||  dd¡|d\}}t ¡ }|| d›|	d< | ¡  ¡ |j |j d |	d< |j|d d}|j|d d}ˆ j|  dd ¡||dˆ _ˆ  ||¡\}}t|tƒr¯|d }ˆ  ||¡}|d |d |d |d f\}}}}| ¡  ¡ }t |¡dk rÖg S ˆ j||||ˆ j|  dd¡d}|d |d }}g }| ¡ \}}}t|ƒD ]}||d || …d d …f }||d || …d d …f } ˆ jd ur8ˆ j|| |  dd¡|  dd¡d }!|!d ˆ j… }!n.| jd!d"}"| jd!d"d }#tj|#d!d"}#tjˆ j g|" !¡  ˆ j"g |"j#d}"t$|"|#d#g}!t%|!ƒD ]–\}$}%d }&|  d$¡d urt&ˆ d%ƒs†t'|  d$¡ƒˆ _(ˆ j(|$d › d& }&d!}'t|%j)t*ƒr¡|%j)d|'… }(n	|%j)d|'…  !¡ }(t*t+‡ fd'd(„|(ƒƒ}(|d uró| ,|(¡})| -|)¡}*t. /|)¡\}+},|| |+d)œ}-|&d uròd* 0|)¡|&d+ || < |*|&d, || < |+|&d- || < n|| |(d.œ}-| 1|-¡ qjqü||	fS )0NÚdecoding_ctc_weightr!   gñhãˆµøä>Ú	lm_weightÚlm_filezenable beam_searchÚnbestr$   Úfsi€>  )rÈ   Úaudio_fsz0.3fÚ	load_dataÚ	data_typeÚsound)rË   ÚfrontendÚextract_featiè  Úbatch_data_timer_   )r_   Úhotword)Ú	tokenizerrÍ   r   rš   é   r¹   rµ   )r¾   r¹   ÚmaxlenratioÚminlenratio)ÚxÚ	am_scoresrÓ   rÔ   r   rv   )ÚyseqÚscoreÚ
output_dirÚwriterÚ
best_recogc                    s   | ˆ j ko| ˆ jko| ˆ jkS r   )r{   rz   Úblank_id)rÕ   ©r:   r   r   Ú<lambda>Ë  s    z0ContextualParaformer.inference.<locals>.<lambda>)rÃ   rA   ú ÚtokenrA   Útext_postprocessed)rÃ   Ú	token_intr   )2r*   ÚctcÚbeam_searchr2   ÚinfoÚinit_beam_searchrÇ   ÚtimeÚperf_counterr   rÈ   r   r^   ÚitemÚframe_shiftÚlfr_nry   Úgenerate_hotwords_listÚhotword_listrR   Ú
isinstanceÚtupleÚcalc_predictorÚroundr£   r+   r›   rÂ   rx   r¡   r‹   Útensorrz   r   r{   r_   r   Ú	enumerateÚhasattrr
   rÚ   r×   ÚlistÚfilterÚ
ids2tokensÚtokens2textr   Úsentence_postprocessÚjoinÚappend).r:   Údata_inÚdata_lengthsrÃ   rÑ   rÍ   r<   Ú
is_use_ctcÚ	is_use_lmÚ	meta_dataÚtime1Úaudio_sample_listÚtime2r?   r@   Útime3ra   rb   Úpredictor_outsrŽ   r   ÚalphasÚpre_peak_indexr•   r–   rj   ÚresultsÚbÚnÚdrp   rÕ   rÖ   Ú
nbest_hypsr×   rØ   Ú	nbest_idxÚhypÚibest_writerÚlast_posrâ   rà   rA   rá   r   Úresult_ir   rÝ   r   Ú	inferenceX  s¼   ÿ
ÿ
ÿÿÿ
ü
ú

ü$ÿÿ



€Ý%zContextualParaformer.inferencec                 C   sê  dd„ }dd„ }d }|j d ur*tj |j ¡}tj |d¡}tj |¡r(||ƒ}nd }|d u r2d }	|	S tj |¡r˜| d¡r˜t d¡ g }	g }
t	 
|d¡9}| ¡ D ] }| ¡ }| ¡ }|d urd|||ƒ}|
 |¡ |	 | |¡¡ qQ|	 | jg¡ |
 d	¡ W d   ƒ n1 sˆw   Y  t d
 ||
¡¡ |	S | d¡r%t d¡ t ¡ j}tj |¡s³t |¡ tj |tj |¡¡}t |¡}t
|dƒ |j¡ |}g }	g }
t	 
|d¡9}| ¡ D ] }| ¡ }| ¡ }|d urð|||ƒ}|
 |¡ |	 | |¡¡ qÝ|	 | jg¡ |
 d	¡ W d   ƒ n	1 sw   Y  t d
 ||
¡¡ |	S | d¡sqt d¡ g }	g }
| ¡  ¡ D ] }|
 |¡ | ¡  ¡ }|d urQ|||ƒ}|	 | |¡¡ q:|	 | jg¡ |
 d	¡ t d |
¡¡ |	S d }	|	S )Nc                 S   s†   i }t | tƒs	J ‚t| ddd)}| ¡ }|D ]}| ¡  ¡ }|d }|dd … }d |¡||< qW d   ƒ |S 1 s<w   Y  |S )NÚrÚutf8)Úencodingr   r$   rß   )rî   ÚstrÚopenÚ	readlinesÚstripÚsplitrú   )Úseg_dict_fileÚseg_dictÚfÚlinesÚlineÚsrÃ   r™   r   r   r   Úload_seg_dictâ  s   ü
þùzBContextualParaformer.generate_hotwords_list.<locals>.load_seg_dictc                 S   s†   t  d¡}d}| D ]3}| ¡ }||v r||| d 7 }q	| |¡r8|D ]}||v r2||| d 7 }q#|d7 }q#q	|d7 }q	| ¡  ¡ S )Nz^[\u4E00-\u9FA50-9]+$Ú rß   z<unk> )ÚreÚcompileÚlowerÚmatchr  r  )Útxtr  ÚpatternÚout_txtÚwordÚcharr   r   r   Úseg_tokenizeî  s   


ü
zAContextualParaformer.generate_hotwords_list.<locals>.seg_tokenizer  z.txtz.Attempting to parse hotwords from local txt...r  z<s>z9Initialized hotword list from file: {}, hotword list: {}.Úhttpz(Attempting to parse hotwords from url...Úwbz&Attempting to parse hotwords as str...zHotword list: {}.)Ú	cmvn_fileÚosÚpathÚdirnamerú   ÚexistsÚendswithr2   rå   Úcodecsr  r  r  r  rû   Ú
tokens2idsrz   r4   Ú
startswithÚtempfileÚTemporaryDirectoryÚnameÚmakedirsÚbasenameÚrequestsr*   ÚwriteÚcontent)r:   Úhotword_list_or_filerÑ   rÍ   r!  r,  r  Ú	model_dirr  rí   Úhotword_str_listÚfinr  Úhwr¾   Úwork_dirÚtext_file_pathÚ
local_filer   r   r   rì   á  sš   

@Â


÷
ÿÿ0Ö





÷
ÿÿñ




ÿz+ContextualParaformer.generate_hotwords_listc                 K   s2   d|vrd|d< ddl m} |dd| i|¤Ž}|S )NÚmax_seq_leni   r$   )Úexport_rebuild_modelÚmodelr   )Úexport_metarI  )r:   r<   rI  Úmodelsr   r   r   ÚexportL  s
   zContextualParaformer.export)Nrµ   )NNNNrF   )Ú__name__Ú
__module__Ú__qualname__Ú__doc__r)   r+   r·   r   r   r  rh   rW   r„   rÂ   rõ   r  rì   rM  Ú__classcell__r   r   r=   r   r   (   sR    #þýüû
ùLþýüûú
ùV;
ù1ú
ü 

k)T),r0  r#  rç   r+   r5  r2   r8  r=  Únumpyr   Útypingr   r   Ú
contextlibr   Údistutils.versionr   Úfunasr.registerr   Úfunasr.utilsr   Úfunasr.metrics.compute_accr   Úfunasr.models.paraformer.modelr	   Úfunasr.utils.datadir_writerr
   Úfunasr.models.paraformer.searchr   Úfunasr.train_utils.device_funcsr   Ú+funasr.models.transformer.utils.add_sos_eosr   Ú*funasr.models.transformer.utils.nets_utilsr   r   Úfunasr.utils.load_utilsr   r   Ú__version__Útorch.cuda.ampr   Úregisterr   r   r   r   r   Ú<module>   s8   
