o
    iX                     @   s  d dl mZmZ d dlZd dlZd dlZd dlZd dlm	  m
Z d dlmZ d dlm	Z	 d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ d dlmZ d dlm Z  ddl!m"Z" G dd dej	j#Z$G dd dej	j#Z%G dd de	j#Z&G dd de	j'Z'dej(dfddZ)G dd de	j#Z*e+ddG dd de	j#Z,e+dd G d!d  d e	j#Z-dS )"    )IterableOptionalN)Tensor)nn)autocast)compute_accuracyth_accuracy)LabelSmoothingLoss)force_gatherable)load_audio_text_image_videoextract_fbank)DatadirWriter)CTC)tables)
Hypothesis   )ctc_forced_alignc                   @   sF   e Zd ZdZdddZddejfdejded	ej	fd
dZ
dd ZdS )SinusoidalPositionEncoder P   皙?c                 C   s   d S N )selfd_modeldropout_rater   r   S/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sense_voice/model.py__int__   s   z!SinusoidalPositionEncoder.__int__N	positionsdepthdtypec           
      C   s   | d}||}|j}ttjdg||d|d d  }ttj|d |d||  }t||dg}t|g dt|g d	 }tj	t
|t|gdd
}	|	|S )Nr   i'  )r    device   r   r!   )r   r$   r   )r   r   r$   dim)sizetyper!   torchlogtensorexparangereshapecatsincos)
r   r   r   r    
batch_sizer!   log_timescale_incrementinv_timescalesscaled_timeencodingr   r   r   encode   s   



z SinusoidalPositionEncoder.encodec                 C   sP   |  \}}}tjd|d |jdd d d f }| |||j|j}|| S )Nr   r#   )r'   r)   r-   r!   r7   r    to)r   xr2   	timesteps	input_dimr   position_encodingr   r   r   forward2   s   "z!SinusoidalPositionEncoder.forward)r   r   )__name__
__module____qualname____doc__r   r)   float32r   intr    r7   r=   r   r   r   r   r      s    

r   c                       s2   e Zd ZdZej f fdd	Zdd Z  Z	S )PositionwiseFeedForwardzPositionwise feed forward layer.

    Args:
        idim (int): Input dimenstion.
        hidden_units (int): The number of hidden units.
        dropout_rate (float): Dropout rate.

    c                    sF   t t|   tj||| _tj||| _tj|| _	|| _
dS )z,Construct an PositionwiseFeedForward object.N)superrD   __init__r)   r   Linearw_1w_2Dropoutdropout
activation)r   idimhidden_unitsr   rL   	__class__r   r   rF   D   s
   
z PositionwiseFeedForward.__init__c              	   C   s   |  | | | |S )zForward function.)rI   rK   rL   rH   )r   r9   r   r   r   r=   L   s   zPositionwiseFeedForward.forward)
r>   r?   r@   rA   r)   r   ReLUrF   r=   __classcell__r   r   rO   r   rD   :   s    	rD   c                       s\   e Zd ZdZ					d fdd	Zdd	d
Zdd ZdddZdddZdddZ	  Z
S )MultiHeadedAttentionSANMzMulti-Head Attention layer.

    Args:
        n_head (int): The number of heads.
        n_feat (int): The number of features.
        dropout_rate (float): Dropout rate.

    r   N      r   c              	      s   t    || dksJ || | _|| _t||| _t||d | _d| _tj	|d| _
tj|||dd|dd| _|d d }|dkrJ|| }|d | }t||fd	| _dS )
z)Construct an MultiHeadedAttention object.r      N)pr   F)stridepaddinggroupsbiasr"           )rE   rF   d_khr   rG   
linear_outlinear_q_k_vattnrJ   rK   Conv1d
fsmn_blockConstantPad1dpad_fn)r   n_headin_featn_featr   kernel_size
sanm_shfit	lora_list	lora_rank
lora_alphalora_dropoutleft_paddingright_paddingrO   r   r   rF   [   s    

z!MultiHeadedAttentionSANM.__init__c                 C   s   |  \}}}|d ur t||ddf}|d ur|| }|| }|dd}| |}| |}|dd}||7 }| |}|d urG|| }|S )Nr$   r   r"   )r'   r)   r.   	transposere   rc   rK   )r   inputsmaskmask_shfit_chunkbtdr9   r   r   r   forward_fsmn   s   


z%MultiHeadedAttentionSANM.forward_fsmnc                 C   s   |  \}}}| |}tj|t| j| j dd\}}}t|||| j| jfdd}	t|||| j| jfdd}
t|||| j| jfdd}|	|
||fS )a	  Transform query, key and value.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).

        Returns:
            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).

        r$   r%   r   r"   )	r'   r`   r)   splitrC   r^   r]   r.   rq   )r   r9   ru   rv   rw   q_k_vqkvq_hk_hv_hr   r   r   forward_qkv   s   
"z$MultiHeadedAttentionSANM.forward_qkvc           
      C   s   | d}|dur0|dur|| }|dd}td }|||}tj|dd|d}ntj|dd}| |}t||}	|		dd
 |d| j| j }	| |	S )	a  Compute attention context vector.

        Args:
            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).

        Returns:
            torch.Tensor: Transformed value (#batch, time1, d_model)
                weighted by the attention score (#batch, time1, time2).

        r   Nr   infr$   r%   r\   r"   )r'   	unsqueezeeqfloatmasked_fillr)   softmaxrK   matmulrq   
contiguousviewr^   r]   r_   )
r   valuescoresrs   mask_att_chunk_encodern_batch	min_valuera   p_attnr9   r   r   r   forward_attention   s$   

 
z*MultiHeadedAttentionSANM.forward_attentionc                 C   sZ   |  |\}}}}| |||}	|| jd  }t||dd}
| ||
||}||	 S )  Compute scaled dot product attention.

        Args:
            query (torch.Tensor): Query tensor (#batch, time1, size).
            key (torch.Tensor): Key tensor (#batch, time2, size).
            value (torch.Tensor): Value tensor (#batch, time2, size).
            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
                (#batch, time1, time2).

        Returns:
            torch.Tensor: Output tensor (#batch, time1, d_model).

              r$   )r   rx   r]   r)   r   rq   r   )r   r9   rs   rt   r   r~   r   r   r}   fsmn_memoryr   att_outsr   r   r   r=      s   z MultiHeadedAttentionSANM.forwardc                 C   s  |  |\}}}}|dur|dks|dkr|dur|ddddd|d  ddf }	|ddddd|d  ddf }
tj|d |fdd}tj|d |fdd}tj|d |	fdd|d< tj|d |
fdd|d< |dkr|d dddd||d   dddf |d< |d dddd||d   dddf |d< n)|ddddd|d  ddf |ddddd|d  ddf d	}|}| |d}|| jd
  }t||dd}| ||d}|| |fS )r   Nr   r$   r"   r|   r%   r}   r   )r|   r}   r   r   )r   r)   r/   rx   r]   r   rq   r   )r   r9   cache
chunk_size	look_backr~   r   r   r}   
k_h_stride
v_h_stride	cache_tmpr   r   r   r   r   r   forward_chunk   s,   &&22$$z&MultiHeadedAttentionSANM.forward_chunk)r   NrT   rU   r   r   NNNNr   )r>   r?   r@   rA   rF   rx   r   r   r=   r   rR   r   r   rO   r   rS   Q   s    
&

&rS   c                       s$   e Zd Z fddZdd Z  ZS )	LayerNormc                    s   t  j|i | d S r   )rE   rF   )r   argskwargsrO   r   r   rF     s   zLayerNorm.__init__c                 C   sL   t | | j| jd ur| j nd | jd ur| j nd | j}||S r   )F
layer_normr   normalized_shapeweightr[   epstype_as)r   inputoutputr   r   r   r=     s   
zLayerNorm.forward)r>   r?   r@   rF   r=   rR   r   r   rO   r   r     s    r   c                 C   sf   |d u r|   }td|d| j}tj| dd}||k }| }|d ur.|||S ||S )Nr   r   r$   r%   )maxr)   r-   r8   r!   r   detachr(   )lengthsmaxlenr    r!   
row_vectormatrixrs   r   r   r   sequence_mask"  s   "r   c                       s8   e Zd Z			d fdd	ZdddZdd
dZ  ZS )EncoderLayerSANMTFr\   c	           	         sz   t t|   || _|| _t|| _t|| _t	|| _
|| _|| _|| _|| _| jr5t|| || _|| _|| _dS )z!Construct an EncoderLayer object.N)rE   r   rF   	self_attnfeed_forwardr   norm1norm2r   rJ   rK   in_sizer'   normalize_beforeconcat_afterrG   concat_linearstochastic_depth_rater   )	r   r   r'   r   r   r   r   r   r   rO   r   r   rF   .  s   


zEncoderLayerSANM.__init__Nc           
   
   C   sn  d}d}| j r| jdkrtd | jk }dd| j  }|r0|dur,tj||gdd}||fS |}| jr:| |}| jretj|| j	||||dfdd}	| j
| jkr]||| |	  }n-|| |	 }n%| j
| jkr|||| | j	||||d  }n|| | j	||||d }| js| |}|}| jr| |}||| | |  }| js| |}|||||fS )	  Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        Fg      ?r   r   Nr%   )rt   r   r$   )trainingr   r)   randitemr/   r   r   r   r   r   r'   r   rK   r   r   )
r   r9   rs   r   rt   r   
skip_layerstoch_layer_coeffresidualx_concatr   r   r   r=   I  sj   
	
	


zEncoderLayerSANM.forwardr   c                 C   s   |}| j r
| |}| j| jkr | j||||\}}|| }n| j||||\}}| j s3| |}|}| j r=| |}|| | }| j sL| |}||fS )r   )r   r   r   r'   r   r   r   r   )r   r9   r   r   r   r   ra   r   r   r   r     s    




zEncoderLayerSANM.forward_chunk)TFr\   )NNNr   )r>   r?   r@   rF   r=   r   rR   r   r   rO   r   r   -  s    
Nr   encoder_classesSenseVoiceEncoderSmallc                '       s   e Zd ZdZdddddddddd	ed
dddddddfdededededededededededee de	de	deded ed!ed"ed#ef& fd$d%Z
d&efd'd(Zd)ejd*ejfd+d,Z  ZS )-r   z
    Author: Speech Lab of DAMO Academy, Alibaba Group
    SCAMA: Streaming chunk-aware multihead attention for online end-to-end speech recognition
    https://arxiv.org/abs/2006.01713
          i      r   r   r\   conv2dTFlinearr   r$      sanm
input_sizeoutput_sizeattention_headslinear_units
num_blocks	tp_blocksr   positional_dropout_rateattention_dropout_rater   input_layerr   r   positionwise_layer_typepositionwise_conv_kernel_sizepadding_idxri   rj   selfattention_layer_typec              	      s   t    | _t | _|| _t| ft||	||f||	||ft	 fddt
dD | _t	 fddt
|d D | _t	 fddt
|D | _t| _t| _d S )Nc                    s$   g | ]}t    qS r   r   .0i)r   encoder_selfattn_layerencoder_selfattn_layer_args0r   r   positionwise_layerpositionwise_layer_argsr   r   
<listcomp>      z3SenseVoiceEncoderSmall.__init__.<locals>.<listcomp>r   c                    $   g | ]}t    qS r   r   r   r   r   encoder_selfattn_layer_argsr   r   r   r   r   r   	  r   c                    r   r   r   r   r   r   r   r     r   )rE   rF   _output_sizer   embedr   rD   rS   r   
ModuleListrange	encoders0encoderstp_encodersr   
after_normtp_norm)r   r   r   r   r   r   r   r   r   r   r   r   pos_enc_classr   r   r   r   r   ri   rj   r   r   rO   )r   r   r   r   r   r   r   r   r   rF     sR   
		

zSenseVoiceEncoderSmall.__init__returnc                 C   s   | j S r   )r   r   r   r   r   r   &  s   z"SenseVoiceEncoderSmall.output_sizexs_padilensc           	      C   s  |j d }t|||jddddddf }||  d 9 }| |}t| jD ]\}}|||}|d |d }}q(t| jD ]\}}|||}|d |d }}q@| |}|	d
d }t| jD ]\}}|||}|d |d }}qg| |}||fS )zEmbed positions in tensor.r   )r   r!   Ng      ?r   )shaper   r!   r   r   	enumerater   r   r   squeezesumrC   r   r   )	r   r   r   r   masks	layer_idxencoder_layerencoder_outsolensr   r   r   r=   )  s"   
"





zSenseVoiceEncoderSmall.forward)r>   r?   r@   rA   r   rC   r   r   strboolrF   r   r)   r   r=   rR   r   r   rO   r   r     s    		
bmodel_classesSenseVoiceSmallc                       s,  e Zd ZdZ														d4d	ed
ededededededededededededef fddZe	d5defddZ
dejdejdejdejfd d!Zdejdejdejfd"d#Zd$ejd%ejd&ejd'ejfd(d)Zd$ejd&ejfd*d+Zdd,gddfd-efd.d/Zd0d1 Zd2d3 Z  ZS )6r  z*CTC-attention hybrid Encoder-Decoder modelNr   r$   r   r   r"   Fspecaugspecaug_conf	normalizenormalize_confencoderencoder_confctc_confr   
vocab_size	ignore_idblank_idsoseoslength_normalized_lossc                    s  t    |d urtj|}|di |}|d ur'tj|}|di |}tj|}|dd|i|}| }|d u r@i }td|	|d|}|| _	|d urS|n|	d | _
|d ur^|n|	d | _|	| _|
| _|| _|| _|| _d | _|| _|| _|| _dddddd	d
d| _ddddd	d
d| _ddd| _ddd| _tjdt| j t| j || _dddddd| _t| j| j|dd| jd| _ d S )Nr   )odimencoder_output_sizer   r   rV   r      r         )autozhenyuejakonospeech)i4a  i5a  i8a  i<a  i@a  ia        )withitnwoitn)ia  ia  ia  ia  ia  ia  ia  )unkhappysadangryneutral
lsm_weightr\   )r'   r   	smoothingnormalize_lengthr   )!rE   rF   r   specaug_classesgetnormalize_classesr   r   r   r  r  r  r	  r
  r  r  r  error_calculatorctcr  r  lid_dictlid_int_dicttextnorm_dicttextnorm_int_dictr)   r   	Embeddinglenr   emo_dictr	   criterion_att)r   r  r  r  r  r  r  r  r   r	  r
  r  r  r  r  r   specaug_classnormalize_classencoder_classr  r+  rO   r   r   rF   P  sX   

zSenseVoiceSmall.__init__modelc                 K   s.   ddl m} |jd| dd|\} }| |fS )Nr   )	AutoModelT)r7  trust_remote_coder   )funasrr8  build_model)r7  r   r8  r   r   r   from_pretrained  s   zSenseVoiceSmall.from_pretrainedspeechspeech_lengthstexttext_lengthsc                 K   s  t | dkr|dddf }t | dkr |dddf }|jd }| |||\}}d\}	}
d\}}t }| |ddddddf |d |ddddf |d \}	}
| |ddddddf |ddddf \}}|	| }|	durt|		 nd|d< |durt|	 nd|d< |durt|	 nd|d< ||d	< | j
rt|d  }t|||f|j\}}}|||fS )
zEncoder + Decoder + Calc loss
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                text: (Batch, Length)
                text_lengths: (Batch,)
        r   Nr   r   r   loss_ctc	loss_richlossacc_rich)r1  r'   r   r7   dict_calc_ctc_loss_calc_rich_ce_lossr)   cloner   r  rC   r   r
   r!   )r   r=  r>  r?  r@  r   r2   encoder_outencoder_out_lensrA  cer_ctcrB  rD  statsrC  r   r   r   r   r=     s,   
66
zSenseVoiceSmall.forwardc                    s2   j dur jr  ||\}} jdur ||\}}t fdd|dddf D |j} |}t fdd|dddf D |j} |}tj||fdd}|d7 } tdd	gg|j	|
ddd}	tj||	fdd}
tj|
|fdd}|d7 } ||\}}||fS )
zFrontend + Encoder. Note that this method is used by asr_inference.py
        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
                ind: int
        Nc                    s<   g | ]}t d dkrt| jv r jt| ndgqS )r   g?r   )r)   r   rC   r-  )r   lidr   r   r   r     s    z*SenseVoiceSmall.encode.<locals>.<listcomp>r   c                    s   g | ]
} j t| gqS r   )r/  rC   )r   styler   r   r   r     s    rV   r   r%   r"   )r  r   r  r)   
LongTensorr8   r!   r   r/   repeatr'   r  )r   r=  r>  r?  r   lidslanguage_querystylesstyle_queryevent_emo_queryinput_queryrI  rJ  r   r   r   r7     s8   



zSenseVoiceSmall.encoderI  rJ  ys_padys_pad_lensc                 C   sR   |  ||||}d }| js%| jd ur%| j |j}| j| | dd}||fS )NT)is_ctc)r+  r   r*  argmaxdatacpu)r   rI  rJ  rW  rX  rA  rK  ys_hatr   r   r   rF    s   zSenseVoiceSmall._calc_ctc_lossc                 C   sB   | j |}| || }t|d| j| | jd}||fS )Nr$   )ignore_label)r+  ctc_lor3  r   r   r   r	  r
  )r   rI  rW  decoder_outrB  rD  r   r   r   rG    s   z"SenseVoiceSmall._calc_rich_ce_losswav_file_tmp_namekeyc           7   	   K   s  i }t |tjr1|dddkr1||}}	t|jdk r'|d d d d d f }|	d u r0|jd }	nKt }
t||j	|dd|dd|d}t }||
 d	|d
< t
||dd|d\}}	t }|| d	|d< |	  |j |j d |d< |j|d d}|	j|d d}	|dd}| t|| jv r| j| ndgg|j|ddd}|dd}|dd }|dd}|d u r|rdnd}| t| j| gg|j|ddd}tj||fdd}|	d7 }	| tddgg|j|ddd}tj||fdd}tj||fdd}|	d7 }	| ||	\}}t |tr3|d }| j|}|ddrQtd |d d d d | jd f< g }| \}}}t |d ttfrh|d }t||k rs|| }t|D ]k}||d ||  d d f }|j dd}tj!|dd}d } |d d urt"| d!st#|d | _$| j$d" } || j%k}!||! & }"|'|"}#| d ur|#| d# || < |rdd$l(m)}$ g }%|*|#d%d  }&|+|&}'g }(|'D ]})|)r|(,|) q|(-d& qt|(dkr|| |#d'}*|-|* qw| j.||d%||  d d f }+|+ d/ },d|+|,| j%k| j%f< t0|+1d t|(1d2 |+j|| d% 2 t3t|(1d2 |+j| j4d(}-|$|-dd || f },d}.d}/|| d% }0|,D ]>\}1}2|.tt|2 }3|1dkrt5|.d) d* d d}4t6|3d) d* d |0d) d* d }5|%-|&|/ |4|5g |/d7 }/|3}.q| 7|%\}%}6|| |#|%|6d+}*|-|* qw|| |#d'}*|-|* qw||fS ),N	data_typesoundfbankrV   r   fsi>  )rf  audio_fsrc  	tokenizerz0.3f	load_data)rc  frontendextract_feat  batch_data_timer!   r#   languager  r   use_itnF	text_normoutput_timestampr  r  r%   r"   ban_emo_unkr   r  r$   
output_dirwriter1best_recogr?  )groupbyr   |   )rb  r?  )r
  <      )rb  r?  	timestampwords)8
isinstancer)   r   r(  r1  r   timeperf_counterr   rf  r   r   r   frame_shiftlfr_nr8   r   rO  r,  r!   rP  r'   r.  r/   r  tupler+  log_softmaxr   r2  listr   rZ  unique_consecutivehasattrr   rt  r  tolistdecode	itertoolsrv  text2tokens
tokens2idsextendappendr   r\  r   r   longr+   r
  r   minpost)7r   data_indata_lengthsrb  rh  rj  r   	meta_datar=  r>  time1audio_sample_listtime2time3rn  rR  ro  textnormrq  textnorm_queryrU  rV  rI  rJ  
ctc_logitsresultsru   nrw   r   r9   yseqibest_writerrs   	token_intr?  rv  rz  tokenstoken_back_to_id	token_idstok_lsresult_ilogits_speechpredalign_starttoken_idts_max
pred_token
pred_frame_endts_leftts_rightr{  r   r   r   	inference)  s   






""





&
"zSenseVoiceSmall.inferencec           
      C   s  g }g }d }t |D ]t\}}|\}}}	t|d }t|	d }	|dkr$q
|dkr5|||	g || nG|drM|dd  }|||	g || n/|d urp| rp| rp| rp| rp|| }|	|d d< ||d< n|||	g || |}q
||fS )Nrl  u   ▁r   r   r$   )r   rC   r  
startswithisalphaisascii)
r   rz  timestamp_new	words_new	prev_wordr   rv   wordstartendr   r   r   r    s0   

(

zSenseVoiceSmall.postc                 K   s2   ddl m} d|vrd|d< |dd| i|}|S )Nr   )export_rebuild_modelmax_seq_leni   r7  r   )export_metar  r  r  )r   r   r  modelsr   r   r   export  s
   zSenseVoiceSmall.export)NNNNNNNr   r$   r$   r   r   r"   Fr   )r>   r?   r@   rA   r   rE  rC   r   rF   staticmethodr<  r)   r   r=   r7   rF  rG  r  r  r  r  rR   r   r   rO   r   r  L  s    	
H
2
6


 ).typingr   r   typesr}  numpynpr)   torch.nn.functionalr   
functionalr   r   torch.cuda.ampr   funasr.metrics.compute_accr   r   "funasr.losses.label_smoothing_lossr	   funasr.train_utils.device_funcsr
   funasr.utils.load_utilsr   r   funasr.utils.datadir_writerr   funasr.models.ctc.ctcr   funasr.registerr   funasr.models.paraformer.searchr   utils.ctc_alignmentr   Moduler   rD   rS   r   rB   r   r   registerr   r  r   r   r   r   <module>   s>    ! C 
 
