o
    i6V                     @   s  d dl mZ d dl mZ d dl mZ d dl mZ d dl mZ d dlZd dlZd dlm	Z	 d dl
m	  mZ d dlZd dlmZ d dlmZ d d	lmZmZ d d
lmZmZ d dlmZ d dlmZ d dlmZ d dlmZ d dl m!Z! d dl"m#Z# d dl"m$Z$ d dl"m%Z% d dl"m&Z& d dl"m'Z' d dl"m(Z( d dl)m*Z* d dl+m,Z, G dd de	j-Z.e,/ddG dd de	j-Z0G dd de	j-Z1e,/dde,/dd G d!d  d e	j-Z2dS )"    )List)Optional)Sequence)Tuple)UnionN)	to_device)make_pad_mask)MultiHeadedAttentionMultiHeadedAttentionSANM)SinusoidalPositionEncoderStreamSinusoidalPositionEncoder)	LayerNorm)Conv1dLinear)MultiLayeredConv1d)PositionwiseFeedForward)repeat)Conv2dSubsampling)Conv2dSubsampling2)Conv2dSubsampling6)Conv2dSubsampling8)TooShortUttError)check_short_utt)CTC)tablesc                       s8   e Zd Z			d fdd	ZdddZdd
dZ  ZS )EncoderLayerSANMTF        c	           	         sz   t t|   || _|| _t|| _t|| _t	|| _
|| _|| _|| _|| _| jr5t|| || _|| _|| _dS z!Construct an EncoderLayer object.N)superr   __init__	self_attnfeed_forwardr   norm1norm2nnDropoutdropoutin_sizesizenormalize_beforeconcat_afterLinearconcat_linearstochastic_depth_ratedropout_rate)	selfr&   r'   r   r    r-   r(   r)   r,   	__class__ N/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/sanm/encoder.pyr   -   s   


zEncoderLayerSANM.__init__Nc           
   
   C   sn  d}d}| j r| jdkrtd | jk }dd| j  }|r0|dur,tj||gdd}||fS |}| jr:| |}| jretj|| j	||||dfdd}	| j
| jkr]||| |	  }n-|| |	 }n%| j
| jkr|||| | j	||||d  }n|| | j	||||d }| js| |}|}| jr| |}||| | |  }| js| |}|||||fS )	  Compute encoded features.

        Args:
            x_input (torch.Tensor): Input tensor (#batch, time, size).
            mask (torch.Tensor): Mask tensor for the input (#batch, time).
            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).

        Returns:
            torch.Tensor: Output tensor (#batch, time, size).
            torch.Tensor: Mask tensor (#batch, time).

        Fg      ?r      Ndim)mask_shfit_chunkmask_att_chunk_encoder)trainingr,   torchranditemcatr(   r!   r)   r   r&   r'   r+   r%   r"   r    )
r.   xmaskcacher7   r8   
skip_layerstoch_layer_coeffresidualx_concatr1   r1   r2   forwardH   sj   
	
	


zEncoderLayerSANM.forwardr   c                 C   s   |}| j r
| |}| j| jkr | j||||\}}|| }n| j||||\}}| j s3| |}|}| j r=| |}|| | }| j sL| |}||fS )r3   )r(   r!   r&   r'   r   forward_chunkr"   r    )r.   r?   rA   
chunk_size	look_backrD   attnr1   r1   r2   rG      s    




zEncoderLayerSANM.forward_chunk)TFr   )NNN)NNr   )__name__
__module____qualname__r   rF   rG   __classcell__r1   r1   r/   r2   r   ,   s    
Nr   encoder_classesSANMEncoderc                3       sN  e Zd ZdZdddddddded	d
dddg d
dddddddddfdededededededededee de	d e	d!ed"ed#ed$e
e d%e	d&ed'ed(e
e d)ed*ed+ed,ed-ed.ef2 fd/d0Zd1efd2d3Z		d@d4ejd5ejd6ejd7ed1eejejeej f f
d8d9Zi fd:ejd;efd<d=Z		d@d4ejd5ejd;ed7efd>d?Z  ZS )ArP   z
    Author: Zhifu Gao, Shiliang Zhang, Ming Lei, Ian McLoughlin
    San-m: Memory equipped self-attention for end-to-end speech recognition
    https://arxiv.org/abs/2006.01713
          i      g?r   conv2dTFlinearr4   r9      r   N      sanmencoderzseq2seq/encoder
input_sizeoutput_sizeattention_headslinear_units
num_blocksr-   positional_dropout_rateattention_dropout_rateinput_layerr(   r)   positionwise_layer_typepositionwise_conv_kernel_sizepadding_idxinterctc_layer_idxinterctc_use_conditioningkernel_size
sanm_shfit	lora_list	lora_rank
lora_alphalora_dropoutselfattention_layer_type!tf2torch_tensor_name_prefix_torchtf2torch_tensor_name_prefix_tfc                    s  t    | _|	dkr+tjtjtjtjtj	 |
|| _
nr|	dkr7t| _
nf|	dkrCt| _
nZ|	dkrOt| _
nN|	dkr[t| _
nB|	dkrptjtjj|dt | _
n-|	d u rkr|d | _
n!tj| _
n|	dkrt | _
n|	d	krt | _
ntd
|	 | _|dkrt|f	n|dkrt||f	n|dkrt||f	ntd|dkrt||fn|dkrt||||||||f
||||||||f
td 	f	dd| _t|d  	fdd| _| jr"t| _|| _t|dkr<dt |k r:t!||k s<J || _"d | _#t| _$|| _%|| _&d S )NrU   rT   conv2d2conv2d6conv2d8embed)re   pe	pe_onlinezunknown input_layer: conv1dzconv1d-linearzSupport only linear or conv1d.selfattnrY   r4   c                    s   t    S Nr   lnum)	r)   r-   encoder_selfattn_layerencoder_selfattn_layer_args0r[   r(   r\   positionwise_layerpositionwise_layer_argsr1   r2   <lambda>C      z&SANMEncoder.__init__.<locals>.<lambda>c                    s   t    S ry   rz   r{   )r)   r-   r}   encoder_selfattn_layer_argsr(   r\   r   r   r1   r2   r   P  r   r   )'r   r   _output_sizer;   r#   
Sequentialr*   r   r$   ReLUrt   r   r   r   r   	Embeddingr   r   
ValueErrorr(   r   r   r   NotImplementedErrorr	   r
   r   	encoders0encoders
after_normrf   lenminmaxrg   conditioning_layerr%   ro   rp   )r.   r[   r\   r]   r^   r_   r-   r`   ra   rb   pos_enc_classr(   r)   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   r/   )
r)   r-   r}   r   r~   r[   r(   r\   r   r   r2   r      s   





 
zSANMEncoder.__init__returnc                 C   s   | j S ry   )r   r.   r1   r1   r2   r\   f     zSANMEncoder.output_sizexs_padilensprev_statesctcc                 C   s  t |dddddf  |j}||  d  }| jdu r"|}nIt| jts:t| jts:t| jts:t| jt	rft
| j|d\}}|r]td|d dd| d |d|| ||\}}n| |}| ||}|d |d }}g }	t| jdkr| ||}|d |d }}nAt| jD ];\}
}|||}|d |d }}|
d | jv r|}| jr| |}|	|
d |f | jr||}|| | }q| jr| |}|dd}t|	dkr||	f|dfS ||dfS )	zEmbed positions in tensor.

        Args:
            xs_pad: input tensor (B, L, D)
            ilens: input length (B)
            prev_states: Not to be used now.
        Returns:
            position embedded tensor and mask
        N      ?r4   zhas z) frames and is too short for subsampling z(it needs more than z frames), return empty resultsr   )r   todevicer\   rt   
isinstancer   r   r   r   r   r'   r   r   r   rf   r   	enumerater(   r   appendrg   softmaxr   squeezesum)r.   r   r   r   r   masksshort_status
limit_sizeencoder_outsintermediate_outs	layer_idxencoder_layerencoder_outctc_outolensr1   r1   r2   rF   i  s^   $











zSANMEncoder.forwardfeatsrA   c                 C   st   t |dkr|S t|d |jd|d< tj|d |fdd}|d d |d d |d d   d d d f |d< |S )Nr   r   r   r4   r5   rH      )r   r   r   r;   r>   )r.   r   rA   overlap_featsr1   r1   r2   _add_overlap_chunk  s   4zSANMEncoder._add_overlap_chunkc                 C   sl  ||   d 9 }| jd u r|}n| ||}|d r$t|d |jd}n| ||}| |d d d d }|d |d }}g }t| jdkrX| |d d d d }|d |d }}nDt	| jD ]>\}}	|	|d d d d }|d |d }}|d | jv r|}
| j
r| |
}
||d |
f | jr||
}|| | }q]| j
r| |}t|dkr||fd d fS ||d fS )Nr   
tail_chunkr   r   r   r4   )r\   rt   r   r   r   r   r   rf   r   r   r(   r   r   rg   r   r   )r.   r   r   rA   r   r   r   r   r   r   r   r   r1   r1   r2   rG     s<   




zSANMEncoder.forward_chunk)NN)rK   rL   rM   __doc__r   intfloatr   strboolr   r   r\   r;   Tensorr   r   rF   npndarraydictr   rG   rN   r1   r1   r/   r2   rP      s    		
 $
Gc                       s$   e Zd Z fddZdd Z  ZS )EncoderLayerSANMExportc                    s>   t    |j| _|j| _|j| _|j| _|j| _|j| _dS r   )r   r   r   r    r!   r"   r&   r'   )r.   modelr/   r1   r2   r     s   
zEncoderLayerSANMExport.__init__c                 C   sV   |}|  |}| ||}| j| jkr|| }|}| |}| |}|| }||fS ry   )r!   r   r&   r'   r"   r    )r.   r?   r@   rD   r1   r1   r2   rF     s   


zEncoderLayerSANMExport.forward)rK   rL   rM   r   rF   rN   r1   r1   r/   r2   r     s    r   SANMEncoderChunkOptExportSANMEncoderExportc                       s   e Zd Z					ddedejf fdd	Zd
d Zddej	dej	defddZ
dd Zdd Zdd Zdd Zdd Z  ZS )r      0  rZ   TNonnx
ctc_linearc                    s  t    |j| _t| jtrd | _|| _|| _|j| _ddlm	} ||dd| _
ddlm} t|drSt| jjD ]\}	}
t|
jtrJ||
j|
_t|
| jj|	< q:t| jjD ]\}	}
t|
jtri||
j|
_t|
| jj|	< qY|| _|jd jj| _|jd jjj| _|| _d S )Nr   )sequence_maskF)flip)MultiHeadedAttentionSANMExportr   )r   r   rt   r   r   r   	feats_dimr   funasr.utils.torch_functionr   r   funasr.models.sanm.attentionr   hasattrr   r   r   r
   r   r   
model_nameh	num_heads
linear_outout_featureshidden_sizer   )r.   r   max_seq_lenr   r   r   r   r   r   idr/   r1   r2   r     s.   
	

zSANMEncoderExport.__init__c                 C   sz   |d d d d d f }t |jdkr!d|d d d d d d f  }nt |jdkr5d|d d d d d f  }|d }||fS )Nr   r4      g     )r   shape)r.   r@   mask_3d_btdmask_4d_bhltr1   r1   r2   prepare_mask/  s   zSANMEncoderExport.prepare_maskFspeechspeech_lengthsonlinec                 C   s   |s	|| j d  }| |}| |}| jd u r|}n| |}| j||}|d |d }}| j||}|d |d }}| j|}| jd urW| |}t	j
|dd}||fS )Nr   r   r4   r   r5   )r   r   r   rt   r   r   r   r   r   Fr   )r.   r   r   r   r@   r   r   r   r1   r1   r2   rF   9  s    





zSANMEncoderExport.forwardc                 C   s   | j jd jS )Nr   )r   r   r'   r   r1   r1   r2   get_output_sizeR  s   z!SANMEncoderExport.get_output_sizec                 C   s   t dd| j}|S )Nr4   d   )r;   randnr   )r.   r   r1   r1   r2   get_dummy_inputsU  s   z"SANMEncoderExport.get_dummy_inputsc                 C   s   dgS )Nr   r1   r   r1   r1   r2   get_input_namesY  r   z!SANMEncoderExport.get_input_namesc                 C   s   g dS )N)r   encoder_out_lenspredictor_weightr1   r   r1   r1   r2   get_output_names\  s   z"SANMEncoderExport.get_output_namesc                 C   s   ddiddiddidS )Nr4   feats_lengthenc_out_lengthpre_out_length)r   r   r   r1   r   r1   r1   r2   get_dynamic_axes_  s   z"SANMEncoderExport.get_dynamic_axes)r   r   rZ   TN)F)rK   rL   rM   r   r#   Moduler   r   r;   r   rF   r   r   r   r   r   rN   r1   r1   r/   r2   r     s$    (
)3typingr   r   r   r   r   loggingr;   torch.nnr#   torch.nn.functional
functionalr   numpyr   funasr.train_utils.device_funcsr   *funasr.models.transformer.utils.nets_utilsr   r   r	   r
   #funasr.models.transformer.embeddingr   r   $funasr.models.transformer.layer_normr   0funasr.models.transformer.utils.multi_layer_convr   r   3funasr.models.transformer.positionwise_feed_forwardr   &funasr.models.transformer.utils.repeatr   +funasr.models.transformer.utils.subsamplingr   r   r   r   r   r   funasr.models.ctc.ctcr   funasr.registerr   r   r   registerrP   r   r   r1   r1   r1   r2   <module>   sJ    
  -

