o
    i.O                     @   s   d dl mZ d dlZd dlZd dlZd dlmZ d dlm	  m
Z
 d dlmZ d dlm  mZ d dlmZ dd Zdd Zd	d
 ZeddeddG dd dejZeddG dd dejZG dd dejZdS )    )TupleN)pad_sequence)tablesc                 C   s0  t | ddd}| }W d    n1 sw   Y  g }g }tt|D ]O}||  }|d dkrQ||d   }|d dkrP|dt|d  }t|}q%q%|d d	krt||d   }|d dkrt|dt|d  }t|}q%q%t|tj	}	t|tj	}
t|	|
g}t
j|t
j	d
}|S )Nrzutf-8)encodingr   z
<AddShift>   z<LearnRateCoef>   z	<Rescale>dtype)open	readlinesrangelensplitlistnparrayastypefloat32torch	as_tensor)	cmvn_fileflines
means_list	vars_listi	line_itemadd_shift_linerescale_linemeansvarscmvn r#   Q/home/ubuntu/.local/lib/python3.10/site-packages/funasr/frontends/wav_frontend.py	load_cmvn   s4   
r%   c                 C   sf   | j }| j}| j\}}|ddd|f }|ddd|f }| ||7 } | ||9 } | tjS )z"
    Apply CMVN with mvn data
    r   r   N   )devicer
   shapetotyper   r   inputsr"   r'   r
   framedimr    r!   r#   r#   r$   
apply_cmvn)   s   
r/   c                 C   s
  g }| j d }tt|| }| d |d d d}t|| f} ||d d  }| j d }|| df}||| f}	|| | d }
|||
|   }|dkrwd| d|  |d |
 |  d ||
  }t| g| dd  gt|  } | |	|}| 	tj
S )Nr   r   r&   )r(   intr   ceilrepeatr   vstack
as_stridedcloner*   r   )r,   lfr_mlfr_n
LFR_inputsTT_lfrleft_paddingfeat_dimstridessizeslast_idxnum_paddingLFR_outputsr#   r#   r$   	apply_lfr:   s    

,"rC   frontend_classeswav_frontendWavFrontendc                       s   e Zd ZdZ													
		d&dededededededededededededef fddZdefddZ	de
jdee
je
jf fdd Zde
jd!e
jdee
je
jf fd"d#Zde
jd!e
jdee
je
jf fd$d%Z  ZS )'rF   (Conventional frontend structure for ASR.N>  hammingP      
   r0   r         ?Tr   fswindown_melsframe_lengthframe_shiftfilter_length_minfilter_length_maxr7   r8   dither
snip_edgesupsacle_samplesc                    s|   t    || _|| _|| _|| _|| _|| _|| _|	| _	|
| _
|| _|| _|| _|| _| jd u r6d | _d S t| j| _d S N)super__init__rN   rO   rP   rQ   rR   rS   rT   r7   r8   r   rU   rV   rW   r%   r"   selfr   rN   rO   rP   rQ   rR   rS   rT   r7   r8   rU   rV   rW   kwargs	__class__r#   r$   rZ   R   s   
$zWavFrontend.__init__returnc                 C      | j | j S rX   rP   r7   r\   r#   r#   r$   output_sizes      zWavFrontend.output_sizeinputc                 K   s,  | d}g }g }t|D ]c}|| }|| d | }	| jr"|	d }	|	d}	tj|	| jt| j|| j	 d | j
| jd| j| j	| jd	}
| jdksN| jdkrVt|
| j| j}
| jd urat|
| j}
|
 d}||
 || qt|}|dkr|d d d d d d f }||fS t|ddd}||fS )	Nr                )num_mel_binsrQ   rR   rU   energy_floorwindow_typesample_frequencyrV   r   Tbatch_firstpadding_value)sizer   rW   	unsqueezekaldifbankrP   minrQ   rN   rR   rU   rO   rV   r7   r8   rC   r"   r/   appendr   r   r   )r\   rf   input_lengthsr]   
batch_sizefeats
feats_lensr   waveform_lengthwaveformmatfeat_length	feats_padr#   r#   r$   forwardv   sB   





zWavFrontend.forwardrw   c                 C   s   | d}g }g }t|D ]9}|| }|| d | }|d }|d}tj|| j| j| j| jd| j	| j
d}	|	 d}
||	 ||
 qt|}t|ddd}||fS )Nr   rg   ri   rj   rQ   rR   rU   rk   rl   rm   Trn   )rq   r   rr   rs   rt   rP   rQ   rR   rU   rO   rN   rv   r   r   r   r\   rf   rw   rx   ry   rz   r   r{   r|   r}   r~   r   r#   r#   r$   forward_fbank   s0   




zWavFrontend.forward_fbankc           
      C   s   | d}g }g }t|D ];}||d || d d f }| jdks&| jdkr.t|| j| j}| jd ur9t|| j}| d}|| || qt	|}t
|ddd}	|	|fS )Nr   r   Tri   rn   )rq   r   r7   r8   rC   r"   r/   rv   r   r   r   )
r\   rf   rw   rx   ry   rz   r   r}   r~   r   r#   r#   r$   forward_lfr_cmvn   s   




zWavFrontend.forward_lfr_cmvnNrH   rI   rJ   rK   rL   r0   r0   r   r   rM   TT)__name__
__module____qualname____doc__strr1   floatboolrZ   rd   r   Tensorr   r   r   r   __classcell__r#   r#   r^   r$   rF   M   s|    	
!
*
WavFrontendOnlinec                       sr  e Zd ZdZ													
		d6dededededededededededededef fddZdefddZ	e
dejdejdejfd d!Ze
	"d7dejdeded#edeejejef f
d$d%Ze
d&ed'ed(edefd)d*Zi fd+ejd,ejd-edeejejejf fd.d/Zd"i fd+ejd,ejd#ed-efd0d1Zd+ejd,ejfd2d3Zi fd-efd4d5Z  ZS )8r   z6Conventional frontend structure for streaming ASR/VAD.NrH   rI   rJ   rK   rL   r0   r   rM   Tr   rN   rO   rP   rQ   rR   rS   rT   r7   r8   rU   rV   rW   c                    s   t    || _|| _|| _|| _|| _t| j| j d | _t| j| j d | _	|| _
|| _|	| _|
| _|| _|| _|| _|| _| jd u rLd | _d S t| j| _d S )Nrh   )rY   rZ   rN   rO   rP   rQ   rR   r1   frame_sample_lengthframe_shift_sample_lengthrS   rT   r7   r8   r   rU   rV   rW   r%   r"   r[   r^   r#   r$   rZ      s"   
$zWavFrontendOnline.__init__r`   c                 C   ra   rX   rb   rc   r#   r#   r$   rd      re   zWavFrontendOnline.output_sizer,   r"   c                 C   s   | j }| j}| j\}}t|ddd|f |df}t|ddd|f |df}| t|||7 } | t|||9 } | tj	S )z*
        Apply CMVN with mvn data
        r   r   Nr&   )
r'   r
   r(   r   tiler   
from_numpyr*   r)   r   r+   r#   r#   r$   r/     s   
  zWavFrontendOnline.apply_cmvnFis_finalc                 C   s:  g }| j d }tt||d d  | }|}| j d }| }	|| df}
||| f}|| | d }||||   }|ri|dkrhd| d|  |d | |  d ||  }t| g| dd gt|  } n|dkru||| f}|}t|d || }| d| ||
}|	|dddf }| 	tj
||fS )z%
        Apply lfr with data
        r   r   r&   r0   N)r(   r1   r   r2   r   r4   ru   r5   r6   r*   r   )r,   r7   r8   r   r9   r:   r;   
splice_idxr=   
ori_inputsr>   r?   r@   rA   rB   lfr_splice_cacher#   r#   r$   rC     s0   

,"zWavFrontendOnline.apply_lfrsample_lengthr   r   c                 C   s,   t | | | d }|dkr| |kr|S dS )Nr   r   )r1   )r   r   r   	frame_numr#   r#   r$   compute_frame_num6  s   z#WavFrontendOnline.compute_frame_numrf   rw   cachec                 K   s`  | d}tj|d |fdd}| |jd | j| j}|d d |jd || j   d f |d< td}td}td}	|rg }g }
g }	t|D ]B}|| }|	|d |d | j | j   |d }|
d}tj|| j| j| j| jd| j| jd}| d}|
	| |		| qLt|}t|	}	t|
d	dd
}||d< t|	|d< |||	fS )Nr   input_cacher   r.   r0   rg   ri   r   Trn   fbanksfbanks_lens)rq   r   catr   r(   r   r   emptyr   rv   rr   rs   rt   rP   rQ   rR   rU   rO   rN   stackr   r   copydeepcopy)r\   rf   rw   r   r]   rx   r   	waveformsr   rz   ry   r   r|   r}   r~   r#   r#   r$   r   =  s^   
 








zWavFrontendOnline.forward_fbankc                 K   s   | d}g }g }g }	t|D ]J}
||
d ||
 d d f }| jdks(| jdkr9| || j| j|\}|d |
< }| jd urE| || j}| d}|| || |	| qt	
|}t|ddd}t	
|	}	|||	fS )Nr   r   r   Tri   rn   )rq   r   r7   r8   rC   r   r/   r"   rv   r   r   r   )r\   rf   rw   r   r   r]   rx   ry   rz   lfr_splice_frame_idxsr   r}   lfr_splice_frame_idxr~   r   r#   r#   r$   r   x  s(   







z"WavFrontendOnline.forward_lfr_cmvnc                 K   s  | dd}| di }t|dkr| | |jd }|dks$J d| j|||d\}}}	|jd r<tj|d |fdd	|d
< |d sit|D ]}
|d ||
 dd d f j	dd	
| jd d d qJ|	d |d d jd  | jkrt|d }tj||fdd	}|	|d jd 7 }	t|d
 jd | j | j d }|d  dkr| jd d nd}| j||	||d\}}	}| jdkrtd|d< ||	fS |d | }|d
 d d || j || j f |d< |d | j | j }|d
 d d d |f |d
< ||	fS |d
 d d d | j| j  f |d< t|D ]}
tj|d |
 ||
 fdd	|d |
< qtd|	fS |rp|d  dkrJ|n|d |d
< t|d }tj|tjd|jd  }	| j||	||d\}}	}||	fS )Nr   Fr   r   r   zOwe support to extract feature online only when the batch size is equal to 1 now)r   reserve_waveformsr   r   r   r&   r	   )getr   
init_cacher(   r   r   r   r   rv   rr   r3   r7   r   r1   r   r   numelr   r   zeros)r\   rf   rw   r]   r   r   rx   r   ry   feats_lengthsr   lfr_splice_cache_tensorframe_from_waveformsminus_framer   reserve_frame_idxr   _r#   r#   r$   r     s   


," 
'zWavFrontendOnline.forwardc                 C   s@   t d|d< t d|d< g |d< d |d< d |d< d |d< |S )Nr   r   r   r   r   r   r   )r   r   )r\   r   r#   r#   r$   r     s   zWavFrontendOnline.init_cacher   )F)r   r   r   r   r   r1   r   r   rZ   rd   staticmethodr   r   r/   r   rC   r   dictr   r   r   r   r   r#   r#   r^   r$   r      s    	
)!

?
 Rc                       sv   e Zd ZdZ					ddededed	ed
ef
 fddZdefddZdejdejde	ejejf fddZ
  ZS )WavFrontendMel23rG   rH   rK   rL   r   rN   rQ   rR   r7   r8   c                    s2   t    || _|| _|| _|| _|| _d| _d S )N   )rY   rZ   rN   rQ   rR   r7   r8   rP   )r\   rN   rQ   rR   r7   r8   r]   r^   r#   r$   rZ     s   
	
zWavFrontendMel23.__init__r`   c                 C   s   | j d| j d  S )Nr&   r   rb   rc   r#   r#   r$   rd     s   zWavFrontendMel23.output_sizerf   rw   c                 C   s   | d}g }g }t|D ]D}|| }|| d | }| }t|| j| j}	t|	}	tj|	| j	d}	|	d d | j
 }	t|	}	|	 d}
||	 ||
 qt|}t|ddd}||fS )Nr   )context_sizeTri   rn   )rq   r   numpyeend_ola_featurestftrQ   rR   	transformsplicer7   r8   r   r   rv   r   r   r   r#   r#   r$   r     s$   





zWavFrontendMel23.forward)rH   rK   rL   r   r   )r   r   r   r   r1   rZ   rd   r   r   r   r   r   r#   r#   r^   r$   r     s4    r   )typingr   r   r   r   r   torch.nnnntorchaudio.compliance.kaldi
compliancers   torch.nn.utils.rnnr   !funasr.frontends.eend_ola_feature	frontendsr   funasr.registerr   r%   r/   rC   registerModulerF   r   r   r#   r#   r#   r$   <module>   s*   

 
  "