o
    ip                     @   sZ   d dl mZ d dlZd dlmZ d dlmZ d dlmZ e	ddG dd dej
ZdS )    )TupleN)tables)pad_sequencefrontend_classesWhisperFrontendc                       s   e Zd ZdZ					ddeded	ed
edef
 fddZdefddZ	dde	j
de	j
de	j
fddZde	j
de	j
dee	j
e	j
f fddZ  ZS )r   zySpeech Representation Using Encoder Outputs from OpenAI's Whisper Model:

    URL: https://github.com/openai/whisper
    >  NTP   Ffswhisper_modeldo_pad_trimn_melspermutec                    s   t    |dksJ || _dd l}ddlm}m}	m}
 |	| _|	| _	|| _
|
| _t| j
| j d | _d| _|| _|dksA|dkrDd| _|d	d }|| _|d ur[dd
lm} || _n|jj| _|| _|ri|j| _|| _d S )Nr   r   )
HOP_LENGTHN_FFT	N_SAMPLESi     zlarge-v3large   filters_path)mel_filters)super__init__r	   whisperwhisper.audior   r   r   n_fft
win_length
hop_lengthpad_samplesintframe_shiftlfr_nr   getr   +funasr.models.sense_voice.whisper_lib.audior   audior   pad_or_trimr   )selfr	   r
   r   r   r   kwargsr   r   r   r   r   r   	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/funasr/frontends/whisper_frontend.pyr      s0   
	

zWhisperFrontend.__init__returnc                 C   s   | j S N)r   )r%   r)   r)   r*   output_size9   s   zWhisperFrontend.output_sizer#   ilensc           
      C   s   t | j|j}t j|| j| j|dd}|dd df  d }| j	d ur2| 
|j| j| j	}n| 
|j| j}|| }t j|dd }|d urQ|| j }	nd }	t |||ddjdd	d d d d d f d
 }|d d }||	fS )NT)windowreturn_complex.   g|=)minr   )dimg       @g      @)torchhann_windowr   todevicestftr   r   absr   r   r   clamplog10maximumviewsizemax)
r%   r#   r.   r/   r9   
magnitudesfiltersmel_speclog_specolensr)   r)   r*   log_mel_spectrogram<   s"   
.z#WhisperFrontend.log_mel_spectrograminputinput_lengthsc                 K   s   | d}g }g }|tj}t|D ]/}| jr"| || | j}n|| }| |d d d f |d \}}	|	|d  |	|	 qt
|}|dkrZ|d d d d d d f }
nt|ddd}
| jrk|
ddd}
|
|fS )Nr   r   Tg        )batch_firstpadding_valuer2   )r?   r7   r5   float32ranger   r$   r   rF   append	as_tensorr   r   )r%   rG   rH   r&   
batch_sizefeats
feats_lensifeatfeat_len	feats_padr)   r)   r*   forward[   s$   
 
zWhisperFrontend.forward)r   NTr   Fr,   )__name__
__module____qualname____doc__r   strboolr   r-   r5   TensorrF   r   rV   __classcell__r)   r)   r'   r*   r   
   sD    (
)typingr   r5   torch.nnnnfunasr.registerr   torch.nn.utils.rnnr   registerModuler   r)   r)   r)   r*   <module>   s    
