o
    iC:                     @   sZ  d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	Z
d dlZd dlm  mZ d dlmZmZ d dlmZmZmZmZ d dlZd dlmZ d dlmZmZ d dlmZ d	d
 ZdZdZdZdZ dZ!e!e Z"ee"e Z#e d Z$eee Z%eee$Z&d>ddZ'efde(fddZ)efde*de(fddZ+e"fddde(de(fddZ,e"fddde(de(fd d!Z-edd"efd#e(d$ejfd%d&Z.ed dfd'ee*e
j/ejf d#e(d(e(d)eee*ej0f  fd*d+Z1eG d,d- d-Z2G d.d/ d/ej3Z3G d0d1 d1ej4Z4G d2d3 d3ej5Z5d?d5d6Z6G d7d8 d8ej7Z8G d9d: d:ej7Z9e:d;d<G d=d< d<ej7Z;dS )@    N)	dataclass)DictIterableOptionalList)Tensornn)CalledProcessErrorrunPopenPIPE)	lru_cache)r   Union)tablesc                 C   s   | | dksJ | | S )Nr    )xyr   r   R/home/ubuntu/.local/lib/python3.10/site-packages/funasr/models/qwen_audio/audio.py	exact_div   s   r   i>  i  P               c                 C   sD   t dD ]\}}}| d|  ||d   d }d||  }|} q|S )Nz[(1,3,1)] + [(1,3,2)] r   r   )eval)L_indilationpaddingkernel_sizestrideL_outr   r   r   get_T_after_cnn%   s
   r!   src                 C   sf   ddddddddd	d
dddt |dg}t|tttdd}|j| d\}}t|tj tj	d S )Nffmpeg-nostdin-threads0-izpipe:-fs16le-ac1-acodec	pcm_s16le-ar)stdinstdoutstderrbufsize)input      @)
strr   r   communicatenp
frombufferint16flattenastypefloat32)contentr"   cmdpout_r   r   r   load_bytesio_audio-   s&   rC   filec                 C   s   ddddd| dddd	d
ddt |dg}z
t|dddj}W n ty4 } ztd|j  |d}~ww t|tj	
 tjd S )a?  
    Open an audio file and read as mono waveform, resampling as necessary

    Parameters
    ----------
    file: str
        The audio file to open

    sr: int
        The sample rate to resample the audio if necessary

    Returns
    -------
    A NumPy array containing the audio waveform, in float32 dtype.
    r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   -T)capture_outputcheckzFailed to load audio: Nr5   )r6   r
   r1   r	   RuntimeErrorr2   decoder8   r9   r:   r;   r<   r=   )rD   r"   r?   rA   er   r   r   
load_audioD   s"   rK   r/   )axislengthrL   c                C   s   t | rC| j| |kr| j|t j|| jdd} | j| |k rAdg| j }d|| j|  f||< t| dd |ddd D } | S | j| |krS| j	t
||d	} | j| |k rqdg| j }d|| j|  f||< t| |} | S )
O
    Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
    devicedimindex)r   r   r   c                 S   s   g | ]	}|D ]}|qqS r   r   ).0sizespadr   r   r   
<listcomp>w   s    zpad_or_trim.<locals>.<listcomp>Nr/   indicesrL   )torch	is_tensorshapeindex_selectarangerP   ndimFrV   takeranger8   )arrayrM   rL   
pad_widthsr   r   r   pad_or_triml   s   
 
re   c                C   sZ   t | r| j| |kr| j|t j|| jdd} | S | j| |kr+| jt||d} | S )rN   rO   rQ   rX   )rZ   r[   r\   r]   r^   rP   ra   rb   )rc   rM   rL   r   r   r   trim   s   
rf   )maxsizen_melsreturnc                 C   sp   |dksJ d| t tjtjtd}t|d|  	| W  d   S 1 s1w   Y  dS )a  
    load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
    Allows decoupling librosa dependency; saved using:

        np.savez_compressed(
            "mel_filters.npz",
            mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
        )
    r   zUnsupported n_mels: zmel_filters.npzmel_N)
r8   loadospathjoindirname__file__rZ   
from_numpyto)rP   rh   fr   r   r   mel_filters   s   $rt   audior   rP   c           
      C   s   t | st| trt| } t | } |dur| |} |dkr(t| d|f} t 	t
| j}t j| t
t|dd}|dddf  d }t| j|}|| }t j|dd	 }	t |	|	 d
 }	|	d d }	|	S )ap  
    Compute the log-Mel spectrogram of

    Parameters
    ----------
    audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
        The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz

    n_mels: int
        The number of Mel-frequency filters, only 80 is supported

    padding: int
        Number of zero samples to pad to the right

    device: Optional[Union[str, torch.device]]
        If given, the audio tensor is moved to this device before STFT

    Returns
    -------
    torch.Tensor, shape = (80, n_frames)
        A Tensor that contains the Mel spectrogram
    Nr   T)windowreturn_complex.r/   r   g|=)ming       @g      @)rZ   r[   
isinstancer6   rK   rq   rr   r`   rV   hann_windowN_FFTrP   stft
HOP_LENGTHabsrt   clamplog10maximummax)
ru   rh   r   rP   rv   r|   
magnitudesfiltersmel_speclog_specr   r   r   log_mel_spectrogram   s"   



r   c                   @   s^   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< dS )ModelDimensionsrh   n_audio_ctxn_audio_staten_audio_headn_audio_layern_vocab
n_text_ctxn_text_staten_text_headn_text_layerN)__name__
__module____qualname__int__annotations__r   r   r   r   r      s   
 r   c                       s&   e Zd Zdedef fddZ  ZS )	LayerNormr   ri   c                    s   t  ||jS N)superforwardtypedtypeselfr   	__class__r   r   r      s   zLayerNorm.forward)r   r   r   r   r   __classcell__r   r   r   r   r      s    r   c                   @   s   e Zd ZdedefddZdS )Linearr   ri   c                 C   s2   t || j|j| jd u rd S | j|jS r   )r`   linearweightrr   r   biasr   r   r   r   r      s   zLinear.forwardN)r   r   r   r   r   r   r   r   r   r      s    r   c                       s2   e Zd Zdededee def fddZ  ZS )Conv1dr   r   r   ri   c                    s.   t  |||j|d u rd S ||jS r   )r   _conv_forwardrr   r   )r   r   r   r   r   r   r   r      s
   
zConv1d._conv_forward)r   r   r   r   r   r   r   r   r   r   r   r      s    *r   '  c                 C   s   |d dksJ t ||d d  }t| t|d  }t| ddt jf |t jddf  }tjt|t|gddS )z*Returns sinusoids for positional embeddingr   r   r   NrR   )	r8   logrZ   expr^   newaxiscatsincos)rM   channelsmax_timescalelog_timescale_incrementinv_timescalesscaled_timer   r   r   	sinusoids   s
   *r   c                
       st   e Zd Zdedef fddZ			ddedee dee d	ee fd
dZddedededee fddZ	  Z
S )MultiHeadAttentionn_staten_headc                    sH   t    || _t||| _t||dd| _t||| _t||| _d S )NF)r   )r   __init__r   r   querykeyvaluerA   )r   r   r   r   r   r   r     s   
zMultiHeadAttention.__init__Nr   xamaskkv_cachec           
      C   s   |  |}|d u s|d u s| j|vr)| |d u r|n|}| |d u r%|n|}n
|| j }|| j }| ||||\}}	| ||	fS r   )r   r   r   qkv_attentionrA   )
r   r   r   r   r   qkvwvqkr   r   r   r     s   


zMultiHeadAttention.forwardr   r   r   c                 C   s  |j \}}}|| j d }|jg |j d d | jdR  dddd| }|jg |j d d | jdR  dddd| }|jg |j d d | jdR  dddd}|| }	|d ure|	|7 }	tj|	dd|j}
|
| ddddjdd|		 fS )	Ng      пr   r/   r   r      r   )	start_dim)
r\   r   viewpermuter`   softmaxrr   r   r;   detach)r   r   r   r   r   n_batchn_ctxr   scaler   wr   r   r   r   "  s   440$z MultiHeadAttention.qkv_attentionNNNr   )r   r   r   r   r   r   r   dictr   r   r   r   r   r   r   r     s    
(r   c                
       sZ   e Zd Zddededef fddZ			dded	ee d
ee dee fddZ	  Z
S )ResidualAttentionBlockFr   r   cross_attentionc                    s|   t    t||| _t|| _|rt||nd | _|r t|nd | _|d }t	t
||t t
||| _t|| _d S )N   )r   r   r   attnr   attn_ln
cross_attncross_attn_lnr   
Sequentialr   GELUmlpmlp_ln)r   r   r   r   n_mlpr   r   r   r   2  s   

 zResidualAttentionBlock.__init__Nr   r   r   r   c                 C   sZ   || j | |||dd  }| jr!|| j| |||dd  }|| | | }|S )N)r   r   r   )r   )r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   ?  s
   zResidualAttentionBlock.forward)Fr   )r   r   r   r   boolr   r   r   r   r   r   r   r   r   r   r   1  s    r   encoder_classesQwenAudioEncoderc                       st   e Zd Z			ddedededededed	ed
ef fddZddededefddZdededefddZ	  Z
S )r      Trh   r   r   r   n_layer
output_dimavg_pooladd_audio_bos_eos_tokenc	           
         s   t    t|ddd| _tdddd| _| dt| t fddt	|D | _
t| _|rAtjddd	| _nd | _t|| _|rUtd|| _nd | _|| _ | _d S )
Nr   r   )r   r   r   )r   r   r   positional_embeddingc                    s   g | ]}t  qS r   )r   )rT   rB   r   r   r   r   rW   a  s    z-QwenAudioEncoder.__init__.<locals>.<listcomp>)r   )r   r   r   conv1conv2register_bufferr   r   
ModuleListrb   blocksr   ln_post	AvgPool1d
avg_poolerr   proj	Embeddingaudio_bos_eos_tokenr   r   )
r   rh   r   r   r   r   r   r   r   kwargsr   r   r   r   O  s"   


zQwenAudioEncoder.__init__Nr   padding_maskaudio_lengthsc                 C   sL  |j | jjj| jjjd}|dur,|dddf d }| }|ddddd|f }t| |}t| |}|	ddd}|
d}|
d}| jd| | _|jdd | jjkspJ d|jdd | jjf || j  |j}|dur|j | jjj| jjjd}|
d}|ddd|ddf }||d|}|d}	d||	< |	|dd|d| jdd|| jd|}
tj|
|jd}||
td	}| jD ]}|||d
}q| jr|	ddd}| |}|	ddd}| |}| |}| jdur| jjd dddf }| jjd dddf }nd\}}|||fS )zt
        x : torch.Tensor, shape = (batch_size, n_mels, n_ctx)
            the mel spectrogram of the audio
        r   rP   Nr   r   r   zincorrect audio shape: r/   )r   z-inf)r   NN)rr   r   r   r   rP   r   r`   gelur   r   sizer   input_positional_embeddingr\   r   allexpandr   reshaperZ   
zeros_likemasked_fillfloatr   r   r   r   r   )r   r   r   r   input_mel_lenmax_mel_len_in_batchbszsrc_lenbatch_src_lenpadding_mask_key_padding_masknew_padding_maskblockboseosr   r   r   r   q  sV   








zQwenAudioEncoder.forwardinput_audiosinput_audio_lengthsaudio_span_tokensc                 C   s   |d d df   }t|}t|d|gj| jjj| jjj	d}t
t|D ]}d||d || d  f< q)| |||\}}	}
g }t
t|D ])}|| }|| d |d  }|	d urgt|	||
g}t||ksoJ || qK|S )Nr   r   r   )tolistr   rZ   onesr   rr   r   r   r   rP   rb   lenitemconcatappend)r   r  r  r  real_input_audio_lensmax_len_in_batchr   rS   r   r
  r  output_audiosi
audio_spanru   r   r   r   encode  s"   zQwenAudioEncoder.encode)r   TTr   )r   r   r   r   r   r   r   r   r   r  r   r   r   r   r   r   M  s.    		"8)r   )r   )<base64gzipdataclassesr   typingr   r   r   r   numpyr8   rZ   torch.nn.functionalr   
functionalr`   r   
subprocessr	   r
   r   r   rl   	functoolsr   r   funasr.registerr   r   SAMPLE_RATEr{   N_MELSr}   CHUNK_LENGTH	N_SAMPLESN_FRAMESN_SAMPLES_PER_TOKENFRAMES_PER_SECONDTOKENS_PER_SECONDr!   r   rC   r6   rK   re   rf   rt   ndarrayrP   r   r   r   r   r   r   Moduler   r   registerr   r   r   r   r   <module>   sj    



(
2	
	.
