o
    }oi/                     @   s   d dl mZmZmZ d dlZd dlmZ d dlmZ d dl	m
Z
mZ d dlmZmZmZmZ d dlmZ G dd	 d	e
ZG d
d de
ZdS )    )DictOptionalTupleN)	rearrange)make_seq_mask_like)NeuralModule	typecheck)AudioSignalLengthsType
NeuralTypeSpectrogramType)loggingc                
       s   e Zd ZdZddedededef fddZed	efd
dZde	j
fddZed	eeef fddZed	eeef fddZe 	dde	j
dee	j
 d	ee	j
e	j
f fddZde	j
d	e	j
fddZ  ZS )AudioToSpectrograma  Transform a batch of input multi-channel signals into a batch of
    STFT-based spectrograms.

    Args:
        fft_length: length of FFT
        hop_length: length of hops/shifts of the sliding window
        power: exponent for magnitude spectrogram. Default `None` will
               return a complex-valued spectrogram
        magnitude_power: Transform magnitude of the spectrogram as x^magnitude_power.
        scale: Positive scaling of the spectrogram.
          ?
fft_length
hop_lengthmagnitude_powerscalec                    s   t    |d dkrtd| d|| _|| _d| _t| j}| 	d| |d d | _
|dkr:td| || _|dkrHtd	| || _td
| jj td| td| td| td| d S )N   r   fft_length =  must be divisible by 2constantwindow   4Magnitude power needs to be positive: current value *Scale needs to be positive: current value Initialized %s with:	fft_length:      %s	hop_length:      %s	magnitude_power: %s	scale:           %s)super__init__
ValueErrorr   r   pad_modetorchhann_window
win_lengthregister_buffernum_subbandsr   r   r   debug	__class____name__selfr   r   r   r   r   r+    ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/audio/modules/transforms.pyr"   &   s(   
zAudioToSpectrogram.__init__returnc                 C      | j S Nr   r.   r0   r0   r1   r'   C      zAudioToSpectrogram.win_lengthxc                 C   sT   |  \}}}t|d}tj|| j| j| j| jd| jdddd
}t|d||d}|S )zApply STFT as in torchaudio.transforms.Spectrogram(power=None)

        Args:
            x_spec: Input time-domain signal, shape (..., T)

        Returns:
            Time-domain signal ``x_spec = STFT(x)``, shape (..., F, N).
        zB C T -> (B C) TTF)
inputn_fftr   r'   r   centerr$   
normalizedonesidedreturn_complexz(B C) F N -> B C F NBC)	sizer   r%   stftr   r   r'   r   r$   )r.   r8   r@   rA   Tx_specr0   r0   r1   rC   G   s    

zAudioToSpectrogram.stftc                 C      t dt t dt dddS )+Returns definitions of module output ports.r@   rA   rD   r@   Toptionalr9   input_lengthr   r	   r
   r6   r0   r0   r1   input_typesf      
zAudioToSpectrogram.input_typesc                 C      t dt t dt dS )rG   r@   rA   DrD   rI   outputoutput_lengthr   r   r
   r6   r0   r0   r1   output_typesn      

zAudioToSpectrogram.output_typesNr9   rM   c                 C   s  | d| d}}||d|}tjj|jjdd0 | | }| j	dkr;t
| | j	td|   }| jdkrE| j| }W d   n1 sOw   Y  |durp| j|d}t||ddd	}||d
}||fS | dtj||jd  }||fS )a  Convert a batch of C-channel input signals
        into a batch of complex-valued spectrograms.

        Args:
            input: Time-domain input signal with C channels, shape (B, C, T)
            input_length: Length of valid entries along the time dimension, shape (B,)

        Returns:
            Output spectrogram with F subbands and N time frames, shape (B, C, F, N)
            and output length with shape (B,).
        r   Fenabledr                 ?NrM   lengthsliketime_dim
valid_ones        device)rB   viewr%   ampautocastrf   typerC   floatr   powabsexpangler   get_output_lengthr   masked_filloneslong)r.   r9   rM   r@   rD   rU   rV   length_maskr0   r0   r1   forwardv   s&   
$

zAudioToSpectrogram.forwardc                 C   s   |j | jddd }|S )zGet length of valid frames for the output.

        Args:
            input_length: number of valid samples, shape (B,)

        Returns:
            Number of valid frames, shape (B,)
        floor)rounding_moder   )divr   addrs   r.   rM   rV   r0   r0   r1   rp      s   
z$AudioToSpectrogram.get_output_lengthr   r   r4   )r,   
__module____qualname____doc__intrk   r"   propertyr'   r%   TensorrC   r   strr   rO   rX   r   r   r   ru   rp   __classcell__r0   r0   r/   r1   r      s(     +r   c                	       s   e Zd ZdZddedededef fddZed	efd
dZde	j
fddZed	eeef fddZed	eeef fddZe dde	j
dee	j
 d	e	j
fddZde	j
d	e	j
fddZ  ZS )SpectrogramToAudioa  Transform a batch of input multi-channel spectrograms into a batch of
    time-domain multi-channel signals.

    Args:
        fft_length: length of FFT
        hop_length: length of hops/shifts of the sliding window
        magnitude_power: Transform magnitude of the spectrogram as x^(1/magnitude_power).
        scale: Spectrogram will be scaled with 1/scale before the inverse transform.
    r   r   r   r   r   c                    s   t    |d dkrtd| d|| _|| _t| j}| d| |d d | _	|dkr7td| || _
|dkrEtd| || _td	| jj td
| td| td| td| d S )Nr   r   r   r   r   r   r   r   r   r   r   r   r    )r!   r"   r#   r   r   r%   r&   r'   r(   r)   r   r   r   r*   r+   r,   r-   r/   r0   r1   r"      s&   
zSpectrogramToAudio.__init__r2   c                 C   r3   r4   r5   r6   r0   r0   r1   r'      r7   zSpectrogramToAudio.win_lengthrE   c                 C   sT   |  \}}}}t|d}tj|| j| j| j| jdddddd
}t|d||d}|S )zApply iSTFT as in torchaudio.transforms.InverseSpectrogram

        Args:
            x_spec: Input complex-valued spectrogram, shape (..., F, N)

        Returns:
            Time-domain signal ``x = iSTFT(x_spec)``, shape (..., T).
        zB C F N -> (B C) F NTFN)
r9   r:   r   r'   r   r;   r<   r=   lengthr>   z(B C) T -> B C Tr?   )rB   r   r%   istftr   r   r'   r   )r.   rE   r@   rA   FNr8   r0   r0   r1   r      s    

zSpectrogramToAudio.istftc                 C   rF   )rG   rR   rI   TrJ   rL   rW   r6   r0   r0   r1   rO      rP   zSpectrogramToAudio.input_typesc                 C   rQ   )rG   rH   rI   rT   rN   r6   r0   r0   r1   rX     rY   zSpectrogramToAudio.output_typesNr9   rM   c           	      C   sL  | d| d| d}}}|| jks!J d| d| j ||d||}| s1tdtjj|jj	dd4 |
 }| jd	krJ|| j }| jd	krct| d	| j td
|   }| |}W d   n1 srw   Y  |dur| j|d}t||ddd}||d}||fS | dtj||jd  }||fS )a  Convert input complex-valued spectrogram to a time-domain
        signal. Multi-channel IO is supported.

        Args:
            input: Input spectrogram for C channels, shape (B, C, F, N)
            input_length: Length of valid entries along the time dimension, shape (B,)

        Returns:
            Time-domain signal with T time-domain samples and C channels, (B, C, T)
            and output length with shape (B,).
        r   rZ   zNumber of subbands F=z  not matching self.num_subbands=z%Expected `input` to be complex dtype.Fr[   r   r]   Nr^   r_   rd   re   )rB   r)   rg   
is_complexr#   r%   rh   ri   rf   rj   cfloatr   r   rl   rm   rn   ro   r   rp   r   rq   rr   rs   )	r.   r9   rM   r@   r   r   rU   rV   rt   r0   r0   r1   ru   
  s,   " 


(zSpectrogramToAudio.forwardc                 C   s   | d| j }|S )zGet length of valid samples for the output.

        Args:
            input_length: number of valid frames, shape (B,)

        Returns:
            Number of valid samples, shape (B,)
        r   )submulr   rs   rz   r0   r0   r1   rp   9  s   
z$SpectrogramToAudio.get_output_lengthr{   r4   )r,   r|   r}   r~   r   rk   r"   r   r'   r%   r   r   r   r   r   rO   rX   r   r   ru   rp   r   r0   r0   r/   r1   r      s     
$.r   )typingr   r   r   r%   einopsr   1nemo.collections.asr.parts.preprocessing.featuresr   nemo.core.classesr   r   nemo.core.neural_typesr	   r
   r   r   
nemo.utilsr   r   r   r0   r0   r0   r1   <module>   s    