o
    i-                     @   s   d Z ddlZddlZddlmZ dddZG d	d
 d
eZdddZG dd deZdddZ	G dd deZ
								d ddZG dd deZdS )!z=Spec Augment module for preprocessing i.e., data augmentation    N)	FuncTransP   FPILc                 C   s$  ddl m} ddlm} |}|dkrq| jd }|| |kr| S t||| }t|| || d }	|| d| | jd |	f|}
|| |d | jd ||	 f|}|ri|
| d|	< || |	d< | S t	
|
|fdS |dkrddl}ddlm} ||| |	 S td	| d
 )a  time warp for spec augment

    move random center frame by the random width ~ uniform(-window, window)
    :param numpy.ndarray x: spectrogram (time, freq)
    :param int max_time_warp: maximum time frames to warp
    :param bool inplace: overwrite x with the result
    :param str mode: "PIL" (default, fast, not differentiable) or "sparse_image_warp"
        (slow, differentiable)
    :returns numpy.ndarray: time warped spectrogram (time, freq)
    r   )Image)BICUBICr      Nsparse_image_warp)spec_augmentzunknown resize mode: z+, choose one from (PIL, sparse_image_warp).)r   r   	PIL.Imager   shaperandom	randrange	fromarrayresizenumpyconcatenatetorchespnet.utilsr	   	time_warp
from_numpyNotImplementedError)xmax_time_warpinplacemoder   r   windowtcenterwarpedleftrightr   r	    r!   Q/home/ubuntu/.local/lib/python3.10/site-packages/espnet/transform/spec_augment.pyr   
   s6   
$(r   c                       &   e Zd ZeZejZ fddZ  ZS )TimeWarpc                       |s|S t  |S Nsuper__call__selfr   train	__class__r!   r"   r)   ;      zTimeWarp.__call__)__name__
__module____qualname__r   _func__doc__r)   __classcell__r!   r!   r-   r"   r$   7       r$         Tc                 C   s   |r| }n|   }|jd }tjjd||dfd}|D ]0\}}	td|| }
|	|
7 }	|
|
| kr2q|r?d|dd|
|	f< q| |dd|
|	f< q|S )zfreq mask for spec agument

    :param numpy.ndarray x: (time, freq)
    :param int n_mask: the number of masks
    :param bool inplace: overwrite
    :param bool replace_with_zero: pad zero on mask if true else use mean
    r   r   r8   sizeNcopyr   r   r   randintr   mean)r   Fn_maskreplace_with_zeror   clonednum_mel_channelsfsfmask_endf_zeror!   r!   r"   	freq_maskA   s   
rH   c                       r#   )FreqMaskc                    r%   r&   r'   r*   r-   r!   r"   r)   d   r/   zFreqMask.__call__)r0   r1   r2   rH   r3   r4   r)   r5   r!   r!   r-   r"   rI   `   r6   rI   (   c                 C   s   |r| }n|   }|jd }tjjd||dfd}|D ]/\}}	|| dkr&qtd|| }
|
|
| kr5q|	|
7 }	|rBd||
|	< q| ||
|	< q|S )zfreq mask for spec agument

    :param numpy.ndarray spec: (time, freq)
    :param int n_mask: the number of masks
    :param bool inplace: overwrite
    :param bool replace_with_zero: pad zero on mask if true else use mean
    r   r8   r9   r;   )specTr@   rA   r   rB   len_spectrotsr   rF   t_zeror!   r!   r"   	time_maskj   s    
rP   c                       r#   )TimeMaskc                    r%   r&   r'   r*   r-   r!   r"   r)      r/   zTimeMask.__call__)r0   r1   r2   rP   r3   r4   r)   r5   r!   r!   r-   r"   rQ      r6   rQ      d   c	           	      C   sV   t | tjsJ | jdksJ t| |||d} t| ||||d} t| ||||d} | S )aI  spec agument

    apply random time warping and time/freq masking
    default setting is based on LD (Librispeech double) in Table 2
        https://arxiv.org/pdf/1904.08779.pdf

    :param numpy.ndarray x: (time, freq)
    :param str resize_mode: "PIL" (fast, nondifferentiable) or "sparse_image_warp"
        (slow, differentiable)
    :param int max_time_warp: maximum frames to warp the center frame in spectrogram (W)
    :param int freq_mask_width: maximum width of the random freq mask (F)
    :param int n_freq_mask: the number of the random freq mask (m_F)
    :param int time_mask_width: maximum width of the random time mask (T)
    :param int n_time_mask: the number of the random time mask (m_T)
    :param bool inplace: overwrite intermediate array
    :param bool replace_with_zero: pad zero on mask if true else use mean
    r8   )r   r   )r   rA   )
isinstancer   ndarrayndimr   rH   rP   )	r   resize_moder   max_freq_widthn_freq_maskmax_time_widthn_time_maskr   rA   r!   r!   r"   r	      s$   r	   c                       r#   )SpecAugmentc                    r%   r&   r'   r*   r-   r!   r"   r)      r/   zSpecAugment.__call__)r0   r1   r2   r	   r3   r4   r)   r5   r!   r!   r-   r"   r\      r6   r\   )r   Fr   )r7   r8   TF)rJ   r8   TF)r   r   rR   r8   rS   r8   TT)r4   r   r   espnet.transform.functionalr   r   r$   rH   rI   rP   rQ   r	   r\   r!   r!   r!   r"   <module>   s(    
-



 
0