o
    }oip'                     @   st   d dl Z d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
mZmZ G dd dejeZG dd dejeZdS )    N)Typing	typecheck)LengthsType
NeuralTypeSpectrogramTypec                       s   e Zd ZdZdZdZedd Zedd Z							
		d$de	de	de	de	e
B dejd
B de
def fddZe e dd Zdd ZdejdejdejfddZdejde	dejd e	e
B de
d!e	dejfd"d#Z  ZS )%SpecAugmenta  
    Zeroes out(cuts) random continuous horisontal or
    vertical segments of the spectrogram as described in
    SpecAugment (https://arxiv.org/abs/1904.08779).

    params:
    freq_masks - how many frequency segments should be cut
    time_masks - how many time segments should be cut
    freq_width - maximum number of frequencies to be cut in one segment
    time_width - maximum number of time steps to be cut in one segment.
        Can be a positive integer or a float value in the range [0, 1].
        If positive integer value, defines maximum number of time steps
        to be cut in one segment.
        If a float value, defines maximum percentage of timesteps that
        are cut adaptively.
    use_vectorized_code - GPU-based implementation with batched masking and GPU rng,
        setting it to False reverts to the legacy implementation.
        Fast implementation is inspired by torchaudio:
        https://github.com/pytorch/audio/blob/ea437b31ce316ea3d66fe73768c0dcb94edb79ad/src/torchaudio/functional/functional.py#L816
          c                 C   s   t dt t tdt dS ))Returns definitions of module input typesBDTr   )
input_speclength)r   r   tupler   self r   h/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/parts/submodules/spectr_augment.pyinput_types2   s   
zSpecAugment.input_typesc                 C      dt dt iS z*Returns definitions of module output typesaugmented_specr   r   r   r   r   r   r   output_types:      zSpecAugment.output_typesr   
   N        T
freq_masks
time_masks
freq_width
time_widthrng
mask_valueuse_vectorized_codec                    sz   t    |d u rt n|| _|| _|| _|| _|| _|| _	|| _
t|tr,d| _d S |dks4|dk r8tdd| _d S )NFg      ?r   z9If `time_width` is a float value, must be in range [0, 1]T)super__init__randomRandom_rngr   r    r!   r"   r$   r%   
isinstanceintadaptive_temporal_width
ValueError)r   r   r    r!   r"   r#   r$   r%   	__class__r   r   r'   ?   s   




zSpecAugment.__init__c                 C   s   | j r	| ||S | ||S N)r%   _forward_vectorized_forward_legacyr   r   r   r   r   r   forward^   s   zSpecAugment.forwardc              	   C   s,  |j \}}}|  }tj|j dd}|| j }t|D ]e}	t| jD ]}| j	d|}
| j	d| j}d||	|
|
| d d f< q$| j
rStdt||	 | j }n| j}td||	 | }t| jD ]}| j	d|}
| j	d|}d||	d d |
|
| f< qdqt||j}|j|| jd}|S )NF)shape
fill_valuer   Tr   maskvalue)r6   cpunumpynpfullr!   ranger   r*   randintr-   maxr,   r"   r    torch
from_numpytodevicemasked_fillr$   )r   r   r   
batch_sizenum_freq_bins_lengths_cpu	fill_maskfreq_start_upper_boundidxstartwidthtime_max_widthtime_start_upper_boundmasked_specr   r   r   r3   f   s*   
zSpecAugment._forward_legacyr   r   returnc                 C   s@   | j || j|| j| j| jd}| j || j|| j| j| jd}|S )N)r   	num_masksr   rO   axisr$   )_apply_masksr    r"   	TIME_AXISr$   r   r!   	FREQ_AXISr4   r   r   r   r2      s"   	zSpecAugment._forward_vectorizedrT   rO   rU   c                 C   s  || j | jfv sJ d| j  d| j d|t|tr%|| j kr%J d|jd }|j| }|| jkrEt|trEtj|| |dd}tj||f|j	tj
d| }	|	 }	tj||f|j	tj
d}
|| jkrr|
|d|	  }
n|
||	  }
|
 }
|
|	 }tj||j	d	}||
d
k||d
k @ }|jdd}tj|tjd}|| jkr|d d d d d f }n|d d d d d f }||d d d d d d f< |j||dS )Nz4Axis can be only be equal to frequency             (z) or time (z). Received: axis=z6Float width supported             only with time axis.r   )rA   r   )rE   dtype)rE   )dim)rY   r8   )rX   rW   r+   floatr6   rB   clamp	unsqueezerandrE   float32longarangeany
zeros_likeboolrF   )r   r   rT   r   rO   r$   rU   rG   axis_length
mask_width
mask_startmask_endindicesmask_tensorr9   mask_rangesr   r   r   rV      sL   




zSpecAugment._apply_masks)r   r   r   r   Nr   T)__name__
__module____qualname____doc__rX   rW   propertyr   r   r,   r\   r(   r)   re   r'   r   rB   no_gradr5   r3   Tensorr2   rV   __classcell__r   r   r/   r   r      sd    

 r   c                       sP   e Zd ZdZedd Zedd Zd fd
d	Ze e	
 dd Z  ZS )
SpecCutoutaS  
    Zeroes out(cuts) random rectangles in the spectrogram
    as described in (https://arxiv.org/abs/1708.04552).

    params:
    rect_masks - how many rectangular masks should be cut
    rect_freq - maximum size of cut rectangles along the frequency dimension
    rect_time - maximum size of cut rectangles along the time dimension
    c                 C   r   )r
   r   r   r   r   r   r   r   r      r   zSpecCutout.input_typesc                 C   r   r   r   r   r   r   r   r      r   zSpecCutout.output_typesr         Nc                    s:   t t|   |d u rt n|| _|| _|| _|| _d S r1   )	r&   ru   r'   r(   r)   r*   
rect_masks	rect_time	rect_freq)r   rx   ry   rz   r#   r/   r   r   r'      s
   
zSpecCutout.__init__c           	   	   C   s   |j }t|d D ]A}t| jD ]9}| jd|d | j }| jd|d | j }| jd| j}| jd| j}d||||| ||| f< qq	|S )Nr   r   r	   r   )r6   r?   rx   r*   r@   rz   ry   )	r   r   shrM   irect_xrect_yw_xw_yr   r   r   r5      s    	zSpecCutout.forward)r   rv   rw   N)rm   rn   ro   rp   rq   r   r   r'   r   rB   rr   r5   rt   r   r   r/   r   ru      s    


	ru   )r(   r<   r=   rB   torch.nnnnnemo.core.classesr   r   nemo.core.neural_typesr   r   r   Moduler   ru   r   r   r   r   <module>   s    B