o
    }oi                     @   sV   d dl Z d dlZd dlZd dlmZmZ d dlmZmZm	Z	 dgZ
G dd deZdS )    N)NeuralModule	typecheck)LengthsType
NeuralTypeSpectrogramTypeSSLPretrainWithMaskedPatchc                       sV   e Zd ZdZedd Zedd Z		dded	ef fd
dZ	e
 dd Z  ZS )r   a  
    Zeroes out fixed size time patches of the spectrogram.
    All samples in batch are guaranteed to have the same amount of masked time steps.
    Note that this may be problematic when we do pretraining on a unbalanced dataset.

    For example, say a batch contains two spectrograms of length 87 and 276.
    With mask_fraction=0.7 and patch_size=10, we'll obrain mask_patches=7.
    Each of the two data will then have 7 patches of 10-frame mask.

    Args:
        patch_size (int): up to how many time steps does one patch consist of.
            Defaults to 10.
        mask_fraction (float): how much fraction in each sample to be masked (number of patches is rounded up).
            Range from 0.0 to 1.0. Defaults to 0.7.
    c                 C   s   t dt t tdt dS )z)Returns definitions of module input typesBCDTr	   )
input_speclength)r   r   tupler   self r   g/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/audio/modules/ssl_pretrain_masking.pyinput_types+   s   
z&SSLPretrainWithMaskedPatch.input_typesc                 C   s   dt dt iS )z*Returns definitions of module output typesaugmented_specr   )r   r   r   r   r   r   output_types3   s   z'SSLPretrainWithMaskedPatch.output_types
   ffffff?
patch_sizemask_fractionc                    sN   t    |dkrtd| |dks|dk rtd| || _|| _d S )Nr   z,patch_size must be positive, got patch_size=g      ?        zAmask_fraction must be in the range [0.0, 1.0], got mask_fraction=)super__init__
ValueErrorr   r   )r   r   r   	__class__r   r   r   8   s   

z#SSLPretrainWithMaskedPatch.__init__c              
   C   s  |}| j r[t| D ]M\}}t|| j }t|| j }|| j t|| j dk }|| j| k r7|| j }t||}	|	D ]}
d||dddd|
| j |
d | j f< q?q|S | j| j }t	j
|d|jd}|| | jk}t|d }|| }|S )a-  
        Apply Patched masking on the input_spec.


        During the training stage, the mask is generated randomly, with
        approximately `self.mask_fraction` of the time frames being masked out.

        In the validation stage, the masking pattern is fixed to ensure
        consistent evaluation of checkpoints and to prevent overfitting. Note
        that the same masking pattern is applied to all data, regardless of
        their lengths. On average, approximately `self.mask_fraction` of the
        time frames will be masked out.

        r   r   N   )devicezT -> 1 1 1 T)training	enumeratetolistranger   intr   randomsampletorcharangesizer#   einops	rearrangefloat)r   r   r   r   idxcur_lenpatcheslen_fractionmask_patchesmasked_patchesmpchunk_lengthmaskr   r   r   forwardG   s&   
.	z"SSLPretrainWithMaskedPatch.forward)r   r   )__name__
__module____qualname____doc__propertyr   r   r(   r0   r   r   r:   __classcell__r   r   r   r   r      s    

)r)   r.   r+   nemo.core.classesr   r   nemo.core.neural_typesr   r   r   __all__r   r   r   r   r   <module>   s   