o
    Si1D                     @   sV  d dl Z d dlZd dlZd dlmZmZmZmZmZm	Z	m
Z
 d dlZd dlZd dlmZmZ d dlmZ d dlmZ g dZG dd dejjZG d	d
 d
ejjZG dd dejjZdejdededededejfddZdejdedejfddZe	dZ deeee f  dede fddZ!d$dejdedejfd d!Z"G d"d# d#ejjZ#dS )%    N)AnyDictOptionalSequenceTupleTypeVarUnion)CutSetFeatureExtractor)dereverb_wpe_torch)Pathlike)	GlobalMVNSpecAugmentRandomizedSmoothingDereverbWPEc                
       s   e Zd ZdZdef fddZe		ddedee dee	 d	d fd
dZ
eded	d fddZdefddZ	ddejdeej d	ejfddZdejd	ejfddZ  ZS )r   z,Apply global mean and variance normalizationfeature_dimc                    s8   t    || _| dt| | dt| d S )N
norm_means	norm_stds)super__init__r   register_buffertorchzerosones)selfr   	__class__ T/home/ubuntu/.local/lib/python3.10/site-packages/lhotse/dataset/signal_transforms.pyr      s   
zGlobalMVN.__init__Ncutsmax_cuts	extractorreturnc                 C   sB   |j ||d}dd | D }|d j\}| |}|| |S )N)r    r!   c                 S   s   i | ]
\}}|t |qS r   )r   	as_tensor).0namevaluer   r   r   
<dictcomp>#   s    z'GlobalMVN.from_cuts.<locals>.<dictcomp>r   )compute_global_feature_statsitemsshapeload_state_dict)clsr   r    r!   statsr   
global_mvnr   r   r   	from_cuts   s   
zGlobalMVN.from_cuts
stats_filec                 C   s,   t |}|d j\}| |}|| |S )Nr   )r   loadr*   r+   )r,   r0   r-   r   r.   r   r   r   	from_file)   s
   

zGlobalMVN.from_filec                 C   s   t |  | d S N)r   save
state_dict)r   r0   r   r   r   to_file1   s   zGlobalMVN.to_filefeaturessupervision_segmentsc                 C   s   || j  | j S r3   )r   r   )r   r7   r8   r   r   r   forward4   s   zGlobalMVN.forwardc                 C   s   || j  | j S r3   )r   r   )r   r7   r   r   r   inverse;   s   zGlobalMVN.inverse)NNr3   )__name__
__module____qualname____doc__intr   classmethodr	   r   r
   r/   r   r2   r6   r   Tensor	IntTensorr9   r:   __classcell__r   r   r   r   r      s6    
r   c                       s`   e Zd ZdZ			ddeeeeeef  f de	def fdd	Z
d
ejdejfddZ  ZS )r   z
    Randomized smoothing - gaussian noise added to an input waveform, or a batch of waveforms.
    The summed audio is clipped to ``[-1.0, 1.0]`` before returning.
    皙?T333333?sigmasample_sigmapc                    s&   t    || _|| _|| _d| _dS )a  
        RandomizedSmoothing's constructor.

        :param sigma: standard deviation of the gaussian noise. Either a constant float, or a schedule,
            i.e. a list of tuples that specify which value to use from which step.
            For example, ``[(0, 0.01), (1000, 0.1)]`` means that from steps 0-999, the sigma value
            will be 0.01, and from step 1000 onwards, it will be 0.1.
        :param sample_sigma: when ``False``, then sigma is used as the standard deviation in each forward step.
            When ``True``, the standard deviation is sampled from a uniform distribution of
            ``[-sigma, sigma]`` for each forward step.
        :param p: the probability of applying this transform.
        r   N)r   r   rF   rG   rH   step)r   rF   rG   rH   r   r   r   r   E   s
   

zRandomizedSmoothing.__init__audior"   c                 O   s   t | jtr
| j}nt| j| j}|  jd7  _| jr:|jd ftdd |jdd  D  }|dt	| d  }|t
| }t|d| j d}|| }tj|| ddd	S )
N   r   c                 s       | ]}d V  qdS rK   Nr   r$   _r   r   r   	<genexpr>k       z.RandomizedSmoothing.forward.<locals>.<genexpr>   g      ?)rH   g      )minmax)
isinstancerF   floatschedule_value_for_steprI   rG   r*   tupler   rand
randn_likerandom_mask_along_batch_axisrH   clip)r   rJ   argskwargsrF   
mask_shapenoise
noise_maskr   r   r   r9   ]   s   (zRandomizedSmoothing.forward)rD   TrE   )r;   r<   r=   r>   r   rV   r   r   r?   boolr   r   rA   r9   rC   r   r   r   r   r   ?   s    r   c                       s   e Zd ZdZ							d!d	ee d
ededededef fddZ	d"dej	deej
 dej	fddZ	d#dej	dededej	fddZdeeef fddZdeeef fdd Z  ZS )$r   a  
    SpecAugment performs three augmentations:
    - time warping of the feature matrix
    - masking of ranges of features (frequency bands)
    - masking of ranges of frames (time)

    The current implementation works with batches, but processes each example separately
    in a loop rather than simultaneously to achieve different augmentation parameters for
    each example.
    P   rR      
   d   333333??time_warp_factornum_feature_masksfeatures_mask_sizenum_frame_masksframes_mask_sizemax_frames_mask_fractionc                    s   t    d|  krdksJ  J |dksJ |dksJ |dks%J |dks+J || _|| _|| _|| _|| _|| _|| _dS )a  
        SpecAugment's constructor.

        :param time_warp_factor: parameter for the time warping; larger values mean more warping.
            Set to ``None``, or less than ``1``, to disable.
        :param num_feature_masks: how many feature masks should be applied. Set to ``0`` to disable.
        :param features_mask_size: the width of the feature mask (expressed in the number of masked feature bins).
            This is the ``F`` parameter from the SpecAugment paper.
        :param num_frame_masks: the number of masking regions for utterances. Set to ``0`` to disable.
        :param frames_mask_size: the width of the frame (temporal) masks (expressed in the number of masked frames).
            This is the ``T`` parameter from the SpecAugment paper.
        :param max_frames_mask_fraction: limits the size of the frame (temporal) mask to this value times the length
            of the utterance (or supervision segment).
            This is the parameter denoted by ``p`` in the SpecAugment paper.
        :param p: the probability of applying this transform.
            It is different from ``p`` in the SpecAugment paper!
        r   rK   N)	r   r   ri   rj   rk   rl   rm   rn   rH   )r   ri   rj   rk   rl   rm   rn   rH   r   r   r   r      s   

zSpecAugment.__init__Nr7   r8   r"   c           	      O   s   t |jdksJ d| }|du r(t|dD ]}| || ||< q|S |D ]\}}}|| }| j||||f ddd||||f< q*t|dD ]}| j|| ddd||< qO|S )a  
        Computes SpecAugment for a batch of feature matrices.

        Since the batch will usually already be padded, the user can optionally
        provide a ``supervision_segments`` tensor that will be used to apply SpecAugment
        only to selected areas of the input. The format of this input is described below.

        :param features: a batch of feature matrices with shape ``(B, T, F)``.
        :param supervision_segments: an int tensor of shape ``(S, 3)``. ``S`` is the number of
            supervision segments that exist in ``features`` -- there may be either
            less or more than the batch size.
            The second dimension encoder three kinds of information:
            the sequence index of the corresponding feature matrix in `features`,
            the start frame index, and the number of frames for each segment.
        :return: an augmented tensor of shape ``(B, T, F)``.
           zESpecAugment only supports batches of single-channel feature matrices.Nr   TF)warpmask)lenr*   clonerangesize_forward_single)	r   r7   r8   r]   r^   sequence_idxstart_frame
num_frames	end_framer   r   r   r9      s$   
zSpecAugment.forwardTrp   rq   c                 C   s   t   | jkr	|S |r| jdur| jdkrt|| jd}|rR| }t|| j| j|dd}| j|	d }t
| jt|| j }t
| j|| }t||||dd}|S )zO
        Apply SpecAugment to a single feature matrix of shape (T, F).
        NrK   )factorrR   )	mask_size
mask_times
mask_valueaxisr   )randomrH   ri   	time_warpmeanmask_along_axis_optimizedrk   rj   rn   ru   rS   rl   mathceilrm   )r   r7   rp   rq   r   max_tot_mask_framesrl   max_mask_framesr   r   r   rv      s<   
zSpecAugment._forward_singlec              	   K   s$   t | j| j| j| j| j| j| jdS )N)ri   rj   rk   rl   rm   rn   rH   )dictri   rj   rk   rl   rm   rn   rH   )r   r^   r   r   r   r5     s   zSpecAugment.state_dictr5   c                 C   st   | d| j| _| d| j| _| d| j| _| d| j| _| d| j| _| d| j| _| d| j| _d S )Nri   rj   rk   rl   rm   rn   rH   )getri   rj   rk   rl   rm   rn   rH   )r   r5   r   r   r   r+     s"   zSpecAugment.load_state_dict)rc   rR   rd   re   rf   rg   rh   r3   )TT)r;   r<   r=   r>   r   r?   rV   r   r   rA   rB   r9   rb   rv   r   strr   r5   r+   rC   r   r   r   r   r   y   sT    ,
0
)r   r7   r|   r}   r~   r   r"   c                 C   sN  |dvrt d| d} | dgt|  dd  } ttdt|d|f}td|| ||  }|	 
 }|	 |	  
 }|dkrs|dkr^|| dd||f< | 
dS t||D ]\}	}
|| dd|	|
f< qcn-|dkr|| dddd||f< | 
dS t||D ]\}	}
|| dddd|	|
f< q| 
d} | S )a  
    Apply Frequency and Time masking along axis.
    Frequency and Time masking as described in the SpecAugment paper.

    :param features: input tensor of shape ``(T, F)``
    :mask_size: the width size for masking.
    :mask_times: the number of masking regions.
    :mask_value: Value to assign to the masked regions.
    :axis: Axis to apply masking on (1 -> time, 2 -> frequency)
    )rK   rR   z.Only Frequency and Time masking are supported!r   NrK   )
ValueError	unsqueezereshapelistru   r   randintr?   rY   longsqueezezip)r7   r|   r}   r~   r   values
min_valuesmask_starts	mask_ends
mask_startmask_endr   r   r   r   %  s,   
 


r   r{   c                 C   s  |  d}|| |d kr| S tj|d || }tj|| || d }||kr-| S | dd} tjjj| ddddd|ddf ||  dfddd}tjjj| dddd|dddf || |  dfddd}tj	||fdd	
d
dS )
aT  
    Time warping as described in the SpecAugment paper.
    Implementation based on Espresso:
    https://github.com/freewym/espresso/blob/master/espresso/tools/specaug_interpolate.py#L51

    :param features: input tensor of shape ``(T, F)``
    :param factor: time warping parameter.
    :return: a warped tensor of shape ``(T, F)``
    r   rK   Nro   bicubicF)ru   modealign_cornersrR   dim)ru   npr   r   r   r   nn
functionalinterpolatecatr   )r7   r{   tcenterwarpedleftrightr   r   r   r   R  s*   

r   TschedulerI   c                 C   sP   t |  \}}|d |ksJ d| d|  d|d  dt||d }|| S )Nr   z.Cannot determine the scheduled value for step z with schedule: zG. Did you forget to add the first part of the schedule for steps below ?rK   )r   bisectbisect_right)r   rI   
milestonesr   idxr   r   r   rW   v  s   rW         ?tensorrH   c                 C   sB   | j d ftdd | j dd D  }t||ktj}|S )a  
    For a given tensor with shape (N, d1, d2, d3, ...), returns a mask with shape (N, 1, 1, 1, ...),
    that randomly masks the samples in a batch.

    E.g. for a 2D input matrix it looks like:

        >>> [[0., 0., 0., ...],
        ...  [1., 1., 1., ...],
        ...  [0., 0., 0., ...]]

    :param tensor: the input tensor.
    :param p: the probability of masking an element.
    r   c                 s   rL   rM   r   rN   r   r   r   rP     rQ   z/random_mask_along_batch_axis.<locals>.<genexpr>rK   N)r*   rX   r   rY   tofloat32)r   rH   r_   rq   r   r   r   r[     s   (r[   c                       sB   e Zd ZdZddedef fddZdejd	ejfd
dZ  Z	S )r   a@  
    Dereverberation with Weighted Prediction Error (WPE).
    The implementation and default values are borrowed from `nara_wpe` package:
    https://github.com/fgnt/nara_wpe

    The method and library are described in the following paper:
    https://groups.uni-paderborn.de/nt/pubs/2018/ITG_2018_Drude_Paper.pdf
          n_fft
hop_lengthc                    s   t    || _|| _d S r3   )r   r   r   r   )r   r   r   r   r   r   r     s   

zDereverbWPE.__init__rJ   r"   c                    sP   |j dkrtj fdd|D ddS |j dksJ tj fdd|D ddS )a  
        Expects audio to be 2D or 3D tensor.
        2D means a batch of single-channel audio, shape (B, T).
        3D means a batch of multi-channel audio, shape (B, D, T).
        B => batch size; D => number of channels; T => number of audio samples.
        rR   c                    s$   g | ]}t |d  j jdqS )r   r   r   )r   r   r   r   r$   ar   r   r   
<listcomp>  s    z'DereverbWPE.forward.<locals>.<listcomp>r   r   ro   c                    s   g | ]}t | j jd qS )r   )r   r   r   r   r   r   r   r     s    )ndimr   r   stack)r   rJ   r]   r^   r   r   r   r9     s   
	

zDereverbWPE.forward)r   r   )
r;   r<   r=   r>   r?   r   r   rA   r9   rC   r   r   r   r   r     s    	r   )r   )$r   r   r   typingr   r   r   r   r   r   r   numpyr   r   lhotser	   r
   lhotse.augmentationr   lhotse.utilsr   __all__r   Moduler   r   r   rA   r?   rV   r   r   r   rW   r[   r   r   r   r   r   <module>   s@    $/: -
-!"