o
    }oi0                     @   sH   d dl Z d dlZd dlZd dlmZ G dd deZG dd deZdS )    N)AudioNoiseBatchc                       s   e Zd Z								ddededed	ed
edededef fddZdejdededejfddZddejdedejfddZ	de
de
fddZ  ZS )SpeakerNoiseAugmentation                    @      4@      ?probnoise_ratiomin_r_speechmax_r_speechmin_r_noisemax_r_noisemin_mix_ratemax_mix_ratec	           	         s  t    || _|| _|| _|| _|| _|| _|| _|| _	d| j  kr(dks1n t
d| j d| j  kr<dksEn t
d| j | j| jksWt
d| j d| j | j| jksit
d| j d| j d| j  krz| j	  krzdksn t
d| j d	| j	 d S )
Nr      zprob must be in [0, 1], got: z$noise_ratio must be in [0, 1], got: z<min_r_speech must be no greater than max_r_speech, got: min=z	 and max=z:min_r_noise must be no greater than max_r_noise, got: min=zTmin_mix_rate must be no greater than max_mix_rate, and both must be in [0, 1], got: z and )super__init__r	   r
   r   r   r   r   r   r   
ValueError)	selfr	   r
   r   r   r   r   r   r   	__class__ i/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/modules/ssl_modules/augmentation.pyr      s4   
$z!SpeakerNoiseAugmentation.__init__noise	noise_lenmax_audio_lenreturnc                 C   s6   |d | }||k r| || d }|d | }|S )Nr   )repeat)r   r   r   r   r   r   r   repeat_noise>   s
   z%SpeakerNoiseAugmentation.repeat_noiser   c                 C   sF   | d}||k rd|| f}tjjj|||d}|S |d | }|S )Nr   )value)sizetorchnn
functionalpad)r   r   r   r    r   r%   r   r   r   pad_or_trim_noiseE   s   
z*SpeakerNoiseAugmentation.pad_or_trim_noisebatchc              	      s  |j }|j}|d}|d}|j}|j}|j}|j}	t|D ]# tj	
 | jkr,q d| j  kr=| j  k r=dkrUn ntj	t|  | j t|  | j }
ntdt|  | j }
tj	|  |
 }tj	
 | jk sw|dkrtj	| j| j}n&tj	| j| j}tj	 fddt|D }||  | < || | < |  |
krd}| | |  |  |
|| < |
| < n
tj	|  |
 }t| d |  f d |   }|  dkrt| d |  f d |   nd}|dkrt|d|d  |  nd}| |||
 f }t|  }|| ||||
 < |  | | < |  |	 < || < |  | < q t |j!|j |j||||	dS )Nr   r   c                       g | ]}| kr|qS r   r   .0xir   r   
<listcomp>l       z5SpeakerNoiseAugmentation.__call__.<locals>.<listcomp>   
   	sample_idaudio	audio_lenr   r   noisy_audionoisy_audio_len)"r4   r5   r!   r   r   r6   r7   rangenprandomrandr	   r   r   randintintmaxr
   uniformr   r   r   r   choicecloner&   r   r"   summathsqrt
zeros_liker   r3   )r   r'   audio_signalaudio_lengths
batch_sizer   r   r   r6   r7   mix_lenmix_start_idxenergy_ratiojnoise_start_idxaudio_energynoise_energy	mix_scale
noise_clipnoise_signalr   r,   r   __call__N   s^   

& "
&6(z!SpeakerNoiseAugmentation.__call__)r   r   r   r   r   r   r   r   )r   )__name__
__module____qualname__floatr   r"   Tensorr=   r   r&   r   rS   __classcell__r   r   r   r   r      s:    	&	r   c                       s   e Zd Z												dd	ed
ededededededededededef fddZdedefddZdd Z  Z	S )MultiSpeakerNoiseAugmentationr   r   r   r   r   r         r	   r
   r   r   r   r   r   r   min_num_segmentsmax_num_segmentsmin_num_speakersmax_num_speakersc              
      s8   t  j||||||||d |	| _|
| _|| _|| _d S )N)r	   r
   r   r   r   r   r   r   )r   r   r]   r^   r_   r`   )r   r	   r
   r   r   r   r   r   r   r]   r^   r_   r`   r   r   r   r      s   

z&MultiSpeakerNoiseAugmentation.__init__r'   r   c              	   C   s  |j }|j}|d}|j}|j}|j}|j}t|D ]}	tj	
 | jkr'qd| j  kr8| j  k r8dkrDn n
tj	| j| j}
n| j}
tdt||	 |
 }tj	| j| jd }tj	| j| jd }t||}tj	|d| g| }tj	
 | jk s|dkrd}tj	| j| j}nd}tj	| j| j}| |	||||}t||	 }d}||	 | }t|D ]'}|}||k rtj	||}|| |||||  < |||  }||| 7 }qt ||	d ||	 f d ||	  }t |d ||	  d ||	  }|dkrt!"|d|d  |  nd}|| }||	 | ||	< ||	 ||	< |||	< ||	 ||	< qt#|j$|j |j||||dS )Nr   r   r   speechr0   r1   r2   )%r4   r5   r!   r   r   r6   r7   r8   r9   r:   r;   r	   r   r   r?   r>   r=   r<   r]   r^   r_   r`   minmultinomialr
   r   r   r   r   get_noise_segmentsr"   rE   rB   rC   rD   r   r3   )r   r'   rF   rG   rH   r   r   r6   r7   r-   mix_raterI   num_segmentsnum_speakerssegment_lensmoderK   noise_segmentsrR   min_start_idxmax_start_idxrL   	start_idxrN   rO   rP   r   r   r   rS      sf   
&
&"(z&MultiSpeakerNoiseAugmentation.__call__c                    s~  |j }|j}|j}|j}	|d}
|d}g }|dkrC| | |  |	  ||}d}|D ]}|||||   ||7 }q/|S |dkrNtd|  fddt	|
D }t
jj|t||
d dd	}d}|D ]Q}|| }||| kr| | || || ||}n|| |krt
j|| | nd}|| |||   }|| |d7 }|t|krt
jt|}qk|S )
Nr   r   r   ra   z.mode must be either 'noise' or 'speech', got: c                    r(   r   r   r)   	batch_idxr   r   r.     r/   zDMultiSpeakerNoiseAugmentation.get_noise_segments.<locals>.<listcomp>F)replace)r4   r5   r   r   r!   r&   r   appendr   r8   r9   r:   r@   rb   r<   rA   len)r   ro   r'   rh   rg   ri   rF   rG   r   r   rH   r   rj   noise_paddedrm   segment_lenspeaker_candidatessidseg_lenbidaudio_segmentr   rn   r   rd      sF   


$
z0MultiSpeakerNoiseAugmentation.get_noise_segments)r   r   r   r   r   r   r   r   r   r[   r   r\   )
rT   rU   rV   rW   r=   r   r   rS   rd   rY   r   r   r   r   rZ      sP    	
HrZ   )	rC   numpyr9   r"   %nemo.collections.asr.data.ssl_datasetr   objectr   rZ   r   r   r   r   <module>   s   ~