o
    }oi@                    @   s  d dl Z d dlZd dlZd dlmZmZmZ d dlZd dl	Z
d dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d dlmZ d dlmZmZmZmZmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& d d	l'm(Z( d d
l)m*Z*m+Z+m,Z, d dl-m.Z. zd dl/Z0d dl1m2Z2m3Z3m4Z4 dZ5W n e6y   dZ5Y nw zd dl7m8Z8m9Z9m:Z:m;Z; dZ<W n e6y   dZ<Y nw G dd de=Z>G dd de>Z?dS )    N)DictListTuple)	OmegaConf)convolve)cosinehamminghann)tqdm)process_augmentations)DataAnnotatorSpeechSamplerbuild_speaker_samples_mapget_background_noiseget_cleaned_base_pathget_random_offset_indexget_speaker_idsget_speaker_samplesget_split_points_in_alignmentsload_speaker_samplenormalize_audioper_speaker_normalizeperturb_audioread_audio_from_bufferread_noise_manifest)read_manifest)get_overlap_range
is_overlapmerge_float_intervals)logging)CardioidFamilyDirectionVectorDirectivityPatternTF)att2t_SabineEstimatorbeta_SabineEstimationsimulateRIRt2nc                   @   s.  e Zd ZdZdd Zdedededefdd	Zd
d Zdd Z	dd Z
dee fddZdee dedeee ef fddZdd Zdedee defddZdWdedefd d!Zd"edeeef fd#d$Zd%ed&ed'edeeef fd(d)ZdXd+efd,d-Z	dWd.ed/ed0ed1ed2edeeejf fd3d4Zd5ed6ee d7eeef d1efd8d9Zd5ededed:ed;ed<ed=edefd>d?Zd@ej dAedefdBdCZ!dDee dEefdFdGZ"ded:ed@ejdHejdeejejef f
dIdJZ#	KdYdLedMedNed6ee d7eeef dOedPej$dQefdRdSZ%dZdefdUdVZ&dTS )[MultiSpeakerSimulatora  
    Multispeaker Audio Session Simulator - Simulates multispeaker audio sessions using single-speaker audio files and
    corresponding word alignments.

    Change Log:
    v1.0: Dec 2022
        - First working verison, supports multispeaker simulation with overlaps, silence and RIR
        v1.0.1: Feb 2023
            - Multi-GPU support for speed up
            - Faster random sampling routine
            - Fixed sentence duration bug
            - Silence and overlap length sampling algorithms are updated to guarantee `mean_silence` approximation
        v1.0.2: March 2023
            - Added support for segment-level gain perturbation and session-level white-noise perturbation
            - Modified speaker sampling mechanism to include as many speakers as possible in each data-generation run
            - Added chunking mechanism to avoid freezing in multiprocessing processes

    v1.1.0 March 2023
        - Faster audio-file loading with maximum audio duration parameter
        - Re-organized MultiSpeakerSimulator class and moved util functions to util files.
        v1.1.1 March 2023
            - Changed `silence_mean` to use exactly the same sampling equation as `overlap_mean`.


    Args:
        cfg: OmegaConf configuration loaded from yaml file.

    Parameters:
      manifest_filepath (str): Manifest file with paths to single speaker audio files
      sr (int): Sampling rate of the input audio files from the manifest
      random_seed (int): Seed to random number generator

    session_config:
      num_speakers (int): Number of unique speakers per multispeaker audio session
      num_sessions (int): Number of sessions to simulate
      session_length (int): Length of each simulated multispeaker audio session (seconds). Short sessions
                            (e.g. ~240 seconds) tend to fall short of the expected overlap-ratio and silence-ratio.

    session_params:
      max_audio_read_sec (int): The maximum audio length in second when loading an audio file.
                                The bigger the number, the slower the reading speed. Should be greater than 2.5 second.
      sentence_length_params (list): k,p values for a negative_binomial distribution which is sampled to get the
                                     sentence length (in number of words)
      dominance_var (float): Variance in speaker dominance (where each speaker's dominance is sampled from a normal
                             distribution centered on 1/`num_speakers`, and then the dominance values are together
                             normalized to 1)
      min_dominance (float): Minimum percentage of speaking time per speaker (note that this can cause the dominance of
                             the other speakers to be slightly reduced)
      turn_prob (float): Probability of switching speakers after each utterance

      mean_silence (float): Mean proportion of silence to speaking time in the audio session. Should be in range [0, 1).
      mean_silence_var (float): Variance for mean silence in all audio sessions.
                                This value should be 0 <= mean_silence_var < mean_silence * (1 - mean_silence).
      per_silence_var (float):  Variance for each silence in an audio session, set large values (e.g., 20) for de-correlation.
      per_silence_min (float): Minimum duration for each silence, default to 0.
      per_silence_max (float): Maximum duration for each silence, default to -1 for no maximum.
      mean_overlap (float): Mean proportion of overlap in the overall non-silence duration. Should be in range [0, 1) and
                            recommend [0, 0.15] range for accurate results.
      mean_overlap_var (float): Variance for mean overlap in all audio sessions.
                                This value should be 0 <= mean_overlap_var < mean_overlap * (1 - mean_overlap).
      per_overlap_var (float): Variance for per overlap in each session, set large values to de-correlate silence lengths
                               with the latest speech segment lengths
      per_overlap_min (float): Minimum per overlap duration in seconds
      per_overlap_max (float): Maximum per overlap duration in seconds, set -1 for no maximum
      start_window (bool): Whether to window the start of sentences to smooth the audio signal (and remove silence at
                            the start of the clip)
      window_type (str): Type of windowing used when segmenting utterances ("hamming", "hann", "cosine")
      window_size (float): Length of window at the start or the end of segmented utterance (seconds)
      start_buffer (float): Buffer of silence before the start of the sentence (to avoid cutting off speech or starting
                            abruptly)
      split_buffer (float): Split RTTM labels if greater than twice this amount of silence (to avoid long gaps between
                            utterances as being labelled as speech)
      release_buffer (float): Buffer before window at end of sentence (to avoid cutting off speech or ending abruptly)
      normalize (bool): Normalize speaker volumes
      normalization_type (str): Normalizing speakers ("equal" - same volume per speaker, "var" - variable volume per
                                speaker)
      normalization_var (str): Variance in speaker volume (sample from standard deviation centered at 1)
      min_volume (float): Minimum speaker volume (only used when variable normalization is used)
      max_volume (float): Maximum speaker volume (only used when variable normalization is used)
      end_buffer (float): Buffer at the end of the session to leave blank

    outputs:
      output_dir (str): Output directory for audio sessions and corresponding label files
      output_filename (str): Output filename for the wav and RTTM files
      overwrite_output (bool): If true, delete the output directory if it exists
      output_precision (int): Number of decimal places in output files

    background_noise:
      add_bg (bool): Add ambient background noise if true
      background_manifest (str): Path to background noise manifest file
      snr (int): SNR for background noise (using average speaker power), set `snr_min` and `snr_max` values to enable random SNR
      snr_min (int):  Min random SNR for background noise (using average speaker power), set `null` to use fixed SNR
      snr_max (int):  Max random SNR for background noise (using average speaker power), set `null` to use fixed SNR

    segment_augmentor:
      add_seg_aug (bool): Set True to enable augmentation on each speech segment (Default: False)
      segmentor:
        gain:
            prob (float): Probability range (uniform distribution) gain augmentation for individual segment
            min_gain_dbfs (float): minimum gain in terms of dB
            max_gain_dbfs (float): maximum gain in terms of dB

    session_augmentor:
      add_sess_aug: (bool) set True to enable audio augmentation on the whole session (Default: False)
      segmentor:
        white_noise:
            prob (float): Probability of adding white noise (Default: 1.0)
            min_level (float): minimum gain in terms of dB
            max_level (float): maximum gain in terms of dB

    speaker_enforcement:
      enforce_num_speakers (bool): Enforce that all requested speakers are present in the output wav file
      enforce_time (list): Percentage of the way through the audio session that enforcement mode is triggered (sampled
                           between time 1 and 2)

    segment_manifest: (parameters for regenerating the segment manifest file)
      window (float): Window length for segmentation
      shift (float): Shift length for segmentation
      step_count (int): Number of the unit segments you want to create per utterance
      deci (int): Rounding decimals for segment manifest file
    c                 C   s  || _ t|| _t|| _t| j jj| _t	| j| _
g | _d | _d| _g | _g | _d| _g | _dd t| j jjjD | _d| _d | _d | _| j jjj| _| j jjdd| _d | _d | _t j!" rht #dnt #d	| _$i | _%| j jjd
d| _&| j jj'dd r| j jj'j(rt)| j jj'j*d| _'nd | _'| j jj+dd r| j jj+j,rt)| j jj+j*d| _+nd | _+| -  | j.| j jjj/| j jjj| j
0 | j jj1d| _2| j dd| _3| j jdd| _4| 5 | _6d S )N    c                 S      g | ]}d qS r    .0nr,   r,   ]/home/ubuntu/.local/lib/python3.10/site-packages/nemo/collections/asr/data/data_simulation.py
<listcomp>       z2MultiSpeakerSimulator.__init__.<locals>.<listcomp>r   turn_prob_ming      ?cudacpuadd_missing_overlapF	augmentor)	augmenter)num_sessnum_speakersall_speaker_idsrandom_seednum_workers   multiprocessing_chunksizei'  )7_paramsr   	annotatorr   samplerr   data_simulatormanifest_filepath	_manifestr   _speaker_samples_noise_samples	_sentence_text_words_alignments_min_alignment_count_merged_speech_intervalsrangesession_configr:   _furthest_sample_missing_overlapbase_manifest_filepathsegment_manifest_filepathsession_paramsmax_audio_read_sec_max_audio_read_secget_turn_prob_min_volume_speaker_idstorchr4   is_availabledevice_device_audio_read_buffer_dictr6   segment_augmentoradd_seg_augr   r7   session_augmentoradd_sess_aug_check_args_init_speaker_permutationsnum_sessionskeysr<   _permutated_speaker_indsr=   r?   _init_chunk_countchunk_countselfcfgr,   r,   r0   __init__   sb   

 







zMultiSpeakerSimulator.__init__r9   r:   r;   r<   c                 C   s   t j| tt|}tt || | }|| }t|D ]@}||k r(|}	n|}	|	dkr9td|	 d| d|dkrJt j	t|d|	 }
nt 
|
t j	t|d|	 f}
||	8 }qtd| d td| d	| d
 |
||S )a  
        Initialize the speaker permutations for the number of speakers in the session.
        When generating the simulated sessions, we want to include as many speakers as possible.
        This function generates a set of permutations that can be used to sweep all speakers in
        the source dataset to make sure we maximize the total number of speakers included in
        the simulated sessions.

        Args:
            num_sess (int): Number of sessions to generate
            num_speakers (int): Number of speakers in each session
            all_speaker_ids (list): List of all speaker IDs

        Returns:
            permuted_inds (np.array):
                Array of permuted speaker indices to use for each session
                Dimensions: (num_sess, num_speakers)
        r   zseq_len is z
 at count z and should be greater than 0NzTotal z  speakers in the source dataset.z%Initialized speaker permutations for z sessions with z speakers each.)nprandomseedlenlistintceilrN   
ValueErrorpermutationhstackr   inforeshape)rl   r9   r:   r;   r<   all_speaker_id_countsperm_set_counttarget_countcountseq_lenpermuted_indsr,   r,   r0   re      s"   "
z0MultiSpeakerSimulator._init_speaker_permutationsc                 C   s   t t| jjjj| j S )z
        Initialize the chunk count for multi-processing to prevent over-flow of job counts.
        The multi-processing pipeline can freeze if there are more than approximately 10,000 jobs
        in the pipeline at the same time.
        )rt   ro   ru   r@   rC   rO   rf   r?   rl   r,   r,   r0   ri   *  s   z'MultiSpeakerSimulator._init_chunk_countc                 C   s>  | j jjjdk rtd| j jjjdk s| j jjjdkr td| j jjjdk s0| j jjjdkr4td| j jjj| jk rQ| j jjj	dkrQt
d | j| j jj_| j jjjdk r]td| j jjjd dkrktd	d| j jjjd   k r~dkstd
 td
| j jjjdk s| j jjjdkrtd| j jjjdk s| j jjjdkrtd| j jjjdk rtd| j jjjdkr| j jjj| j jjjd| j jjj  krtd| j jjjdk rtd| j jjjdk rtd| j jjjdkr| j jjj| j jjjd| j jjj  krtd| j jjjdk rtd| j jjjdk s0| j jjjdkr4td| j jjjd dk sJ| j jjjd dkrNtd| j jjjd dk sd| j jjjd dkrhtd| j jjj| j jjj dkr{td| j jjjdvr| j jjjdurtdt| jdkrtddS )zO
        Checks YAML arguments to ensure they are within valid ranges.
        r>   zMAt least one speaker is required for making audio sessions (num_speakers < 1)r   z$Turn probability is outside of [0,1]TzTurn probability is less than {self._turn_prob_min} while enforce_num_speakers=True, which may result in excessive session lengths. Forcing turn_prob to 0.5.g      @z4Max audio read time must be greater than 2.5 secondszfk (number of success until the exp. ends) in Sentence length parameter value must be a positive numberzQp (success probability) value in sentence length parameter must be in range (0,1]z Mean overlap is outside of [0,1]z Mean silence is outside of [0,1]z$Mean silence variance is not below 0zJMean silence variance should be lower than mean_silence * (1-mean_silence)zPer silence variance is below 0z*Mean overlap variance is not larger than 0zJMean overlap variance should be lower than mean_overlap * (1-mean_overlap)z)Per overlap variance is not larger than 0z%Minimum dominance is outside of [0,1]z-Speaker enforcement start is outside of [0,1]z+Speaker enforcement end is outside of [0,1]z<Number of speakers times minimum dominance is greater than 1)r   r	   r   NIncorrect window type providedz>Manifest file is empty. Check that the source path is correct.)r@   rC   rO   r:   	ExceptionrT   	turn_probrX   speaker_enforcementenforce_num_speakersr   warningrU   sentence_length_paramsmean_overlapmean_silencemean_silence_varper_silence_varmean_overlap_varper_overlap_varmin_dominanceenforce_timewindow_typerr   rE   r   r,   r,   r0   rd   2  s   





z!MultiSpeakerSimulator._check_argsc                 C   s&   d| _ g | _g | _i | _tj  dS )za
        Clear the system memory. Cache data for audio files and alignments are removed.
        N)rH   rJ   rK   r_   r[   r4   empty_cacher   r,   r,   r0   clean_up  s
   zMultiSpeakerSimulator.clean_upreturnc                 C   s   d| j jjj }tjj|| j jjj| j jjjd}tj	|dtj
d}t|}|dkr@tt|D ]}||  | j jjj7  < q1|| d| j jjj| j jjj   }tt|D ]}||  | j jjj7  < |dkrv|| ||d   ||< qX|S )z
        Get the dominance value for each speaker, accounting for the dominance variance and
        the minimum per-speaker dominance.

        Returns:
            dominance (list): Per-speaker dominance
              ?locscalesizer   a_mina_maxr>   )r@   rC   rO   r:   ro   rp   normalrT   dominance_varclipinfsumrN   rr   r   )rl   dominance_mean	dominancetotalir,   r,   r0   _get_speaker_dominance  s0   




z,MultiSpeakerSimulator._get_speaker_dominancebase_speaker_dominancefactorc                 C   s   g }t | jjjjD ]}| j| dkr|| q
t|dkrmt	|}t t|d ddD ]}|| ||d   ||< q.|D ]
}|| | ||< q?|t
| }t dt|D ]}|| ||d   ||< qXd}||fS |}d}||fS )a  
        Increase speaker dominance for unrepresented speakers (used only in enforce mode).
        Increases the dominance for these speakers by the input factor (and then re-normalizes the probabilities to 1).

        Args:
            base_speaker_dominance (list): Dominance values for each speaker.
            factor (int): Factor to increase dominance of unrepresented speakers by.
        Returns:
            dominance (list): Per-speaker dominance
            enforce (bool): Whether to keep enforce mode turned on
        r   r>   TF)rN   r@   rC   rO   r:   rP   appendrr   ro   copyr   )rl   r   r   increase_percentr   r   enforcer,   r,   r0   _increase_speaker_dominance  s&   

z1MultiSpeakerSimulator._increase_speaker_dominancec                 C   s   | j jjjdkrt| j jjj| _dS | j jjjdkrGtj	j
d| j jjj| j jjjd| _tjt| j| j jjj| j jjjd | _dS dS )zc
        Set the volume for each speaker (either equal volume or variable speaker volume).
        equalvariabler   r   r   N)r@   rC   rT   normalization_typero   onesrO   r:   rY   rp   r   normalization_varr   array
min_volume
max_volumetolistr   r,   r,   r0   _set_speaker_volume  s    




z)MultiSpeakerSimulator._set_speaker_volumeprev_speakerr   c                 C   s   | j jjjdkr|du rd}|S |}|S tjdd| j jjjkr'|dur'|S |}||krJtjdd}d}||| krF|d7 }||| ks<||ks-|S )a6  
        Get the next speaker (accounting for turn probability and dominance distribution).

        Args:
            prev_speaker (int): Previous speaker turn.
            dominance (list): Dominance values for each speaker.
        Returns:
            prev_speaker/speaker_turn (int): Speaker turn
        r>   Nr   )	r@   rC   rO   r:   ro   rp   uniformrT   r   )rl   r   r   speaker_turnrandr,   r,   r0   _get_next_speaker  s"   
z'MultiSpeakerSimulator._get_next_speakerFwindow_amountstartc                 C   s   | j jjjdkrt|d }n"| j jjjdkrt|d }n| j jjjdkr-t|d }ntdt	|
| j}|rB|d| S ||d S )aw  
        Get window curve to alleviate abrupt change of time-series signal when segmenting audio samples.

        Args:
            window_amount (int): Window length (in terms of number of samples).
            start (bool): If true, return the first half of the window.

        Returns:
            window (tensor): Half window (either first half or second half)
        r   r)   r	   r   r   N)r@   rC   rT   r   r   r	   r   r   r[   
from_numpytor^   )rl   r   r   windowr,   r,   r0   _get_window  s   z!MultiSpeakerSimulator._get_windowfirst_alignmentc                 C   s   t | jjjj| jjj }t | jjjj| jjj }||k r&d}d}||fS ||| k r6|| }d}||fS || | }||fS )aP  
        Get the start cutoff and window length for smoothing the start of the sentence.

        Args:
            first_alignment (int): Start of the first word (in terms of number of samples).
        Returns:
            start_cutoff (int): Amount into the audio clip to start
            window_amount (int): Window length
        r   )rt   r@   rC   rT   window_sizesrstart_buffer)rl   r   r   r   start_cutoffr,   r,   r0   _get_start_buffer_and_window2  s   
z2MultiSpeakerSimulator._get_start_buffer_and_windowcurrent_sample_cursorremaining_dur_samplesremaining_len_audio_filec                 C   s   t | jjjj| jjj }t | jjjj| jjj }|| |kr'|| }d}n|| | |kr5|| | }||k rA|}d}||fS ||| k rK|| }||fS )aN  
        Get the end buffer and window length for smoothing the end of the sentence.

        Args:
            current_sample_cursor (int): Current location in the target file (in terms of number of samples).
            remaining_dur_samples (int): Remaining duration in the target file (in terms of number of samples).
            remaining_len_audio_file (int): Length remaining in audio file (in terms of number of samples).
        Returns:
            release_buffer (int): Amount after the end of the last alignment to include
            window_amount (int): Window length
        r   )rt   r@   rC   rT   r   r   release_buffer)rl   r   r   r   r   r   r,   r,   r0   _get_end_buffer_and_windowJ  s    z0MultiSpeakerSimulator._get_end_buffer_and_windowr   num_missingc                 C   sb   t t| jD ]}| j| dkr|d7 }q|dkr/t| jjjj|  d| jjjj  dS dS )z
        Check if any speakers were not included in the clip and display a warning.

        Args:
            num_missing (int): Number of missing speakers.
        r   r>   zFspeakers were included in the clip instead of the requested amount of N)	rN   rr   rP   warningswarnr@   rC   rO   r:   )rl   r   kr,   r,   r0   _check_missing_speakersk  s   
z-MultiSpeakerSimulator._check_missing_speakersaudio_manifestsentence_word_countmax_word_count_in_sentencemax_samples_in_sentencerandom_offsetc              	   C   sl  |rt jjdt|d d}nd}t|d |d  | jjj }| |\}	}
| jjj	j
s/d}
t| j}|| }|| }d\}}}d}|}d}||k r||k r|t|d k rt|d | | jjj |	 }|| |krmnv|d | }|dkr||dkr|ng| j| | jt|d | jjj t|	d | jjj  |d |   |dkr|d7 }|d7 }qG| jdkr|  j|7  _n	|  jd	| 7  _|d7 }|d7 }|}||7 }||k r||k r|t|d k sW| jjj	jd
ur+|
dkr| j|
dd}| j| j| _t| jt||	|	|
  |fd| _t| j||	|
 |	|  fd| j| _nt| j||	|	|  fd| j| _|t|d k r| jjj	jd
ur| ||t||	| d
 \}}t| j||	| |	| |  fd| j| _|dkr| j|dd}|	| | }|	| | | }t||| |}t| j|fd| j| _~|| t| jfS )aV  
        Add audio file to current sentence (up to the desired number of words).
        Uses the alignments to segment the audio file.
        NOTE: 0 index is always silence in `audio_manifest['words']`, so we choose `offset_idx=1` as the first word

        Args:
            audio_manifest (dict): Line from manifest file for current audio file
            audio_file (tensor): Current loaded audio file
            sentence_word_count (int): Running count for number of words in sentence
            max_word_count_in_sentence (int): Maximum count for number of words in sentence
            max_samples_in_sentence (int): Maximum length for sentence in terms of samples

        Returns:
            sentence_word_count+current_word_count (int): Running word count
            len(self._sentence) (tensor): Current length of the audio file
        r>   words)lowhigh
alignmentsr   )r   r   r   r(   r    NT)r   F)ro   rp   randintrr   rt   r@   rC   r   r   rT   start_windowrH   rJ   r   rK   floatrI   r   r   r   r^   r[   catmultiplyr   )rl   r   
audio_filer   r   r   r   
offset_idxr   r   start_window_amountsentence_samplesr   remaining_durationprev_dur_samplesdur_samplescurr_dur_samplescurrent_word_countword_idxsilence_countwordr   r   end_window_amount	sig_startsig_endwindowed_audio_filer,   r,   r0   	_add_file|  s   



$
	
zMultiSpeakerSimulator._add_filer   speaker_idsspeaker_wav_align_mapc              	   C   sr  t j| jjjjd | jjjjd d }tjdtj	| j
d| _d| _g g | _| _d\}}||k r||k rt|||| jd}t|| jd| j| jd}	t|| j|	| j
| j| jdd	\}
}}| jjjjrmt|
|| j| j
d
}
| ||
|||\}}||k r||k s6| jjjjrtt| jdkrt| j| j| jjjj| jjjt| jd}t | j||| j!| j
d| _dS dS dS )a\  
        Build a new sentence by attaching utterance samples together until the sentence has reached a desired length.
        While generating the sentence, alignment information is used to segment the audio.

        Args:
            speaker_turn (int): Current speaker turn.
            speaker_ids (list): LibriSpeech speaker IDs for each speaker in the current session.
            speaker_wav_align_map (dict): Dictionary containing speaker IDs and their corresponding wav filepath and alignments.
            max_samples_in_sentence (int): Maximum length for sentence in terms of samples
        r   r>   )dtyper]   r(   r   r   )r   r   r   min_alignment_count)r   audio_read_buffer_dict
offset_minrU   r   T)r   buffer_dictoffset_indexr]   rU   r   read_subsetr]   )r   r   split_bufferr   sentence_audio_len)sentence_audiosplitsr   volumer]   N)"ro   rp   negative_binomialr@   rC   rT   r   r[   zerosfloat64r^   rH   rI   rJ   rK   r   rL   r   r_   rV   r   r`   ra   r   r   	normalizemaxabsr   r   r   rr   r   rY   )rl   r   r   r   r   slr   r   r   r   r   r   r  r,   r,   r0   _build_sentence  sp   	
"#
z%MultiSpeakerSimulator._build_sentencelengthsession_len_samplesprev_len_samplesr   c                 C   st  || }| j j| }	| j ||	}
||kr|dur|
r| j |	}|| }|dk r:|d| 8 }|  jd| 7  _d}|| j| k r[|| j| | 8 }|  j| j| | 7  _| j| }|| }|}|| }d}t||g||grt||g||g}t|d |d  d}||k r|  j|| 7  _| j  j	|7  _	|S | j 
|}|| | |kr|st|| |}|S || }|S )a%  
        Returns new overlapped (or shifted) start position after inserting overlap or silence.

        Args:
            speaker_turn (int): The integer index of the current speaker turn.
            prev_speaker (int): The integer index of the previous speaker turn.
            start (int): Current start of the audio file being inserted.
            length (int): Length of the audio file being inserted.
            session_len_samples (int): Maximum length of the session in terms of number of samples
            prev_len_samples (int): Length of previous sentence (in terms of number of samples)
            enforce (bool): Whether speaker enforcement mode is being used
        Returns:
            new_start (int): New starting position in the session accounting for overlap or silence
        Nr   r>   )rB   running_speech_len_samplessilence_vs_overlap_selectorsample_from_overlap_modelrQ   rP   r   r   r  running_overlap_len_samplessample_from_silence_model)rl   r   r   r   r  r  r  r   running_len_samplesnon_silence_len_samplesadd_overlapdesired_overlap_amount	new_start
prev_startprev_endnew_endoverlap_amountoverlap_rangesilence_amountr,   r,   r0   _add_silence_or_overlapW  s>   
	z-MultiSpeakerSimulator._add_silence_or_overlapr   snrc                 C   s6   |j d | jjj | jj| jj|| jt| j	d}|S )z
        Get meta data for the current session.

        Args:
            array (np.ndarray): audio array
            snr (float): signal-to-noise ratio

        Returns:
            dict: meta data
        r   )durationsilence_meanoverlap_meanbg_snrr   speaker_volumes)
shaper@   rC   r   rB   sess_silence_meansess_overlap_meanrZ   rs   rY   )rl   r   r   	meta_datar,   r,   r0   _get_session_meta_data  s   z,MultiSpeakerSimulator._get_session_meta_data	rttm_listr  c           
      C   s   g }|D ]}dd |  D }|t|d t|d g qt|| _tdd | jD }|| jjj | }t	|| jjj }t	|| jjj }	||	fS )a-  
        Calculate the total speech and silence duration in the current session using RTTM file.

        Args:
            rttm_list (list):
                List of RTTM timestamps
            running_len_samples (int):
                Total number of samples generated so far in the current session

        Returns:
            sess_speech_len_rttm (int):
                The total number of speech samples in the current session
            sess_silence_len_rttm (int):
                The total number of silence samples in the current session
        c                 S   s   g | ]}|qS r,   r,   )r.   tokenr,   r,   r0   r1     r2   zHMultiSpeakerSimulator._get_session_silence_from_rttm.<locals>.<listcomp>r   r>   c                 S   s   g | ]
}|d  |d  qS )r>   r   r,   )r.   xr,   r,   r0   r1     s    )
splitr   r   r   rM   r   r@   rC   r   rt   )
rl   r+  r  all_sample_listx_rawr-  total_speech_in_secstotal_silence_in_secssess_speech_lensess_silence_lenr,   r,   r0   _get_session_silence_from_rttm  s    
z4MultiSpeakerSimulator._get_session_silence_from_rttm	is_speechc                 C   sx   || }|t |kr&tjj|d|t | f}tjj|d|t | f}|||  | j7  < d|||< |||fS )a  
        Add a sentence to the session array containing time-series signal.

        Args:
            start (int): Starting position in the session
            length (int): Length of the sentence
            array (torch.Tensor): Session array
            is_speech (torch.Tensor): Session array containing speech/non-speech labels

        Returns:
            array (torch.Tensor): Session array in torch.Tensor format
            is_speech (torch.Tensor): Session array containing speech/non-speech labels in torch.Tensor format
        r   r>   )rr   r[   nn
functionalpadrH   )rl   r   r  r   r6  endr,   r,   r0   _add_sentence_to_array  s   
z,MultiSpeakerSimulator._add_sentence_to_arrayr)   idxbasepathfilenamenoise_samplesr]   enforce_counterc	                 C   s"  | j jj}	tj|	|  || _|  }
t|
}| 	  d\}}d}| j
  || _dd t| j jjjD | _d| _tj| j jjjd | j jjjd }| j jjj}t| j jjj| j jj }t|| j}t|| j}| j  | j  ||k s|r~||| kr|r| ||\}
}|r|d7 }|  ||
}|| }|rt!d}n|| j jj"j#| j jj k rn| $|||| t%| j&}| j'|||||||d}| j(||||d	\}}}| j
j)| j*| j+|| j jj || j jj || d
}| j
j,d -| | j
j.| j/t0j12||d || j jj || j jj || t0j12||d t0j12||d d}| j
j,d 3| | j
j4| j*| j+||| t!|| j jj d}| j
j,d -| t5||}| j6| j
j,d |d\| j_7| j_8|| j|< |}|}||k s|s| j jj9j:rt;|| j jj| j9|j<d}| j jj=j>rt%| jdkrt?||dk d }t@t%||| j| jA| j jj=jB| j jj=jC| j jj=jD|	| | jd	\}}||7 }ntEdd}tF|}tG|r|H I }tJKt0j12||d || j jj | j
jL||| jM||dd ~| N  ||fS )a  
        _generate_session function without RIR simulation.
        Generate a multispeaker audio session and corresponding label files.

        Args:
            idx (int): Index for current session (out of total number of sessions).
            basepath (str): Path to output directory.
            filename (str): Filename for output files.
            speaker_ids (list): List of speaker IDs that will be used in this session.
            speaker_wav_align_map (dict): Dictionary containing speaker IDs and their corresponding wav filepath and alignments.
            noise_samples (list): List of randomly sampled noise source files that will be used for generating this session.
            device (torch.device): Device to use for generating this session.
            enforce_counter (int): In enforcement mode, dominance is increased by a factor of enforce_counter for unrepresented speakers
        r   Nc                 S   r*   r+   r,   r-   r,   r,   r0   r1     r2   z;MultiSpeakerSimulator._generate_session.<locals>.<listcomp>r   r>   r   r   r   r   r  r  r  r   )r   r  r   r6  )r   r   r   r:  
speaker_idrttm.wav.rttm.ctm)textwav_filenamer   r  rB  rttm_filepathctm_filepathjson)r   r   session_namerB  r   ctm)r+  r  r   r)   		len_arraypower_arrayr?  r   snr_minsnr_maxbackground_noise_snrrq   r]   z9No background noise samples found in self._noise_samples.N/Ar   r   r=  r>  r)  )Or@   rC   r<   ro   rp   rq   r^   r   r   r   rA   init_annotation_listsrG   rN   rO   r:   rP   _missing_silencer   r   r   r   rt   session_lengthr   r[   r  r   rB   get_session_silence_meanget_session_overlap_meanr   r   r   rT   
end_bufferr  rr   rH   r  r;  create_new_rttm_entryrJ   rK   annote_listsextendcreate_new_json_entryrI   ospathjoinr   create_new_ctm_entrymaximumr5  r  running_silence_len_samplesrb   rc   r   r]   background_noiseadd_bgmeanr   r_   rQ  rR  r   rv   r   	is_tensorr5   numpysfwritewrite_annotation_filesr*  r   )rl   r<  r=  r>  r   r   r?  r]   r@  r<   speaker_dominancer   r  r  r   r   r   r  r   r6  r   r   r  r   r:  new_rttm_entriesnew_json_entrynew_ctm_entriesavg_power_arraybgr   r,   r,   r0   _generate_session  s   







	
X




"z'MultiSpeakerSimulator._generate_sessionNc                 C   s  t d |du r| jjj}tj| | jjjj	}t
|| jjjjd}t| jtj|d tjj| jd}g }| jjjj}t| jjjj| jjjjd}g }t|D ]F}	| jjjjd|	  }
t|	| j| jd}t || jd	}| j!j"|d
}t#j$% rt#&d|	t#j$'   }n| j(}|)|	||
||||f qT| jdkrd| _*d| _t| j+D ]}g }|| j, t-|d | j, |}}t||D ]/}	dd t| jjjj.D | _/i | _0| jdkr|)|j1| j2g||	 R   q|)||	  q| jdkrtj3|}n|}t4|d|d  d| j+ d|d dd|ddt5|dD ]+}| jdkr/|6 \}}
n| j!j"|d
| _7| j2| \}}
| j8j9||
d | :  q q|;  | j8j<|d t d|  dS )z
        Generate several multispeaker audio sessions and corresponding list files.

        Args:
            random_seed (int): random seed for reproducibility
        zGenerating Diarization SessionsN)overwrite_outputzparams.yaml)max_workers)rh  background_manifest_)sess_idxspeaker_samplespermutated_speaker_inds)r   r{  )noise_manifestzcuda:r>   c                 S   r*   r+   r,   r-   r,   r,   r0   r1     r2   z;MultiSpeakerSimulator.generate_sessions.<locals>.<listcomp>[/z] Waiting jobs from z 2z to jobs)descunitr   )r=  r>  )r=  z6Data simulation has been completed, results saved at: )=r   ry   r@   rC   r<   ro   rp   rq   outputs
output_dirr   rv  r   savera  rb  rc  
concurrentfuturesProcessPoolExecutorr=   rO   rf   r   rg  rh  rx  rN   output_filenamer   rF   rh   r   rB   sample_noise_manifestr[   r4   r\   r]   device_countr^   r   rE   rj   r?   minr:   rP   r_   submitru  as_completedr
   rr   resultrG   rA   add_to_filename_listsr   shutdownwrite_filelist_files)rl   r<   r  r=  tpr  rf   source_noise_manifestqueuerz  r>  r   r   r?  r]   	chunk_idxstt_idxend_idx	generatorfuturer,   r,   r0   generate_sessions  s   






 (
z'MultiSpeakerSimulator.generate_sessions)Fr+   r)   N)'__name__
__module____qualname____doc__rn   rt   r   re   ri   rd   r   r   r   r   boolr   r   r   r   r   r   r   dictr[   Tensorr   strr   rs   r  r  ro   ndarrayr*  r5  r;  r]   ru  r  r,   r,   r,   r0   r'   A   s    zC+e
"
%

!
 

T	
G
!
	
 <r'   c                       s   e Zd ZdZ fddZdd Zdd Zdeej	e
f fd	d
Zde
dej	deee
f fddZ	dde
dedededededejde
fddZ  ZS )RIRMultiSpeakerSimulatora  
    RIR Augmented Multispeaker Audio Session Simulator - simulates multispeaker audio sessions using single-speaker
    audio files and corresponding word alignments, as well as simulated RIRs for augmentation.

    Args:
        cfg: OmegaConf configuration loaded from yaml file.

    Parameters (in addition to the base MultiSpeakerSimulator parameters):
    rir_generation:
      use_rir (bool): Whether to generate synthetic RIR
      toolkit (str): Which toolkit to use ("pyroomacoustics", "gpuRIR")
      room_config:
        room_sz (list): Size of the shoebox room environment (1d array for specific, 2d array for random range to be
                        sampled from)
        pos_src (list): Positions of the speakers in the simulated room environment (2d array for specific, 3d array
                        for random ranges to be sampled from)
        noise_src_pos (list): Position in room for the ambient background noise source
      mic_config:
        num_channels (int): Number of output audio channels
        pos_rcv (list): Microphone positions in the simulated room environment (1d/2d array for specific, 2d/3d array
                        for range assuming num_channels is 1/2+)
        orV_rcv (list or null): Microphone orientations (needed for non-omnidirectional microphones)
        mic_pattern (str): Microphone type ("omni" - omnidirectional) - currently only omnidirectional microphones are
                           supported for pyroomacoustics
      absorbtion_params: (Note that only `T60` is used for pyroomacoustics simulations)
        abs_weights (list): Absorption coefficient ratios for each surface
        T60 (float): Room reverberation time (`T60` is the time it takes for the RIR to decay by 60DB)
        att_diff (float): Starting attenuation (if this is different than att_max, the diffuse reverberation model is
                          used by gpuRIR)
        att_max (float): End attenuation when using the diffuse reverberation model (gpuRIR)
    c                    s   t  | |   d S r  )superrn   _check_args_rirrk   	__class__r,   r0   rn   "  s   z!RIRMultiSpeakerSimulator.__init__c                 C   s  | j jjjdvrtd| j jjjdkrtstd| j jjjdkr(ts(tdt| j jjj	j
dkr7td| j jjjjd	krDtd
t| j jjj	jdk rStd| j jjj	jD ]}t|dkrftdqZt| j jjjjd	krvtd| j jjj	jD ]}t|dkrtdq}| j jjjt| j jjj	jkrtd| j jjjjt| j jjjjkrtd| j jjjjs| j jjjjdkrtd| j jjjjdurt| j jjjjt| j jjjjkrtd| j jjjjD ]}t|dkrtdqdS dS )zR
        Checks RIR YAML arguments to ensure they are within valid ranges
        )pyroomacousticsgpuRIR)Toolkit must be pyroomacoustics or gpuRIRr  zOpyroomacoustics should be installed to run this simulator with RIR augmentationr  zFgpuRIR should be installed to run this simulator with RIR augmentation   z"Incorrect room dimensions providedr   z2Number of channels should be greater or equal to 1r)   z%Less than 2 provided source positionsz8Three coordinates must be provided for sources positionszNo provided mic positionsz4Three coordinates must be provided for mic positionszJNumber of speakers is not equal to the number of provided source positionszNNumber of channels is not equal to the number of provided microphone positionsomniz?Microphone orientations must be provided if mic_pattern != omniNzTA different number of microphone orientations and microphone positions were providedz3Three coordinates must be provided for orientations)r@   rC   rir_generationtoolkitr   PRAImportErrorGPURIRrr   room_configroom_sz
mic_confignum_channelspos_srcpos_rcvrO   r:   orV_rcvmic_pattern)rl   sublistr,   r,   r0   r  &  sZ   z(RIRMultiSpeakerSimulator._check_args_rirc                 C   sp  t | jjjjj}|jdkr3t |j	d }t
|j	d D ]}t j||df ||df ||< qn|}t | jjjjj}|jdkryt |j	d |j	d f}t
|j	d D ]}t
|j	d D ]}t j|||df |||df ||< qaqXn|}| jjjjrt || jjjjjf}t | jjjjj}|jdkrt |j	d |j	d f}t
|j	d D ]}t
|j	d D ]}t j|||df |||df ||< qqn|}| jjjjj}	|	rt |	}	| jjjjj}
| jjjjj}| jjjjj}| jjjjj}| jjjjj}| jjj}t|||d}t||}t||}t||}t|||||||||	|
d
}|j	d d }||fS )z
        Create simulated RIR using the gpuRIR library

        Returns:
            RIR (tensor): Generated RIR
            RIR_pad (int): Length of padding added when convolving the RIR with an audio file
        r)   r   r>   r  )abs_weights)Tdiffr  r  ) ro   r   r@   rC   r  r  r  ndimr  r&  rN   rp   r   r  rg  rh  vstacknoise_src_posr  r  r  r  absorbtion_paramsr  T60att_diffatt_maxr   r$   r#   r&   r%   )rl   room_sz_tmpr  r   pos_src_tmpr  jmic_pos_tmpmic_posr  r  r  r  r  r  r   betar  Tmaxnb_imgRIRRIR_padr,   r,   r0   _generate_rir_gpuRIRY  sZ   
$
(
(




z-RIRMultiSpeakerSimulator._generate_rir_gpuRIRr   c              	   C   s  | j jjjj}| j jj}t| j jjjj	}|j
dkr?t|jd }t|jd D ]}tj||df ||df ||< q*n|}t| j jjjj}|j
dkrt|jd |jd f}t|jd D ]}t|jd D ]}tj|||df |||df ||< qmqdn|}t||\}	}
tj||t|	|
d}| j jjjrt|| j jjjjf}|D ]}|| q| j jjjj}| j jjjjdkrtj}tdddd	}t||d
}t| j jjjj }|j
dkrt|jd |jd f}t|jd D ] }t|jd D ]}tj|||df |||df ||< qqn|}|j!|j"|d |#  d}|j$D ]}|D ]}|jd d |krG|jd d }q4q0|j$|fS )z
        Create simulated RIR using the pyroomacoustics library

        Returns:
            RIR (tensor): Generated RIR
            RIR_pad (int): Length of padding added when convolving the RIR with an audio file
        r)   r   r>   r  )fs	materials	max_orderr  Z   T)azimuth
colatitudedegrees)orientationpattern_enum)directivity)%r@   rC   r  r  r  r   ro   r   r  r  r  r  r&  rN   rp   r   r  prainverse_sabineShoeBoxMaterialrg  rh  r  r  
add_sourcer  r  r"   OMNIr!   r    r  add_microphone_arrayTcompute_rirrir)rl   rt60r   r  r  r   r  r  r  e_absorptionr  roomposr  dir_vecdir_objr  r  rir_padchannelr,   r,   r0   _generate_rir_pyroomacoustics  sf   	

$
(*

z6RIRMultiSpeakerSimulator._generate_rir_pyroomacousticsr   r  c              	   C   s   g }d}t | jjjjjD ]F}| jjjjdkr(t||||dt|f 	 }n| jjjjdkrAt||| | dt| 	 }t||krKt|}|
t| q||fS )a  
        Augment one sentence (or background noise segment) using a synthetic RIR.

        Args:
            input (torch.tensor): Input audio.
            speaker_turn (int): Current speaker turn.
            RIR (torch.tensor): Room Impulse Response.
        Returns:
            output_sound (list): List of tensors containing augmented audio
            length (int): Length of output audio channels (or of the longest if they have different lengths)
        r   r  Nr  )rN   r@   rC   r  r  r  r  r   rr   r   r   r[   tensor)rl   inputr   r  output_soundr  r  out_channelr,   r,   r0   _convolve_rir  s   ""z&RIRMultiSpeakerSimulator._convolve_rirr)   r<  r=  r>  r   r   r?  r]   r@  c	           &      C   s  | j jj}	tj|	|  || _|  }
t|
}| 	  d\}}d}| j
  || _dd t| j jjjD | _| j jjjdkrH|  \}}n| j jjjdkrW|  \}}ntdtj| j jjjd | j jjjd	 }| j jjj}t| j jjj| j jj }t|| j jjjj f}t|}||k s|r||| kr|r| !||\}
}|r|d	7 }| "||
}|| | }|rt#d
}n|| j jj$j%| j jj k rn| &|||| | '| j(||\}}| j)|||||||d}|| }|t*|krtj+j,-|ddd|t*| f}tj+j,-|d|t*| f}d	|||< t| j jjjj D ]}t*|| }|||| |f  || 7  < q&| j
.| j/| j0|| j jj || j jj || }| j
j1d 2| | j
3| j4t5j67||d || j jj || j jj || t5j67||d t5j67||d }| j
j1d 8| | j
9||| || j jj } | j
j1d 2|  t:||}|| j|< |}|}||k s|s| j jj;j<rt=|| j jj| j;}| j jj>j?rLt*| jdkrt@||d	k d }!tAt*||!| j| jB| j jj>jC| j jj>jD| j jj>jE|	| | jd	\}"}#||"7 }|jFd }| G||!\}"}#| '|"d|\}$}%t| j jjjj D ]}|dd|f  |$| d| 7  < q5nd}#tH|}tI|r^|J K }tLMt5j67||d || j jj | j
jN||| jO||#dd ~| P  ||fS )aR  
        Generate a multispeaker audio session and corresponding label files.

        Args:
            idx (int): Index for current session (out of total number of sessions).
            basepath (str): Path to output directory.
            filename (str): Filename for output files.
            speaker_ids (list): List of speaker IDs that will be used in this session.
            speaker_wav_align_map (dict): Dictionary containing speaker IDs and their corresponding wav filepath and alignments.
            noise_samples (list): List of randomly sampled noise source files that will be used for generating this session.
            device (torch.device): Device to use for generating this session.
            enforce_counter (int): In enforcement mode, dominance is increased by a factor of enforce_counter for unrepresented speakers
        r   Nc                 S   r*   r+   r,   r-   r,   r,   r0   r1     r2   z>RIRMultiSpeakerSimulator._generate_session.<locals>.<listcomp>r  r  r  r   r>   r   rA  rC  rD  rE  rF  rK  rM  r)   rN  r   rT  rU  rV  )Qr@   rC   r<   ro   rp   rq   r^   r   r   r   rA   rW  rG   rN   rO   r:   rP   r  r  r  r  r   r   r   r   r   rt   rY  r   r[   r  r  r  r   r   r   rT   r\  r  r  rH   r  rr   r7  r8  r9  r]  rJ   rK   r^  r_  r`  rI   ra  rb  rc  r   rd  re  rb   rc   r   rg  rh  ri  r   r_   rQ  rR  r   r&  _get_backgroundr   rj  r5   rk  rl  rm  rn  r*  r   )&rl   r<  r=  r>  r   r   r?  r]   r@  r<   ro  r   r  r  r   r  r  r   r   r  r   r6  r   r   augmented_sentencer  r   r:  r  len_chrp  rq  rr  rs  rt  r   augmented_bgry  r,   r,   r0   ru    s   






 $	
Q




("z*RIRMultiSpeakerSimulator._generate_sessionr  )r  r  r  r  rn   r  r  r   r[   r  rt   r  rs   r  r  r  r]   ru  __classcell__r,   r,   r  r0   r    s2     39 C!	r  )@r  ra  r   typingr   r   r   rk  ro   	soundfilerl  r[   	omegaconfr   scipy.signalr   scipy.signal.windowsr   r   r	   r
   0nemo.collections.asr.parts.preprocessing.perturbr   6nemo.collections.asr.parts.utils.data_simulation_utilsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   /nemo.collections.asr.parts.utils.manifest_utilsr   .nemo.collections.asr.parts.utils.speaker_utilsr   r   r   
nemo.utilsr   r  r  pyroomacoustics.directivitiesr    r!   r"   r  r  r  r#   r$   r%   r&   r  objectr'   r  r,   r,   r,   r0   <module>   sP   D         I