o
    %ݫi^                     @   s,   d Z ddlZddlmZ G dd deZdS )aI   Specifies the inference interfaces for diarization modules.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
    N)
Pretrainedc                   @   sP   e Zd ZdZg dZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dd ZdS )Speech_Emotion_Diarizationa  A ready-to-use SED interface (audio -> emotions and their durations)

    Arguments
    ---------
    See ``Pretrained``

    Example
    -------
    >>> from speechbrain.inference.diarization import Speech_Emotion_Diarization
    >>> tmpdir = getfixture("tmpdir")
    >>> sed_model = Speech_Emotion_Diarization.from_hparams(source="speechbrain/emotion-diarization-wavlm-large", savedir=tmpdir,) # doctest: +SKIP
    >>> sed_model.diarize_file("speechbrain/emotion-diarization-wavlm-large/example.wav") # doctest: +SKIP
    )
input_normwav2vec
output_mlpc                 C   s4   |  |}|d}tdg}| |||g}|S )a&  Get emotion diarization of a spoken utterance.

        Arguments
        ---------
        path : str
            Path to audio file which to diarize.

        Returns
        -------
        list of dictionary: List[Dict[List]]
            The emotions and their temporal boundaries.
        r   g      ?)
load_audio	unsqueezetorchtensordiarize_batch)selfpathwaveformbatch
rel_lengthframe_class r   U/home/ubuntu/.local/lib/python3.10/site-packages/speechbrain/inference/diarization.pydiarize_file'   s
   

z'Speech_Emotion_Diarization.diarize_filec                 C   sn   t |jdkr|d}|du rtj|jd | jd}|| j|| j}}| j||}| j	|}|S )a)  Encodes audios into fine-grained emotional embeddings

        Arguments
        ---------
        wavs : torch.Tensor
            Batch of waveforms [batch, time, channels].
        wav_lens : torch.Tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        torch.Tensor
            The encoded batch
           r   N)device)
lenshaper   r	   onesr   tomodsr   wav2vec2)r   wavswav_lensoutputsr   r   r   encode_batch;   s   
z'Speech_Emotion_Diarization.encode_batchc           
      C   s`   |  ||}| j|}| j|}| j|}tj|dd\}}| jj	|}| 
||}	|	S )a  Get emotion diarization of a batch of waveforms.

        The waveforms should already be in the model's desired format.
        You can call:
        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
        to get a correctly converted signal in most cases.

        Arguments
        ---------
        wavs : torch.Tensor
            Batch of waveforms [batch, time, channels].
        wav_lens : torch.Tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.
        batch_id : torch.Tensor
            id of each batch (file names etc.)

        Returns
        -------
        list of dictionary: List[Dict[List]]
            The emotions and their temporal boundaries.
        )dim)r    hparamsavg_poolr   r   log_softmaxr	   maxlabel_encoderdecode_torchpreds_to_diarization)
r   r   r   batch_idr   averaged_outscoreindexpredsresultsr   r   r   r   Z   s   z(Speech_Emotion_Diarization.diarize_batchc           
   	   C   s   i }t t|D ]D}|| }g }t t|D ]%}t| jjd | d}t|| jjd  d}	||| ||	|| g q| |}dd |D ||| < q|S )al  Convert frame-wise predictions into a dictionary of
        diarization results.

        Arguments
        ---------
        prediction : torch.Tensor
            Frame-wise predictions
        batch_id : str
            The id for this batch

        Returns
        -------
        dictionary
            A dictionary with the start/end of each emotion
        g{Gz?   c                 S   s$   g | ]}|d  |d |d dqS )r   r0      )startendemotionr   ).0kr   r   r   
<listcomp>   s    zCSpeech_Emotion_Diarization.preds_to_diarization.<locals>.<listcomp>)ranger   roundr#   stridewindow_lengthappend!merge_ssegs_same_emotion_adjacent)
r   
predictionr*   r/   ipredloljr2   r3   r   r   r   r)   |   s   
z/Speech_Emotion_Diarization.preds_to_diarizationc                 C   s   |  |||S )z1Get emotion diarization for a batch of waveforms.)r   )r   r   r   r*   r   r   r   forward   s   z"Speech_Emotion_Diarization.forwardc                 C   s   ||krdS dS )a  Returns True if segments are overlapping.

        Arguments
        ---------
        end1 : float
            End time of the first segment.
        start2 : float
            Start time of the second segment.

        Returns
        -------
        overlapped : bool
            True of segments overlapped else False.

        Example
        -------
        >>> from speechbrain.processing import diarization as diar
        >>> diar.is_overlapped(5.5, 3.4)
        True
        >>> diar.is_overlapped(5.5, 6.4)
        False
        FTr   )r   end1start2r   r   r   is_overlapped   s   z(Speech_Emotion_Diarization.is_overlappedc                 C   s   g }|d }d}t dt|D ]5}|| }| |d |d r=|d |d kr=|d |d< |t|d kr<d}|| q|| |}q|du rP||d  |S )a)  Merge adjacent sub-segs if they are the same emotion.
        Arguments
        ---------
        lol : list of list
            Each list contains [utt_id, sseg_start, sseg_end, emo_label].
        Returns
        -------
        new_lol : list of list
            new_lol contains adjacent segments merged from the same emotion ID.
        Example
        -------
        >>> from speechbrain.utils.EDER import merge_ssegs_same_emotion_adjacent
        >>> lol=[['u1', 0.0, 7.0, 'a'],
        ... ['u1', 7.0, 9.0, 'a'],
        ... ['u1', 9.0, 11.0, 'n'],
        ... ['u1', 11.0, 13.0, 'n'],
        ... ['u1', 13.0, 15.0, 'n'],
        ... ['u1', 15.0, 16.0, 'a']]
        >>> merge_ssegs_same_emotion_adjacent(lol)
        [['u1', 0.0, 9.0, 'a'], ['u1', 9.0, 15.0, 'n'], ['u1', 15.0, 16.0, 'a']]
        r   Fr   r0   r1   Tr!   )r8   r   rF   r<   )r   rA   new_lolssegflagr?   	next_ssegr   r   r   r=      s$   

z<Speech_Emotion_Diarization.merge_ssegs_same_emotion_adjacentN)__name__
__module____qualname____doc__MODULES_NEEDEDr   r    r   r)   rC   rF   r=   r   r   r   r   r      s    " r   )rN   r	    speechbrain.inference.interfacesr   r   r   r   r   r   <module>   s    