o
    ½e¦i^  ã                   @   s,   d Z ddlZddlmZ G dd„ deƒZdS )aI   Specifies the inference interfaces for diarization modules.

Authors:
 * Aku Rouhe 2021
 * Peter Plantinga 2021
 * Loren Lugosch 2020
 * Mirco Ravanelli 2020
 * Titouan Parcollet 2021
 * Abdel Heba 2021
 * Andreas Nautsch 2022, 2023
 * Pooneh Mousavi 2023
 * Sylvain de Langen 2023
 * Adel Moumen 2023
 * Pradnya Kandarkar 2023
é    N)Ú
Pretrainedc                   @   sP   e Zd ZdZg d¢Zdd„ Zdd„ Zdd„ Zd	d
„ Zdd„ Z	dd„ Z
dd„ ZdS )ÚSpeech_Emotion_Diarizationa  A ready-to-use SED interface (audio -> emotions and their durations)

    Arguments
    ---------
    See ``Pretrained``

    Example
    -------
    >>> from speechbrain.inference.diarization import Speech_Emotion_Diarization
    >>> tmpdir = getfixture("tmpdir")
    >>> sed_model = Speech_Emotion_Diarization.from_hparams(source="speechbrain/emotion-diarization-wavlm-large", savedir=tmpdir,) # doctest: +SKIP
    >>> sed_model.diarize_file("speechbrain/emotion-diarization-wavlm-large/example.wav") # doctest: +SKIP
    )Ú
input_normÚwav2vecÚ
output_mlpc                 C   s4   |   |¡}| d¡}t dg¡}|  |||g¡}|S )a&  Get emotion diarization of a spoken utterance.

        Arguments
        ---------
        path : str
            Path to audio file which to diarize.

        Returns
        -------
        list of dictionary: List[Dict[List]]
            The emotions and their temporal boundaries.
        r   g      ð?)Ú
load_audioÚ	unsqueezeÚtorchÚtensorÚdiarize_batch)ÚselfÚpathÚwaveformÚbatchÚ
rel_lengthÚframe_class© r   ú_/home/ubuntu/transcripts/venv/lib/python3.10/site-packages/speechbrain/inference/diarization.pyÚdiarize_file'   s
   

z'Speech_Emotion_Diarization.diarize_filec                 C   sn   t |jƒdkr| d¡}|du rtj|jd | jd}| | j¡| | j¡}}| j ||¡}| j 	|¡}|S )a)  Encodes audios into fine-grained emotional embeddings

        Arguments
        ---------
        wavs : torch.Tensor
            Batch of waveforms [batch, time, channels].
        wav_lens : torch.Tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.

        Returns
        -------
        torch.Tensor
            The encoded batch
        é   r   N)Údevice)
ÚlenÚshaper   r	   Úonesr   ÚtoÚmodsr   Úwav2vec2)r   ÚwavsÚwav_lensÚoutputsr   r   r   Úencode_batch;   s   
z'Speech_Emotion_Diarization.encode_batchc           
      C   s`   |   ||¡}| j |¡}| j |¡}| j |¡}tj|dd\}}| jj 	|¡}|  
||¡}	|	S )a‰  Get emotion diarization of a batch of waveforms.

        The waveforms should already be in the model's desired format.
        You can call:
        ``normalized = EncoderDecoderASR.normalizer(signal, sample_rate)``
        to get a correctly converted signal in most cases.

        Arguments
        ---------
        wavs : torch.Tensor
            Batch of waveforms [batch, time, channels].
        wav_lens : torch.Tensor
            Lengths of the waveforms relative to the longest one in the
            batch, tensor of shape [batch]. The longest one should have
            relative length 1.0 and others len(waveform) / max_length.
            Used for ignoring padding.
        batch_id : torch.Tensor
            id of each batch (file names etc.)

        Returns
        -------
        list of dictionary: List[Dict[List]]
            The emotions and their temporal boundaries.
        éÿÿÿÿ)Údim)r    ÚhparamsÚavg_poolr   r   Úlog_softmaxr	   ÚmaxÚlabel_encoderÚdecode_torchÚpreds_to_diarization)
r   r   r   Úbatch_idr   Úaveraged_outÚscoreÚindexÚpredsÚresultsr   r   r   r   Z   s   z(Speech_Emotion_Diarization.diarize_batchc           
   	   C   sž   i }t t|ƒƒD ]D}|| }g }t t|ƒƒD ]%}t| jjd | dƒ}t|| jjd  dƒ}	| || ||	|| g¡ q|  |¡}dd„ |D ƒ||| < q|S )al  Convert frame-wise predictions into a dictionary of
        diarization results.

        Arguments
        ---------
        prediction : torch.Tensor
            Frame-wise predictions
        batch_id : str
            The id for this batch

        Returns
        -------
        dictionary
            A dictionary with the start/end of each emotion
        g{®Gáz”?é   c                 S   s$   g | ]}|d  |d |d dœ‘qS )r   r0   é   )ÚstartÚendÚemotionr   )Ú.0Úkr   r   r   Ú
<listcomp>—   s    ÿzCSpeech_Emotion_Diarization.preds_to_diarization.<locals>.<listcomp>)Úranger   Úroundr#   ÚstrideÚwindow_lengthÚappendÚ!merge_ssegs_same_emotion_adjacent)
r   Ú
predictionr*   r/   ÚiÚpredÚlolÚjr2   r3   r   r   r   r)   |   s   
ÿz/Speech_Emotion_Diarization.preds_to_diarizationc                 C   s   |   |||¡S )z1Get emotion diarization for a batch of waveforms.)r   )r   r   r   r*   r   r   r   Úforwardœ   s   z"Speech_Emotion_Diarization.forwardc                 C   s   ||krdS dS )a  Returns True if segments are overlapping.

        Arguments
        ---------
        end1 : float
            End time of the first segment.
        start2 : float
            Start time of the second segment.

        Returns
        -------
        overlapped : bool
            True of segments overlapped else False.

        Example
        -------
        >>> from speechbrain.processing import diarization as diar
        >>> diar.is_overlapped(5.5, 3.4)
        True
        >>> diar.is_overlapped(5.5, 6.4)
        False
        FTr   )r   Úend1Ústart2r   r   r   Úis_overlapped    s   z(Speech_Emotion_Diarization.is_overlappedc                 C   s¤   g }|d }d}t dt|ƒƒD ]5}|| }|  |d |d ¡r=|d |d kr=|d |d< |t|ƒd kr<d}| |¡ q| |¡ |}q|du rP| |d ¡ |S )a)  Merge adjacent sub-segs if they are the same emotion.
        Arguments
        ---------
        lol : list of list
            Each list contains [utt_id, sseg_start, sseg_end, emo_label].
        Returns
        -------
        new_lol : list of list
            new_lol contains adjacent segments merged from the same emotion ID.
        Example
        -------
        >>> from speechbrain.utils.EDER import merge_ssegs_same_emotion_adjacent
        >>> lol=[['u1', 0.0, 7.0, 'a'],
        ... ['u1', 7.0, 9.0, 'a'],
        ... ['u1', 9.0, 11.0, 'n'],
        ... ['u1', 11.0, 13.0, 'n'],
        ... ['u1', 13.0, 15.0, 'n'],
        ... ['u1', 15.0, 16.0, 'a']]
        >>> merge_ssegs_same_emotion_adjacent(lol)
        [['u1', 0.0, 9.0, 'a'], ['u1', 9.0, 15.0, 'n'], ['u1', 15.0, 16.0, 'a']]
        r   Fr   r0   r1   Tr!   )r8   r   rF   r<   )r   rA   Únew_lolÚssegÚflagr?   Ú	next_ssegr   r   r   r=   ½   s$   ÿ
€
z<Speech_Emotion_Diarization.merge_ssegs_same_emotion_adjacentN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚMODULES_NEEDEDr   r    r   r)   rC   rF   r=   r   r   r   r   r      s    " r   )rN   r	   Ú speechbrain.inference.interfacesr   r   r   r   r   r   Ú<module>   s    