o
    pib                     @   s   d dl Z d dlmZmZmZmZmZmZ d dlZ	d dl
Z
d dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d d	lmZ d d
lm Z  d dl!m"Z"m#Z#m$Z$ G dd dZ%G dd de%Z&dS )    N)CallableListOptionalTextTupleUnion)	rearrange)is_oom_error)	AudioFile)ModelSpecifications)
Resolution)map_with_specifications)Powerset)fix_reproducibility)SegmentSlidingWindowSlidingWindowFeaturec                   @   s   e Zd ZdS )BaseInferenceN)__name__
__module____qualname__ r   r   Q/home/ubuntu/.local/lib/python3.10/site-packages/pyannote/audio/core/inference.pyr   *   s    r   c                   @   s  e Zd ZdZ								d0dededee d	ee d
ee	j
ge	j
f dededeej defddZdejdd fddZdejdee	j
ee	j
 f fddZdejdedee deeee f fddZ	d1dedee deeeee	j
f  eee	j
f f fddZ	d1dedeeee f dee deeeee	j
f  eee	j
f f fd d!Zed"d#de	jdfd$ed%ed&eeef d'ed(ed)ed*edefd+d,Ze	-d2d$ed&eeef defd.d/Z dS )3	Inferencea  Inference

    Parameters
    ----------
    model : Model
        Model. Will be automatically set to eval() mode and moved to `device` when provided.
    window : {"sliding", "whole"}, optional
        Use a "sliding" window and aggregate the corresponding outputs (default)
        or just one (potentially long) window covering the "whole" file or chunk.
    duration : float, optional
        Chunk duration, in seconds. Defaults to duration used for training the model.
        Has no effect when `window` is "whole".
    step : float, optional
        Step between consecutive chunks, in seconds. Defaults to warm-up duration when
        greater than 0s, otherwise 10% of duration. Has no effect when `window` is "whole".
    pre_aggregation_hook : callable, optional
        When a callable is provided, it is applied to the model output, just before aggregation.
        Takes a (num_chunks, num_frames, dimension) numpy array as input and returns a modified
        (num_chunks, num_frames, other_dimension) numpy array passed to overlap-add aggregation.
    skip_aggregation : bool, optional
        Do not aggregate outputs when using "sliding" window. Defaults to False.
    skip_conversion: bool, optional
        In case a task has been trained with `powerset` mode, output is automatically
        converted to `multi-label`, unless `skip_conversion` is set to True.
    batch_size : int, optional
        Batch size. Larger values (should) make inference faster. Defaults to 32.
    device : torch.device, optional
        Device used for inference. Defaults to `model.device`.
        In case `device` and `model.device` are different, model is sent to device.
    slidingNF    modelwindowdurationsteppre_aggregation_hookskip_aggregationskip_conversiondevice
batch_sizec
                 C   s  || _ |d u r| j j}|| _| j   | j | j | j j}
|dvr&td|dkr8tdd |
D r8td || _	t
t|
j}|pE|}||krXtd|dd	|dd
 || _|| _t }|
D ]}|jrt|sttt|j|j}nt }||| j qct|
tr|d | _nt|| _|| _|| _t
t|
j| _|p| jd dkrd| j n| jd }|| jkrtd|dd| jdd|| _|	| _d S )N)r   wholez&`window` must be "sliding" or "whole".r&   c                 s   s    | ]	}|j tjkV  qd S N)
resolutionr   FRAME).0sr   r   r   	<genexpr>l   s    
z%Inference.__init__.<locals>.<genexpr>zUsing "whole" `window` inference with a frame-based model might lead to bad results and huge memory consumption: it is recommended to set `window` to "sliding".zModel was trained with gzs chunks, and you requested z>s chunks for inference: this might lead to suboptimal results.r           皙?z*Step between consecutive chunks is set to zs, while chunks are only z^s long, leading to gaps between consecutive chunks. Either decrease step or increase duration.) r   r$   evaltospecifications
ValueErroranywarningswarnr   nextiterr   r#   listpowersetr   lenclassespowerset_max_classesnnIdentityappend
isinstancer   
conversion
ModuleListr"   r!   warm_upr    r%   )selfr   r   r   r    r!   r"   r#   r$   r%   r2   training_durationrB   r+   cr   r   r   __init__N   sb   



 


zInference.__init__returnc                 C   sD   t |tjstdt|j d| j| | j| || _| S )zSend internal model to `device`z5`device` must be an instance of `torch.device`, got ``)	rA   torchr$   	TypeErrortyper   r   r1   rB   )rE   r$   r   r   r   r1      s   zInference.tochunksc                 C   s   t  0 z| || j}W n ty, } zt|r&td| jdd|d}~ww W d   n1 s7w   Y  dt j	dt
jfdd}t| jj||| jS )	ah  Forward pass

        Takes care of sending chunks to right device and outputs back to CPU

        Parameters
        ----------
        chunks : (batch_size, num_channels, num_samples) torch.Tensor
            Batch of audio chunks.

        Returns
        -------
        outputs : (tuple of) (batch_size, ...) np.ndarray
            Model output.
        zbatch_size (z dzP) is probably too large. Try with a smaller value until memory error disappears.NoutputrB   c                 [   s   ||    S r'   )cpunumpy)rO   rB   kwargsr   r   r   	__convert   s   z"Inference.infer.<locals>.__convert)rK   inference_moder   r1   r$   RuntimeErrorr	   MemoryErrorr%   Tensorr>   Moduler   r2   rB   )rE   rN   outputs	exception_Inference__convertr   r   r   infer   s"   
zInference.inferwaveformsample_ratehookc              	      s"  j jj}tj }|j\}	ddtt dt	ffdd}t
j j|j j}|krBt|d||d}	|	j\}
}}nd}
|k pO| | dk  ro|dd|
| df }|j\}}|| }t|d|f}d	d
 }t
j j|}|dur|d|
  d ddd}td|
jD ](}|	||j  }|}t
j j|||}|dur||j |
  d q r܈|d }t
j j|||}|dur||
  |
  d dttj dtjfdd}t
j j||}	ddtjdt	dtt dtf fdd}t
j j|||S )a  Slide model on a waveform

        Parameters
        ----------
        waveform: (num_channels, num_samples) torch.Tensor
            Waveform.
        sample_rate : int
            Sample rate.
        hook: Optional[Callable]
            When a callable is provided, it is called everytime a batch is
            processed with two keyword arguments:
            - `completed`: the number of chunks that have been processed so far
            - `total`: the total number of chunks

        Returns
        -------
        output : (tuple of) SlidingWindowFeature
            Model output. Shape is (num_chunks, dimension) for chunk-level tasks,
            and (num_frames, dimension) for frame-level tasks.
        Nr2   rI   c                    s"   |j tjkrtd j jdS | S )Nr.   startr   r    )r(   r   CHUNKr   r   r    )receptive_fieldr2   )rE   r   r   __frames   s   z!Inference.slide.<locals>.__frames   z*channel chunk frame -> chunk channel framer   c                  [   s   t  S r'   )r9   )rR   r   r   r   __empty_list  s   z%Inference.slide.<locals>.__empty_list)	completedtotalc                 [   s   |  | d S r'   )r@   )rO   batch_outputrR   r   r   r   __append_batch"  s   
z'Inference.slide.<locals>.__append_batchrO   c                 [   s
   t | S r'   )npvstack)rO   rR   r   r   r   __vstackA  s   
z!Inference.slide.<locals>.__vstackrY   framesc                    s   j s|jtjks|jrjd u rtdjjd}t	| |S jd ur)| } j
t	| tdjjd|jddd} rM|jtd dd|_|S )Nr.   r`   T)rD   hammingmissingloose)mode)r"   r(   r   rb   permutation_invariantr!   r   r   r    r   	aggregaterD   cropr   data)rY   rn   r2   
aggregatedhas_last_chunknum_samplesr^   rE   r   r   __aggregateH  s6   	




z$Inference.slide.<locals>.__aggregater'   )rI   N)r   audioget_num_samplesr   roundr    shaper   r   r   r   r2   rc   r   unfoldFpadrk   aranger%   r\   r   ndarrayr   )rE   r]   r^   r_   window_size	step_size__Inference__framesrn   rN   
num_chunks
last_chunklast_window_sizelast_pad_Inference__empty_listrY   _Inference__append_batchrG   batchbatch_outputslast_outputs_Inference__vstack_Inference__aggregater   rx   r   slide   s   




+zInference.slidefilec                 C   sh   t | j | j|\}}| jdkr| j|||dS | |d }dtjdtjfdd}t	| jj
||S )a  Run inference on a whole file

        Parameters
        ----------
        file : AudioFile
            Audio file.
        hook : callable, optional
            When a callable is provided, it is called everytime a batch is processed
            with two keyword arguments:
            - `completed`: the number of chunks that have been processed so far
            - `total`: the total number of chunks

        Returns
        -------
        output : (tuple of) SlidingWindowFeature or np.ndarray
            Model output, as `SlidingWindowFeature` if `window` is set to "sliding"
            and `np.ndarray` if is set to "whole".

        r   r_   NrY   rI   c                 [      | d S Nr   r   rY   rR   r   r   r   __first_sample     z*Inference.__call__.<locals>.__first_sample)r   r$   r   r|   r   r   r\   rk   r   r   r2   )rE   r   r_   r]   r^   rY   _Inference__first_sampler   r   r   __call__w  s   


zInference.__call__chunkc                    s
  t j jdkrLt ts'tdd  D }tdd  D }t||d jj	 \}}j
|||d}dtdtf fd	d
}	tjj|	|S t tr\jj	 \}}ntjfdd D dd}|d }dtjdtjfdd}
tjj|
|S )a  Run inference on a chunk or a list of chunks

        Parameters
        ----------
        file : AudioFile
            Audio file.
        chunk : Segment or list of Segment
            Apply model on this chunk. When a list of chunks is provided and
            window is set to "sliding", this is equivalent to calling crop on
            the smallest chunk that contains all chunks. In case window is set
            to "whole", this is equivalent to concatenating each chunk into one
            (artifical) chunk before processing it.
        hook : callable, optional
            When a callable is provided, it is called everytime a batch is processed
            with two keyword arguments:
            - `completed`: the number of chunks that have been processed so far
            - `total`: the total number of chunks

        Returns
        -------
        output : (tuple of) SlidingWindowFeature or np.ndarray
            Model output, as `SlidingWindowFeature` if `window` is set to "sliding"
            and `np.ndarray` if is set to "whole".

        Notes
        -----
        If model needs to be warmed up, remember to extend the requested chunk with the
        corresponding amount of time so that it is actually warmed up when processing the
        chunk of interest:
        >>> chunk_of_interest = Segment(10, 15)
        >>> extended_chunk = Segment(10 - warm_up, 15 + warm_up)
        >>> inference.crop(file, extended_chunk).crop(chunk_of_interest, returns_data=False)
        r   c                 s       | ]}|j V  qd S r'   )ra   r*   rG   r   r   r   r,         z!Inference.crop.<locals>.<genexpr>c                 s   r   r'   )endr   r   r   r   r,     r   )ra   r   r   rO   rI   c                    s&   | j }t j|j|jd}t| j|S )Nr`   )sliding_windowr   ra   r   r    r   rv   )rO   rR   rn   shifted_frames)r   r   r   __shift  s
   zInference.crop.<locals>.__shiftc                    s    g | ]}j j |d  qS )r   )r   r|   ru   r   )r   rE   r   r   
<listcomp>  s     z"Inference.crop.<locals>.<listcomp>re   )dimNrY   c                 [   r   r   r   r   r   r   r   r     r   z&Inference.crop.<locals>.__first_sample)r   r$   r   rA   r   minmaxr   r|   ru   r   r   r   r2   rK   catr\   rk   r   )rE   r   r   r_   ra   r   r]   r^   rY   _Inference__shiftr   r   )r   r   rE   r   ru     s*   
+



zInference.crop)r.   r.   g-q=scoresrn   rD   epsilonro   rp   skip_averagec                 C   s  | j j\}}}	| j}
t|
j|j|jd}|rt|	ddnt
|df}t
|df}t|d | jj | }||d|< t|d | jj | }|||| d< || jj| jj |d | jj  d|j  d }tj||	ftjd}tj||	ftjd}tj||	ftjd}| D ]R\}}dt| }tj|dd	d
 ||jd|j  }||||   || | | 7  < ||||   || | 7  < t||||  ||||| < q|r|}n|t|| }|||d	k< t||S )a  Aggregation

        Parameters
        ----------
        scores : SlidingWindowFeature
            Raw (unaggregated) scores. Shape is (num_chunks, num_frames_per_chunk, num_classes).
        frames : SlidingWindow
            Frames resolution.
        warm_up : (float, float) tuple, optional
            Left/right warm up duration (in seconds).
        missing : float, optional
            Value used to replace missing (ie all NaNs) values.
        skip_average : bool, optional
            Skip final averaging step.

        Returns
        -------
        aggregated_scores : SlidingWindowFeature
            Aggregated scores. Shape is (num_frames, num_classes)
        r`   re   r   Ng      ?)dtypeFr.   )copynan)rv   r   r   r   ra   r   r    rk   ro   reshapeonesr~   closest_framezerosfloat32isnan
nan_to_nummaximumr   )r   rn   rD   r   ro   rp   r   r   num_frames_per_chunknum_classesrN   hamming_windowwarm_up_windowwarm_up_leftwarm_up_right
num_framesaggregated_outputoverlapping_chunk_countaggregated_maskr   scoremaskstart_frameaverager   r   r   rt     s~   	
	





zInference.aggregater/   r/   c           
      C   s   | j jdks
J d| j j\}}}| j}t||d  }t||d  }t||j |j }|| | |k rItdt	|d dd|jdd	 | j d
d
||| f }t
|j|d |j  |jd|d  |d  |j d}	t||	S )a  Trim left and right warm-up regions

        Parameters
        ----------
        scores : SlidingWindowFeature
            (num_chunks, num_frames, num_classes)-shaped scores.
        warm_up : (float, float) tuple
            Left/right warm up ratio of chunk duration.
            Defaults to (0.1, 0.1), i.e. 10% on both sides.

        Returns
        -------
        trimmed : SlidingWindowFeature
            (num_chunks, trimmed_num_frames, num_speakers)-shaped scores
           zLInference.trim expects (num_chunks, num_frames, num_classes)-shaped `scores`r   re   zTotal `warm_up` is so large (d   r-   zL% of each chunk) that resulting trimmed scores does not cover a whole step (zs)N)ra   r    r   )rv   ndimr   r   r~   r    r   r5   r6   sumr   ra   r   )
r   rD   r   r   rN   num_frames_leftnum_frames_rightnum_frames_stepnew_data
new_chunksr   r   r   trimn  s*   
zInference.trim)r   NNNFFNr   r'   )r   )!r   r   r   __doc__r   r   r   floatr   rk   r   boolrK   r$   intrH   r1   rW   r   r   r\   r   r   r
   r   r   r   ru   staticmethodr   r   rt   r   r   r   r   r   r   .   s    "	

[$#
  
.
Q
{
r   )'r5   typingr   r   r   r   r   r   rQ   rk   rK   torch.nnr>   torch.nn.functional
functionalr   einopsr   "lightning.pytorch.utilities.memoryr	   pyannote.audio.core.ior
   pyannote.audio.core.modelr   r   pyannote.audio.core.taskr   pyannote.audio.utils.multi_taskr   pyannote.audio.utils.powersetr   $pyannote.audio.utils.reproducibilityr   pyannote.corer   r   r   r   r   r   r   r   r   <module>   s"    