o
    ui9#                     @   sD   d dl Z d dlZd dlZd dlmZ G dd dZG dd dZdS )    N)tqdmc                       s4   e Zd ZdZd
 fdd	Zdd Zddd	Z  ZS )
ConvTDFNetzG
    ConvTDFNet - Convolutional Temporal Frequency Domain Network.
       c                    s   t t|   d| _|| _d| | _|| _|| _| jd d | _|| jd  | _	t
j| jdd| _|| _|dkr=| jd n| j}t
d|| j| j | jg| _|d | _dS )a  
        Initialize ConvTDFNet.

        Args:
            target_name (str): The target name for separation.
            L (int): Number of layers.
            dim_f (int): Dimension in the frequency domain.
            dim_t (int): Dimension in the time domain (log2).
            n_fft (int): FFT size.
            hop (int, optional): Hop size. Defaults to 1024.

        Returns:
            None
                 T)window_lengthperiodic*N)superr   __init__dim_cdim_fdim_tn_ffthopn_bins
chunk_sizetorchhann_windowwindowtarget_namezerosfreq_padn)selfr   Lr   r   r   r   out_c	__class__ ?/home/ubuntu/sommelier/podcast-pipeline/models/separate_fast.pyr      s   
zConvTDFNet.__init__c                 C   s   | d| jg}tj|| j| j| jddd}t|}|g d}| ddd| j	| j
g d| j| j	| j
g}|ddddd| jf S )z
        Perform Short-Time Fourier Transform (STFT).

        Args:
            x (torch.Tensor): Input waveform.

        Returns:
            torch.Tensor: STFT of the input waveform.
        T)r   
hop_lengthr   centerreturn_complex)r      r   r   r   N)reshaper   r   stftr   r   r   view_as_realpermuter   r   r   r   )r   xr    r    r!   r(   1   s   

zConvTDFNet.stftNc                 C   s   |du r| j |jd dddgn|}t||gd}| jdkr"dnd}|d|d| j| jgdd| j| jg}|	g d	}|
 }t|}tj|| j| j| jd
d}|d|| jgS )a  
        Perform Inverse Short-Time Fourier Transform (ISTFT).

        Args:
            x (torch.Tensor): Input STFT.
            freq_pad (torch.Tensor, optional): Frequency padding. Defaults to None.

        Returns:
            torch.Tensor: Inverse STFT of the input.
        Nr   r   r
      r   r"   )r   r   r&   r   T)r   r#   r   r$   )r   repeatshaper   catr   r'   r   r   r*   
contiguousview_as_complexistftr   r   r   r   )r   r+   r   cr    r    r!   r3   K   s    
zConvTDFNet.istft)r   )N)__name__
__module____qualname____doc__r   r(   r3   __classcell__r    r    r   r!   r      s
    r   c                   @   s0   e Zd ZdZdd Zdd Zdd Zdd	 Zd
S )	PredictorzR
    Predictor class for source separation using ConvTDFNet and ONNX Runtime.
    c                 C   sr   || _ tdd|d |d |d d| _|dkr$tj|d d	gd
| _dS |dkr5tj|d dgd
| _dS td)a,  
        Initialize the Predictor.

        Args:
            args (dict): Configuration arguments.
            device (str): Device to run the model ('cuda' or 'cpu').

        Returns:
            None

        Raises:
            ValueError: If the provided device is not 'cuda' or 'cpu'.
        vocals   r   r   r   )r   r   r   r   r   cuda
model_pathCUDAExecutionProvider)	providerscpuCPUExecutionProviderz%Device must be either 'cuda' or 'cpu'N)argsr   model_ortInferenceSessionmodel
ValueError)r   rC   devicer    r    r!   r   n   s"   

zPredictor.__init__c                 C   s   |j d }| jd }| jd d }|dksJ d||kr|}i }| jd dks,||k r.|}d}td||D ]/}|d7 }|dkrBdn|}t|| | |}	|| }
|dd|
|	f  ||< |	|kre nq6| j||d	}|S )
z
        Separate the sources from the input mix.

        Args:
            mix (np.ndarray): Input mixture signal.

        Returns:
            np.ndarray: Separated sources.

        Raises:
            AssertionError: If margin is zero.
        r"   marginchunksD  r   zMargin cannot be zero!r   N)margin_size)r/   rC   rangemincopy
demix_base)r   mixsamplesrJ   r   segmented_mixcounterskips_marginendstartsourcesr    r    r!   demix   s*   

zPredictor.demixc                 C   s  g }t t|d}|d |D ],}|| }g }|jd }| j}	|	jd }
|	jd|
  }|||  }tt	d|
f|t	d|ft	d|
ffd}g }d}||| k rut
|dd|||	j f }|| ||7 }||| k sVtjt
|tjd}t  | j}|	|}| jd r|dd	|   id  d
 |dd	|  id d
  }|	t|}n|	t|dd	|  id }|dddd|
|
 f dddd ddd| f }|dkrdn|}|t| ddd d krdn| }|dkrd}||dd||f  |d W d   n	1 s3w   Y  || qtj|dd}|  |S )z
        Base function for source separation.

        Args:
            mixes (dict): Dictionary of segmented mixtures.
            margin_size (int): Size of the margin.

        Returns:
            np.ndarray: Separated sources.
        )totalzSource separationr   r   r   N)dtypedenoiseinputg      ?r"   )axis)r   lenset_descriptionr/   rD   r   r   npconcatenater   arrayappendr   tensorfloat32no_gradrG   r(   rC   runrA   numpyr3   	transposer'   listkeysupdateclose)r   mixesrM   chunked_sourcesprogress_barrR   cmixrZ   n_samplerG   trimgen_sizepadmix_p	mix_wavesiwaves_ortspek	spec_pred	tar_waves
tar_signalrY   rX   _sourcesr    r    r!   rQ      sf   



* 



" *
zPredictor.demix_basec              	   C   s   |j dkrt||g}|jd | jd d  }|jd | jd d  dkrAt|dd| jd d |jd | jd d   ff}|j}| |j}|d j}|dkri|| d| jd d |  ddf |fS || |fS )z
        Predict the separated sources from the input mix.

        Args:
            mix (np.ndarray): Input mixture signal.

        Returns:
            tuple: Tuple containing the mixture minus the separated sources and the separated sources.
        r   rK   rL   r   )r   r   N)ndimrc   asfortranarrayr/   rC   rx   Tr[   )r   rR   tailrZ   optr    r    r!   predict   s(   


,zPredictor.predictN)r5   r6   r7   r8   r   r[   rQ   r   r    r    r    r!   r:   i   s    "(Gr:   )r   rk   rc   onnxruntimerE   r   r   r:   r    r    r    r!   <module>   s   \