o
    s·¯iø9  ã                   @   sÈ   d dl Z d dlmZ d dlm  mZ d dlmZmZmZm	Z	 ddl
mZ G dd„ deƒZG dd„ deƒZG d	d
„ d
ejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZG dd„ dejƒZdS )é    N)ÚLSTMÚLinearÚBatchNorm1dÚ	Parameteré   )Ú	BaseModelc                   @   s   e Zd Zdd„ ZdS )ÚXUMXc                 O   s   t dƒ‚)NzXXUMX is broken in torch 2.0, use torch<2.0 with asteroid<0.7 to use it until it's fixed.)ÚRuntimeError)ÚselfÚargsÚkwargs© r   úI/home/ubuntu/.local/lib/python3.10/site-packages/asteroid/models/x_umx.pyÚ__init__
   s   ÿzXUMX.__init__N)Ú__name__Ú
__module__Ú__qualname__r   r   r   r   r   r   	   s    r   c                       s\   e Zd ZdZ													
	d‡ fdd„	Zdd„ Zdd„ Zdd„ Zdd„ Z‡  Z	S )Ú
BrokenXUMXaÐ  CrossNet-Open-Unmix (X-UMX) for Music Source Separation introduced in [1].
        There are two notable contributions with no effect on inference:
            a) Multi Domain Losses
                - Considering not only spectrograms but also time signals
            b) Combination Scheme
                - Considering possible combinations of output instruments
        When starting to train X-UMX, you can optionally use the above by setting
        ``loss_use_multidomain'' and ``loss_combine_sources'' which are both set in conf.yml.

    Args:
        sources (list): The list of instruments, e.g., ["bass", "drums", "vocals"],
            defined in conf.yml.
        window_length (int): The length in samples of window function to use in STFT.
        in_chan (int): Number of input channels, should be equal to
            STFT size and STFT window length in samples.
        n_hop (int): STFT hop length in samples.
        hidden_size (int): Hidden size parameter of LSTM layers.
        nb_channels (int): set number of channels for model (1 for mono
            (spectral downmix is applied,) 2 for stereo).
        sample_rate (int): sampling rate of input wavs
        nb_layers (int): Number of (B)LSTM layers in network.
        input_mean (torch.tensor): Mean for each frequency bin calculated
            in advance to normalize the mixture magnitude spectrogram.
        input_scale (torch.tensor): Standard deviation for each frequency bin
            calculated in advance to normalize the mixture magnitude spectrogram.
        max_bin (int): Maximum frequency bin index of the mixture that X-UMX
            should consider. Set to None to use all frequency bins.
        bidirectional (bool): whether we use LSTM or BLSTM.
        spec_power (int): Exponent for spectrogram calculation.
        return_time_signals (bool): Set to true if you are using a time-domain
            loss., i.e., applies ISTFT. If you select ``MDL=True'' via
            conf.yml, this is set as True.

    References
        [1] "All for One and One for All: Improving Music Separation by Bridging
        Networks", Ryosuke Sawata, Stefan Uhlich, Shusuke Takahashi and Yuki Mitsufuji.
        https://arxiv.org/abs/2010.04228 (and ICASSP 2021)
    é   é   é   é   éD¬  é   NTr   Fc              	      s  t ƒ  |¡ || _|| _|| _|| _|| _|| _|| _|| _	|d d | _
|r+|| _n| j
| _|| _|| _|	d urGt |	d | j…  ¡ ¡ }	nt | j¡}	|
d ur`t d|
d | j…  ¡ ¡ }
nt | j¡}
t|||dd}t||dkd}t ||¡| _|rƒ|d n|}i }i }i }i }|D ]V}t| j||d||< t||||dd	d
||< t| j
||d||< t|	 ¡ ƒ|d |¡< t|
 ¡ ƒ|d |¡< tt | j
¡ ¡ ƒ|d |¡< tt | j
¡ ¡ ƒ|d |¡< qt |¡| _t |¡| _ t |¡| _!t "|¡| _#t$|j%||dd| _&d S )Nr   r   g      ð?T)Úwindow_lengthÚn_fftÚn_hopÚcenter)Ú
spec_powerÚmono)Únb_binsÚhidden_sizeÚnb_channelsFgš™™™™™Ù?)Ú
input_sizer!   Ú
num_layersÚbidirectionalÚbatch_firstÚdropout)Únb_output_binsr!   r"   úinput_mean_{}úinput_scale_{}úoutput_mean_{}úoutput_scale_{})Úwindowr   Ú
hop_lengthr   )'Úsuperr   r   Úin_chanr   ÚsourcesÚ_return_time_signalsr"   Ú	nb_layersr%   r(   Úmax_binr!   r   ÚtorchÚ
from_numpyÚfloatÚzerosÚonesÚ_STFTÚ_SpectrogramÚnnÚ
SequentialÚencoderÚ_InstrumentBackboneEncr   Ú_InstrumentBackboneDecr   ÚcloneÚformatÚ
ModuleDictÚ	layer_encÚ
layer_lstmÚ	layer_decÚParameterDictÚ
mean_scaleÚ_ISTFTr-   Údecoder)r
   r1   r   r0   r   r!   r"   Úsample_rater3   Ú
input_meanÚinput_scaler4   r%   r   Úreturn_time_signalsÚstftÚspecÚlstm_hidden_sizeÚsrc_encÚsrc_lstmÚsrc_decrH   Úsrc©Ú	__class__r   r   r   8   sz   
ý
ú

ýÿÿzBrokenXUMX.__init__c                 C   s`   |   |¡\}}|  | ¡ ¡}|  ||¡}| jr*| ddddd¡}|  ||¡}||fS d}||fS )aÇ  Model forward

        Args:
            wav (torch.Tensor): waveform tensor. 1D, 2D or 3D tensor, time last.

        Returns:
            masked_mixture (torch.Tensor): estimated spectrograms masked by
                X-UMX's output of shape $(sources, frames, batch_size, channels, bins)$
            time_signals (torch.Tensor): estimated time signals of shape $(sources, batch_size, channels, time_length)$ if `return_time_signals` is `True`
        r   r   r   é   r   N)r>   Úforward_maskerrA   Úapply_masksr2   ÚpermuterJ   )r
   ÚwavÚmixtureÚangÚ	est_masksÚmasked_mixturerP   Útime_signalsr   r   r   Úforward™   s   þzBrokenXUMX.forwardc                 C   st  |j j}|dd | j…f }|g}tdt| jƒƒD ]	}| | ¡ ¡ qt| jƒD ],\}}||  | j	d 
|¡ 7  < ||  | j	d 
|¡ 9  < | j| || |ƒ||< q't|ƒt| jƒ }d}t| jƒD ]\}}| j| |ƒ}	|t || |	d gd¡7 }qd|t| jƒ }g }
| jD ]&}| j| ||ƒ}|| j	d 
|¡ 9 }|| j	d	 
|¡ 7 }|
 t |¡¡ qŠtj|
dd
}|S )N.r   r)   r*   g        r   éÿÿÿÿr,   r+   ©Údim)ÚdataÚshaper4   ÚrangeÚlenr1   ÚappendrA   Ú	enumeraterH   rB   rD   ÚsumrE   r5   ÚcatrF   ÚFÚreluÚstack)r
   Ú
input_specÚshapesÚxÚinputsÚirU   Úcross_1Úcross_2Útmp_lstm_outÚ	mask_listÚx_tmpr_   r   r   r   rY   ¶   s.   
zBrokenXUMX.forward_maskerc                    s(   t  ‡ ‡fdd„tt| jƒƒD ƒ¡}|S )Nc                    s   g | ]}ˆˆ |  ‘qS r   r   )Ú.0ru   ©r_   r]   r   r   Ú
<listcomp>Ý   s    z*BrokenXUMX.apply_masks.<locals>.<listcomp>)r5   rp   rh   ri   r1   )r
   r]   r_   Úmasked_tf_repr   r|   r   rZ   Ü   s   $zBrokenXUMX.apply_masksc                 C   sN   | j | j| j| jdœ}| j| j| jdd| j| j| j	| j
ddœ
}i |¥|¥}|S )z-Arguments needed to re-instantiate the model.)r   r0   r   rK   NF)
r1   r!   r"   rL   rM   r4   r3   r%   r   rN   )r   r0   r   rK   r1   r!   r"   r4   r3   r%   r   )r
   Ú	fb_configÚ
net_configÚ
model_argsr   r   r   Úget_model_argsà   s,   üöÿþzBrokenXUMX.get_model_args)r   r   r   r   r   r   r   NNNTr   F)
r   r   r   Ú__doc__r   rb   rY   rZ   r‚   Ú__classcell__r   r   rV   r   r      s(    *ña&r   c                       ó.   e Zd ZdZ		d‡ fdd„	Zdd„ Z‡  ZS )	r?   a—  Encoder structure that maps the mixture magnitude spectrogram to
    smaller-sized features which are the input for the LSTM layers.

    Args:
        nb_bins (int): Number of frequency bins of the mixture.
        hidden_size (int): Hidden size parameter of LSTM layers.
        nb_channels (int): set number of channels for model
            (1 for mono (spectral downmix is applied,) 2 for stereo).
    r   r   c                    s<   t ƒ  ¡  || _|| _t t| j| |ddt|ƒ¡| _d S )NF)Úbias)	r/   r   r4   r!   r<   r=   r   r   Úenc)r
   r    r!   r"   rV   r   r   r   	  s   

þz_InstrumentBackboneEnc.__init__c                 C   sB   |\}}}}|   | d|| j ¡¡}| ||| j¡}t |¡}|S ©Nrc   )r‡   Úreshaper4   r!   r5   Útanh©r
   rs   rr   Ú	nb_framesÚ
nb_samplesr"   Ú_r   r   r   rb     s
   
z_InstrumentBackboneEnc.forward©r   r   ©r   r   r   rƒ   r   rb   r„   r   r   rV   r   r?   þ   s    ür?   c                       r…   )	r@   a‹  Decoder structure that maps output of LSTM layers to
    magnitude estimate of an instrument.

    Args:
        nb_output_bins (int): Number of frequency bins of the instrument estimate.
        hidden_size (int): Hidden size parameter of LSTM layers.
        nb_channels (int): Number of output bins depending on STFT size.
            It is generally calculated ``(STFT size) // 2 + 1''.
    r   r   c              
      sX   t ƒ  ¡  || _t t|d |ddt|ƒt ¡ t|| j| ddt| j| ƒ¡| _d S )Nr   F)Úin_featuresÚout_featuresr†   )	r/   r   r(   r<   r=   r   r   ÚReLUÚdec)r
   r(   r!   r"   rV   r   r   r   -  s   
ÿ
ùz_InstrumentBackboneDec.__init__c                 C   s:   |\}}}}|   | d|jd ¡¡}| |||| j¡}|S rˆ   )r”   r‰   rg   r(   r‹   r   r   r   rb   ?  s   z_InstrumentBackboneDec.forwardr   r   r   r   rV   r   r@   "  s    ür@   c                       ó&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )	r:   r   r   Tc                    s8   t t| ƒ ¡  tt |¡dd| _|| _|| _|| _	d S )NF)Úrequires_grad)
r/   r:   r   r   r5   Úhann_windowr-   r   r   r   )r
   r   r   r   r   rV   r   r   r   G  s
   
z_STFT.__init__c                 C   sf   |  ¡ \}}}| || d¡}tj|| j| j| j| jddddd	}| ¡  	||| jd d dd¡}|S )z€
        Input: (nb_samples, nb_channels, nb_timesteps)
        Output:(nb_samples, nb_channels, nb_bins, nb_frames, 2)
        rc   FTÚreflect)r   r.   r-   r   Ú
normalizedÚonesidedÚpad_modeÚreturn_complexr   r   )
Úsizer‰   r5   rO   r   r   r-   r   Ú
contiguousÚview)r
   rs   r   r"   Únb_timestepsÚstft_fr   r   r   rb   N  s   ÷ z_STFT.forward©r   r   T©r   r   r   r   rb   r„   r   r   rV   r   r:   F  ó    r:   c                       s&   e Zd Zd‡ fdd„	Zdd„ Z‡  ZS )r;   r   Tc                    s   t t| ƒ ¡  || _|| _d S ©N)r/   r;   r   r   r   )r
   r   r   rV   r   r   r   l  s   
z_Spectrogram.__init__c                 C   sŠ   |  ¡  ¡ }t |tdf |tdf ¡}| dd¡}| d¡ d¡ | jd ¡}| j	r;tj
|ddd}tj
|ddd}| dddd¡|gS )	zÞ
        Input: complex STFT
            (nb_samples, nb_channels, nb_bins, nb_frames, 2)
        Output: Power/Mag Spectrogram and the corresponding phase
            (nb_frames, nb_samples, nb_channels, nb_bins)
        r   r   r   r   rc   g       @T)Úkeepdim)ÚdetachrA   r5   Úatan2ÚEllipsisÚ	transposeÚpowrl   r   r   Úmeanr[   )r
   r¡   Úphaser   r   r   rb   q  s   z_Spectrogram.forward)r   Tr£   r   r   rV   r   r;   k  s    r;   c                       r•   )	rI   r   r   Tc                    s*   t t| ƒ ¡  || _|| _|| _|| _d S r¥   )r/   rI   r   r-   r   r.   r   )r
   r-   r   r.   r   rV   r   r   r   ‹  s
   
z_ISTFT.__init__c                 C   sŒ   |j \}}}}}|t |¡ }|t |¡ }	tj||	gdd}
|
 || | ||d¡}
tj|
| j| j| j	| j
d}| ||||j d ¡}|S )Nrc   rd   r   )r   r.   r-   r   )rg   r5   ÚcosÚsinrp   rŸ   Úistftr   r.   r-   r   )r
   rP   r^   r1   ÚbsizeÚchannelsÚfbinsÚframesÚx_rÚx_irs   r\   r   r   r   rb   ’  s   ÿz_ISTFT.forwardr¢   r£   r   r   rV   r   rI   Š  r¤   rI   )r5   Útorch.nnr<   Útorch.nn.functionalÚ
functionalrn   r   r   r   r   Úbase_modelsr   r   r   ÚModuler?   r@   r:   r;   rI   r   r   r   r   Ú<module>   s     o$$%