o
    9wirB                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ ddl	m
Z
 ddlmZmZ ddlmZ G dd	 d	ejZd
d Zdd ZG dd dejZG dd dejZG dd dejZdS )    N)nn)
functional   )capture_init)center_trimunfold)
LayerScalec                       s*   e Zd ZdZd	 fdd	Zdd Z  ZS )
BLSTMz
    BiLSTM with same hidden units as input dim.
    If `max_steps` is not None, input will be splitting in overlapping
    chunks and the LSTM applied separately on each chunk.
    r   NFc                    sX   t    |d u s|d dksJ || _tjd|||d| _td| || _|| _d S )N   r   T)bidirectional
num_layershidden_size
input_size   )	super__init__	max_stepsr   LSTMlstmLinearlinearskip)selfdimlayersr   r   	__class__ J/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/demucs/demucs.pyr      s   

zBLSTM.__init__c              	   C   s  |j \}}}|}d}| jd ur5|| jkr5| j}|d }t|||}	|	j d }
d}|	ddddd||}|ddd}| |d }| |}|ddd}|rg }||d||}	|d }t|
D ]C}|dkr|||	d d |d d d | f  qc||
d kr||	d d |d d |d f  qc||	d d |d d || f  qct	
|d}|dd |f }|}| jr|| }|S )	NFr   Tr   r      .)shaper   r   permutereshaper   r   rangeappendtorchcatr   )r   xBCTyframedwidthstrideframesnframesoutlimitkr   r   r   forward"   s<   

&$&zBLSTM.forward)r   NF)__name__
__module____qualname____doc__r   r5   __classcell__r   r   r   r   r	      s    r	   c                 C   sL   | j   }|| d }| j  j|  _| jdur$| j j|  _dS dS )zTRescale initial weight scale. It is unclear why it helps but it certainly does.
          ?N)weightstddetachdatabias)conv	referencer=   scaler   r   r   rescale_convF   s   
rD   c                 C   s6   |   D ]}t|tjtjtjtjfrt|| qd S N)modules
isinstancer   Conv1dConvTranspose1dConv2dConvTranspose2drD   )modulerB   subr   r   r   rescale_moduleP   s
   
rN   c                	       sB   e Zd ZdZ			dded	ed
edef fddZdd Z  ZS )DConva  
    New residual branches in each encoder layer.
    This alternates dilated convolutions, potentially with LSTMs and attention.
    Also before entering each residual branch, dimension is projected on a smaller subspace,
    e.g. of dim `channels // compress`.
    r
   r   -C6?TFr   channelscompressdepthinitc              
      s<  t    |d dksJ || _|| _t|| _|dk}dd }|r&dd }t|| }|
r2tj}ntj	}t
g | _t| jD ][}|rHd| nd}||d  }tj|||||d||| t|d| d|d| tdt||g}|r|dt|||d	 |	r|dt|dd
dd tj| }| j| q@dS )a  
        Args:
            channels: input/output channels for residual branch.
            compress: amount of channel compression inside the branch.
            depth: number of layers in the residual branch. Each layer has its own
                projection, and potentially LSTM and attention.
            init: initial scale for LayerNorm.
            norm: use GroupNorm.
            attn: use LocalAttention.
            heads: number of heads for the LocalAttention.
            ndecay: number of decay controls in the LocalAttention.
            lstm: use LSTM.
            gelu: Use GELU activation.
            kernel: kernel size for the (dilated) convolutions.
            dilate: if true, use dilation, increasing with the depth.
        r   r   r   c                 S      t  S rE   r   Identitydr   r   r   <lambda>y       z DConv.__init__.<locals>.<lambda>c                 S   s   t d| S )Nr   r   	GroupNormrX   r   r   r   rZ   {       )dilationpaddingr   )headsndecay   T)r   r   r   N)r   r   rQ   rR   absrS   intr   GELUReLU
ModuleListr   r$   rH   GLUr   insert
LocalStater	   
Sequentialr%   )r   rQ   rR   rS   rT   normattnra   rb   r   gelukerneldilatenorm_fnhiddenactrY   r_   r`   modslayerr   r   r   r   ]   s<   



zDConv.__init__c                 C   s   | j D ]}||| }q|S rE   )r   )r   r(   rv   r   r   r   r5      s   
zDConv.forward)r
   r   rP   TFr
   r
   FTr   T)	r6   r7   r8   r9   re   floatr   r5   r:   r   r   r   r   rO   V   s     :rO   c                	       s<   e Zd ZdZddedededef fdd	Zd
d Z  ZS )rk   a  Local state allows to have attention based only on data (no positional embedding),
    but while setting a constraint on the time window (e.g. decaying penalty term).

    Also a failed experiments with trying to provide some frequency based attention.
    r
   r   rQ   ra   nfreqsrb   c                    s   t    || dksJ ||f|| _|| _|| _t||d| _t||d| _t||d| _	|r>t||| d| _
|rdt||| d| _| jj jd9  _| jjd us[J d| jjjd d < t|||  |d| _d S )Nr   r   g{Gz?)r   r   ra   rx   rb   r   rH   contentquerykeyquery_freqsquery_decayr<   r?   r@   proj)r   rQ   ra   rx   rb   r   r   r   r      s    
zLocalState.__init__c                 C   s  |j \}}}| j}tj||j|jd}|d d d f |d d d f  }| |||d|}| |||d|}	t	d|	|}
|
|	j d d  }
| j
rtjd| j
d |j|jd}tdtj | |ddd }| |||d|| j
d  }|
t	d||7 }
| jrtjd| jd |j|jd}| |||d|}t|d }|ddd |  | jd  }|
t	d||7 }
|
tj||
jtjdd tj|
dd	}| |||d|}t	d
||}| j
rt	d||}t||gd}||d|}|| | S )N)devicedtyper    zbhct,bhcs->bhtsr   r;   r   zfts,bhfs->bhtsir   zbhts,bhct->bhcszbhts,fts->bhfs)r!   ra   r&   aranger   r   r{   viewr|   einsumrx   cosmathpir}   rb   r~   sigmoidrd   masked_fill_eyeboolsoftmaxrz   r'   r#   r   )r   r(   r)   r*   r+   ra   indexesdeltaquerieskeysdotsperiodsfreq_kernelfreq_qdecaysdecay_qdecay_kernelweightsrz   resulttime_sigr   r   r   r5      s8    " "zLocalState.forward)r
   r   r
   )r6   r7   r8   r9   re   r   r5   r:   r   r   r   r   rk      s     rk   c                       sp   e Zd Ze																					
					d fdd	Zdd Zdd Zd fdd	Z  ZS )Demucsr   @          @   Tr      r
   r   rP   皙?D  (   c           &   
      sh  t    || _|| _|| _|
| _|	| _|| _|| _|| _	|| _
|| _|| _t | _t | _t | _|r@tjdd}d}nt }d}|rLtj}ntj}|}d}t|D ]}dd } ||krg fdd} g }!|!t||||	| || g7 }!||k}"||k}#|d@ r|!t|||||"|#dg7 }!|r|!t||| d| || |g7 }!| jtj|!  g }$|dkr|}%nt| j| }%|r|$tj||| d|
 d |
d	| || |g7 }$|d@ r|$t|||||"|#dg7 }$|$tj||%||	|d	g7 }$|dkr|$| |%| g7 }$| jdtj|$  |}t|| }qW|}|r$t||| _nd
| _|r2t | |d d
S d
S )ab	  
        Args:
            sources (list[str]): list of source names
            audio_channels (int): stereo or mono
            channels (int): first convolution channels
            depth (int): number of encoder/decoder layers
            growth (float): multiply (resp divide) number of channels by that
                for each layer of the encoder (resp decoder)
            depth (int): number of layers in the encoder and in the decoder.
            rewrite (bool): add 1x1 convolution to each layer.
            lstm_layers (int): number of lstm layers, 0 = no lstm. Deactivated
                by default, as this is now replaced by the smaller and faster small LSTMs
                in the DConv branches.
            kernel_size (int): kernel size for convolutions
            stride (int): stride for convolutions
            context (int): kernel size of the convolution in the
                decoder before the transposed convolution. If > 1,
                will provide some context from neighboring time steps.
            gelu: use GELU activation function.
            glu (bool): use glu instead of ReLU for the 1x1 rewrite conv.
            norm_starts: layer at which group norm starts being used.
                decoder layers are numbered in reverse order.
            norm_groups: number of groups for group norm.
            dconv_mode: if 1: dconv in encoder only, 2: decoder only, 3: both.
            dconv_depth: depth of residual DConv branch.
            dconv_comp: compression of DConv branch.
            dconv_attn: adds attention layers in DConv branch starting at this layer.
            dconv_lstm: adds a LSTM layer in DConv branch starting at this layer.
            dconv_init: initial scale for the DConv branch LayerScale.
            normalize (bool): normalizes the input audio on the fly, and scales back
                the output by the same amount.
            resample (bool): upsample x2 the input and downsample /2 the output.
            rescale (float): rescale initial weights of convolutions
                to get their standard deviation closer to `rescale`.
            samplerate (int): stored as meta information for easing
                future evaluations of the model.
            segment (float): duration of the chunks of audio to ideally evaluate the model on.
                This is used by `demucs.apply.apply_model`.
        r   r   r   r   c                 S   rU   rE   rV   rX   r   r   r   rZ   F  r[   z!Demucs.__init__.<locals>.<lambda>c                    s   t  | S rE   r\   rX   norm_groupsr   r   rZ   H  r^   )rS   rT   rR   rn   r   )r`   N)rB   )!r   r   audio_channelssourceskernel_sizecontextr/   rS   resamplerQ   	normalize
sampleratesegmentr   rh   encoderdecoderskip_scalesri   rg   rf   r$   rH   rO   r%   rl   lenrI   rj   re   r	   r   rN   )&r   r   r   rQ   growthrS   rewritelstm_layersr   r/   r   ro   glunorm_startsr   
dconv_modedconv_depth
dconv_comp
dconv_attn
dconv_lstm
dconv_initr   r   rescaler   r   
activationch_scaleact2in_channelsr`   indexrr   encodern   r   decodeout_channelsr   r   r   r      s   
L









zDemucs.__init__c                 C   s   | j r|d9 }t| jD ]}t|| j | j d }td|}qt| jD ]}|d | j | j }q&| j r=t|d }t|S )aX  
        Return the nearest valid length to use with the model so that
        there is no time steps left over in a convolution, e.g. for all
        layers, size of the input - kernel_size % stride = 0.

        Note that input are automatically padded if necessary to ensure that the output
        has the same length as the input.
        r   r   )	r   r$   rS   r   ceilr   r/   maxre   )r   length_idxr   r   r   valid_lengthx  s   	zDemucs.valid_lengthc                 C   sF  |}|j d }| jr(|jddd}|jddd}|jddd}|| d|  }nd}d}| || }t||d ||d  f}| jrKt	|dd}g }| j
D ]}	|	|}|| qP| jrd| |}| jD ]}
|d}t||}|
|| }qg| jrt	|dd}|| | }t||}||dt| j| j|d}|S )Nr    r   T)r   keepdimgh㈵>r   r   )r!   r   meanr=   r   Fpadr   juliusresample_fracr   r%   r   r   popr   r   sizer   r   r   )r   mixr(   r   monor   r=   r   savedr   r   r   r   r   r   r5     s:   






$zDemucs.forwardc                    s~   t | jD ]/}dD ]*}dD ]%}| d| d| }| d| d| }||v r2||vr2||||< qq	qt j||d d S )N)r   r   )r@   r<   .z.3.z.2.)strict)r$   rS   r   r   load_state_dict)r   stater   r   abnewoldr   r   r   r     s   zDemucs.load_state_dict)r   r   r   r   Tr   r   r
   r   TTr
   r
   r   r   r
   r
   r
   rP   TTr   r   r   )T)	r6   r7   r8   r   r   r   r5   r   r:   r   r   r   r   r      s>     'r   )r   typingtpr   r&   r   torch.nnr   r   statesr   utilsr   r   transformerr   Moduler	   rD   rN   rO   rk   r   r   r   r   r   <module>   s   2
G>