o
    i4m                     @   s  d dl Z d dlZd dlZd dl mZ d dlmZ d dlm	Z
 d dlmZ ddlmZmZ d dlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZ dd Z d/ded dej!fddZ"G dd dej!Z#G dd dej!Z$G dd dej!Z%G dd dej!Z&G dd dej!Z'G d d! d!ej!Z(G d"d# d#ej!Z)G d$d% d%ej!Z*d&ee+ef fd'd(Z,d)ee+ef fd*d+Z-d,ee+ef fd-d.Z.dS )0    N)nn)
functional)
transforms)Activation1d   )WNConv1dWNConvTranspose1d)LiteralDictAny)prepare_audio)	SnakeBeta)
BottleneckDiscreteBottleneck)create_pretransform_from_configcreate_bottleneck_from_config)Pretransformc                 O   s(   | dd tjjj| g|R i |S )Nuse_reentrantF)
setdefaulttorchutils
checkpoint)functionargskwargs r   i/home/ubuntu/.local/lib/python3.10/site-packages/solospeech/vae_modules/stable_vae/models/autoencoders.pyr      s   r   F
activation)elusnakenonereturnc                 C   sT   | dkr	t  }n| dkrt|}n| dkrt  }ntd|  |r(t|}|S )Nr   r   r    zUnknown activation )r   ELUr   Identity
ValueErrorr   )r   	antialiaschannelsactr   r   r   get_activation   s   


r(   c                       &   e Zd Zd fdd	Zdd Z  ZS )ResidualUnitFc              
      sn   t    || _|d d }tt|rdnd||dt||d||dt|r'dnd||dt||dd	| _d S )
N      r   r   r%   r&      )in_channelsout_channelskernel_sizedilationpaddingr   )r/   r0   r1   )super__init__r2   r   
Sequentialr(   r   layers)selfr/   r0   r2   	use_snakeantialias_activationr3   	__class__r   r   r5   '   s   

zResidualUnit.__init__c                 C   s   |}|  |}|| S Nr7   )r8   xresr   r   r   forward7   s   
zResidualUnit.forwardFF__name__
__module____qualname__r5   rA   __classcell__r   r   r;   r   r*   &   s    r*   c                       r)   )EncoderBlockFc                    st   t    tt||d|dt||d|dt||d|dt|r!dnd||dt||d| |t|d d	| _	d S )
Nr   r/   r0   r2   r9      	   r   r   r-   r,   r/   r0   r1   strider3   )
r4   r5   r   r6   r*   r(   r   mathceilr7   )r8   r/   r0   rM   r9   r:   r;   r   r   r5   @   s    

zEncoderBlock.__init__c                 C   
   |  |S r=   r>   r8   r?   r   r   r   rA   O      
zEncoderBlock.forwardrB   rC   r   r   r;   r   rH   ?   s    rH   c                       r)   )DecoderBlockFc                    s   t    |rttj|ddt||d| dddd}nt||d| |t|d d}tt	|r4d	nd
||d|t
||d|dt
||d|dt
||d|d| _d S )Nnearest)scale_factormoder,   r   Fsame)r/   r0   r1   rM   biasr3   rL   r   r   r-   rI   rJ   rK   )r4   r5   r   r6   Upsampler   r   rN   rO   r(   r*   r7   )r8   r/   r0   rM   r9   r:   use_nearest_upsampleupsample_layerr;   r   r   r5   S   s:   


zDecoderBlock.__init__c                 C   rP   r=   r>   rQ   r   r   r   rA   p   rR   zDecoderBlock.forwardFFFrC   r   r   r;   r   rS   R   s    rS   c                       s<   e Zd Zdddg dg dddf fdd	Zd	d
 Z  ZS )OobleckEncoderr,          r   r,         r,   ra   rb   rb   Fc           
         s   t    dg| }t|| _t||d | dddg}t| jd D ]}	|t||	 | ||	d  | ||	 |dg7 }q#|t|rCdnd||d	 | d
t|d	 | |dddg7 }tj	| | _
d S )Nr   r   r.   rJ   r/   r0   r1   r3   )r/   r0   rM   r9   r   r   r-   )r4   r5   lendepthr   rangerH   r(   r   r6   r7   )
r8   r/   r&   
latent_dimc_multsstridesr9   r:   r7   ir;   r   r   r5   t   s   
	

0zOobleckEncoder.__init__c                 C   rP   r=   r>   rQ   r   r   r   rA      rR   zOobleckEncoder.forwardrC   r   r   r;   r   r]   s   s    r]   c                	       s@   e Zd Zdddg dg dddddf	 fdd		Zd
d Z  ZS )OobleckDecoderr,   r^   r_   r`   rc   FTc
              
      s   t    dg| }t|| _t||d | dddg}
t| jd ddD ]}|
t|| | ||d  | ||d  |||dg7 }
q%|
t|rIdnd	||d | d
t|d | |dddd|	rdt	 nt
 g7 }
tj|
 | _d S )Nr   re   r.   rJ   rd   r   )r/   r0   rM   r9   r:   rZ   r   r   r-   F)r/   r0   r1   r3   rX   )r4   r5   rf   rg   r   rh   rS   r(   r   Tanhr#   r6   r7   )r8   r0   r&   ri   rj   rk   r9   r:   rZ   
final_tanhr7   rl   r;   r   r   r5      s(   






zOobleckDecoder.__init__c                 C   rP   r=   r>   rQ   r   r   r   rA      rR   zOobleckDecoder.forwardrC   r   r   r;   r   rm      s    'rm   c                       r)   )DACEncoderWrapperr   c                    s   t    ddlm} |dd }|d dt|d   }|dd|i|| _|| _|d ur8tj	| jj
|dd	nt | _|dkrSt||dd
ddd| jjd< d S d S )Nr   )Encoderri   d_modelr,   rk   d_latentr   )r1   @   r.   rJ   )r1   r3   r   )r4   r5   dac.model.dacrq   poprf   encoderri   r   Conv1denc_dimr#   proj_outr   getblock)r8   r/   r   
DACEncoderri   encoder_out_dimr;   r   r   r5      s   
&$zDACEncoderWrapper.__init__c                 C   s   |  |}| |}|S r=   )rw   rz   rQ   r   r   r   rA      s   

zDACEncoderWrapper.forwardr   rC   r   r   r;   r   rp      s    rp   c                       r)   )DACDecoderWrapperr   c                    s:   t    ddlm} |di |||d| _|| _d S )Nr   )Decoder)input_channeld_outr   )r4   r5   ru   r   decoderri   )r8   ri   r0   r   
DACDecoderr;   r   r   r5      s   

zDACDecoderWrapper.__init__c                 C   rP   r=   )r   rQ   r   r   r   rA      rR   zDACDecoderWrapper.forwardr   rC   r   r   r;   r   r      s    	r   c                       st   e Zd Z						ddedef fddZddd	Zdd
dZdd Zdd Z	dd Z
dddZdddZ  ZS )AudioAutoencoderr,   NF
bottleneckpretransformc                    s   t    || _|| _|| _|| _|| _|| _| j| _|	d ur"|	| _|
d ur)|
| _|| _	|| _
|| _|| _|| _| j	d uo@| j	j| _d S r=   )r4   r5   downsampling_ratiosample_rateri   io_channelsr/   r0   
min_lengthr   rw   r   r   	soft_clipis_discrete)r8   rw   r   ri   r   r   r   r   r   r/   r0   r   r;   r   r   r5      s$   
zAudioAutoencoder.__init__c              
   K   s  i }| j d urx|sx| j jr9|r2g }t|jd D ]}|| j |||d   qtj|dd}nF| j |}n?t 3 |rcg }t|jd D ]}|| j |||d   qItj|dd}n| j |}W d    n1 ssw   Y  | j	d ur|rg }	t|jd D ]}|	| 	|||d   qtj|	dd}	n| 	|}	n|}	| j
d ur| j
j|	fddi|\}	}
||
 |r|	|fS |	S )Nr   r   dimreturn_infoT)r   enable_gradrh   shapeappendencoder   catno_gradrw   r   update)r8   audior   skip_pretransformiterate_batchr   infoaudiosrl   latentsbottleneck_infor   r   r   r     s@    
 
	

zAudioAutoencoder.encodec              
   K   s  | j d ur0|r*g }t|jd D ]}|| j |||d   qtj|dd}n| j |}|rTg }t|jd D ]}|| |||d   q;tj|dd}n	| j|fi |}| jd ur| jj	r|rg }t|jd D ]}|| j|||d   qqtj|dd}nF| j|}n?t
 3 |rg }t|jd D ]}|| j|||d   qtj|dd}n| j|}W d    n1 sw   Y  | jrt|}|S )Nr   r   r   )r   rh   r   r   decoder   r   r   r   r   r   r   tanh)r8   r   r   r   decodedrl   decodedsr   r   r   r   ?  sB   
 
 
 	
zAudioAutoencoder.decodec                 K   s:   t | jts
J d| jj|fi |}| j|fi |S )z_
        Decode discrete tokens to audio
        Only works with discrete autoencoders
        z3decode_tokens only works with discrete autoencoders)
isinstancer   r   decode_tokensr   )r8   tokensr   r   r   r   r   r   j  s   zAudioAutoencoder.decode_tokensc                 C   s   |  |g|gS )a  
        Preprocess single audio tensor (Channels x Length) to be compatible with the encoder.
        If the model is mono, stereo audio will be converted to mono.
        Audio will be silence-padded to be a multiple of the model's downsampling ratio.
        Audio will be resampled to the model's sample rate. 
        The output will have batch size 1 and be shape (1 x Channels x Length)
        )!preprocess_audio_list_for_encoder)r8   r   in_srr   r   r   preprocess_audio_for_encoderw  s   z-AudioAutoencoder.preprocess_audio_for_encoderc              	   C   sR  t |}t|tr|g| }t ||ksJ dg }d}t|D ]Z}|| }|| }t |jdkr>|jd dkr>|d}nt |jdkrJ|d}t |jdksUJ d|| jkrit	|| j
|j}	|	|}|| |jd |krz|jd }q || j|| j  | j  }
t|D ]}t|| |||
| j|| jdd||< qt|S )	a  
        Preprocess a [list] of audio (Channels x Length) into a batch tensor to be compatable with the encoder. 
        The audio in that list can be of different lengths and channels. 
        in_sr can be an integer or list. If it's an integer it will be assumed it is the input sample_rate for every audio.
        All audio will be resampled to the model's sample rate. 
        Audio will be silence-padded to the longest length, and further padded to be a multiple of the model's downsampling ratio. 
        If the model is mono, all audio will be converted to mono. 
        The output will be a tensor of shape (Batch x Channels x Length)
        z:list of sample rates must be the same length of audio_listr   rJ   r   r,   zAAudio should be shape (Channels x Length) with no batch dimensionre   )r   	target_srtarget_lengthtarget_channelsdevice)rf   r   intrh   r   squeeze	unsqueezer   TResampletor   r   r   r   r/   r   stack)r8   
audio_list
in_sr_list
batch_size	new_audio
max_lengthrl   r   r   resample_tfpadded_audio_lengthr   r   r   r     s:   







z2AudioAutoencoder.preprocess_audio_list_for_encoderr_   r^   c                 K   s  |s| j |fi |S | j}|jd }|jd }||9 }||9 }|| }	g }
td|| d |	D ]}|dddd||| f }|
| q0|| |kr`|dddd| df }|
| t|
}
|
jd }|| }t|| j|f	|j
}t|D ]j}|
|ddf }|  |}||d kr|}||jd  }n||	 | }|||  }|| d }d}|jd }|dkr||7 }||7 }||d k r||8 }||8 }|dddd||f |dddd||f< q|S )aM  
        Encode audios into latents. Audios should already be preprocesed by preprocess_audio_for_encoder.
        If chunked is True, split the audio into chunks of a given maximum size chunk_size, with given overlap.
        Overlap and chunk_size params are both measured in number of latents (not audio samples) 
        # and therefore you likely could use the same values with decode_audio. 
        A overlap of zero will cause discontinuity artefacts. Overlap should be => receptive field size. 
        Every autoencoder will have a different receptive field size, and thus ideal overlap.
        You can determine it empirically by diffing unchunked vs chunked output and looking at maximum diff.
        The final chunk may have a longer overlap in order to keep chunk_size consistent for all chunks.
        Smaller chunk_size uses less memory, but more compute.
        The chunk_size vs memory tradeoff isn't linear, and possibly depends on the GPU and CUDA version
        For example, on a A6000 chunk_size 128 is overall faster than 256 and 512 even though it has more chunks
        r,   r   r   N)r   r   r   rh   r   r   r   zerosri   r   r   )r8   r   chunkedoverlap
chunk_sizer   samples_per_latent
total_sizer   hop_sizechunksrl   chunk
num_chunksy_sizey_finalx_chunky_chunkt_endt_startolchunk_start	chunk_endr   r   r   encode_audio  sL   






2zAudioAutoencoder.encode_audioc                 K   s  |s| j |fi |S || }|jd }|jd }g }	td|| d |D ]}
|dddd|
|
| f }|	| q%|
| |krU|dddd| df }|	| t|	}	|	jd }| j}|| }t|| j|f	|j
}t|D ]j}
|	|
ddf }|  |}|
|d kr|}||jd  }n|
| | }|||  }|d | }d}|jd }|
dkr||7 }||7 }|
|d k r||8 }||8 }|dddd||f |dddd||f< qw|S )a  
        Decode latents to audio. 
        If chunked is True, split the latents into chunks of a given maximum size chunk_size, with given overlap, both of which are measured in number of latents. 
        A overlap of zero will cause discontinuity artefacts. Overlap should be => receptive field size. 
        Every autoencoder will have a different receptive field size, and thus ideal overlap.
        You can determine it empirically by diffing unchunked vs chunked audio and looking at maximum diff.
        The final chunk may have a longer overlap in order to keep chunk_size consistent for all chunks.
        Smaller chunk_size uses less memory, but more compute.
        The chunk_size vs memory tradeoff isn't linear, and possibly depends on the GPU and CUDA version
        For example, on a A6000 chunk_size 128 is overall faster than 256 and 512 even though it has more chunks
        r,   r   r   N)r   r   rh   r   r   r   r   r   r0   r   r   )r8   r   r   r   r   r   r   r   r   r   rl   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   decode_audio  sH   






2zAudioAutoencoder.decode_audio)r,   NNNNFr\   )F)Fr_   r^   )rD   rE   rF   r   r   r5   r   r   r   r   r   r   r   rG   r   r   r;   r   r      s&    	
,
-+

+Fr   encoder_configc           
      C   s  |  dd }|d usJ d|dkrtdi | d }nQ|dkr@ddlm} | d }tt| dg d	|d< |di |}n-|d
krP| d }tdi |}n|dkrfddlm} | d }|di |}nt	d| |  dd}|s|
 D ]}	d|	_qy|S )NtypezEncoder type must be specifiedoobleckconfigseanetr   )SEANetEncoderratios)r,   r,   r,   r,   r,   dac
local_attnr   )TransformerEncoder1DzUnknown encoder type requires_gradTFr   )r{   r]   encodec.modulesr   listreversedrp   local_attentionr   r$   
parametersr   )
r   encoder_typerw   r   seanet_encoder_config
dac_configr   local_attn_configr   paramr   r   r   create_encoder_from_config4  s8   r   decoder_configc           	      C   s   |  dd }|d usJ d|dkrtdi | d }nA|dkr0ddlm} |di | d }n-|dkr@| d }tdi |}n|d	krVd
dlm} | d }|di |}ntd| |  dd}|so| D ]}d|_	qi|S )Nr   zDecoder type must be specifiedr   r   r   r   )SEANetDecoderr   r   r   )TransformerDecoder1DzUnknown decoder type r   TFr   )
r{   rm   r   r   r   r   r   r$   r   r   )	r   decoder_typer   r   r   r   r   r   r   r   r   r   create_decoder_from_config\  s4   r   r   c                 C   s  | d }t |d }t|d }|dd }|dd }|d us$J d|dd }|d us2J d|d	d }|d us@J d
| dd }|d usNJ d|dd }	|dd }
|dd }|d urit||}|d urqt|}|d dd}t|||||||||	|
|dS )Nmodelrw   r   r   ri   z,latent_dim must be specified in model configr   z4downsampling_ratio must be specified in model configr   z-io_channels must be specified in model configr   z-sample_rate must be specified in model configr/   r0   r   r   F)	r   ri   r   r   r   r   r/   r0   r   )r   r   r{   r   r   r   )r   	ae_configrw   r   r   ri   r   r   r   r/   r0   r   r   r   r   r   create_autoencoder_from_config  sB   
r   )FN)/r   rN   numpynpr   torch.nnr   F
torchaudior   r   alias_free_torchr   	nn.layersr   r   typingr	   r
   r   r   r   blocksr   r   r   r   factoryr   r   pretransformsr   r   Moduler(   r*   rH   rS   r]   rm   rp   r   r   strr   r   r   r   r   r   r   <module>   s:    !",  Q($