o
    㥵i~                     @   s  d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	Z	d dl
Zd dlZd dlZd dlmZ d dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZmZ d d
lmZ d dl m!Z! d dl"m#Z# eG dd dZ$de%de%de%fddZ&eG dd dZ'G dd dej(Z)G dd dej(Z*G dd dej(Z+G dd dej(Z,G dd dej(Z-G d d! d!ej(Z.G d"d# d#ej(Z/G d$d% d%e*Z0d&ej1fd'e%d(e%d)e%d*ej2def
d+d,Z3d-ed.edefd/d0Z4d1d2 Z5d-ejd3ej6e%e%f fd4d5Z7	 dUd-ejd6e%d7e%d8e%de%f
d9d:Z8	;	<dVd-ejd3ej6e%e%f d=e9d>e:fd?d@Z;G dAdB dBej(Z<G dCdD dDej(Z=dEdF Z>dGdH Z?G dIdJ dJej(Z@G dKdL dLej(ZAG dMdN dNej(ZBG dOdP dPej(ZCG dQdR dRej(ZDG dSdT dTeeZEdS )W    N)	dataclass)ListOptionalUnion)AudioSignal)	BaseModel)
CodecMixin)Snake1dWNConv1dWNConvTranspose1d)	OmegaConf)Tensornn)
functional)weight_norm)remove_parametrizationsc                   @   sR   e Zd ZU ejed< ejed< ejed< ejed< ejed< dZejdB ed< dS )VQResultzcodeslatentscodebook_losscommitment_lossNsemantic_distill_z)__name__
__module____qualname__torchr   __annotations__r    r   r   U/home/ubuntu/.local/lib/python3.10/site-packages/fish_speech/models/dac/modded_dac.pyr      s   
 




r   nkreturnc                 C   s    | | dkr| S | | | |  S Nr   r   )r    r!   r   r   r   find_multiple    s   r$   c                   @   s   e Zd ZU dZeed< dZeed< dZeed< dZeed< dZ	eed	< d
Z
eed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dZeed< dd ZdS )	ModelArgsi   
block_size   n_layern_head   dim   intermediate_sizen_local_heads@   head_dim'  	rope_baseh㈵>norm_epsg?dropout_rateattn_dropout_rateTchannels_firstropepos_embed_type   max_relative_positionc                 C   sX   | j dkr	| j| _ | jd u r!d| j }td| d }t|d| _| jdv s*J dd S )Nr.               )r9   	conformerz3pos_embed_type must be either 'rope' or 'conformer')r/   r)   r-   r+   intr$   r:   )self
hidden_dimn_hiddenr   r   r   __post_init__7   s   


zModelArgs.__post_init__N)r   r   r   r&   rB   r   r(   r)   r+   r-   r/   r1   r3   floatr5   r6   r7   r8   boolr:   strr<   rF   r   r   r   r   r%   &   s    
 r%   c                       s2   e Zd Zejf fdd	Zdd Zdd Z  ZS )KVCachec                    sF   t    ||||f}| dtj||d | dtj||d d S )Nk_cachedtypev_cache)super__init__register_bufferr   zeros)rC   max_batch_sizemax_seq_lengthn_headsr1   rM   cache_shape	__class__r   r   rP   E   s   
zKVCache.__init__c                 C   s   |j d |j d ksJ | j}| j}||d d d d |f< ||d d d d |f< |d d d d d | d d d f |d d d d d | d d d f fS )Nr   r>      )shaperK   rN   max)rC   	input_posk_valv_valk_outv_outr   r   r   updateM   s   &&zKVCache.updatec                 C   sT   | j d d d d |d d d f d | jd d d d |d d d f d d S r#   )rK   fill_rN   )rC   
prompt_lenr   r   r   clear_cache[   s   (,zKVCache.clear_cache)	r   r   r   r   bfloat16rP   ra   rd   __classcell__r   r   rW   r   rJ   D   s
    rJ   c                	       sV   e Zd Zdeddf fddZdd Z		dded	ee d
ee defddZ  Z	S )Transformerconfigr"   Nc                    s   t     | _t fddt jD | _t j	 j
d| _ jdkr9t| jj| jj| jj}| d| n| dd  ttj| jj| jjtjd}| d| d| _d| _d	| _d S )
Nc                 3   s    | ]}t  V  qd S N)TransformerBlock).0_rh   r   r   	<genexpr>e   s    
z'Transformer.__init__.<locals>.<genexpr>epsr9   	freqs_cisrL   causal_maskr.   F)rO   rP   rh   r   
ModuleListranger(   layersRMSNormr+   r5   normr:   precompute_freqs_cisr&   r1   r3   rQ   r   trilonesrH   rS   rT   use_kv_cache)rC   rh   rq   rr   rW   rm   r   rP   a   s&   



zTransformer.__init__c                 C   sn   | j j| j j }t|d}|| _|| _| jjj}| jjj	}| j
D ]}t||| j j||||j_q d| _dS )zW
        This method will only be called during inference when using KV cache.
        r'   TN)rh   r+   r)   r$   rT   rS   rw   weightrM   deviceru   rJ   r/   to	attentionkv_cacher{   )rC   rS   rT   r1   rM   r}   br   r   r   setup_caches|   s"   




zTransformer.setup_cachesxr\   maskc                 C   s   | j jdkr| jd usJ d| j| }nd }|d u rD| js6| jr6| jd d |f }|dd | d f }n| jd d |f }|d|f }t| jD ]\}}|||||}qI| 	|}|S )Nr9   zBRoPE frequencies must be initialized for RoPE positional embedding.rY   )
rh   r:   rq   trainingr{   rr   r[   	enumerateru   rw   )rC   r   r\   r   rq   ilayerr   r   r   forward   s    
zTransformer.forwardNN)
r   r   r   r%   rP   r   r   r   r   rf   r   r   rW   r   rg   `   s    rg   c                
       sD   e Zd Zdeddf fddZdededed	edef
d
dZ  ZS )rj   rh   r"   Nc                    sf   t    t|| _t|| _t|j|jd| _	t|j|jd| _
t|jdd| _t|jdd| _d S )Nro   T)inplace)rO   rP   	Attentionr   FeedForwardfeed_forwardrv   r+   r5   ffn_normattention_norm
LayerScaleattention_layer_scaleffn_layer_scalerC   rh   rW   r   r   rP      s   


zTransformerBlock.__init__r   r\   rq   r   c              	   C   s>   ||  | | |||| }|| | | | }|S ri   )r   r   r   r   r   r   )rC   r   r\   rq   r   houtr   r   r   r      s
   zTransformerBlock.forwardr   r   r   r%   rP   r   r   rf   r   r   rW   r   rj      s    	rj   c                       s^   e Zd Zdef fddZdededefddZ		dd
edededee def
ddZ	  Z
S )r   rh   c                    s   t    |j|j dksJ |jd|j  |j }tj|j|dd| _tj|j|j |jdd| _	d | _
|j| _|j| _|j| _|j| _|j| _|j| _| jdkrt|j| _d|j d }tt|| j| _tjj| jddd	 d S d S )
Nr   r>   FbiasrA   rY           {Gz?)meanstd)rO   rP   r+   r)   r/   r1   r   Linearwqkvwor   r7   r:   r<   	Parameterr   rR   rel_pos_embeddingsinitnormal_)rC   rh   total_head_dimnum_pos_embeddingsrW   r   r   rP      s(   

zAttention.__init__qseqlenr"   c                 C   sx   t j||jd}|d|d }t || j dd| j }| j| }|dd}t ||dd}|dd}|S )Nr}   rY   r   r>   r.   )	r   aranger}   	unsqueezeclampr<   r   	transposematmul)rC   r   r   	positionsrelative_positionsrel_embeddings
rel_logitsr   r   r   _compute_conformer_pos_scores   s   
z'Attention._compute_conformer_pos_scoresNr   rq   r   r\   c                 C   s  |j \}}}| j| j }| |j|||gdd\}	}
}|}|	||| j| j}	|
||| j| j}
|||| j| j}| jdkrKt|	|}	t|
|}
t	dd |	|
|f\}	}
}| j
d urg| j
||
|\}
}|
j| j| j dd}
|j| j| j dd}| jdkrdt| j }t|	|
d	d| }| |	|}|| }|d ur|| td
}tj|dd}| jdkr| jrtj|| jd}t||}ntj|	|
|| jr| jnd|d}|dd ||| j| j }| |}|S )Nr.   r+   r9   c                 S   s   |  ddS )NrY   r>   )r   )r   r   r   r   <lambda>  s    z#Attention.forward.<locals>.<lambda>rY   rA         ?r   z-infr   )pr   )	dropout_p	attn_maskr>   )rZ   r/   r1   r   splitviewr)   r:   apply_rotary_embmapr   ra   repeat_interleavemathsqrtr   r   r   r   masked_fillrG   Fsoftmaxr7   r   dropoutscaled_dot_product_attention
contiguousr   )rC   r   rq   r   r\   bszr   rl   kv_sizer   r!   vcontext_seqlenscalescores
rel_scoresattnyr   r   r   r      sN    





	
zAttention.forwardri   )r   r   r   r%   rP   r   rB   r   r   r   rf   r   r   rW   r   r      s    r   c                       s8   e Zd Zdeddf fddZdedefddZ  ZS )	r   rh   r"   Nc                    s^   t    tj|j|jdd| _tj|j|jdd| _tj|j|jdd| _t	|j
| _d S )NFr   )rO   rP   r   r   r+   r-   w1w3w2Dropoutr6   r   r   rW   r   r   rP   5  s
   
zFeedForward.__init__r   c              	   C   s&   |  | t| || | S ri   )r   r   r   silur   r   rC   r   r   r   r   r   <  s   &zFeedForward.forwardr   r   r   rW   r   r   4  s    r   c                       sB   e Zd Zddedef fddZdd Zded	efd
dZ  Z	S )rv   r4   r+   rp   c                    s&   t    || _tt|| _d S ri   )rO   rP   rp   r   r   r   rz   r|   )rC   r+   rp   rW   r   r   rP   A  s   
zRMSNorm.__init__c                 C   s$   |t t j|| ddd| j  S )Nr.   T)r+   keepdim)r   rsqrtr   rp   r   r   r   r   _normF  s   $zRMSNorm._normr   r"   c                 C   s   |  | |}|| j S ri   )r   rG   type_asr|   )rC   r   outputr   r   r   r   I  s   
zRMSNorm.forward)r4   )
r   r   r   rB   rG   rP   r   r   r   rf   r   r   rW   r   rv   @  s    rv   c                	       sN   e Zd Z		ddedeeef deddf fdd	Zd
edefddZ	  Z
S )r   {Gz?Fr+   init_valuesr   r"   Nc                    s*   t    || _t|t| | _d S ri   )rO   rP   r   r   r   r   rz   gamma)rC   r+   r   r   rW   r   r   rP   O  s   
zLayerScale.__init__r   c                 C   s   | j r	|| jS || j S ri   )r   mul_r   r   r   r   r   r   Y  s   zLayerScale.forward)r   F)r   r   r   rB   r   rG   r   rH   rP   r   rf   r   r   rW   r   r   N  s    

r   c                       s   e Zd ZdZ				ddededee ded	ej	f
 fd
dZ
	ddedee defddZ	ddedee defddZ	ddedee def fddZ  ZS )WindowLimitedTransformerz<
    Transformer with window limited attention, causal.
    r*   NTrh   	input_dimwindow_sizecausallook_ahead_convc                    s   t  | || _|| _|j| _|d ur|nt | _||jkr't	||jnt | _
||jkr;t	|j|| _d S t | _d S ri   )rO   rP   r   r   r8   r   Identityr   r+   r   
input_projoutput_proj)rC   rh   r   r   r   r   rW   r   r   rP   b  s   

z!WindowLimitedTransformer.__init__
max_lengthx_lensr"   c                 C   sx   | j r2tt||}t|dd}| jp|}|| d jdd}t|}||k| @ }nt	| d }|S )z=
        Make mask to form window limited attention.
        r.   rY   r   )minr   )
r   r   ry   rz   r   r   r   r   rH   NotImplementedError)rC   r   r   r   row_indicesr   valid_rangecolumn_indicesr   r   r   make_window_limited_mask|  s   

z1WindowLimitedTransformer.make_window_limited_maskc                 C   sd   | j rtt||}nt||}| d }t|D ]\}}d|d||f< q| d }|S )zE
        Make ordinary mask if window size is not specified.
        r   r   N)r   r   ry   rz   rH   r   )rC   r   r   r   r   x_lenr   r   r   	make_mask  s   z"WindowLimitedTransformer.make_maskr   c                    s   | j r	|dd}| |}| |}tj|jd |jd}|jd }| jd ur/| 	||}n| 
||}||j}t |||}| |}| j rQ|dd}|S )NrY   r>   r   )r8   r   r   r   r   r   rZ   r}   r   r   r   r~   rO   r   r   )rC   r   r   r\   r   r   rW   r   r   r     s   




z WindowLimitedTransformer.forward)r*   NTNri   )r   r   r   __doc__r%   rB   r   rH   r   ModulerP   r   r   r   r   rf   r   r   rW   r   r   ]  sR    

r   r2   seq_lenn_elembaserM   c                 C   sz   d|t d|dd |d   |   }t j| |jd}t ||}t t ||}t j|j|j	gdd}|j
|dS )Nr   r   r>   r   r.   r   rL   )r   r   rG   r}   outerpolar	ones_likestackrealimagr~   )r   r   r   rM   freqstrq   cacher   r   r   rx     s   $rx   r   rq   c                 C   s   |   jg | jd d ddR  }|d|dd|dd}t|d |d  |d |d   |d |d  |d |d   gd}|d}|| S )Nr.   r>   rY   r?   ).r   ).rY   )	rG   reshaperZ   r   sizer   r   flattenr   )r   rq   xshapedx_out2r   r   r   r     s   &

r   c                 C   s6   t | tjrtjj| jdd tj| jd d S d S )Nr   )r   r   )
isinstancer   Conv1dr   trunc_normal_r|   	constant_r   )mr   r   r   init_weights  s   r
  paddingsc                 C   sX   |\}}|dkr|dksJ ||f|| | j d ksJ | j d | }| d||f S )zCRemove padding from x, handling properly zero padding. Only for 1d!r   r.   .)rZ   )r   r  padding_leftpadding_rightendr   r   r   unpad1d  s
   r  kernel_sizestridepadding_totalc                 C   s@   | j d }|| | | d }t|d | ||  }|| S )zSee `pad_for_conv1d`.r.   rY   )rZ   r   ceil)r   r  r  r  lengthn_framesideal_lengthr   r   r   get_extra_padding_for_conv1d  s   
r  rR   r   modevaluec                 C   s   | j d }|\}}|dkr|dksJ ||f|dkrKt||}d}||kr4|| d }t| d|f} t| |||}	|	j d | }
|	dd|
f S t| |||S )zTiny wrapper around F.pad, just to allow for reflect padding on small input.
    If this is the case, we insert extra 0 padding to the right
    before the reflection happen.
    r.   r   reflectrY   .N)rZ   r[   r   pad)r   r  r  r  r  r  r  max_pad	extra_padpaddedr  r   r   r   pad1d  s   


r  c                       s@   e Zd Z				d fdd	Zdd Zdd	d
Zdd Z  ZS )CausalConvNetrY   Nc                    sV   t t|   tj||||||d| _|| _|d | d | _|| _| j| j | _	d S )N)r  dilationgroupsrY   )
rO   r   rP   r   r  convr  r  r!  padding)rC   in_channelsout_channelsr  r!  r  r"  r$  rW   r   r   rP   
  s   
zCausalConvNet.__init__c                 C   s:   | j }t|| j| j|}t|||fddd}| | S )Nconstantr   )r  r  )r$  r  r  r  r  r#  r   )rC   r   r  extra_paddingr   r   r   r   "  s   zCausalConvNet.forwardr|   r   c                 C      t | j||d| _| S N)namer+   r   r#  rC   r+  r+   r   r   r   r   *     zCausalConvNet.weight_normc                 C      t | j| _| S ri   r   r#  rC   r   r   r   remove_weight_norm.     z CausalConvNet.remove_weight_norm)rY   rY   rY   Nr|   r   r   r   r   rP   r   r   r2  rf   r   r   rW   r   r   	  s    
r   c                       s:   e Zd Z	d fdd	Zdd Zdd	d
Zdd Z  ZS )CausalTransConvNetrY   Nc                    s4   t t|   tj|||||d| _|| _|| _d S )N)r  r!  )rO   r6  rP   r   ConvTranspose1dr#  r  r  )rC   r%  r&  r  r!  r  r$  rW   r   r   rP   4  s   

zCausalTransConvNet.__init__c                 C   s>   |  |}| j| j }t|}|| }t|||f}| S ri   )r#  r  r  r   r  r  r   )rC   r   r  r  r  r   r   r   r   >  s   

zCausalTransConvNet.forwardr|   r   c                 C   r)  r*  r,  r-  r   r   r   r   F  r.  zCausalTransConvNet.weight_normc                 C   r/  ri   r0  r1  r   r   r   r2  J  r3  z%CausalTransConvNet.remove_weight_norm)rY   rY   Nr4  r5  r   r   rW   r   r6  3  s    

r6  c                  O      t | i | S ri   )r   r   argskwargsr   r   r   CausalWNConv1dO     r<  c                  O   r8  ri   )r6  r   r9  r   r   r   CausalWNConvTranspose1dS  r=  r>  c                       s4   e Zd Zddededef fddZd	d
 Z  ZS )ResidualUnit   rY   Fr+   r!  r   c              
      s^   t    |r	tnt}d| d }tt||||d||dt||||dd| _|| _d S )N   r>      )r  r!  r$  rY   )r  )	rO   rP   r<  r
   r   
Sequentialr	   blockr   )rC   r+   r!  r   
conv_classr  rW   r   r   rP   X  s   

zResidualUnit.__init__c                 C   sh   |  |}|jd |jd  }|dkr0| jr#|dd | f }|| S |d|d | d f }|| S )Nr.   r   .r>   )rD  rZ   r   )rC   r   r   r  r   r   r   r   d  s   
zResidualUnit.forward)r@  rY   Fr   r   r   rB   rH   rP   r   rf   r   r   rW   r   r?  W  s    r?  c                	       sB   e Zd Z					ddededed	ef fd
dZdd Z  ZS )EncoderBlockr@  rY   Fr   Nr+   r  r   	n_t_layerc                    s   t    |r	tnt}|dkrt nt||d|||d ||d dd}tt|d d|d	t|d d|d	t|d d
|d	t	|d ||d |d| |t
|d d|| _d S )Nr   r*   r0   r?   r(   r)   r+   r-   r   r   r   rh   r>   rY   r!  r   	   r  r  r$  )rO   rP   r<  r
   r   r   r   rC  r?  r	   r   r  rD  )rC   r+   r  r   rH  transformer_general_configrE  transformer_modulerW   r   r   rP   p  s<   


zEncoderBlock.__init__c                 C   
   |  |S ri   rD  r   r   r   r   r        
zEncoderBlock.forward)r@  rY   Fr   NrF  r   r   rW   r   rG  o  s     *rG  c                       sT   e Zd Zdg ddg dddfdededed	ed
edef fddZdd Z  Z	S )Encoderr0   r>   r=   r'   r'   )r   r   r=   r=   NFd_modelstridesd_latentn_transformer_layersrN  r   c           
   
      s   t    |r	tnt}|d|dddg| _t||D ]\}}	|d9 }|  jt||||	|dg7  _q|  jt||||dddg7  _tj	| j | _|| _
d S )NrY   rB  r?   r  r$  r>   )r  r   rH  rN  )rO   rP   r<  r
   rD  ziprG  r	   r   rC  enc_dim)
rC   rU  rV  rW  rX  rN  r   rE  r  rH  rW   r   r   rP     s(   
	

zEncoder.__init__c                 C   rP  ri   rQ  r   r   r   r   r     rR  zEncoder.forward)
r   r   r   rB   listr%   rH   rP   r   rf   r   r   rW   r   rS    s*    %rS  c                       sH   e Zd Z						ddeded	ed
edef
 fddZdd Z  ZS )DecoderBlockr@  r'   rY   Fr   Nr   
output_dimr  r   rH  c           	         s   t    |r	tnt}|dkrt nt||d |||d ||d dd}tt||||d| |t	
|d dt|d|d	t|d|d	t|d
|d	| _d S )Nr   r0   r?   rI  rJ  r>   rM  rY   rK  rL  )rO   rP   r>  r   r   r   r   rC  r	   r   r  r?  rD  )	rC   r   r^  r  r   rH  rN  conv_trans_classrO  rW   r   r   rP     s:   
	
zDecoderBlock.__init__c                 C   rP  ri   rQ  r   r   r   r   r     rR  zDecoderBlock.forward)r@  r'   rY   Fr   NrF  r   r   rW   r   r]    s&    +r]  c                       s@   e Zd Zddg ddfdededef fdd	Zd
d Z  ZS )DecoderrY   Fr   r   r   r   Nd_outr   rX  c              
      s   t    |r	tnt}|||dddg}	tt||D ]!\}
\}}|d|
  }|d|
d   }|	t||||||dg7 }	q|	t||||dddt	 g7 }	tj
|	 | _d S )NrB  r?   rY  r>   rY   )r   rH  rN  )rO   rP   r<  r
   r   rZ  r]  r	   r   TanhrC  model)rC   input_channelchannelsratesrb  r   rX  rN  rE  ru   r   r  rH  r   r^  rW   r   r   rP     s,   

zDecoder.__init__c                 C   rP  ri   )rd  r   r   r   r   r      rR  zDecoder.forward)	r   r   r   rB   rH   r\  rP   r   rf   r   r   rW   r   r`    s    'r`  c                       s   e Zd Zdg dddg ddddg dg ddfd	ed
ee dededee dejjdededee dee f fddZ	dd Z
		d#dejdejdefddZdejfddZ				d$dejdejd ejdedef
d!d"Z  ZS )%DACr0   rT  Nr,   )r'   r'   r=   r>   iD  Tra  encoder_dimencoder_rates
latent_dimdecoder_dimdecoder_rates	quantizersample_rater   encoder_transformer_layersdecoder_transformer_layersc                    s   t    || _|| _|| _|| _|| _|d u r |dt|  }|| _t	
|| _t|||||	|d| _|| _t|||||
|d| _|| _| t |  | _| jd | _d S )Nr>   )r   rX  rN  r=   )rO   rP   ri  rj  rl  rm  ro  lenrk  npprod
hop_lengthrS  encoderrn  r`  decoderapplyr
  	get_delaydelayframe_length)rC   ri  rj  rk  rl  rm  rn  ro  r   rp  rq  rN  rW   r   r   rP   %  s>   
	

zDAC.__init__c                 C   sV   |d u r| j }|| j ksJ |jd }t|| j | j | }tj|d|f}|S )Nr.   r   )ro  rZ   r   r  ru  r   r   r  )rC   
audio_dataro  r  	right_padr   r   r   
preprocess[  s   
zDAC.preprocessr|  audio_lengthsn_quantizersc                 K   s   |j dkr
|d}|jd }t|| j | j | }tj|d|f}|du r5t	
|| g|j}| |}| j||fi |}|j}	t	|| j  }
|	|
fS )aa  Encode given audio data and return quantized latent codes

        Parameters
        ----------
        audio_data : Tensor[B x T]
            Audio data to encode
        n_quantizers : int, optional
            Number of quantizers to use, by default None
            If None, all quantizers are used.

        Returns
        -------
        dict
            A dictionary with the following keys:
            "z" : Tensor[B x D x T]
                Quantized continuous representation of input
            "codes" : Tensor[B x N x T]
                Codebook indices for each codebook
                (quantized discrete representation of input)
            "latents" : Tensor[B x N*D x T]
                Projected latents (continuous representation of input before quantization)
            "vq/commitment_loss" : Tensor[1]
                Commitment loss to train encoder to predict vectors closer to codebook
                entries
            "vq/codebook_loss" : Tensor[1]
                Codebook loss to update the codebook
            "length" : int
                Number of samples in input audio
        r>   rY   r.   r   N)ndimr   rZ   r   r  r{  r   r   r  r   
LongTensorr~   r}   rv  rn  r   long)rC   r|  r  r  r;  r  r}  r   
vq_resultsindicesindices_lensr   r   r   encodef  s   
%


z
DAC.encoder  c                 C   s6   |j dkr	|d  }| j|}|| j }| ||fS )Nr>   )r  rn  decoder{  rw  )rC   r  feature_lengthsr   r  r   r   r   r    s
   

z
DAC.decodetemplater   c                 K   s`   |j d }| ||}| j||fi |}t|tr|d n|j}	| |	}
|
dd|f |fS )a%  Model forward pass

        Parameters
        ----------
        audio_data : Tensor[B x 1 x T]
            Audio data to encode
        sample_rate : int, optional
            Sample rate of audio data in Hz, by default None
            If None, defaults to `self.sample_rate`
        n_quantizers : int, optional
            Number of quantizers to use, by default None.
            If None, all quantizers are used.

        Returns
        -------
        dict
            A dictionary with the following keys:
            "z" : Tensor[B x D x T]
                Quantized continuous representation of input
            "codes" : Tensor[B x N x T]
                Codebook indices for each codebook
                (quantized discrete representation of input)
            "latents" : Tensor[B x N*D x T]
                Projected latents (continuous representation of input before quantization)
            "vq/commitment_loss" : Tensor[1]
                Commitment loss to train encoder to predict vectors closer to codebook
                entries
            "vq/codebook_loss" : Tensor[1]
                Codebook loss to update the codebook
            "length" : int
                Number of samples in input audio
            "audio" : Tensor[B x 1 x length]
                Decoded audio data.
        r.   r   .N)rZ   r~  r  r  tupler   r  )rC   r|  r  r   ro  r  r;  r  r  r   r   r   r   r   r     s   
+
zDAC.forwardr   )NNNN)r   r   r   rB   r   r   r   r   rH   rP   r~  r   r  r  r   rf   r   r   rW   r   rh  $  sv    	
6
4rh  )r   )rR   r   )Fr   typingtpdataclassesr   r   r   r   hydralibrosanumpyrs  	soundfilesfr   
audiotoolsr   audiotools.mlr   dac.model.baser   dac.nn.layersr	   r
   r   	omegaconfr   r   r   torch.nnr   r   torch.nn.utils.parametrizationsr   torch.nn.utils.parametrizer   r   rB   r$   r%   r   rJ   rg   rj   r   r   rv   r   r   re   rM   rx   r   r
  Tupler  r  rI   rG   r  r   r6  r<  r>  r?  rG  rS  r]  r`  rh  r   r   r   r   <module>   s    	Nn^



*/*0,