o
    ꁱi                     @   s*  d dl Z d dlZd dlmZ d dlmZmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlZd dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
dl m!Z!m"Z" e#e$Z%d dl&Z&z d dl'm(Z( dZ)e%*d e+e&,ddd krdZ)e%-d W n e.y   dZ)e%-d Y nw G dd dej/Z0G dd dej1Z2G dd de2Z3e4g dZ5dNdej1de6dej1fdd Z7dOdej1d!e8de6dej1fd"d#Z9	 dPd$ej:d%e+d&e+d'e+de+f
d(d)Z;dQd$ej:d,eje+e+f d-e6d.e<fd/d0Z=d$ej:d,eje+e+f fd1d2Z>G d3d4 d4ej1Z?G d5d6 d6ej1Z@G d7d8 d8ZAG d9d: d:ej1ZBG d;d< d<ej1ZCG d=d> d>ej1ZDG d?d@ d@ej1ZEG dAdB dBej1ZFG dCdD dDej1ZGG dEdF dFej1ZHeG dGdH dHZIG dIdJ dJeZJG dKdL dLeZKeLe!eJ eLe"eK g dMZMdS )R    N)partial)	dataclassfield)DictListOptionalTupleUnion)	AutoModel)PretrainedConfig)logging)PreTrainedModel)ACT2FN   ) VibeVoiceAcousticTokenizerConfig VibeVoiceSemanticTokenizerConfig)fused_rms_norm_affineTz@APEX FusedRMSNorm is available and will be used for optimizationOPTIMIZE_FOR_SPEED0FzJAPEX FusedRMSNorm is disabled by environment variable OPTIMIZE_FOR_SPEED=0z<APEX FusedRMSNorm not available, using native implementationc                       sB   e Zd ZdZdejeeje ej	f f fddZ
dd Z  ZS )ConvLayerNormz
    Convolution-friendly LayerNorm that moves channels to last dimensions
    before running the normalization and moves them back to original position right after.
    normalized_shapec                    s   t  j|fi | d S Nsuper__init__)selfr   kwargs	__class__ Q/home/ubuntu/vibevoice-community/vibevoice/modular/modular_vibevoice_tokenizer.pyr   ,   s   zConvLayerNorm.__init__c                 C   sJ   | dd}tj| | j| j | j | j	|}| dd}|S Nr      )
	transposenn
functional
layer_normfloatr   weightbiasepstype_asr   xr   r   r    forward/   s   .zConvLayerNorm.forward)__name__
__module____qualname____doc__tpr	   intr   torchSizer   r.   __classcell__r   r   r   r    r   '   s    &r   c                       sF   e Zd Zddedef fddZdd	 Zd
d ZdefddZ	  Z
S )RMSNormh㈵>TNdimr*   c                    sZ   t    || _|| _|| _| jr%|d u r|fn|}tt|| _	d S | 
dd  d S )Nr(   )r   r   r:   r*   elementwise_affiner$   	Parameterr5   onesr(   register_parameterr   r:   r*   r;   weight_shaper   r   r    r   6   s   
zRMSNorm.__init__c                 C   s$   |t |djddd| j  S )Nr"   T)keepdim)r5   rsqrtpowmeanr*   r,   r   r   r    _normA   s   $zRMSNorm._normc                 C   s,   |  | |}| jd ur|| j }|S r   )rF   r'   r+   r(   r   r-   outputr   r   r    r.   D   s   

zRMSNorm.forwardreturnc                 C   s   d| j  d| j d| j S )Nzdim=z, eps=z, elementwise_affine=)r:   r*   r;   r   r   r   r    
extra_reprJ   s   zRMSNorm.extra_reprr9   TN)r/   r0   r1   r4   r'   r   rF   r.   strrK   r7   r   r   r   r    r8   5   s
    r8   c                       s0   e Zd Zd
dedef fddZdd	 Z  ZS )ConvRMSNormr9   TNr:   r*   c                    s   t  |||| d S r   r   r?   r   r   r    r   N   s   zConvRMSNorm.__init__c                 C   sf   | dd}tr| js | | |}| jd ur|| j }nt|| j| jj| j	}| dd}|S r!   )
r#   APEX_AVAILABLEr;   rF   r'   r+   r(   r   shaper*   rG   r   r   r    r.   Q   s   


zConvRMSNorm.forwardrL   )r/   r0   r1   r4   r'   r   r.   r7   r   r   r   r    rN   M   s    rN   )noneweight_normspectral_normtime_layer_normr&   time_group_normrQ   modulenormrI   c                 C   s8   |t v sJ |dkrtj| S |dkrtj| S | S )NrR   rS   )CONV_NORMALIZATIONSr$   utilsrR   rS   )rV   rW   r   r   r    apply_parametrization_normb   s   rZ   causalc                 K   s   |t v sJ |dkrt| tjjjsJ t| jfi |S |dkr<|r'tdt| tjjjs1J tj	d| jfi |S t
 S )zReturn the proper normalization module. If causal is True, this will ensure the returned
    module is causal, or return an error if the normalization doesn't support causal evaluation.
    r&   rU   z,GroupNorm doesn't support causal evaluation.r   )rX   
isinstancer$   modulesconv_ConvNdr   out_channels
ValueError	GroupNormIdentity)rV   r[   rW   norm_kwargsr   r   r    get_norm_modulen   s   re   r-   kernel_sizestridepadding_totalc                 C   s@   | j d }|| | | d }t|d | ||  }|| S )zMCalculate extra padding needed for convolution to have the same output lengthrA   r   )rP   mathceil)r-   rf   rg   rh   lengthn_framesideal_lengthr   r   r    get_extra_padding_for_conv1d   s   
rn   zero        paddingsmodevaluec                 C   s   | j d }|\}}|dkr|dksJ ||f|dkrKt||}d}||kr4|| d }t| d|f} t| |||}	|	j d | }
|	dd|
f S t| |||S )z;Pad 1D input with handling for small inputs in reflect moderA   r   reflectr   .N)rP   maxFpad)r-   rq   rr   rs   rk   padding_leftpadding_rightmax_pad	extra_padpaddedendr   r   r    pad1d   s   

r~   c                 C   sX   |\}}|dkr|dksJ ||f|| | j d ksJ | j d | }| d||f S )zCRemove padding from x, handling properly zero padding. Only for 1d!r   rA   .)rP   )r-   rq   rx   ry   r}   r   r   r    unpad1d   s
   r   c                	       L   e Zd ZdZddi ddededejeejf f fdd	Z	d
d Z
  ZS )
NormConv1dz<Wrapper around Conv1d and normalization applied to this convFrQ   r[   rW   rd   r[   rW   rd   c                   D   t    ttj|i ||| _t| j||fi || _|| _d S r   )	r   r   rZ   r$   Conv1dr^   re   rW   	norm_typer   r[   rW   rd   argsr   r   r   r    r         

zNormConv1d.__init__c                 C      |  |}| |}|S r   )r^   rW   r,   r   r   r    r.         

zNormConv1d.forwardr/   r0   r1   r2   boolrM   r3   r   Anyr   r.   r7   r   r   r   r    r          r   c                	       r   )NormConvTranspose1dzEWrapper around ConvTranspose1d and normalization applied to this convFrQ   r   r[   rW   rd   c                   r   r   )	r   r   rZ   r$   ConvTranspose1dconvtrre   rW   r   r   r   r   r    r      r   zNormConvTranspose1d.__init__c                 C   r   r   )r   rW   r,   r   r   r    r.      r   zNormConvTranspose1d.forwardr   r   r   r   r    r      r   r   c                   @   s~   e Zd ZdZdd Zdedejdeej fddZ	dedejd	ejfd
dZ
dejfddZddee deej fddZdS ) VibeVoiceTokenizerStreamingCachezACache for streaming convolution, similar to KV cache in attentionc                 C   s
   i | _ d S r   )cacherJ   r   r   r    r         
z)VibeVoiceTokenizerStreamingCache.__init__layer_idsample_indicesrI   c                 C   s   g }d}|  D ] }||f}|| jvr dS | j| }|| t||jd }qt|dkri|d  dkrig }|D ]&}|jd |k r\||jd  }	tj||	dfddd}
||
 q;|| q;t	j
|ddS t	j
|ddS )z4Get cached states for given layer and sample indicesr   NrA   r"   constantrr   rs   r:   )tolistr   appendru   rP   lenr:   rv   rw   r5   stack)r   r   r   states
max_lengthidxkeystatepadded_statespad_sizepadded_stater   r   r    get   s&   


z$VibeVoiceTokenizerStreamingCache.getr   c                 C   s4   t | D ]\}}||f}||  | j|< qdS )z4Set cached states for given layer and sample indicesN)	enumerater   detachr   )r   r   r   r   ir   r   r   r   r    set   s   z$VibeVoiceTokenizerStreamingCache.setc                 C   sF   t | j D ]}|\}}|| v r | j| }t|| j|< qdS )z6Set all cached states to zero for given sample indicesN)listr   keysr   r5   
zeros_like)r   r   r   r   
sample_idxcached_tensorr   r   r    set_to_zero   s   
z,VibeVoiceTokenizerStreamingCache.set_to_zeroNc                    s    du r|du r| j   dS  dur.|du r. fdd| j  D }|D ]}| j |= q%dS  durH|durJ| D ]} |f}| j |d q:dS dS dS )z4Clear cache for specific layer/samples or everythingNc                    s   g | ]
}|d   kr|qS r   r   ).0kr   r   r    
<listcomp>   s    z:VibeVoiceTokenizerStreamingCache.clear.<locals>.<listcomp>)r   clearr   r   pop)r   r   r   keys_to_remover   r   r   r   r   r    r      s   
z&VibeVoiceTokenizerStreamingCache.clear)NN)r/   r0   r1   r2   r   rM   r5   Tensorr   r   r   r   r   r   r   r   r    r      s    "	r   c                       s   e Zd ZdZddddddi dfdeded	ed
ededededededejeej	f def fddZ
edd Z				d#dejdee deej dededejfddZ	d$dejdedejdedejf
dd Zd$dejdedejfd!d"Z  ZS )%SConv1dzPConv1d with built-in handling of asymmetric or causal padding and normalization.r   TFrQ   rt   in_channelsr`   rf   rg   dilationgroupsr)   r[   rW   rd   pad_modec                    s   t    t|||||||||	|
d
| _|| _|| _|| _|| _|| _|| _	|| _
|d | |d  | _|d | |d  | _d | _d S )N)r   r   r)   r[   rW   rd   r   )r   r   r   r^   r[   r   rf   r   rg   r   r`   context_sizerh   	_layer_id)r   r   r`   rf   rg   r   r   r)   r[   rW   rd   r   r   r   r    r     s   


zSConv1d.__init__c                 C       | j d u rdt|  | _ | j S )Nsconv1d_r   idrJ   r   r   r    r   "     
zSConv1d.layer_idNr-   r   r   	use_cachedebugrI   c           	      C   sh   |j \}}}|r|du r| j||dS | jsJ d|dus"J dt||ks,J d| ||||S )a  
        Forward pass with optional streaming support via cache.
        
        Args:
            x: Input tensor [batch_size, channels, time]
            cache: VibeVoiceTokenizerStreamingCache object for maintaining states
            sample_indices: Indices identifying each sample for cache management
            use_cache: Whether to use cached states for streaming
            debug: Whether to print debug information
            
        Returns:
            Output tensor
        Nr   z8Streaming mode is only supported for causal convolutions2sample_indices must be provided for streaming mode$sample_indices must match batch size)rP   _forward_non_streamingr[   r   _forward_streaming	r   r-   r   r   r   r   BCTr   r   r    r.   (  s   zSConv1d.forwardc                 C   s`  |j \}}}|| j|}|du rD| jdkr2tj||| j|j|jd}|r1td|j  d| j  ntj||d|j|jd}|rDtd |j d dkrUtj	||gdd}	n|}	|ritd	|j  d
|j  d|	j   | 
|	}
|rxtd|
j   | jdkr|	j d }|| jkr|| j }|	dddd|df }n|	}|rtd|j   || j|| |
S )MStreaming forward pass with cache operations kept separate from compiled codeNr   devicedtypez&[DEBUG] Initialized cache with shape: z, context_size=z.[DEBUG] No context needed (kernel_size=stride)r"   r   [DEBUG] Input shape: , Cache shape: , Combined: z[DEBUG] Output shape: [DEBUG] New cache shape: )rP   r   r   r   r5   zerosr   r   printcatr^   r   )r   r-   r   r   r   r   r   r   cached_statesinput_with_contextrH   total_input_lengthnew_cache_start	new_cacher   r   r    r   G  s:   
 




zSConv1d._forward_streamingc                 C   s   |j \}}}| j}| j}| j}| j}	t||||	}
|r)td|j  d|	 d|
  | jrH| jdkr=t	||	|
f| jdd}nt	||	|
f| jd}n|	d }|	| }t	||||
 f| jd}|rftd	|j   | 
|}|rutd
|j   |S )'Standard forward pass without streaming#[DEBUG NON-STREAMING] Input shape: z, padding_total=z, extra_padding=r   r   r   )rr   r"   z%[DEBUG NON-STREAMING] After padding: z$[DEBUG NON-STREAMING] Output shape: )rP   rf   rg   r   rh   rn   r   r[   r   r~   r^   )r   r-   r   r   r   r   rf   rg   r   rh   extra_paddingry   rx   rH   r   r   r    r     s*   

zSConv1d._forward_non_streamingNNFFF)r/   r0   r1   r2   r4   r   rM   r3   r   r   r   propertyr   r5   r   r   r   r.   r   r   r7   r   r   r   r    r     sj    


"

$9r   c                       s   e Zd ZdZddddi dfdeded	ed
ededededej	eej
f def fddZedd Z				d!dejdee deej dededejfddZ	d"dejdedejdedejf
ddZd"dejdedejfdd Z  ZS )#SConvTranspose1dzYConvTranspose1d with built-in handling of asymmetric or causal padding and normalization.r   FrQ         ?Tr   r`   rf   rg   r[   rW   trim_right_ratiord   r)   c
           
   
      s   t    t||||||||	d| _|| _|| _| js$| jdks$J d| jdkr.| jdks0J || _|| _|| _|| _	|| | _
|d | _d | _d S )N)r[   rW   rd   r)   r   zB`trim_right_ratio` != 1.0 only makes sense for causal convolutionsrp   r   )r   r   r   r   r[   r   rf   rg   r   r`   rh   r   r   )
r   r   r`   rf   rg   r[   rW   r   rd   r)   r   r   r    r     s"   




zSConvTranspose1d.__init__c                 C   r   )N
sconvtr1d_r   rJ   r   r   r    r     r   zSConvTranspose1d.layer_idNr-   r   r   r   r   rI   c           	      C   sZ   |j \}}}|r|du r| j||dS |dusJ dt||ks%J d| ||||S )zI
        Forward pass with optional streaming support via cache.
        Nr   r   r   )rP   r   r   r   r   r   r   r    r.     s   zSConvTranspose1d.forwardc                 C   s  |j \}}}|| j|}|du r#tj||d|j|jd}|r#td tj||gdd}	|r>td|j  d|j  d	|	j   | 	|	}
|rMtd
|
j   | j
r_t| j| j }| j| }n
| jd }| j| }|| dkrvt|
||f}
|rtd|
j   |j d dkr|
}n|| j }|
j d |kr|
dddd| df }n|
}|rtd|j   |	j d | jkr|	dddd| j df }n|	}|rtd|j   || j|| |S )r   Nr   r   z3[DEBUG] Initialized empty cache for transposed convr"   r   r   r   r   z+[DEBUG] Full transposed conv output shape: z[DEBUG] After unpadding: z&[DEBUG] Final streaming output shape: r   )rP   r   r   r5   r   r   r   r   r   r   r[   ri   rj   rh   r   r   rg   r   r   )r   r-   r   r   r   r   r   r   cached_input
full_inputfull_outputry   rx   rH   expected_new_outputr   r   r   r    r     sH    



 z#SConvTranspose1d._forward_streamingc                 C   s   |r
t d|j  | |}|rt d|j  | jr+t| j| j }| j| }n
| jd }| j| }|| dkrBt|||f}|rLt d|j  |S )r   r   z-[DEBUG NON-STREAMING] After transposed conv: r"   r   z*[DEBUG NON-STREAMING] Final output shape: )	r   rP   r   r[   ri   rj   rh   r   r   )r   r-   r   yry   rx   r   r   r    r   '  s   


z'SConvTranspose1d._forward_non_streamingr   r   )r/   r0   r1   r2   r4   r   rM   r'   r3   r   r   r   r   r   r5   r   r   r   r.   r   r   r7   r   r   r   r    r     s`    




$Ir   c                       s(   e Zd Z	d fdd	Zdd Z  ZS )FFNFc                    sF   t    || _tj| j||d| _td | _tj|| j|d| _d S )Nr)   gelu)	r   r   	embed_dimr$   Linearlinear1r   r   linear2)r   r   ffn_dimr)   r   r   r    r   D  s
   

zFFN.__init__c                 C   s"   |  |}| |}| |}|S r   )r   r   r   r,   r   r   r    r.   P  s   


zFFN.forwardr   r/   r0   r1   r   r.   r7   r   r   r   r    r   C  s    r   c                       s4   e Zd Z							d	 fdd	Zdd Z  ZS )
	Convlayerr   Tr   rR   c                    s,   t    t|||||||||	|
d
| _d S )N)rg   r   r   r)   r   rW   r[   )r   r   r   r^   )r   r   r`   rf   rg   r   r   r)   r   rW   r[   r   r   r    r   X  s   

zConvlayer.__init__c                 C   s
   |  |S r   )r^   r,   r   r   r    r.   i  r   zConvlayer.forward)r   r   r   Tr   rR   Tr   r   r   r   r    r   W  s    r   c                       s*   e Zd Z		d	 fdd	Zdd Z  ZS )
Block1D   rp   r^   ư>c                    s  t    |dddkr$t||ddd| _t||ddd| _n|dddkrBt||ddd| _t||ddd| _|dkrht|||dd	||d
d|dd|dd|ddd| _n)|dkrt|||||d
d|dd|dd|ddd| _nt	d| t
||dd| |ddd| _|dkrt ntj|| _|dkrtj|t| dd| _tj|t| dd| _d S d | _d | _d S )N	layernormLNr*   r   r*   r8   r^   r   r   r   rt   rW   rQ   r[   Tr)   )r   rf   r   rW   r[   r)   depthwise_convzUnsupported mixer layer: ffn_expansion   Fr   rp   r   )requires_grad)r   r   r   r   rW   ffn_normrN   r   mixerra   r   ffnr$   rc   r]   DropPath	drop_pathr<   r5   r=   gamma	ffn_gamma)r   r:   rf   r  mixer_layerlayer_scale_init_valuer   r   r   r    r   m  sF   












zBlock1D.__init__c                 C   s   |}|  |}| |}| jd ur|| jd }|| | }|}| |}|ddd}| |}|ddd}| jd urG|| jd }|| | }|S )NrA   r   r"   r   )	rW   r  r	  	unsqueezer  r  permuter  r
  )r   r-   residualr   r   r    r.     s   





zBlock1D.forward)r   rp   r^   r   r   r   r   r   r    r   l  s
    ,r   c                       4   e Zd ZdZ fddZd
ddZd
dd	Z  ZS )TokenizerEncoderz
    Encoder component for the VibeVoice tokenizer that converts audio to latent representations.
    
    Args:
        config: Configuration object with model parameters
    c                    s  t    |j| _|j| _|j| _tt|j| _|j| _t	|dd| _
t| j| _|j| _t	|dd}t	|dd}t	|dd}t	|di }t	|d	d
}t	|dd}t	|dd}t	|dd}	t	|dd}
t	|dd}t	|dd}t	|dd}t	|dd}|dkrt}n|dkrtt|
d}ntd| tt| j| j|||| j||d}t | _| j| tt| jD ]0}| jd|  | jd|d   }tt|| j| d | j| | j|||d}| j| qtt|||	| j||||d 	t | _d!d" td|t| jD d tt| jD ]+}| jd|  tj fd#d"t| j| D  }| j|  | j| 7  q|sM||	d$| _ nt! | _ t| j|| j|||d%| _"d S )&Nn_residual_layersr   rf   r   last_kernel_sizerW   rQ   norm_paramsr   rt   r)   Tr   r   layernorm_epsr   layernorm_elementwise_affinedrop_path_raterp   r  r^   r  r   disable_last_normFr8   r;   Unsupported norm type: rW   rd   r[   r   r)   r"   )rf   rg   r[   r   rW   r)   r  r   r*   r[   r   rW   r)   r  c                 S      g | ]}|  qS r   itemr   r-   r   r   r    r         z-TokenizerEncoder.__init__.<locals>.<listcomp>c                        g | ]} |  d qS )r:   r  r   r   jcurdp_ratesin_ch
layer_typer   r    r          r   rf   r[   r   rW   r)   )#r   r   channels	dimension	n_filtersr   reversedratiosdepthsgetattrr  npprod
hop_lengthr[   r   r   rN   ra   r$   
Sequentialr   
ModuleListdownsample_layersr   ranger   r   stagesr5   linspacesumrW   rc   head)r   configrf   r  rW   r  r   r)   r   r  r  r  r  r  r  r   stemr   out_chdownsample_layerstager   r&  r    r     s   

(
 
 zTokenizerEncoder.__init__NFc           
   	   C   s.  t t| jD ]}| j| D ]}t|tr||||||d}q||}q| j| D ]g}t|drt|jdrt|jj	tr|}	|
|}|jj	|||||d}|jd ur\||jd }|	| }|}	||}|ddd}||}|ddd}|jd ur||jd }|	| }q)||}q)q| 
|S Nr   r   r   r   r  r^   rA   r   r"   r   )r:  r   r2  r9  r\   r   r;  hasattrr  r^   rW   r	  r  r  r  r  r
  
r   r-   r   r   r   r   r   layerblockr  r   r   r    forward_features  s0   

$







z!TokenizerEncoder.forward_featuresc                 C   ,   | j |||||d}| j|||||d}|S NrE  rJ  r>  r   r-   r   r   r   r   r   r   r    r.   *     zTokenizerEncoder.forwardr   r/   r0   r1   r2   r   rJ  r.   r7   r   r   r   r    r    s
    
R"r  c                       r  )TokenizerDecoderz
    Decoder component for the VibeVoice tokenizer that converts latent representations back to audio.
    
    Args:
        config: Configuration object with model parameters
    c                    s  t    |j| _|j| _|j| _|j| _|j| _t|dd| _t	
| j| _|j| _t|dd}t|dd}t|dd}t|di }t|d	d
}t|dd}t|dd}t|dd}	t|dd}
t|dd}t|dd}t|dd}t|dd}t|dd}|dkrt}n|dkrtt|d}ntd| tt| j| jdt| jd   |||| j||d }t | _| j| tt| jD ]?}| jdt| jd |   | jdt| jd | d   }tt|| j| d | j| |||| j|
d!	}| j| qtt|||	| j||||d"	t | _d#d$ td|t| jD d tt| jD ]2}| jdt| jd |   tj fd%d$t| j| D  }| j|  | j| 7  q0|sn||	d&| _nt  | _t| j|| j|||d'| _!d S )(Nr  r   rf   r   r  rW   rQ   r  r   rt   r)   Tr   r   r  r   r   r   r  r  rp   r  r^   r  r   r  Fr8   r  r  r"   r  )rf   rg   rW   rd   r)   r[   r   r  c                 S   r  r   r  r   r   r   r    r     r!  z-TokenizerDecoder.__init__.<locals>.<listcomp>c                    r"  r#  r   r$  r&  r   r    r     r+  r   r,  )"r   r   r.  r-  r/  r1  r2  r3  r  r4  r5  r6  r[   r   r   rN   ra   r$   r7  r   r   r8  upsample_layersr   r:  r   r   r;  r5   r<  r=  rW   rc   r>  )r   r?  rf   r  rW   r  r   r)   r   r  r   r  r  r  r  r  r   r@  r   rA  upsample_layerrC  r   r&  r    r   7  s   
 

 
 
 zTokenizerDecoder.__init__NFc           
   	   C   s2  t t| jD ]}| j| D ]}t|ttfr!||||||d}q||}q| j| D ]g}t|drt|j	drt|j	j
tr|}	||}|j	j
|||||d}|jd ur^||jd }|	| }|}	||}|ddd}||}|ddd}|jd ur||jd }|	| }q+||}q+q| |S rD  )r:  r   r2  rR  r\   r   r   r;  rF  r  r^   rW   r	  r  r  r  r  r
  rG  r   r   r    rJ    s0   
$







z!TokenizerDecoder.forward_featuresc                 C   rK  rL  rM  rN  r   r   r    r.     rO  zTokenizerDecoder.forwardr   rP  r   r   r   r    rQ  0  s
    
["rQ  c                   @   sP   e Zd ZU dZejed< dZee	e
ejf  ed< dddZdd	 Zd
d ZdS )VibeVoiceTokenizerEncoderOutputa  
    Output of VibeVoice tokenizer encoder, representing a Gaussian distribution with fixed variance.
    
    Args:
        mean (`torch.FloatTensor`): The mean parameters of the distribution.
        std (`float` or `torch.FloatTensor`): Fixed standard deviation value.
    rE   Nstdfixc                 C   s   |dkr| j | jt| j   }|| jfS |dkrX| j d}| jd }tj|| j j| j jd| }| | j  k rI|	d}| | j  k s;| j |t| j   }||fS | j | jfS )aK  
        Sample from the distribution.
        
        Args:
            dist_type (`str`): Sampling method, either 'fix' or 'gaussian'.
                
        Returns:
            `torch.FloatTensor`: Sampled values.
            `torch.FloatTensor` (optional): Standard deviation used (only when dist_type='gaussian').
        rV  gaussianr   g?r   rA   )
rE   rU  r5   
randn_likesizerandnr   r   r:   r  )r   	dist_typer-   
batch_sizers   rU  r   r   r    sample  s   


z&VibeVoiceTokenizerEncoderOutput.samplec                 C   s   t | j}tj| j|ddS )zFCompute KL divergence between this distribution and a standard normal.rQ   )	reduction)r5   r   rE   rv   mse_loss)r   targetr   r   r    kl  s   z"VibeVoiceTokenizerEncoderOutput.klc                 C   s   | j S )z>Return the distribution mode (which is the mean for Gaussian).rE   rJ   r   r   r    rr     s   z$VibeVoiceTokenizerEncoderOutput.mode)rV  )r/   r0   r1   r2   r5   r   __annotations__rU  r   r	   r'   r]  ra  rr   r   r   r   r    rT    s   
 

rT  c                       s   e Zd ZdZeZdZdZdZddgZ	 fddZ
dd	 Ze dddZe dddZe dddZdddZ  ZS )VibeVoiceAcousticTokenizerModelzRVibeVoice speech tokenizer model combining encoder and decoder for acoustic tokensvibevoice_acoustic_tokenizerTr  rQ  c                    s  t  | | jdt|jdd t|dd| _t|j	t
r+dd |j	dD }n|j	}|jd urEt|jt
rEd	d |jdD }ntt|}t|}|j|_|j|_|j|_||_|j|_|j|_|j|_|j|_|j|_|j|_|j |_ |j!|_!t|}|j|_|j"|_|j#|_||_|j|_|j|_|j|_|j|_|j|_|j|_|j |_ |j!|_!t$|| _%t&|| _'| (| j) d S )
Nfix_stdF)
persistentstd_dist_typerV  c                 S      g | ]}t |qS r   r4   r   dr   r   r    r     r!  z<VibeVoiceAcousticTokenizerModel.__init__.<locals>.<listcomp>-c                 S   ri  r   rj  rk  r   r   r    r     r!  )*r   r   register_bufferr5   tensorrf  r3  rh  r\   encoder_depthsrM   splitdecoder_depthsr   r0  copydeepcopyvae_dimr.  encoder_n_filtersr/  encoder_ratiosr1  r2  	conv_normrW   r   	conv_biasr)   r  r  r  r  r  decoder_n_filtersdecoder_ratiosr  encoderrQ  decoderapply_init_weights)r   r?  rp  rr  encoder_configdecoder_configr   r   r    r     sL   



z(VibeVoiceAcousticTokenizerModel.__init__c                 C      t |tjr!tjj|j| jjd |jdurtj	|j dS dS t |tj
r7tj|j tj	|j dS t |tjrVtjj|j| jjd |jdurXtj	|j dS dS dS z Initialize weights for the model)rU  Nr\   r$   r   initnormal_r(   r?  weight_init_valuer)   zeros_	LayerNormones_r   r   rV   r   r   r    r  +     

z-VibeVoiceAcousticTokenizerModel._init_weightsNFc                 C   s,   | j |||||d}t|ddd| jdS )'Convert audio to latent representationsrE  r   r"   r   )rE   rU  )r|  rT  r  rf  r   audior   r   r   r   latentsr   r   r    encode9  s   z&VibeVoiceAcousticTokenizerModel.encodec                 C   sB   |p| j }|dkr|jddS |dkr|jddS td| d)+Sample from the encoder output distributionrV  r[  rW  zUnsupported dist_type: z, expected 'fix' or 'gaussian')rh  r]  ra   r   encoder_outputr[  r   r   r    sampling?  s   
z(VibeVoiceAcousticTokenizerModel.samplingc                 C   s:   |j d | jjkr
n|ddd}| j|||||d}|S )z,Convert latent representations back to audior   r   r"   rE  )rP   r?  ru  r  r}  )r   r  r   r   r   r   r  r   r   r    decodeK  s
   z&VibeVoiceAcousticTokenizerModel.decodec           
      C   s>   | j |||||d}| |\}}| j|||||d}	|	|fS )EFull forward pass: encode audio to latents, then decode back to audiorE  )r  r  r  )
r   r  r   r   r   r   r  sampled_latents_reconstructedr   r   r    r.   V  s   z'VibeVoiceAcousticTokenizerModel.forwardr   r   )r/   r0   r1   r2   r   config_classbase_model_prefix_supports_flash_attn_2_supports_sdpa_no_split_modulesr   r  r5   no_gradr  r  r  r.   r7   r   r   r   r    rd    s     8
rd  c                       sl   e Zd ZdZeZdZdZdZdgZ	 fddZ
dd Ze dddZe dddZdddZ  ZS )VibeVoiceSemanticTokenizerModelzFVibeVoice speech tokenizer model with only encoder for semantic tokensvibevoice_semantic_tokenizerTr  c                    s   t  | t|jtrdd |jdD }n|j}t|}|j|_	|j
|_|j|_||_|j|_|j|_|j|_|j|_|j|_|j|_|j|_|j|_t|| _| | j d S )Nc                 S   ri  r   rj  rk  r   r   r    r   l  r!  z<VibeVoiceSemanticTokenizerModel.__init__.<locals>.<listcomp>rm  )r   r   r\   rp  rM   rq  rs  rt  ru  r.  rv  r/  rw  r1  r2  rx  rW   r   ry  r)   r  r  r  r  r  r  r|  r~  r  )r   r?  rp  r  r   r   r    r   g  s&   

z(VibeVoiceSemanticTokenizerModel.__init__c                 C   r  r  r  r  r   r   r    r    r  z-VibeVoiceSemanticTokenizerModel._init_weightsNFc                 C   s(   | j |||||d}t|ddddS )r  rE  r   r"   r   rb  )r|  rT  r  r  r   r   r    r    s   z&VibeVoiceSemanticTokenizerModel.encodec                 C   s   |j ddS )r  rQ   r  )r]  r  r   r   r    r    s   z(VibeVoiceSemanticTokenizerModel.samplingc           	      C   s.   | j |||||d}| j|dd\}}d|fS )r  rE  rQ   r  N)r  r  )	r   r  r   r   r   r   r  r  r  r   r   r    r.     s   z'VibeVoiceSemanticTokenizerModel.forwardr   r   )r/   r0   r1   r2   r   r  r  r  r  r  r   r  r5   r  r  r  r.   r7   r   r   r   r    r  ^  s    r  )r   rd  r  )rQ   )FrQ   r   )ro   rp   )Nri   typingr3   	functoolsr   dataclassesr   r   r   r   r   r   r	   rs  numpyr4  r5   torch.nnr$   torch.nn.functionalr%   rv   transformers.models.autor
    transformers.configuration_utilsr   transformers.utilsr   transformers.modeling_utilsr   transformers.activationsr   configuration_vibevoicer   r   
get_loggerr/   loggeros#apex.normalization.fused_layer_normr   rO   infor4   getenvwarningImportErrorr  r   Moduler8   rN   	frozensetrX   rM   rZ   r   re   r   rn   r'   r~   r   r   r   r   r   r   r   r   r   r  rQ  rT  rd  r  register__all__r   r   r   r    <module>   s    


 
(		A $ C  /tF