
    h                     x   d dl Z d dlZd dlmZ d dlmZmZ d dlmZm	Z	m
Z
mZmZ d dlZd dlZd dlZd dlmZ d dlmc mZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d
dl m!Z!m"Z"  ej#        e$          Z%d dl&Z&	 d dl'm(Z( dZ)e%*                    d            e+ e&j,        dd                    d k    rdZ)e%-                    d           n"# e.$ r dZ)e%-                    d           Y nw xY w G d dej/                  Z0 G d dej1                  Z2 G d de2          Z3 e4g d          Z5dIdej1        de6dej1        fdZ7dJdej1        d e8de6dej1        fd!Z9	 dKd"ej:        d#e+d$e+d%e+de+f
d&Z;dLd"ej:        d)ej        e+e+f         d*e6d+e<fd,Z=d"ej:        d)ej        e+e+f         fd-Z> G d. d/ej1                  Z? G d0 d1ej1                  Z@ G d2 d3          ZA G d4 d5ej1                  ZB G d6 d7ej1                  ZC G d8 d9ej1                  ZD G d: d;ej1                  ZE G d< d=ej1                  ZF G d> d?ej1                  ZG G d@ dAej1                  ZHe G dB dC                      ZI G dD dEe          ZJ G dF dGe          ZK ejL        e!eJ            ejL        e"eK           g dHZMdS )M    N)partial)	dataclassfield)DictListOptionalTupleUnion)	AutoModel)PretrainedConfig)logging)PreTrainedModel)ACT2FN   ) VibeVoiceAcousticTokenizerConfig VibeVoiceSemanticTokenizerConfig)fused_rms_norm_affineTz@APEX FusedRMSNorm is available and will be used for optimizationOPTIMIZE_FOR_SPEED0FzJAPEX FusedRMSNorm is disabled by environment variable OPTIMIZE_FOR_SPEED=0z<APEX FusedRMSNorm not available, using native implementationc                   j     e Zd ZdZdej        eej        e         ej	        f         f fdZ
d Z xZS )ConvLayerNormz
    Convolution-friendly LayerNorm that moves channels to last dimensions
    before running the normalization and moves them back to original position right after.
    normalized_shapec                 <     t                      j        |fi | d S Nsuper__init__)selfr   kwargs	__class__s      U/workspace/chatterbox-finetuning/src/vibevoice/modular/modular_vibevoice_tokenizer.pyr   zConvLayerNorm.__init__,   s)    )44V44444    c                 ^   |                     dd          }t          j                            |                                | j        | j                                        | j                                        | j                  	                    |          }|                     dd          }|S Nr      )
	transposenn
functional
layer_normfloatr   weightbiasepstype_asr   xs     r!   forwardzConvLayerNorm.forward/   s    KK1M$$QWWYY0Et{GXGXGZGZ\`\e\k\k\m\mosowxx  A  A  BC  D  DKK1r"   )__name__
__module____qualname____doc__tpr
   intr   torchSizer   r1   __classcell__r    s   @r!   r   r   '   ss         5#rws|UZ2O)P 5 5 5 5 5 5      r"   r   c                   B     e Zd Zddedef fdZd Zd Zd	efd
Z	 xZ
S )RMSNormh㈵>TNdimr-   c                    t                                                       || _        || _        || _        | j        r4||fn|}t          j        t          j        |                    | _	        d S | 
                    dd            d S )Nr+   )r   r   r?   r-   elementwise_affiner'   	Parameterr8   onesr+   register_parameterr   r?   r-   rA   weight_shaper    s        r!   r   zRMSNorm.__init__6   s    "4" 	4%1%9C66|L,uz,'?'?@@DKKK##Hd33333r"   c                     |t          j        |                    d                              dd          | j        z             z  S )Nr%   T)keepdim)r8   rsqrtpowmeanr-   r/   s     r!   _normzRMSNorm._normA   s8    5;quuQxx}}R}>>IJJJJr"   c                     |                      |                                                              |          }| j        
|| j        z  }|S r   )rM   r*   r.   r+   r   r0   outputs      r!   r1   zRMSNorm.forwardD   sB    AGGII&&..q11;"dk)Fr"   returnc                 6    d| j          d| j         d| j         S )Nzdim=z, eps=z, elementwise_affine=)r?   r-   rA   r   s    r!   
extra_reprzRMSNorm.extra_reprJ   s'    ^dh^^dh^^TE\^^^r"   r>   TN)r2   r3   r4   r7   r*   r   rM   r1   strrT   r:   r;   s   @r!   r=   r=   5   s        	4 	4C 	4e 	4 	4 	4 	4 	4 	4K K K  _C _ _ _ _ _ _ _ _r"   r=   c                   0     e Zd Zddedef fdZd Z xZS )	ConvRMSNormr>   TNr?   r-   c                 P    t                                          ||||           d S r   r   rE   s        r!   r   zConvRMSNorm.__init__N   s(    c#5|DDDDDr"   c                 ^   |                     dd          }t          r| j        sL|                     |                                                              |          }| j        
|| j        z  }n&t          || j        | j        j        | j	                  }|                     dd          }|S r$   )
r&   APEX_AVAILABLErA   rM   r*   r.   r+   r   shaper-   rO   s      r!   r1   zConvRMSNorm.forwardQ   s    KK1 	X(? 	XZZ		**22155F{&$+-*1dk4;;LdhWWF!!!Q''r"   rU   )r2   r3   r4   r7   r*   r   r1   r:   r;   s   @r!   rX   rX   M   sh        E EC Ee E E E E E E
 
 
 
 
 
 
r"   rX   )noneweight_normspectral_normtime_layer_normr)   time_group_normr]   modulenormrQ   c                     |t           v sJ |dk    rt          j                            |           S |dk    rt          j                            |           S | S )Nr^   r_   )CONV_NORMALIZATIONSr'   utilsr^   r_   )rb   rc   s     r!   apply_parametrization_normrg   b   s]    &&&&&}x##F+++		 	 x%%f--- r"   causalc                 d   |t           v sJ |dk    r8t          | t          j        j        j                  sJ t          | j        fi |S |dk    rO|rt          d          t          | t          j        j        j                  sJ t          j	        d| j        fi |S t          j
                    S )zReturn the proper normalization module. If causal is True, this will ensure the returned
    module is causal, or return an error if the normalization doesn't support causal evaluation.
    r)   ra   z,GroupNorm doesn't support causal evaluation.r   )re   
isinstancer'   modulesconv_ConvNdr   out_channels
ValueError	GroupNormIdentity)rb   rh   rc   norm_kwargss       r!   get_norm_modulers   n   s     &&&&&|&"*/"9:::::V0@@K@@@	"	"	" 	MKLLL&"*/"9:::::|Av2BBkBBB{}}r"   r0   kernel_sizestridepadding_totalc                     | j         d         }||z
  |z   |z  dz   }t          j        |          dz
  |z  ||z
  z   }||z
  S )zMCalculate extra padding needed for convolution to have the same output lengthrH   r   )r\   mathceil)r0   rt   ru   rv   lengthn_framesideal_lengths          r!   get_extra_padding_for_conv1dr}      sV     WR[F$}4>BHIh''!+v5}9TUL&  r"   zero        paddingsmodevaluec                 d   | j         d         }|\  }}|dk    r|dk    sJ ||f            |dk    rjt          ||          }d}||k    r||z
  dz   }t          j        | d|f          } t          j        | |||          }	|	j         d         |z
  }
|	dd|
f         S t          j        | |||          S )z;Pad 1D input with handling for small inputs in reflect moderH   r   reflectr   .N)r\   maxFpad)r0   r   r   r   rz   padding_leftpadding_rightmax_pad	extra_padpaddedends              r!   pad1dr      s    WR[F"*L-1!!3!3!3lM5R!3!33ylM22	W&(1,Ia!Y((Aq(D%00l2*c4C4i  uQ$...r"   c                     |\  }}|dk    r|dk    sJ ||f            ||z   | j         d         k    sJ | j         d         |z
  }| d||f         S )zCRemove padding from x, handling properly zero padding. Only for 1d!r   rH   .)r\   )r0   r   r   r   r   s        r!   unpad1dr      su    "*L-1!!3!3!3lM5R!3!33=(QWR[8888
'"+
%CS,##$$r"   c            	       d     e Zd ZdZddi ddededej        eej        f         f fdZ	d	 Z
 xZS )

NormConv1dz<Wrapper around Conv1d and normalization applied to this convFr]   rh   rc   rr   rh   rc   rr   c                    t                                                       t          t          j        |i ||          | _        t          | j        ||fi || _        || _        d S r   )	r   r   rg   r'   Conv1drl   rs   rc   	norm_typer   rh   rc   rr   argsr   r    s         r!   r   zNormConv1d.__init__   sa    .ry$/I&/I/I4PP	#DIvtKK{KK	r"   c                 Z    |                      |          }|                     |          }|S r   )rl   rc   r/   s     r!   r1   zNormConv1d.forward   s%    IIaLLIIaLLr"   r2   r3   r4   r5   boolrV   r6   r   Anyr   r1   r:   r;   s   @r!   r   r      s        FF-246  d # WS"&[1           r"   r   c            	       d     e Zd ZdZddi ddededej        eej        f         f fdZ	d	 Z
 xZS )
NormConvTranspose1dzEWrapper around ConvTranspose1d and normalization applied to this convFr]   r   rh   rc   rr   c                    t                                                       t          t          j        |i ||          | _        t          | j        ||fi || _        || _        d S r   )	r   r   rg   r'   ConvTranspose1dconvtrrs   rc   r   r   s         r!   r   zNormConvTranspose1d.__init__   sc    01CT1TV1T1TVZ[[#DKMMMM	r"   c                 Z    |                      |          }|                     |          }|S r   )r   rc   r/   s     r!   r1   zNormConvTranspose1d.forward   s%    KKNNIIaLLr"   r   r;   s   @r!   r   r      s        OO-246  d # WS"&[1           r"   r   c                       e Zd ZdZd Zdedej        deej                 fdZ	dedej        dej        fdZ
dej        fd	Zddee         deej                 fdZd
S ) VibeVoiceTokenizerStreamingCachezACache for streaming convolution, similar to KV cache in attentionc                     i | _         d S r   )cacherS   s    r!   r   z)VibeVoiceTokenizerStreamingCache.__init__   s    


r"   layer_idsample_indicesrQ   c                 f   g }d}|                                 D ]O}||f}|| j        vr dS | j        |         }|                    |           t          ||j        d                   }Pt          |          dk    r|d                                         dk    rg }|D ]h}|j        d         |k     r@||j        d         z
  }	t          j        ||	dfdd          }
|                    |
           S|                    |           it          j
        |d          S t          j
        |d          S )z4Get cached states for given layer and sample indicesr   NrH   r%   constantr   r   r?   )tolistr   appendr   r\   lenr?   r   r   r8   stack)r   r   r   states
max_lengthidxkeystatepadded_statespad_sizepadded_states              r!   getz$VibeVoiceTokenizerStreamingCache.get   sK   
 "((** 	: 	:CS/C$*$$ttJsOEMM%   ZR99JJ v;;??vay}}!33M 0 0;r?Z//)EKO;H#$51JVW#X#X#XL!((6666!((////;}!4444;v1----r"   r   c                     t          |                                          D ]+\  }}||f}||                                         | j        |<   ,dS )z4Set cached states for given layer and sample indicesN)	enumerater   detachr   )r   r   r   r   ir   r   s          r!   setz$VibeVoiceTokenizerStreamingCache.set   sY     5 5 7 788 	1 	1FAsS/C$Qi..00DJsOO	1 	1r"   c                     t          | j                                                  D ]F}|\  }}||                                v r)| j        |         }t	          j        |          | j        |<   GdS )z6Set all cached states to zero for given sample indicesN)listr   keysr   r8   
zeros_like)r   r   r   r   
sample_idxcached_tensors         r!   set_to_zeroz,VibeVoiceTokenizerStreamingCache.set_to_zero   sv    
))** 	B 	BC#& Hj^224444 $
3"'"2="A"A
3	B 	Br"   Nc                 2   || j                                          dS 6|4fd| j                                         D             }|D ]
}| j         |= dS 8|8|                                D ]%}|f}| j                             |d           "dS dS dS )z4Clear cache for specific layer/samples or everythingNc                 ,    g | ]}|d          k    |S r    ).0kr   s     r!   
<listcomp>z:VibeVoiceTokenizerStreamingCache.clear.<locals>.<listcomp>   s'    OOOAadh>N>Na>N>N>Nr"   )r   clearr   r   pop)r   r   r   keys_to_remover   r   r   s    `     r!   r   z&VibeVoiceTokenizerStreamingCache.clear   s     6J!n&<OOOO):):OOON# " "JqMM" "!n&@%,,.. * *o
sD))))	 "!&@&@* *r"   )NN)r2   r3   r4   r5   r   rV   r8   Tensorr   r   r   r   r   r   r"   r!   r   r      s        KK  .C . .(5<BX . . . .<1C 1 1u| 1 1 1 1B%, B B B B* *hsm *HU\DZ * * * * * *r"   r   c                   d    e Zd ZdZddddddi dfdeded	ed
ededededededej        eej	        f         def fdZ
ed             Z	 	 	 	 ddej        dee         deej                 dededej        fdZ	 ddej        dedej        dedej        f
dZddej        dedej        fdZ xZS ) SConv1dzPConv1d with built-in handling of asymmetric or causal padding and normalization.r   TFr]   r   in_channelsrn   rt   ru   dilationgroupsr,   rh   rc   rr   pad_modec                 @   t                                                       t          |||||||||	|

  
        | _        || _        || _        || _        || _        || _        || _	        || _
        |dz
  |z  |dz
  z
  | _        |dz
  |z  |dz
  z
  | _        d | _        d S )N)r   r   r,   rh   rc   rr   r   )r   r   r   rl   rh   r   rt   r   ru   r   rn   context_sizerv   	_layer_id)r   r   rn   rt   ru   r   r   r,   rh   rc   rr   r   r    s               r!   r   zSConv1d.__init__  s    
 	{L+v%-f4PV!%;@ @ @	   ' &(
 )1_8FQJG *Ao9VaZH r"   c                 L    | j         dt          |            | _         | j         S )Nsconv1d_r   idrS   s    r!   r   zSConv1d.layer_id"  s'    >!2422DN~r"   Nr0   r   r   	use_cachedebugrQ   c                     |j         \  }}}|r||                     ||          S | j        s
J d            |
J d            t          |          |k    s
J d            |                     ||||          S )a  
        Forward pass with optional streaming support via cache.
        
        Args:
            x: Input tensor [batch_size, channels, time]
            cache: VibeVoiceTokenizerStreamingCache object for maintaining states
            sample_indices: Indices identifying each sample for cache management
            use_cache: Whether to use cached states for streaming
            debug: Whether to print debug information
            
        Returns:
            Output tensor
        Nr   z8Streaming mode is only supported for causal convolutions2sample_indices must be provided for streaming mode$sample_indices must match batch size)r\   _forward_non_streamingrh   r   _forward_streaming	r   r0   r   r   r   r   BCTs	            r!   r1   zSConv1d.forward(  s    $ '1a  	?EM..q.>>> {VVVVV{))+_)))>""a''')O'''&&q%GGGr"   c                 Z   |j         \  }}}|                    | j        |          }|| j        dk    rJt	          j        ||| j        |j        |j                  }|rt          d|j          d| j                    n4t	          j        ||d|j        |j                  }|rt          d           |j         d         dk    rt	          j	        ||gd          }	n|}	|r't          d	|j          d
|j          d|	j                     | 
                    |	          }
|rt          d|
j                     | j        dk    rk|	j         d         }|| j        k    r|| j        z
  }|	dddd|df         }n|	}|rt          d|j                     |                    | j        ||           |
S )MStreaming forward pass with cache operations kept separate from compiled codeNr   devicedtypez&[DEBUG] Initialized cache with shape: z, context_size=z.[DEBUG] No context needed (kernel_size=stride)r%   r   [DEBUG] Input shape: , Cache shape: , Combined: z[DEBUG] Output shape: [DEBUG] New cache shape: )r\   r   r   r   r8   zerosr   r   printcatrl   r   )r   r0   r   r   r   r   r   r   cached_statesinput_with_contextrP   total_input_lengthnew_cache_start	new_caches                 r!   r   zSConv1d._forward_streamingG  s   
 '1a 		$-@@  1$$ %Aq$2CAH\]\c d d d |z=CVzzgkgxzz{{{ %Aq!AHAG T T T MKLLL q!A%%!&M1+=1!E!E!E!" 	}!'}}-BU}}cuc{}}~~~ -.. 	;96<99::: q  !3!9!!< "T%666"4t7H"H.qqq!!!_5E5E/EF		 /	 EC)/CCDDDIIdm^Y???r"   c                    |j         \  }}}| j        }| j        }| j        }| j        }	t          ||||	          }
|rt          d|j          d|	 d|
            | j        r@| j        dk    rt          ||	|
f| j        d          }n@t          ||	|
f| j                  }n&|	dz  }|	|z
  }t          ||||
z   f| j                  }|rt          d	|j                     | 
                    |          }|rt          d
|j                     |S )'Standard forward pass without streaming#[DEBUG NON-STREAMING] Input shape: z, padding_total=z, extra_padding=r   r   r   )r   r%   z%[DEBUG NON-STREAMING] After padding: z$[DEBUG NON-STREAMING] Output shape: )r\   rt   ru   r   rv   r}   r   rh   r   r   rl   )r   r0   r   r   r   r   rt   ru   r   rv   extra_paddingr   r   rP   s                 r!   r   zSConv1d._forward_non_streaming  sb   '1a&=* 5QV][[ 	AQ^p}  A  A  A; 
	\}
**!m];$-WXYYY!m];$-PPP *Q.M(=8La,(EFT][[[A 	EC!'CCDDD1 	IGGGHHHr"   NNFFF)r2   r3   r4   r5   r7   r   rV   r6   r   r   r   propertyr   r8   r   r   r   r1   r   r   r:   r;   s   @r!   r   r     s       ZZ011d5" )	 C s  *-=@'+<@  24bf1E 	     <   X EI9="'#	H H H @AH ( 6H  H 	H ).	H H H HD ).7 7EL 7!A7*/,7 "&7 38,7 7 7 7r" " "T "el " " " " " " " "r"   r   c                   X    e Zd ZdZddddi dfdeded	ed
ededededej	        eej
        f         def fdZed             Z	 	 	 	 ddej        dee         deej                 dededej        fdZ	 ddej        dedej        dedej        f
dZddej        dedej        fdZ xZS )SConvTranspose1dzYConvTranspose1d with built-in handling of asymmetric or causal padding and normalization.r   Fr]         ?Tr   rn   rt   ru   rh   rc   trim_right_ratiorr   r,   c
           
      r   t                                                       t          ||||||||	          | _        || _        || _        | j        s| j        dk    s
J d            | j        dk    r| j        dk    sJ || _        || _        || _        || _	        ||z
  | _
        |dz
  | _        d | _        d S )N)rh   rc   rr   r,   r  zB`trim_right_ratio` != 1.0 only makes sense for causal convolutionsr   r   )r   r   r   r   rh   r  rt   ru   r   rn   rv   r   r   )r   r   rn   rt   ru   rh   rc   r  rr   r,   r    s             r!   r   zSConvTranspose1d.__init__  s     	)+|[RX/5Dk`df f f 0{ 	Qd3r999P :99$**t/D/J/J/JJ '&( )61 (!O r"   c                 L    | j         dt          |            | _         | j         S )N
sconvtr1d_r   rS   s    r!   r   zSConvTranspose1d.layer_id  s'    >!4"T((44DN~r"   Nr0   r   r   r   r   rQ   c                     |j         \  }}}|r||                     ||          S |
J d            t          |          |k    s
J d            |                     ||||          S )zI
        Forward pass with optional streaming support via cache.
        Nr   r   r   )r\   r   r   r   r   s	            r!   r1   zSConvTranspose1d.forward  s     '1a  	?EM..q.>>> ))+_)))>""a''')O'''&&q%GGGr"   c                    |j         \  }}}|                    | j        |          }|4t          j        ||d|j        |j                  }|rt          d           t          j        ||gd          }	|r't          d|j          d|j          d	|	j                     | 	                    |	          }
|rt          d
|
j                     | j
        r,t          j        | j        | j        z            }| j        |z
  }n| j        dz  }| j        |z
  }||z   dk    rt          |
||f          }
|rt          d|
j                     |j         d         dk    r|
}n0|| j        z  }|
j         d         |k    r|
dddd| df         }n|
}|rt          d|j                     |	j         d         | j        k    r|	dddd| j         df         }n|	}|rt          d|j                     |                    | j        ||           |S )r   Nr   r   z3[DEBUG] Initialized empty cache for transposed convr%   r   r   r   r   z+[DEBUG] Full transposed conv output shape: z[DEBUG] After unpadding: z&[DEBUG] Final streaming output shape: r   )r\   r   r   r8   r   r   r   r   r   r   rh   rx   ry   rv   r  r   ru   r   r   )r   r0   r   r   r   r   r   r   cached_input
full_inputfull_outputr   r   rP   expected_new_outputr   s                   r!   r   z#SConvTranspose1d._forward_streaming  s   
 '1a yy?? ;q!QqxqwOOOL NLMMM Ya0a888
 	vt!'tt,BTttblbrttuuu kk*-- 	US@QSSTTT ; 	> Id&84;P&PQQM-=LL .!3M-=L -'!++!+m/LMMK 	CAk.?AABBB a A%% FF #$dk/  #':::$QQQ,?+?+@+@%@A$ 	KI6<IIJJJ A!222"111aaa$*;);)<)<#<=II"I 	A?io??@@@		$-;;;r"   c                    |rt          d|j                    |                     |          }|rt          d|j                    | j        r,t	          j        | j        | j        z            }| j        |z
  }n| j        dz  }| j        |z
  }||z   dk    rt          |||f          }|rt          d|j                    |S )r   r   z-[DEBUG NON-STREAMING] After transposed conv: r%   r   z*[DEBUG NON-STREAMING] Final output shape: )	r   r\   r   rh   rx   ry   rv   r  r   )r   r0   r   yr   r   s         r!   r   z'SConvTranspose1d._forward_non_streaming'  s     	CAAABBB KKNN 	MK!'KKLLL ; 	> Id&84;P&PQQM-=LL .!3M-=L-'!++L-899A 	JHqwHHIIIr"   r   r   )r2   r3   r4   r5   r7   r   rV   r*   r6   r   r   r   r  r   r8   r   r   r   r1   r   r   r:   r;   s   @r!   r  r    s       cc01%"b46T C s  *-;?6;  WS"&[1 ?C     :   X EI9="'#	H H H @AH ( 6H  H 	H ).	H H H H. ).G GEL G!AG*/,G "&G 38,G G G GR  T el        r"   r  c                   (     e Zd Z	 d fd	Zd Z xZS )FFNFc                     t                                                       || _        t          j        | j        ||          | _        t          d         | _        t          j        || j        |          | _        d S )Nr,   gelu)	r   r   	embed_dimr'   Linearlinear1r   r  linear2)r   r  ffn_dimr,   r    s       r!   r   zFFN.__init__D  se     	"ytDDD6N	y$.tDDDr"   c                     |                      |          }|                     |          }|                     |          }|S r   )r  r  r  r/   s     r!   r1   zFFN.forwardP  s4    LLOOIIaLLLLOOr"   r   r2   r3   r4   r   r1   r:   r;   s   @r!   r  r  C  sW        
 	
E 
E 
E 
E 
E 
E      r"   r  c                   4     e Zd Z	 	 	 	 	 	 	 d fd	Zd Z xZS )	Convlayerr   Tr   r^   c                     t                                                       t          |||||||||	|

  
        | _        d S )N)ru   r   r   r,   r   rc   rh   )r   r   r   rl   )r   r   rn   rt   ru   r   r   r,   r   rc   rh   r    s              r!   r   zConvlayer.__init__X  sO     	K{6\d"(thTZ`b b b			r"   c                 ,    |                      |          S r   )rl   r/   s     r!   r1   zConvlayer.forwardi  s    yy||r"   )r   r   r   Tr   r^   Tr  r;   s   @r!   r  r  W  si         b b b b b b"      r"   r  c                   *     e Zd Z	 	 d fd	Zd Z xZS )Block1D   r   rl   ư>c                    t                                                       |                    dd          dk    rUt          ||                    dd                    | _        t          ||                    dd                    | _        nn|                    dd          dk    rTt          ||                    dd                    | _        t          ||                    dd                    | _        |dk    rt          |||                    dd	          ||                    d
d          |                    dd          |                    dd          |                    dd                    | _        n|dk    rmt          |||||                    d
d          |                    dd          |                    dd          |                    dd                    | _        nt          d|           t          ||                    dd          |z  |                    dd                    | _        |dk    rt          j                    nt          j                            |          | _        |dk    rbt          j        |t%          j        |          z  d          | _        t          j        |t%          j        |          z  d          | _        d S d | _        d | _        d S )N	layernormLNr-   r$  r-   r=   rl   r   r   r   r   rc   r]   rh   Tr,   )r   rt   r   rc   rh   r,   depthwise_convzUnsupported mixer layer: ffn_expansion   Fr  r   r   )requires_grad)r   r   r   r   rc   ffn_normrX   r  mixerro   r  ffnr'   rq   rk   DropPath	drop_pathrB   r8   rC   gamma	ffn_gamma)r   r?   rt   r1  mixer_layerlayer_scale_init_valuer   r    s          r!   r   zBlock1D.__init__m  s   ::k4((D00%cvzz%/F/FGGGDI)#6::eT3J3JKKKDMMZZY//9<<#CVZZt-D-DEEEDI'E41H1HIIIDM&  "3FJJx4K4K,7)/J	)J)J%+ZZ%?%?'-zz(D'A'A%+ZZ%=%=" " "DJJ ,,,"3C,7)/J	)J)J%+ZZ%?%?'-zz(D'A'A%+ZZ%=%=" " "DJJ FFFGGGJJ**S0FE**
 
 

 +4r//rz?R?RS\?]?]!A%%&<uz3?P?P&P`deeeDJ\*@5:sCTCT*TdhiiiDNNNDJ!DNNNr"   c                    |}|                      |          }|                     |          }| j        || j                            d          z  }||                     |          z   }|}|                     |          }|                    ddd          }|                     |          }|                    ddd          }| j        || j                            d          z  }||                     |          z   }|S )NrH   r   r%   r   )	rc   r.  r2  	unsqueezer1  r-  permuter/  r3  )r   r0   residuals      r!   r1   zBlock1D.forward  s    IIaLLJJqMM:!DJ((,,,At~~a((( MM!IIaAHHQKKIIaA>%DN,,R000At~~a(((r"   )r#  r   rl   r$  r  r;   s   @r!   r"  r"  l  sS        EK'+*" *" *" *" *" *"X      r"   r"  c                   2     e Zd ZdZ fdZddZddZ xZS )TokenizerEncoderz
    Encoder component for the VibeVoice tokenizer that converts audio to latent representations.
    
    Args:
        config: Configuration object with model parameters
    c                 h   t                                                       |j        | _        |j        | _        |j        | _        t          t          |j                            | _        |j        | _        t          |dd          | _
        t          j        | j                  | _        |j        | _        t          |dd          }t          |dd          }t          |dd          }t          |di           }t          |d	d
          }t          |dd          }t          |dd          }t          |dd          }	t          |dd          }
t          |dd          }t          |dd          }t          |dd          }t          |dd          }|dk    rt          }n/|dk    rt!          t"          |
          }nt%          d|           t'          j        t+          | j        | j        |||| j        ||                    }t'          j                    | _        | j                            |           t3          t5          | j                            D ]}| j        d|z  z  | j        d|dz   z  z  }t'          j        t+          || j        |         dz  | j        |         | j        |||                    }| j                            |           t!          t6          |||	| j        |||| 	  	        t'          j                    | _        d! t;          j        d|t?          | j                            D             dt3          t5          | j                            D ]n}| j        d|z  z  t'          j        fd"t3          | j        |                   D              }| j                            |           | j        |         z  o|s ||	#          | _         nt'          j!                    | _         t+          | j        || j        |||$          | _"        d S )%Nn_residual_layersr   rt   r#  last_kernel_sizerc   r]   norm_paramsr   r   r,   Tr&  r'  layernorm_epsr$  layernorm_elementwise_affinedrop_path_rater   r4  rl   r5  r   disable_last_normFr=   rA   Unsupported norm type: rc   rr   rh   r   r,   r%   )rt   ru   rh   r   rc   r,   r4  r&  r-   rh   r   rc   r,   r5  c                 6    g | ]}|                                 S r   itemr   r0   s     r!   r   z-TokenizerEncoder.__init__.<locals>.<listcomp>       ZZZAFFHHZZZr"   c                 <    g | ]} |z                       S )r?   r1  r   r   jcurdp_ratesin_ch
layer_types     r!   r   z-TokenizerEncoder.__init__.<locals>.<listcomp>  1    ddd**(372CDDDdddr"   r(  rt   rh   r   rc   r,   )#r   r   channels	dimension	n_filtersr   reversedratiosdepthsgetattrr=  npprod
hop_lengthrh   r   r   rX   ro   r'   
Sequentialr   
ModuleListdownsample_layersr   ranger   r"  stagesr8   linspacesumrc   rq   head)r   configrt   r>  rc   r?  r   r,   r&  r@  rA  rB  r4  r5  rC  r   stemr   out_chdownsample_layerstagerQ  rR  rS  rT  r    s                        @@@@r!   r   zTokenizerEncoder.__init__  sh    ))8FM2233m!(1Da!H!H'$+..m fmQ77"6+=qAAvvv..fmR886:y99vvt,,FK66	>>'.v7UW['\'\$ )93??fmV<<!(1I1!M!M#F,?GG %II)##@\]]]IIByBBCCC }t~{[fosoz  FN  UY  Z  Z  Z  "$%%d+++s4;''(( 	< 	<ANa1f-E^qQU|4F!}v4;q>A3EdkZ[neiep  |D  KO  VZ  [  [  [    "))*:;;;; #;#9

 

 


 mooZZenQDKHXHX&Y&YZZZs4;''(( 	" 	"ANa1f-EMdddddddeTXT_`aTbNcNcdddE Ku%%%4;q>!CC  	&!	%];;;DIIDIE4>?OX\Xcnv  ~B  IM  N  N  N			r"   NFc           	         t          t          | j                            D ]}| j        |         D ]3}t	          |t
                    r ||||||          }( ||          }4| j        |         D ]8}t          |d          rt          |j        d          rt	          |j        j	        t
                    r|}	|
                    |          }|j        	                    |||||          }|j        ||j                            d          z  }|	|z   }|}	|                    |          }|                    ddd          }|                    |          }|                    ddd          }|j        ||j                            d          z  }|	|z   }- ||          }:| 
                    |          S Nr   r   r   r   r.  rl   rH   r   r%   r   )rd  r   r\  rc  rj   r   re  hasattrr.  rl   rc   r2  r7  r-  r8  r/  r3  
r   r0   r   r   r   r   r   layerblockr9  s
             r!   forward_featuresz!TokenizerEncoder.forward_features  s   s4;''(( 	! 	!A/2 ! !eW-- !au^W`hmnnnAAaAA Q ! !5'** !wu{F/K/K !PZ[`[f[kmtPuPu ! H

1A((%bksx(yyA{. 5 5b 9 99 1A  !Hq))A		!Q**A		!A		!Q**A2 9 9" = == 1AAaAA)!, yy||r"   c                 n    |                      |||||          }|                     |||||          }|S Nrp  ru  rh  r   r0   r   r   r   r   s         r!   r1   zTokenizerEncoder.forward*  C    !!!5[dlq!rrIIau^y`eIffr"   r   r2   r3   r4   r5   r   ru  r1   r:   r;   s   @r!   r;  r;    sv         PN PN PN PN PNd       D       r"   r;  c                   2     e Zd ZdZ fdZddZddZ xZS )TokenizerDecoderz
    Decoder component for the VibeVoice tokenizer that converts latent representations back to audio.
    
    Args:
        config: Configuration object with model parameters
    c                 	   t                                                       |j        | _        |j        | _        |j        | _        |j        | _        |j        | _        t          |dd          | _        t          j
        | j                  | _        |j        | _        t          |dd          }t          |dd          }t          |dd          }t          |di           }t          |d	d
          }t          |dd          }t          |dd          }t          |dd          }	t          |dd          }
t          |dd          }t          |dd          }t          |dd          }t          |dd          }t          |dd          }|dk    rt          }n/|dk    rt          t          |          }nt!          d|           t#          j        t'          | j        | j        dt)          | j                  dz
  z  z  |||| j        ||                     }t#          j                    | _        | j                            |           t1          t)          | j                            D ]}| j        dt)          | j                  dz
  |z
  z  z  | j        dt)          | j                  dz
  |z
  dz
  z  z  }t#          j        t3          || j        |         dz  | j        |         |||| j        |
!	  	                  }| j                            |           t          t4          |||	| j        ||||"	  	        t#          j                    | _        d# t9          j        d|t=          | j                            D             dt1          t)          | j                            D ]}| j        dt)          | j                  dz
  |z
  z  z  t#          j        fd$t1          | j        |                   D              }| j                            |           | j        |         z  |s ||	%          | _        nt#          j                     | _        t'          | j        || j        |||&          | _!        d S )'Nr=  r   rt   r#  r>  rc   r]   r?  r   r   r,   Tr&  r'  r@  r$  r  r  rA  rB  r   r4  rl   r5  r   rC  Fr=   rD  rE  r%   rF  )rt   ru   rc   rr   r,   rh   r  rG  c                 6    g | ]}|                                 S r   rI  rK  s     r!   r   z-TokenizerDecoder.__init__.<locals>.<listcomp>  rL  r"   c                 <    g | ]} |z                       S rN  r   rO  s     r!   r   z-TokenizerDecoder.__init__.<locals>.<listcomp>  rU  r"   r(  rV  )"r   r   rX  rW  rY  r[  r\  r]  r=  r^  r_  r`  rh   r   r   rX   ro   r'   ra  r   r   rb  upsample_layersr   rd  r  r"  re  r8   rf  rg  rc   rq   rh  )r   ri  rt   r>  rc   r?  r   r,   r&  r@  r  rA  rB  r4  r5  rC  r   rj  r   rk  upsample_layerrm  rQ  rR  rS  rT  r    s                         @@@@r!   r   zTokenizerDecoder.__init__7  s     ))m m!(1Da!H!H'$+..m fmQ77"6+=qAAvvv..fmR886:y99vvt,,FK66	>>"6+=sCC'.v7UW['\'\$ )93??fmV<<!(1I1!M!M#F,?GG %II)##@\]]]IIByBBCCC }s4;?O?ORS?S9T(TVahl$/h]ac c c 
  "}##D)))s4;''(( 		8 		8ANaC,<,<q,@1,D&EFE^qS-=-=-AA-E-I'JKF] ,0KNQ,>t{ST~%){'+{EUW W W N  ''7777 #;#9

 

 


 mooZZenQDKHXHX&Y&YZZZ s4;''(( 	" 	"ANaC,<,<q,@1,D&EFEMdddddddeTXT_`aTbNcNcdddE Ku%%%4;q>!CC  	&!	%];;;DIIDIE4=>NW[Wbmu  }A  HL  M  M  M			r"   NFc           	         t          t          | j                            D ]}| j        |         D ]:}t	          |t
          t          f          r ||||||          }/ ||          };| j        |         D ]8}t          |d          rt          |j	        d          rt	          |j	        j
        t
                    r|}	|                    |          }|j	        
                    |||||          }|j        ||j                            d          z  }|	|z   }|}	|                    |          }|                    ddd          }|                    |          }|                    ddd          }|j        ||j                            d          z  }|	|z   }- ||          }:|                     |          S ro  )rd  r   r\  r  rj   r   r  re  rq  r.  rl   rc   r2  r7  r-  r8  r/  r3  rr  s
             r!   ru  z!TokenizerDecoder.forward_features  s   s4;''(( 	! 	!A-a0 ! !eg/?%@AA !au^W`hmnnnAAaAA Q ! !5'** !wu{F/K/K !PZ[`[f[kmtPuPu ! H

1A((%bksx(yyA{. 5 5b 9 99 1A  !Hq))A		!Q**A		!A		!Q**A2 9 9" = == 1AAaAA)!, yy||r"   c                 n    |                      |||||          }|                     |||||          }|S rw  rx  ry  s         r!   r1   zTokenizerDecoder.forward  rz  r"   r   r{  r;   s   @r!   r}  r}  0  sv         YM YM YM YM YMv       D       r"   r}  c                   p    e Zd ZU dZej        ed<   dZee	e
ej        f                  ed<   d	dZd Zd ZdS )
VibeVoiceTokenizerEncoderOutputa  
    Output of VibeVoice tokenizer encoder, representing a Gaussian distribution with fixed variance.
    
    Args:
        mean (`torch.FloatTensor`): The mean parameters of the distribution.
        std (`float` or `torch.FloatTensor`): Fixed standard deviation value.
    rL   Nstdfixc                 t   |dk    r2| j         | j        t          j        | j                   z  z   }|| j        fS |dk    r| j                             d          }| j        dz  }t          j        || j         j        | j         j                  |z  }|                                | j                                         k     rD|	                    d          }|                                | j                                         k     D| j         |t          j        | j                   z  z   }||fS | j         | j        fS )aK  
        Sample from the distribution.
        
        Args:
            dist_type (`str`): Sampling method, either 'fix' or 'gaussian'.
                
        Returns:
            `torch.FloatTensor`: Sampled values.
            `torch.FloatTensor` (optional): Standard deviation used (only when dist_type='gaussian').
        r  gaussianr   g?r   rH   )
rL   r  r8   
randn_likesizerandnr   r   r?   r7  )r   	dist_typer0   
batch_sizer   r  s         r!   samplez&VibeVoiceTokenizerEncoderOutput.sample  s    	DHu'7	'B'BBBAdh;*$$**JHsNE+j1AYYY\aaC''))dimmoo--mmB'' ''))dimmoo-- 	C%"249"="===Ac6M9dh&&r"   c                 l    t          j        | j                  }t          j        | j        |d          S )zFCompute KL divergence between this distribution and a standard normal.r]   )	reduction)r8   r   rL   r   mse_loss)r   targets     r!   klz"VibeVoiceTokenizerEncoderOutput.kl  s-    !$),,z$)Vv>>>>r"   c                     | j         S )z>Return the distribution mode (which is the mean for Gaussian).rL   rS   s    r!   r   z$VibeVoiceTokenizerEncoderOutput.mode  s
    yr"   )r  )r2   r3   r4   r5   r8   r   __annotations__r  r   r
   r*   r  r  r   r   r"   r!   r  r    s           ,04C%u|+,	-444' ' ' '6? ? ?
    r"   r  c                        e Zd ZdZeZdZdZdZddgZ	 fdZ
d Z ej                    dd
            Z ej                    dd            Z ej                    dd            ZddZ xZS )VibeVoiceAcousticTokenizerModelzRVibeVoice speech tokenizer model combining encoder and decoder for acoustic tokensvibevoice_acoustic_tokenizerTr;  r}  c                 $   t                                          |           |                     dt          j        |j                  d           t          |dd          | _        t          |j	        t                    r%d |j	                            d          D             }n|j	        }|j        ?t          |j        t                    r%d |j                            d          D             }nt          t          |                    }t          j        |          }|j        |_        |j        |_        |j        |_        ||_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j         |_         |j!        |_!        t          j        |          }|j        |_        |j"        |_        |j#        |_        ||_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j         |_         |j!        |_!        tI          |          | _%        tM          |          | _'        | (                    | j)                   d S )	Nfix_stdF)
persistentstd_dist_typer  c                 ,    g | ]}t          |          S r   r7   r   ds     r!   r   z<VibeVoiceAcousticTokenizerModel.__init__.<locals>.<listcomp>      OOOc!ffOOOr"   -c                 ,    g | ]}t          |          S r   r  r  s     r!   r   z<VibeVoiceAcousticTokenizerModel.__init__.<locals>.<listcomp>  r  r"   )*r   r   register_bufferr8   tensorr  r]  r  rj   encoder_depthsrV   splitdecoder_depthsr   rZ  copydeepcopyvae_dimrX  encoder_n_filtersrY  encoder_ratiosr[  r\  	conv_normrc   r   	conv_biasr,   r@  rA  r4  r5  rC  decoder_n_filtersdecoder_ratiosr;  encoderr}  decoderapply_init_weights)r   ri  r  r  encoder_configdecoder_configr    s         r!   r   z(VibeVoiceAcousticTokenizerModel.__init__  sT      YV^(D(DQVWWW$V_eDD f+S11 	3OOf.C.I.I#.N.NOOONN#2N  ,F<QSV1W1W,OOf.C.I.I#.N.NOOONN "(>":":;;N v..#)> #)#;  & 5 .$."(/$.'-';$6<6Y3%+%7"060M-+1+C( v..#)> #)#;  & 5 .$."(/$.'-';$6<6Y3%+%7"060M-+1+C( (77'77 	

4%&&&&&r"   c                    t          |t          j                  r_t          j                            |j        | j        j                   |j        &t          j        	                    |j                   dS dS t          |t          j
                  rJt          j                            |j                   t          j        	                    |j                   dS t          |t          j                  r]t          j                            |j        | j        j                   |j        (t          j        	                    |j                   dS dS dS z Initialize weights for the model)r  Nrj   r'   r  initnormal_r+   ri  weight_init_valuer,   zeros_	LayerNormones_r   r   rb   s     r!   r  z-VibeVoiceAcousticTokenizerModel._init_weights+     fbi(( 
	,GOOFMt{/LOMMM{&v{+++++ '&-- 	,GMM&-(((GNN6;'''''	** 	,GOOFMt{/LOMMM{&v{+++++	, 	,&&r"   NFc                     |                      |||||          }t          |                    ddd          | j                  S )'Convert audio to latent representationsrp  r   r%   r   )rL   r  )r  r  r8  r  r   audior   r   r   r   latentss          r!   encodez&VibeVoiceAcousticTokenizerModel.encode9  sH     ,,uE.\emr,ss.GOOAq!4L4LRVR^____r"   c                     |p| j         }|dk    r|                    d          S |dk    r|                    d          S t          d| d          )+Sample from the encoder output distributionr  r  r  zUnsupported dist_type: z, expected 'fix' or 'gaussian')r  r  ro   r   encoder_outputr  s      r!   samplingz(VibeVoiceAcousticTokenizerModel.sampling?  so     3!3	!((5(999*$$!((:(>>>`y```aaar"   c                     |j         d         | j        j        k    rn|                    ddd          }|                     |||||          }|S )z,Convert latent representations back to audior   r   r%   rp  )r\   ri  r  r8  r  )r   r  r   r   r   r   r  s          r!   decodez&VibeVoiceAcousticTokenizerModel.decodeK  sU     =t{222ooaA..GWE.\emrssr"   c                     |                      |||||          }|                     |          \  }}|                     |||||          }	|	|fS )EFull forward pass: encode audio to latents, then decode back to audiorp  )r  r  r  )
r   r  r   r   r   r   r  sampled_latents_reconstructeds
             r!   r1   z'VibeVoiceAcousticTokenizerModel.forwardV  sh    U%bksxyy!]]>::O5Q_kt  }B  C  Co--r"   r   r   )r2   r3   r4   r5   r   config_classbase_model_prefix_supports_flash_attn_2_supports_sdpa_no_split_modulesr   r  r8   no_gradr  r  r  r1   r:   r;   s   @r!   r  r    s       \\3L6!N+-?@6' 6' 6' 6' 6'p, , , U]__` ` ` _`
 U]__	b 	b 	b _	b U]__   _. . . . . . . .r"   r  c                        e Zd ZdZeZdZdZdZdgZ	 fdZ
d Z ej                    dd	            Z ej                    dd
            ZddZ xZS )VibeVoiceSemanticTokenizerModelzFVibeVoice speech tokenizer model with only encoder for semantic tokensvibevoice_semantic_tokenizerTr;  c                 p   t                                          |           t          |j        t                    r%d |j                            d          D             }n|j        }t          j        |          }|j        |_	        |j
        |_        |j        |_        ||_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        |j        |_        t3          |          | _        |                     | j                   d S )Nc                 ,    g | ]}t          |          S r   r  r  s     r!   r   z<VibeVoiceSemanticTokenizerModel.__init__.<locals>.<listcomp>l  r  r"   r  )r   r   rj   r  rV   r  r  r  r  rX  r  rY  r  r[  r\  r  rc   r   r  r,   r@  rA  r4  r5  rC  r;  r  r  r  )r   ri  r  r  r    s       r!   r   z(VibeVoiceSemanticTokenizerModel.__init__g  s       f+S11 	3OOf.C.I.I#.N.NOOONN#2N v..#)> #)#;  & 5 .$."(/$.'-';$6<6Y3%+%7"060M-+1+C( (77 	

4%&&&&&r"   c                    t          |t          j                  r_t          j                            |j        | j        j                   |j        &t          j        	                    |j                   dS dS t          |t          j
                  rJt          j                            |j                   t          j        	                    |j                   dS t          |t          j                  r]t          j                            |j        | j        j                   |j        (t          j        	                    |j                   dS dS dS r  r  r  s     r!   r  z-VibeVoiceSemanticTokenizerModel._init_weights  r  r"   NFc                     |                      |||||          }t          |                    ddd                    S )r  rp  r   r%   r   r  )r  r  r8  r  s          r!   r  z&VibeVoiceSemanticTokenizerModel.encode  sB     ,,uE.\emr,ss.GOOAq!4L4LMMMMr"   c                 .    |                     d          S )r  r]   r  )r  r  s      r!   r  z(VibeVoiceSemanticTokenizerModel.sampling  s     $$v$666r"   c                 r    |                      |||||          }|                     |d          \  }}d|fS )r  rp  r]   r  N)r  r  )	r   r  r   r   r   r   r  r  r  s	            r!   r1   z'VibeVoiceSemanticTokenizerModel.forward  sD    U%bksxyy!]]>V]LL_$$r"   r   r   )r2   r3   r4   r5   r   r  r  r  r  r  r   r  r8   r  r  r  r1   r:   r;   s   @r!   r  r  ^  s        PP3L6!N+,' ' ' ' '<, , , U]__N N N _N
 U]__7 7 7 _7% % % % % % % %r"   r  )r   r  r  )r]   )Fr]   r   )r~   r   )Nrx   typingr6   	functoolsr   dataclassesr   r   r   r   r   r	   r
   r  numpyr^  r8   torch.nnr'   torch.nn.functionalr(   r   transformers.models.autor    transformers.configuration_utilsr   transformers.utilsr   transformers.modeling_utilsr   transformers.activationsr   configuration_vibevoicer   r   
get_loggerr2   loggeros#apex.normalization.fused_layer_normr   r[   infor7   getenvwarningImportErrorr  r   Moduler=   rX   	frozensetre   rV   rg   r   rs   r   r}   r*   r   r   r   r   r   r   r  r  r  r"  r;  r}  r  r  r  register__all__r   r"   r!   <module>r      s              ( ( ( ( ( ( ( ( 5 5 5 5 5 5 5 5 5 5 5 5 5 5                      . . . . . . = = = = = = & & & & & & 7 7 7 7 7 7 + + + + + + g g g g g g g g		H	%	% 					SIIIIIIN
KKRSSS
s929)3//00A55cddd S S SN
NNQRRRRRS    BL   _ _ _ _ _bi _ _ _0    '   "  i !T !T !T U U 	 	ry 	 	 	 	 	 	 BI t 3 cecl    $ 67! !EL !s !C !/2!;>! ! ! !/ /U\ /RXc3h%7 /s /TY / / / /$%u| %rxS'9 % % % %           ")   ?* ?* ?* ?* ?* ?* ?* ?*B` ` ` ` `bi ` ` `F[ [ [ [ [ry [ [ [|    ")   (    	   *@ @ @ @ @bi @ @ @F~ ~ ~ ~ ~ry ~ ~ ~BG G G G Gry G G GT - - - - - - - -^q. q. q. q. q.o q. q. q.hD% D% D% D% D%o D% D% D%L 	 35T U U U 	 35T U U U  s   ;AC C-,C-