o
    1i*                     @  s   d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlZddlZddlmZ ddlm  mZ ddlZddlmZ ddlmZ eeZg dZeed	 ZdddZdddZdddZ eG dd dZ!dS )u  FastBiCodec: optimized BiCodec wrapper with lossless speedups.

Optimizations applied (identical tokens to original):
  Tier 1: Layer truncation — wav2vec2-xlsr-53 24→17 layers (only 11,14,16 read)
  Tier 2: TF32 for matmuls
  Tier 3: Proper tensor-in, tensor-out encode (no file path, no numpy roundtrip)
  Tier 4: SDPA monkey-patch for wav2vec2 attention layers

FP16 autocast removed: truncation alone gives 100% token match. FP16 caused
1.26% semantic token drift at VQ boundaries — not acceptable at scale.
    )annotationsN)Path)Optional)
TokenBatch)register_codec)            model	nn.ModulereturnNonec                 C  sB   | j }t|j}|jdt |_t| j_d| j_td|t dS )zBTruncate wav2vec2 encoder to 17 layers (only 11, 14, 16 are read).NTu4   Truncated wav2vec2-xlsr-53 encoder: %d → %d layers)	encoderlenlayersTRUNCATE_TOconfignum_hidden_layersoutput_hidden_statesloggerinfo)r   r   
original_n r   ;/home/ubuntu/bench-codecs/codecbench/codecs/bicodec_fast.py_apply_wav2vec2_truncation%   s   
r   c                 C  sH   d}|   D ]\}}t|j}|dkrt| |d7 }qtd| dS )zMMonkey-patch wav2vec2 attention layers to use F.scaled_dot_product_attention.r   Wav2Vec2Attentionr
   z8SDPA monkey-patch applied to %d Wav2Vec2Attention layersN)named_modulestype__name___patch_wav2vec2_attentionr   r   )r   patchednamemodulecls_namer   r   r   _apply_wav2vec2_sdpa/   s   
r%   attn_modulec                 C  s*   | j | _		ddd	d
}t|| | _ dS )uH   Replace manual matmul→softmax→matmul with SDPA in Wav2Vec2Attention.NFhidden_statestorch.Tensorattention_maskOptional[torch.Tensor]output_attentionsboolc                 [  s   |r| j |f||d|S | \}}}| |}| |}	| |}
|||| j| jdd}|	||| j| jdd}	|
||| j| jdd}
d }|d urV|}t	j
||	|
|dd}|dd||| j}| |}|d fS )N)r)   r+   r
      g        )	attn_mask	dropout_p)_original_forwardsizeq_projk_projv_projview	num_headshead_dim	transposeFscaled_dot_product_attentionreshape	embed_dimout_proj)selfr'   r)   r+   kwargsBS_querykeyvalue	attn_biasoutr   r   r   sdpa_forward>   s2   




z/_patch_wav2vec2_attention.<locals>.sdpa_forward)NF)r'   r(   r)   r*   r+   r,   )forwardr0   types
MethodType)r&   rH   r   r   r   r    :   s
   #r    c                   @  s   e Zd ZU dZdZded< dZded< d1d2ddZdej	fd3ddZ
d4d5ddZd6dd Zd7d#d$Zd8d&d'Ze d9d)d*Ze d:d,d-Zd;d/d0ZdS )<FastBiCodecCodecu   BiCodec with lossless optimizations: layer truncation, FP16, SDPA.

    Drop-in replacement for BiCodecCodec. Accepts tensor input directly
    instead of file paths — no numpy CPU roundtrip.
    bicodec_faststrr"   >  int	native_srN	model_dir
str | Nonec                 C  s0   || _ d | _d | _d | _d | _d| _tj| _d S )Ncpu)	
_model_dir_bicodec
_w2v_model_w2v_processor_config_devicetorchfloat32_dtype)r>   rR   r   r   r   __init__o   s   zFastBiCodecCodec.__init__cudadevicedtypetorch.dtyper   r   c                 C  sp  || _ || _| jd u rtdt| j}|d  r$tjdt	| n|j
d d  r9tjdt	|j
d  ddlm} ddlm} ddlm}m} || j d| _|| j d	|| _|| j d
| _|| j d
|| _d| jj_| j  t| j dtjjj _!dtjj"_!t#| j t$| jd | jd  | jd  | jd  | _%t&'d| d S )NzCFastBiCodec requires model_dir pointing to Spark-TTS-0.5B checkout.sparkttsr   z	Spark-TTS)load_config)BiCodec)Wav2Vec2FeatureExtractorWav2Vec2Modelz/config.yamlz/BiCodecz/wav2vec2-large-xlsr-53Tsample_rateref_segment_durationlatent_hop_lengthz/FastBiCodec loaded on %s with all optimizations)(rZ   r]   rU   RuntimeErrorr   existssyspathinsertrN   parentsparktts.utils.filerd   sparktts.models.bicodecre   transformersrf   rg   rY   load_from_checkpointtorV   from_pretrainedrX   rW   r   r   evalr   r[   backendsr_   matmul
allow_tf32cudnnr%   rP   _ref_segment_lengthr   r   )r>   r`   ra   
spark_rootrd   re   rf   rg   r   r   r   loadx   sR   









zFastBiCodecCodec.load      @r
   batch_secondsfloat
batch_sizec              	   C  s   | j d us	J dt|| j }tj|d|| jd}tdD ]"}t  | || j}| 	|}W d    n1 s;w   Y  qtj
  td d S )NzCall load() firstr
   r`      zFastBiCodec warmup complete)rV   rP   rQ   r[   randnrZ   rangeinference_modeencodedecoder_   synchronizer   r   )r>   r   r   	n_samplesdummyrB   tbr   r   r   warmup   s   

zFastBiCodecCodec.warmupwavr(   src                 C  s    || j krtj||| j }|S N)rQ   
torchaudio
functionalresample)r>   r   r   r   r   r   _resample_if_needed   s   
z$FastBiCodecCodec._resample_if_neededwav_1d
np.ndarrayc                 C  sR   | j }t|}||krt||| d }|d| }t|d | j	S )zDExtract reference clip for speaker encoder (same logic as original).r
   Nr   )
r|   r   nptiler[   
from_numpy	unsqueezer   ru   rZ   )r>   r   ref_lenwav_lenref_npr   r   r   _get_ref_clip   s   zFastBiCodecCodec._get_ref_clipwav_npc                 C  sN   | j |dddddj}| || j}|jd |jd  |jd  d }|S )	zCExtract wav2vec2 features with truncated model (fp32, no autocast).rO   ptT)sampling_ratereturn_tensorspaddingr   r   r   r	      )rX   input_valuesrW   ru   rZ   r'   )r>   r   inputsfeat	feats_mixr   r   r   _extract_features   s   z"FastBiCodecCodec._extract_featuresr   c                 C  sP  |  ||| j}|jd }g }g }t|D ]B}||    }| |}| 	|}	t
|d | j||	| jd}
| j|
\}}||  ||  qtdd |D }t
j||t
j| jd}t|D ]\}}| ||d|jd f< qrt
jdd	 |D dd
}t| j| j||ddd	 |D |jd ddS )z=Encode [B, 1, T] -> TokenBatch with semantic + global tokens.r   )r   ref_wavr   c                 s      | ]}|j d  V  qdS Nshape.0sr   r   r   	<genexpr>       z*FastBiCodecCodec.encode.<locals>.<genexpr>)ra   r`   Nr   c                 S  s   g | ]}|  qS r   )flatten)r   gr   r   r   
<listcomp>   s    z+FastBiCodecCodec.encode.<locals>.<listcomp>dim)semanticglobalc                 S  s   g | ]}|j d  qS )r   r   r   r   r   r   r      s    )semantic_lengthsglobal_token_dim)
codec_namerh   tokensaux)r   ru   rZ   r   r   squeezerT   numpyr   r   r[   r   r   r   rV   tokenizeappendmaxzeroslong	enumerater   stackr   r"   rQ   )r>   r   r   r@   semantic_listglobal_listi	wav_1d_npr   r   batch
sem_tokensglob_tokensmax_sem_len
padded_semr   global_tokensr   r   r   r      s<   



zFastBiCodecCodec.encoder   c                 C  sL  |j d | j}|j d | j}|jd }g }t|D ]Y}|jd|jd g| }||d|f d}|| d}	| j	||	d}
t
|
tjrUt|
 }
|
jdkrc|
dd}
n
|
jdkrm|
d}
||
| j qtd	d
 |D }tj|d|| jd}t|D ]\}}|d||ddd|jd f< q|S )z-Decode semantic + global tokens -> [B, 1, T].r   r   r   r   r   Nr
   r-   c                 s  r   r   r   )r   ar   r   r   r     r   z*FastBiCodecCodec.decode.<locals>.<genexpr>r   )r   ru   rZ   r   r   r   getr   rV   
detokenize
isinstancer   ndarrayr[   r   r   ndimr   r   r   r   r   )r>   r   r   
global_tokr@   
audio_listr   sem_lensem_iglob_iwav_recmax_lenresultr   r   r   r   r     s*   



$zFastBiCodecCodec.decodetorch.LongTensorc                 C  sB   |j d }|j d }|  d }|| }tj||gdd S )z7Concatenate semantic + global tokens with vocab offset.r   r   r
   r   r   )r   r   itemr[   catr   )r>   r   r   r   offsetglobal_offsetr   r   r   flatten_for_lm  s
   

zFastBiCodecCodec.flatten_for_lmr   )rR   rS   )r`   rN   ra   rb   r   r   )r   r
   )r   r   r   rP   r   r   )r   r(   r   rP   r   r(   )r   r   r   r(   )r   r   r   r(   )r   r(   r   rP   r   r   )r   r   r   r(   )r   r   r   r   )r   
__module____qualname____doc__r"   __annotations__rQ   r^   r[   r\   r~   r   r   r   r   r   r   r   r   r   r   r   r   rL   d   s   
 	4


	*rL   )r   r   r   r   )r&   r   r   r   )"r   
__future__r   loggingrm   rJ   mathpathlibr   typingr   r   r   r[   torch.nnnntorch.nn.functionalr   r9   r   codecbench.codecs.baser   codecbench.codecsr   	getLoggerr   r   REQUIRED_HIDDEN_LAYERSr   r   r   r%   r    rL   r   r   r   r   <module>   s.    




*