o
    1i>                     @  s   d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 ddl
ZddlZddlmZ ddlm  mZ ddlZddlmZ ddlmZ eeZdZG d	d
 d
ejZdd ZdddZdddZeG dd dZ dS )u  FastXCodec2: optimized XCodec2 wrapper with all safe (lossless) speedups.

Optimizations applied (all produce identical VQ codes to original):
  Tier 1: Layer truncation — only run 16/24 encoder layers (hidden_states[16] = last_hidden_state)
  Tier 2: GPU mel extraction — replaces CPU numpy SeamlessM4TFeatureExtractor
  Tier 3: Batched encode — full batch through all stages (original breaks at wav[0,:])
  Tier 4: TF32 for matmuls
  Tier 5: Drop attention_mask for uniform-length inputs
  Tier 6: SDPA monkey-patch for attention (F.scaled_dot_product_attention)
  Tier 7: torch.compile with reduce-overhead mode

FP16 autocast removed: it caused 0.15% additional token drift at VQ boundaries
beyond the mel extraction floor. Not worth the 7% speed gain at scale.
    )annotationsNOrderedDict)Optional)
TokenBatch)register_codec   c                      s6   e Zd ZdZdd fddZe dd
dZ  ZS )GPUMelExtractoru  GPU reimplementation of SeamlessM4TFeatureExtractor.

    Matches the exact Kaldi-style pipeline: scale → frame → DC removal →
    preemphasis → windowing → FFT → power → mel → log → per-mel-bin normalize
    → stride-2 reshape.  All on GPU, batched.
    cudadevicestrc                   s   t    d| _d| _d| _|j| _|j| _d| _d| _d| _	t
|j  }| d| t
|j  }| d| d S )	Ni     i   g
ףp=
?gi  >i   mel_filterswindow)super__init__frame_length
hop_length
fft_lengthnum_mel_binsstridepreemphasis	mel_floorkaldi_scaletorch
from_numpyr   copyfloatregister_bufferr   )selfhf_extractorr   r   r   	__class__ ;/home/ubuntu/bench-codecs/codecbench/codecs/xcodec2_fast.pyr   .   s*   
zGPUMelExtractor.__init__	wav_batchtorch.Tensorreturnc                 C  st  |j d }| | j }|jd| j| jd}||jddd }tj|dddf dd	d
}|| j	|  }|d d| j	  |d< || j
 }t|d| j| j f}tjj|| jd}|j |j  }t|| j}	tj|	| jd}	t|	}
|
jddd}|
jdddd}|
| t|d  }|j d }||| j  }|ddd|ddf }|||| j | j| j }|S )u  Extract mel features on GPU, matching HF extractor exactly.

        Args:
            wav_batch: [B, T] waveform tensor on GPU, already padded with (160, 160).

        Returns:
            features: [B, T_frames, 160] — stride-2 reshaped, normalized log-mel.
        r   )	dimensionsizestepT)dimkeepdim.N)   r           )value).r   g      ?)n)minr.   )r,   r-   
correctiongHz>)shaper   r   unfoldr   r   meanFpadr   r   r   r   fftrfftrealsquareimagmatmulr   clampr   logvarsqrtr   reshaper   )r   r%   Bwavframesshiftedpaddedspectrumpowermellog_melr7   rB   
normalizedT_framesfeaturesr#   r#   r$   forwardC   s,   




zGPUMelExtractor.forward)r
   )r   r   )r%   r&   r'   r&   )	__name__
__module____qualname____doc__r   r   no_gradrQ   __classcell__r#   r#   r!   r$   r	   &   s
    r	   c                   s    t   			dd fd
d}|S )u   Build SDPA replacement forward for Wav2Vec2BertSelfAttention.

    Replaces manual matmul→softmax→matmul with F.scaled_dot_product_attention,
    passing the relative_key position bias as attn_mask. Mathematically identical.
    NFhidden_statesr&   attention_maskOptional[torch.Tensor]relative_position_embeddingsoutput_attentionsboolc                   s  |r
|  ||||S | \}}}|}| jdkr&|d u r td| ||}| |||| j| j	dd}	| 
|||| j| j	dd}
| |||| j| j	dd}d }| jdkrp|d u rhtd|  ||||S | jdkr|t|jf}ttdottjd	otj  }|r| v r |}| |< nItj||jd
dd}tj||jd
dd}|| }t|| j | j}| || j }|r| |< t tkr jdd\}}~t tks|j|	jd}td|	|t| j }|}|d ur|d ur|| }n|}tj j!"tj j!j#j$tj j!j#j%g t&j'|	|
||dd}W d    n	1 s4w   Y  |	dd(||| j| j }| )|}|d fS )Nrotaryz0relative_position_embeddings required for rotaryr.   r   relativez2relative_position_embeddings required for relativerelative_keycompileris_compilingr   r(   F)last)dtypezbhld,lrd->bhlrr/   )	attn_mask	dropout_p)*_original_forwardr*   position_embeddings_type
ValueError_apply_rotary_embeddinglinear_qview	num_heads	head_size	transposelinear_klinear_vr   r   hasattrr   ra   rb   poparanger@   left_max_position_embeddingsright_max_position_embeddingsdistance_embeddinglenREL_POS_CACHE_MAX_ENTRIESpopitemtore   einsummathrC   nn	attentionsdpa_kernel
SDPBackendEFFICIENT_ATTENTIONMATHr8   scaled_dot_product_attentionrD   
linear_out)r   rX   rY   r[   r\   rE   S_query_key_statesquerykeyr0   	attn_bias	cache_key	use_cache	pos_embedposition_ids_lposition_ids_rdistanceoldrel_biasout_cached_pos_embedr#   r$   sdpa_forward   s   
"""












z(_make_sdpa_forward.<locals>.sdpa_forward)NNF)rX   r&   rY   rZ   r[   rZ   r\   r]   r   )original_self_attnr   r#   r   r$   _make_sdpa_forward|   s   Zr   model	nn.Moduler'   Nonec                 C  sP   |   D ]\}}t|j}|dkr |j|_t|}t|||_qt	d dS )z?Monkey-patch all Wav2Vec2BertSelfAttention modules to use SDPA.Wav2Vec2BertSelfAttentionz=SDPA monkey-patch applied to Wav2Vec2BertSelfAttention layersN)
named_modulestyperR   rQ   rh   r   types
MethodTypeloggerinfo)r   namemodulecls_namepatchedr#   r#   r$   _apply_sdpa_patch   s   
r   semantic_modelc                 C  s@   | j }t|j}|jdd |_d| j_d| j_td| dS )u   Truncate wav2vec2-bert encoder to 16 layers.

    hidden_states[16] from the 24-layer model = last_hidden_state of the
    16-layer model. Zero quality loss — layers 17-23 are never read by XCodec2.
    N   Fu1   Truncated wav2vec2-bert encoder: %d → 16 layers)encoderry   layersconfignum_hidden_layersoutput_hidden_statesr   r   )r   r   
original_nr#   r#   r$   _apply_layer_truncation   s   
r   c                   @  s   e Zd ZU dZdZded< dZded< d-d.d
dZdej	fd/ddZ
d0d1ddZd2ddZd3d!d"Ze d4d$d%Ze d5d'd(Zd6d*d+Zd,S )7FastXCodec2CodeczXCodec2 with all safe (lossless) optimizations applied.

    Drop-in replacement for XCodec2Codec with identical encode/decode outputs.
    xcodec2_fastr   r   i>  int	native_srHKUSTAudio/xcodec2model_idc                 C  s*   || _ d | _d| _tj| _d | _d| _d S )NcpuF)	_model_id_model_devicer   float32_dtype_mel_extractor_use_compile)r   r   r#   r#   r$   r     s   
zFastXCodec2Codec.__init__r
   r   re   torch.dtyper'   r   c                 C  s   ddl m} || _|| _|| j}| | t|j	 t
|j|d| _| j| dtjjj_dtjj_t|j	 || _td| d S )Nr   )XCodec2Modelrc   Tz/FastXCodec2 loaded on %s with all optimizations)xcodec2.modeling_xcodec2r   r   r   from_pretrainedr   evalr|   r   r   r	   feature_extractorr   r   backendsr
   r?   
allow_tf32cudnnr   r   r   r   )r   r   re   r   r   r#   r#   r$   load  s   


zFastXCodec2Codec.load      @r.   batch_secondsr   
batch_sizec              
   C  sJ  | j d us	J dt|| j }tj|d|| jd}tdD ]"}t  | || j}| 	|}W d    n1 s;w   Y  qtj
  | jsz<tj| j jdd| j _d| _td tdD ]}t  | || j W d    n1 syw   Y  qatj
  W n ty } ztd	| W Y d }~nd }~ww td
 d S )NzCall load() firstr.   rc      zreduce-overhead)modeTz9torch.compile applied to semantic model (reduce-overhead)z/torch.compile failed, continuing without it: %szFastXCodec2 warmup complete)r   r   r   r   randnr   rangeinference_modeencodedecoder
   synchronizer   compiler   r   r   	Exceptionwarning)r   r   r   	n_samplesdummyr   tber#   r#   r$   warmup&  s8   




zFastXCodec2Codec.warmuprF   r&   src                 C  s    || j krtj||| j }|S )N)r   
torchaudio
functionalresample)r   rF   r   r#   r#   r$   _resample_if_neededB  s   
z$FastXCodec2Codec._resample_if_neededwav_2dc                 C  s   |j d }d|j d d  }t|d|f}t|d}| |}| jj|dd}|j}|dd}| j|}| j	|
d}	|	dd}	tj||	gdd}
| j|
dddd}
| jj|
d	d
\}}}|S )zOptimized encode path: GPU mel + truncated semantic + full batch.

        Args:
            wav_2d: [B, T] waveform on device.

        Returns:
            vq_code: [B, T_tok] integer token indices.
        r   i@  r.   )r   r   N)rY   r   )r,   T)vq)r5   r8   r9   r   r   r   last_hidden_staterp   SemanticEncoder_moduleCodecEnc	unsqueezer   catfc_prior	generator)r   r   rE   pad_for_wav
wav_paddedinput_featuressemantic_outputsemantic_hiddensemantic_encodedvq_emb
concat_embr   vq_coder#   r#   r$   _fast_encodeG  s"   
	
zFastXCodec2Codec._fast_encoder   c                 C  sN   |  ||| j}|d}| |}|jdkr|d}t| j| j|dS )u7   Encode [B, 1, T] → TokenBatch with tokens [B, T_tok].r.      )
codec_namesample_ratetokens)	r   r|   r   squeezer   ndimr   r   r   )r   rF   r   r   codesr#   r#   r$   r   r  s   



zFastXCodec2Codec.encoder   c                 C  sF   |j | j}|jdkr|d}| j|}|jdkr!|d}|S )u   Decode tokens → [B, 1, T].r   r.   )r   r|   r   r   r   r   decode_code)r   r   r   audior#   r#   r$   r     s   



zFastXCodec2Codec.decodetorch.LongTensorc                 C  s"   |j }|jdkr|d}| S )z"Already single-stream: [B, T_tok].r   r.   )r   r   r   long)r   r   tr#   r#   r$   flatten_for_lm  s   

zFastXCodec2Codec.flatten_for_lmN)r   )r   r   )r   r   re   r   r'   r   )r   r.   )r   r   r   r   r'   r   )rF   r&   r   r   r'   r&   )r   r&   r'   r&   )rF   r&   r   r   r'   r   )r   r   r'   r&   )r   r   r'   r   )rR   rS   rT   rU   r   __annotations__r   r   r   r   r   r   r   r   r   r   r   r  r#   r#   r#   r$   r      s   
 

+
r   )r   r   r'   r   )r   r   r'   r   )!rU   
__future__r   loggingr~   r   collectionsr   typingr   numpynpr   torch.nnr   torch.nn.functionalr   r8   r   codecbench.codecs.baser   codecbench.codecsr   	getLoggerrR   r   rz   Moduler	   r   r   r   r   r#   r#   r#   r$   <module>   s,    
V
e
