"""FastBiCodec: optimized BiCodec wrapper with lossless speedups.

Optimizations applied (identical tokens to original):
  Tier 1: Layer truncation — wav2vec2-xlsr-53 24→17 layers (only 11,14,16 read)
  Tier 2: TF32 for matmuls
  Tier 3: Proper tensor-in, tensor-out encode (no file path, no numpy roundtrip)
  Tier 4: SDPA monkey-patch for wav2vec2 attention layers

FP16 autocast removed: truncation alone gives 100% token match. FP16 caused
1.26% semantic token drift at VQ boundaries — not acceptable at scale.
"""

from __future__ import annotations

import logging
import sys
import types
import math
from pathlib import Path
from typing import Optional

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio

from codecbench.codecs.base import TokenBatch
from codecbench.codecs import register_codec

logger = logging.getLogger(__name__)

REQUIRED_HIDDEN_LAYERS = [11, 14, 16]
TRUNCATE_TO = max(REQUIRED_HIDDEN_LAYERS) + 1  # 17


def _apply_wav2vec2_truncation(model: nn.Module) -> None:
    """Truncate wav2vec2 encoder to 17 layers (only 11, 14, 16 are read)."""
    encoder = model.encoder
    original_n = len(encoder.layers)
    encoder.layers = encoder.layers[:TRUNCATE_TO]
    model.config.num_hidden_layers = TRUNCATE_TO
    model.config.output_hidden_states = True
    logger.info("Truncated wav2vec2-xlsr-53 encoder: %d → %d layers", original_n, TRUNCATE_TO)


def _apply_wav2vec2_sdpa(model: nn.Module) -> None:
    """Monkey-patch wav2vec2 attention layers to use F.scaled_dot_product_attention."""
    patched = 0
    for name, module in model.named_modules():
        cls_name = type(module).__name__
        if cls_name == "Wav2Vec2Attention":
            _patch_wav2vec2_attention(module)
            patched += 1
    logger.info("SDPA monkey-patch applied to %d Wav2Vec2Attention layers", patched)


def _patch_wav2vec2_attention(attn_module: nn.Module) -> None:
    """Replace manual matmul→softmax→matmul with SDPA in Wav2Vec2Attention."""
    attn_module._original_forward = attn_module.forward

    def sdpa_forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        output_attentions: bool = False,
        **kwargs,
    ):
        if output_attentions:
            return self._original_forward(
                hidden_states, attention_mask=attention_mask,
                output_attentions=output_attentions, **kwargs
            )

        B, S, _ = hidden_states.size()

        query = self.q_proj(hidden_states)
        key = self.k_proj(hidden_states)
        value = self.v_proj(hidden_states)

        query = query.view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
        key = key.view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
        value = value.view(B, S, self.num_heads, self.head_dim).transpose(1, 2)

        attn_bias = None
        if attention_mask is not None:
            attn_bias = attention_mask

        out = F.scaled_dot_product_attention(
            query, key, value, attn_mask=attn_bias, dropout_p=0.0
        )

        out = out.transpose(1, 2).reshape(B, S, self.embed_dim)
        out = self.out_proj(out)
        return out, None

    attn_module.forward = types.MethodType(sdpa_forward, attn_module)


@register_codec
class FastBiCodecCodec:
    """BiCodec with lossless optimizations: layer truncation, FP16, SDPA.

    Drop-in replacement for BiCodecCodec. Accepts tensor input directly
    instead of file paths — no numpy CPU roundtrip.
    """

    name: str = "bicodec_fast"
    native_sr: int = 16_000

    def __init__(self, model_dir: str | None = None):
        self._model_dir = model_dir
        self._bicodec = None
        self._w2v_model = None
        self._w2v_processor = None
        self._config = None
        self._device = "cpu"
        self._dtype = torch.float32

    def load(self, device: str = "cuda", dtype: torch.dtype = torch.float32) -> None:
        self._device = device
        self._dtype = dtype

        if self._model_dir is None:
            raise RuntimeError(
                "FastBiCodec requires model_dir pointing to Spark-TTS-0.5B checkout."
            )

        spark_root = Path(self._model_dir)
        if (spark_root / "sparktts").exists():
            sys.path.insert(0, str(spark_root))
        elif (spark_root.parent / "Spark-TTS" / "sparktts").exists():
            sys.path.insert(0, str(spark_root.parent / "Spark-TTS"))

        from sparktts.utils.file import load_config
        from sparktts.models.bicodec import BiCodec
        from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model

        self._config = load_config(f"{self._model_dir}/config.yaml")

        self._bicodec = BiCodec.load_from_checkpoint(
            f"{self._model_dir}/BiCodec"
        ).to(device)

        self._w2v_processor = Wav2Vec2FeatureExtractor.from_pretrained(
            f"{self._model_dir}/wav2vec2-large-xlsr-53"
        )
        self._w2v_model = Wav2Vec2Model.from_pretrained(
            f"{self._model_dir}/wav2vec2-large-xlsr-53"
        ).to(device)
        self._w2v_model.config.output_hidden_states = True
        self._w2v_model.eval()

        # --- Tier 1: Layer truncation ---
        _apply_wav2vec2_truncation(self._w2v_model)

        # --- Tier 2: TF32 ---
        torch.backends.cuda.matmul.allow_tf32 = True
        torch.backends.cudnn.allow_tf32 = True

        # --- Tier 4: SDPA ---
        _apply_wav2vec2_sdpa(self._w2v_model)

        self._ref_segment_length = (
            int(self._config["sample_rate"] * self._config["ref_segment_duration"])
            // self._config["latent_hop_length"]
            * self._config["latent_hop_length"]
        )

        logger.info("FastBiCodec loaded on %s with all optimizations", device)

    def warmup(self, batch_seconds: float = 6.0, batch_size: int = 1) -> None:
        assert self._bicodec is not None, "Call load() first"
        n_samples = int(batch_seconds * self.native_sr)
        dummy = torch.randn(batch_size, 1, n_samples, device=self._device)
        for _ in range(5):
            with torch.inference_mode():
                tb = self.encode(dummy, self.native_sr)
                _ = self.decode(tb)
        torch.cuda.synchronize()
        logger.info("FastBiCodec warmup complete")

    def _resample_if_needed(self, wav: torch.Tensor, sr: int) -> torch.Tensor:
        if sr != self.native_sr:
            wav = torchaudio.functional.resample(wav, sr, self.native_sr)
        return wav

    def _get_ref_clip(self, wav_1d: np.ndarray) -> torch.Tensor:
        """Extract reference clip for speaker encoder (same logic as original)."""
        ref_len = self._ref_segment_length
        wav_len = len(wav_1d)
        if ref_len > wav_len:
            wav_1d = np.tile(wav_1d, ref_len // wav_len + 1)
        ref_np = wav_1d[:ref_len]
        return torch.from_numpy(ref_np).unsqueeze(0).float().to(self._device)

    def _extract_features(self, wav_np: np.ndarray) -> torch.Tensor:
        """Extract wav2vec2 features with truncated model (fp32, no autocast)."""
        inputs = self._w2v_processor(
            wav_np, sampling_rate=16000, return_tensors="pt",
            padding=True, output_hidden_states=True,
        ).input_values

        feat = self._w2v_model(inputs.to(self._device))

        # After truncation to 17 layers: hidden_states[0] = embedding output,
        # hidden_states[1..17] = encoder layer outputs. Indices 11, 14, 16
        # remain valid and identical to the 24-layer model.
        feats_mix = (
            feat.hidden_states[11]
            + feat.hidden_states[14]
            + feat.hidden_states[16]
        ) / 3
        return feats_mix

    @torch.inference_mode()
    def encode(self, wav: torch.Tensor, sr: int) -> TokenBatch:
        """Encode [B, 1, T] -> TokenBatch with semantic + global tokens."""
        wav = self._resample_if_needed(wav, sr).to(self._device)
        B = wav.shape[0]

        semantic_list = []
        global_list = []

        for i in range(B):
            wav_1d_np = wav[i].squeeze().cpu().numpy()

            ref_wav = self._get_ref_clip(wav_1d_np)
            feat = self._extract_features(wav_1d_np)

            batch = {
                "wav": torch.from_numpy(wav_1d_np).unsqueeze(0).float().to(self._device),
                "ref_wav": ref_wav,
                "feat": feat.to(self._device),
            }
            sem_tokens, glob_tokens = self._bicodec.tokenize(batch)
            semantic_list.append(sem_tokens.squeeze())
            global_list.append(glob_tokens.squeeze())

        max_sem_len = max(s.shape[-1] for s in semantic_list)
        padded_sem = torch.zeros(B, max_sem_len, dtype=torch.long, device=self._device)
        for i, s in enumerate(semantic_list):
            padded_sem[i, :s.shape[-1]] = s.flatten()

        global_tokens = torch.stack(
            [g.flatten() for g in global_list], dim=0
        )

        return TokenBatch(
            codec_name=self.name,
            sample_rate=self.native_sr,
            tokens={"semantic": padded_sem, "global": global_tokens},
            aux={
                "semantic_lengths": [s.shape[-1] for s in semantic_list],
                "global_token_dim": global_tokens.shape[-1],
            },
        )

    @torch.inference_mode()
    def decode(self, tb: TokenBatch) -> torch.Tensor:
        """Decode semantic + global tokens -> [B, 1, T]."""
        semantic = tb.tokens["semantic"].to(self._device)
        global_tok = tb.tokens["global"].to(self._device)
        B = semantic.shape[0]

        audio_list = []
        for i in range(B):
            sem_len = tb.aux.get("semantic_lengths", [semantic.shape[-1]])[i]
            sem_i = semantic[i, :sem_len].unsqueeze(0)
            glob_i = global_tok[i].unsqueeze(0)
            wav_rec = self._bicodec.detokenize(sem_i, glob_i.unsqueeze(1))
            if isinstance(wav_rec, np.ndarray):
                wav_rec = torch.from_numpy(wav_rec).float()
            if wav_rec.ndim == 1:
                wav_rec = wav_rec.unsqueeze(0).unsqueeze(0)
            elif wav_rec.ndim == 2:
                wav_rec = wav_rec.unsqueeze(1)
            audio_list.append(wav_rec.to(self._device))

        max_len = max(a.shape[-1] for a in audio_list)
        result = torch.zeros(B, 1, max_len, device=self._device)
        for i, a in enumerate(audio_list):
            result[i, :, :a.shape[-1]] = a.squeeze(0)
        return result

    def flatten_for_lm(self, tb: TokenBatch) -> torch.LongTensor:
        """Concatenate semantic + global tokens with vocab offset."""
        semantic = tb.tokens["semantic"]
        global_tok = tb.tokens["global"]
        offset = semantic.max().item() + 1
        global_offset = global_tok + offset
        return torch.cat([semantic, global_offset], dim=-1).long()