"""
Issue: cache_control on system prompt is ignored when
user message contains audio (input_audio). Explicit caching never triggers
cache_write_tokens or cached_tokens for multimodal/audio requests, despite
working correctly for text-only requests.

REPRO: Set OPENROUTER_API_KEY env var and run:
    python openrouter_cache_bug_repro.py

Tested models: google/gemini-2.5-flash, google/gemini-3-flash-preview
Result: text-only caching works (cache_write > 0 on req1, cached > 0 on req2+).
        audio caching always returns cache_write=0, cached=0.
"""
import base64
import json
import os
import time
from pathlib import Path

import httpx
from dotenv import load_dotenv

load_dotenv()

API_KEY = os.getenv("OPENROUTER_API_KEY")
API_URL = "https://openrouter.ai/api/v1/chat/completions"

# ~1037 tokens — exceeds the 1028 minimum for Gemini 2.5 Flash caching
SYSTEM_PROMPT = """# Role

You are a verbatim speech-to-text transcription system. You are NOT a conversational assistant. Your output must precisely match the audio content. Output ONLY the JSON.

# Critical Rules

1. NEVER TRANSLATE. This is transcription, not translation. Write what you HEAR in the script it was spoken in.
2. VERBATIM FIDELITY. Every repetition, filler, stammer, false start, hesitation — exactly as spoken. Do not clean up speech.
3. NO CORRECTION. Do not fix grammar, pronunciation, dialect, or word choice.
4. NO HALLUCINATION. Never add words not in the audio. If audio cuts off mid-sentence, STOP where the audio stops.
5. UNCERTAINTY. If a word is unclear, write [UNK]. Use [INAUDIBLE] for unintelligible speech.
6. BOUNDARY HANDLING. Audio is VAD-cut and may start or end mid-speech. Transcribe everything you can confidently hear. Do not guess what came before or after.
7. LANGUAGE MISMATCH. Trust what you hear. The expected language hint is just a hint. If audio is clearly in a different language, transcribe in that language's script and set detected_language accordingly.

# Code-Mixed Transcription

Audio may contain multiple languages. Each language stays in its native script. Do NOT transliterate.
- Indic words: write in their native script (Devanagari, Telugu, Tamil, etc.)
- English words spoken in an Indic sentence: keep in Latin script
- Hindi words in a Telugu sentence: keep in Devanagari
- Preserve Sandhi and combined forms as spoken. Do not over-split words.

# Punctuation

Insert punctuation from audible prosodic cues only. No pause heard = no punctuation.
- Only: comma, period, ? and !
- Do not add punctuation for grammatical correctness

# No Speech

If the audio contains no speech (only silence, noise, or music), set transcription to [NO_SPEECH].

# Field Rules

- "transcription": the PRIMARY authoritative field. Verbatim, code-mixed, native script.
- "tagged": identical to transcription, with event tags inserted at their audio positions. Do NOT re-interpret the audio for this field — copy transcription and insert tags.
- "speaker": emotion, speaking_style, pace, accent — derived from audio prosody only.
- "detected_language": ISO 639-1 code of the dominant language actually spoken.

# Event Tags

Insert ONLY if clearly and prominently audible. Do not guess.
- [laugh] — audible laughter
- [cough] — actual cough sound
- [sigh] — audible exhale/sigh
- [breath] — heavy or prominent breathing
- [singing] — speaker is singing, not speaking
- [noise] — environmental noise disrupting speech
- [music] — background music audible during speech or if humming
- [applause] — clapping from audience or speaker
- [snort] — nasal snort sound
- [cry] — audible crying or sobbing

# Reference Examples

## Example: Code-mixed (Telugu + English)
Input context: Telugu podcast, speaker casually mixing English
transcription: "నాకు ఈ phone చాలా బాగుంది, like really good quality అన్నమాట"
tagged: "నాకు ఈ phone చాలా బాగుంది, like really good quality అన్నమాట"
detected_language: "te"

## Example: Code-mixed (Hindi + English)
Input context: Hindi interview with English technical terms
transcription: "तो basically हमने machine learning model को train किया और results काफ़ी अच्छे आए"
tagged: "तो basically हमने machine learning model को train किया और results काफ़ी अच्छे आए"
detected_language: "hi"

## Example: No speech
Input context: Segment contains only background noise
transcription: "[NO_SPEECH]"
tagged: "[NO_SPEECH]"
detected_language: (same as expected hint)

## Example: Abrupt cutoff
Input context: Audio ends mid-word due to VAD boundary
transcription: "అప్పుడు వాళ్ళు వచ్చి చెప్పారు కదా, ఆ తర్వాత మన"
tagged: "అప్పుడు వాళ్ళు వచ్చి చెప్పారు కదా, ఆ తర్వాత మన"
Note: audio cuts mid-word — transcribe only what is heard, do not complete the word.

## Example: Event tags
Input context: Speaker laughs while talking
transcription: "అది చాలా funny moment"
tagged: "అది చాలా [laugh] funny moment"
detected_language: "te"

## Example: Language mismatch
Input context: Expected Hindi but speaker is actually speaking English
transcription: "so the main thing about this product is the packaging"
tagged: "so the main thing about this product is the packaging"
detected_language: "en"
"""

HEADERS = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json",
}


def fmt(data: dict) -> str:
    u = data.get("usage", {})
    d = u.get("prompt_tokens_details", {})
    return (
        f"prompt={u.get('prompt_tokens', 0)}, "
        f"cached={d.get('cached_tokens', 0)}, "
        f"cache_write={d.get('cache_write_tokens', 0)}, "
        f"output={u.get('completion_tokens', 0)}"
    )


def send(body: dict, label: str) -> dict:
    start = time.monotonic()
    resp = httpx.post(API_URL, json=body, headers=HEADERS, timeout=60)
    elapsed = (time.monotonic() - start) * 1000

    if resp.status_code != 200:
        print(f"  {label}: HTTP {resp.status_code} ({elapsed:.0f}ms)")
        print(f"    {resp.text[:300]}")
        return {"error": resp.status_code}

    data = resp.json()
    req_id = data.get("id", "?")
    print(f"  {label}: {fmt(data)} | {elapsed:.0f}ms | id={req_id}")
    return data


def find_audio_file() -> Path | None:
    """Look for any FLAC file in common test locations."""
    candidates = [
        Path("preflight/canary_data/en/nM2KMwb86IU/nM2KMwb86IU/segments"),
        Path("test_audio"),
        Path("."),
    ]
    for d in candidates:
        if d.exists():
            flacs = sorted(d.glob("*.flac"))
            if flacs:
                return flacs[0]
    return None


def build_system_msg():
    """System message with cache_control breakpoint."""
    return {
        "role": "system",
        "content": [
            {
                "type": "text",
                "text": SYSTEM_PROMPT,
                "cache_control": {"type": "ephemeral"},
            }
        ],
    }


def main():
    assert API_KEY, "Set OPENROUTER_API_KEY"

    for model in ["google/gemini-2.5-flash", "google/gemini-3-flash-preview"]:
        print(f"\n{'='*72}")
        print(f"MODEL: {model}")
        print(f"{'='*72}")

        # ── TEST 1: TEXT-ONLY (control) ──────────────────────────────────
        print(f"\n  ── TEST 1: Text-only + cache_control (3 identical requests) ──")
        print(f"  Expected: cache_write>0 on req1, cached>0 on req2+")
        for i in range(3):
            body = {
                "model": model,
                "messages": [
                    build_system_msg(),
                    {"role": "user", "content": "Say hello in 5 words."},
                ],
                "temperature": 0,
                "provider": {"order": ["Google AI Studio"], "allow_fallbacks": False},
            }
            send(body, f"  text-req{i+1}")
            time.sleep(2)

        # ── TEST 2: AUDIO — SAME FILE 3x ────────────────────────────────
        audio_path = find_audio_file()
        if not audio_path:
            print("\n  ── SKIPPING audio tests: no .flac file found ──")
            print("  Place a FLAC file in ./preflight/canary_data/en/*/segments/")
            continue

        audio_b64 = base64.b64encode(audio_path.read_bytes()).decode()
        print(f"\n  ── TEST 2: Same audio 3x + cache_control (pinned AI Studio) ──")
        print(f"  Audio: {audio_path.name} ({len(audio_b64)//1024}KB)")
        print(f"  Expected: cache_write>0 on req1, cached>0 on req2+")
        print(f"  ACTUAL BUG: cache_write=0 and cached=0 on ALL requests")
        for i in range(3):
            body = {
                "model": model,
                "messages": [
                    build_system_msg(),
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "input_audio",
                                "input_audio": {"data": audio_b64, "format": "flac"},
                            },
                            {"type": "text", "text": "Transcribe this audio."},
                        ],
                    },
                ],
                "temperature": 0,
                "provider": {"order": ["Google AI Studio"], "allow_fallbacks": False},
            }
            send(body, f"  audio-req{i+1}")
            time.sleep(2)

        # ── TEST 3: AUDIO — NO PROVIDER PIN ──────────────────────────────
        print(f"\n  ── TEST 3: Same audio 3x + cache_control (default routing) ──")
        for i in range(3):
            body = {
                "model": model,
                "messages": [
                    build_system_msg(),
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "input_audio",
                                "input_audio": {"data": audio_b64, "format": "flac"},
                            },
                            {"type": "text", "text": "Transcribe this audio."},
                        ],
                    },
                ],
                "temperature": 0,
            }
            send(body, f"  audio-req{i+1}")
            time.sleep(2)

    print(f"\n{'='*72}")
    print("SUMMARY")
    print("="*72)
    print("""
Text-only requests: cache_control works as expected.
  - cache_write_tokens > 0 on first request (cache is written)
  - cached_tokens > 0 on subsequent requests (cache is read)
  - Latency drops on cache hits

Audio requests: cache_control is completely ignored.
  - cache_write_tokens = 0 on EVERY request (cache never written)
  - cached_tokens = 0 on EVERY request (cache never read)
  - Tested with: same audio repeated, different audio, pinned provider,
    default routing, gemini-2.5-flash, gemini-3-flash-preview

Root cause evidence (from debug/upstream inspection):
  - Text-only upstream body includes: cachedContent="cachedContents/..."
    (no systemInstruction — prompt served from cache)
  - Audio upstream body includes: systemInstruction with full prompt +
    inlineData audio. No cachedContent field. cache_control is dropped.

The OpenRouter gateway is not taking the explicit cache write/read path
when the request contains audio (input_audio). The cache_control marker
on the system message is silently dropped during the OpenAI→Gemini
format translation for multimodal requests.
""")


if __name__ == "__main__":
    main()