#!/usr/bin/env python3
"""
Detailed Word-Level Missing Content Analysis

This script will:
1. Generate audio with very long text
2. Transcribe with OpenAI Whisper (with word timestamps)
3. Show EXACTLY which words are missing
4. Show WHERE they should appear (timestamps)
5. Allow user to verify if it's ASR error or TTS chunking error

Output format makes it easy to:
- Listen to audio at specific timestamps
- Verify if word was actually spoken (ASR error)
- Or if chunk was skipped (TTS error)
"""

import os
import sys
import requests
import subprocess
from openai import OpenAI
import re
from difflib import SequenceMatcher

# Initialize OpenAI
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    print("❌ OPENAI_API_KEY not set")
    sys.exit(1)

client = OpenAI(api_key=OPENAI_API_KEY)

# Very long test text (3500+ characters, mixed English + Hindi, with emotions)
TEST_TEXT = """<excited> In the heart of an enchanted kingdom, where magic flowed like rivers and mythical beasts roamed freely, there lived a legendary warrior princess named Aisha. </excited> She was known throughout the land for her incredible courage, her mastery of ancient combat arts, and her ability to communicate with all living creatures.

<whisper> But few knew of the prophecy that surrounded her birth. </whisper> The ancient seers had foretold that she would one day face the greatest darkness the world had ever seen, and only she possessed the power to defeat it.

Aisha spent her childhood training in the sacred mountains with the wise monks. <curious> They taught her not just the physical arts of combat, but also the spiritual disciplines that would strengthen her mind and soul. </curious> Every dawn, she would practice her sword techniques as the sun rose over the peaks. Every evening, she would meditate under the stars, learning to harness the cosmic energies that flowed through all things.

<angry> One fateful day, dark clouds gathered over the kingdom! </angry> The evil demon lord Ravaksh had awakened from his thousand-year slumber. His army of shadow warriors began to sweep across the land, corrupting everything they touched. Villages were destroyed, forests withered, and despair spread like wildfire.

<sigh> The kingdom's armies fought valiantly, but they were no match for Ravaksh's dark magic. </sigh> One by one, the greatest warriors fell. The king's advisors urged him to flee, to abandon the kingdom and save what few lives they could.

<excited> But Aisha refused to give up! </excited> She knew this was the moment the prophecy had spoken of. She gathered the remaining defenders and devised a daring plan. They would launch a surprise attack on Ravaksh's fortress while he was still consolidating his power.

एक बार की बात है, ऐशा ने अपने सबसे भरोसेमंद योद्धाओं को इकट्ठा किया। उन्होंने एक खतरनाक योजना बनाई। वे रावक्ष के किले पर हमला करेंगे। यह असंभव लग रहा था, लेकिन उनके पास कोई विकल्प नहीं था। राज्य का भविष्य उनके हाथों में था।

<whisper> Under the cover of darkness, they approached the fortress. </whisper> The shadow guards patrolled the walls, their red eyes glowing in the night. <laugh> But Aisha had learned from the wind spirits how to move unseen! </laugh> She led her team through secret passages known only to the ancient monks.

The infiltration was successful. They reached the inner sanctum where Ravaksh was performing a dark ritual. <angry> He was trying to summon an even greater evil from the void! </angry> There was no time to waste.

<excited> Aisha drew her legendary sword, which began to glow with divine light! </excited> The blade had been forged by the celestial smiths themselves, imbued with the power to vanquish any darkness. As she charged forward, her companions engaged the shadow warriors, giving her a clear path to Ravaksh.

The battle that followed was epic. <curious> Ravaksh wielded powers that defied comprehension - he could bend reality itself, create illusions that seemed more real than reality, and summon storms of pure destruction. </curious> But Aisha had trained for this her entire life.

She dodged his dark bolts, deflected his curse spells, and pressed forward relentlessly. <whisper> She remembered the words of her master: "True strength comes not from power, but from purpose." </whisper> Her purpose was clear - protect her people, save her kingdom, restore the light.

With a mighty cry, she leaped high into the air, her sword blazing like a star. <excited> The blade struck true, piercing through Ravaksh's dark armor and into his corrupted heart! </excited> The demon lord let out a terrible scream as the divine light consumed him.

<laugh> The shadow army dissolved like smoke in the wind! </laugh> The dark clouds parted, and sunlight flooded the land once more. The corrupted forests began to heal, flowers bloomed, and the rivers ran clear again.

<excited> Aisha returned to her kingdom as a hero beyond measure! </excited> The people celebrated for seven days and seven nights. Songs were written about her bravery, statues were erected in her honor, and her legend would be told for a thousand generations.

<whisper> But Aisha remained humble. </whisper> She knew that true victory was not in defeating enemies, but in protecting those she loved. <sigh> She had seen too much loss, too much suffering. </sigh> From that day forward, she dedicated herself to teaching the next generation, ensuring they would be ready when darkness threatened again.

<excited> And so the kingdom prospered under her watchful protection, and peace reigned for many years to come! </excited>"""

# Clean reference text (remove emotion tags for WER calculation)
REFERENCE_TEXT = re.sub(r'<[^>]+>', '', TEST_TEXT).strip()
REFERENCE_TEXT = re.sub(r'\s+', ' ', REFERENCE_TEXT)


def get_word_positions(text):
    """Get each word with its position in the text."""
    words = []
    for match in re.finditer(r'\b\w+\b', text.lower()):
        words.append({
            'word': match.group(),
            'position': match.start(),
            'end': match.end()
        })
    return words


def find_missing_words_with_context(reference, transcribed, reference_segments=None):
    """
    Find missing words and show their context in the reference text.
    
    Returns detailed info about each missing word including:
    - The word itself
    - Position in original text
    - Context (words before and after)
    - Expected timestamp (if segments available)
    """
    ref_words = get_word_positions(reference)
    trans_text_lower = transcribed.lower()
    
    missing_words = []
    
    for word_info in ref_words:
        word = word_info['word']
        
        # Check if word appears in transcription
        if word not in trans_text_lower:
            # Get context (5 words before and after)
            word_idx = len([w for w in ref_words if w['position'] < word_info['position']])
            
            context_before = ' '.join([
                ref_words[i]['word'] 
                for i in range(max(0, word_idx-5), word_idx)
            ])
            
            context_after = ' '.join([
                ref_words[i]['word'] 
                for i in range(word_idx+1, min(len(ref_words), word_idx+6))
            ])
            
            missing_words.append({
                'word': word,
                'position': word_info['position'],
                'word_index': word_idx,
                'context_before': context_before,
                'context_after': context_after,
                'full_context': f"{context_before} [{word}] {context_after}"
            })
    
    return missing_words


def estimate_timestamp(word_index, total_words, total_duration):
    """Estimate where a word should appear based on its position."""
    return (word_index / total_words) * total_duration


def main():
    API_KEY = "vn3_cdd6d45f2045d03d5adac56eda6af9a9b781211038972807f35d52dfb6400144"
    API_URL = "http://localhost:8000/v1/tts/generate"
    OUTPUT_FILE = "comprehensive_word_analysis.wav"
    
    print("=" * 80)
    print("🔬 COMPREHENSIVE WORD-LEVEL ANALYSIS")
    print("=" * 80)
    print()
    
    print(f"📝 Test Configuration:")
    print(f"   Text length: {len(TEST_TEXT)} chars")
    print(f"   Reference words: {len(REFERENCE_TEXT.split())}")
    print(f"   Emotion tags: {len(re.findall(r'<[^>]+>', TEST_TEXT))}")
    print()
    
    # Preview
    print(f"Text preview (first 300 chars):")
    print(TEST_TEXT[:300])
    print("...\n")
    
    # Generate audio
    print("🎵 Generating audio (this will take 50-70s with chunking)...")
    print()
    
    response = requests.post(
        API_URL,
        headers={"Content-Type": "application/json", "X-API-Key": API_KEY},
        json={
            "text": TEST_TEXT,
            "speaker": "Nilay",
            "stream": False,
            "seed": 42
        }
    )
    
    if response.status_code != 200:
        print(f"❌ Generation failed: {response.status_code}")
        print(response.text)
        sys.exit(1)
    
    with open(OUTPUT_FILE, 'wb') as f:
        f.write(response.content)
    
    # Get headers
    chunked = response.headers.get('X-Text-Chunked', 'unknown')
    audio_bytes = int(response.headers.get('X-Audio-Bytes', 0))
    audio_secs = float(response.headers.get('X-Audio-Seconds', 0))
    
    print(f"✅ Audio generated!")
    print(f"   Chunked: {chunked}")
    print(f"   Size: {audio_bytes:,} bytes ({audio_bytes/1024/1024:.1f} MB)")
    print(f"   Reported duration: {audio_secs:.1f}s ({audio_secs/60:.1f} min)")
    print()
    
    # Get actual duration
    result = subprocess.run(
        ['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
         '-of', 'default=noprint_wrappers=1:nokey=1', OUTPUT_FILE],
        capture_output=True, text=True
    )
    actual_duration = float(result.stdout.strip())
    print(f"   Actual duration: {actual_duration:.1f}s ({actual_duration/60:.1f} min)")
    print()
    
    # Transcribe with word timestamps
    print("🎙️  Transcribing with OpenAI Whisper (word-level timestamps)...")
    print("   This may take 60-120 seconds for very long audio...")
    print()
    
    # Create client with longer timeout for large files
    import httpx
    client_with_timeout = OpenAI(
        api_key=OPENAI_API_KEY,
        timeout=httpx.Timeout(300.0, read=300.0, write=300.0, connect=30.0)  # 5 min timeout
    )
    
    with open(OUTPUT_FILE, 'rb') as f:
        transcript = client_with_timeout.audio.transcriptions.create(
            model='whisper-1',
            file=f,
            response_format='verbose_json',
            timestamp_granularities=['word', 'segment']
        )
    
    print(f"✅ Transcription complete")
    print()
    
    # Analyze
    transcribed_text = transcript.text
    segments = transcript.segments
    words = transcript.words if hasattr(transcript, 'words') else []
    
    # Speech analysis
    speech_time = sum(seg.end - seg.start for seg in segments) if segments else actual_duration
    speech_ratio = (speech_time / actual_duration) * 100
    
    print(f"📊 Speech Analysis:")
    print(f"   Duration: {actual_duration:.1f}s")
    print(f"   Speech: {speech_time:.1f}s ({speech_ratio:.1f}%)")
    print(f"   Silence: {actual_duration - speech_time:.1f}s ({100-speech_ratio:.1f}%)")
    print(f"   Segments: {len(segments)}")
    print()
    
    # Gap analysis
    if segments:
        gaps = []
        for i in range(len(segments) - 1):
            gap = segments[i+1].start - segments[i].end
            if gap > 0.5:
                gaps.append((i+1, gap, segments[i].text[-30:], segments[i+1].text[:30]))
        
        if gaps:
            print(f"⚠️  Large gaps detected: {len(gaps)}")
            for idx, gap_sec, before, after in gaps[:5]:
                print(f"   Gap #{idx}: {gap_sec:.1f}s between chunks")
                print(f"      Before: ...{before}")
                print(f"      After: {after}...")
        else:
            print(f"✅ No large gaps (>500ms)")
    print()
    
    # Find missing words with context
    print("🔍 Analyzing missing words...")
    missing_words = find_missing_words_with_context(REFERENCE_TEXT, transcribed_text)
    
    ref_word_list = [w['word'] for w in get_word_positions(REFERENCE_TEXT)]
    
    print(f"\n📊 Word Statistics:")
    print(f"   Reference words: {len(ref_word_list)}")
    print(f"   Transcribed words: {len(transcribed_text.split())}")
    print(f"   Missing words: {len(missing_words)}")
    print(f"   Capture rate: {(len(ref_word_list) - len(missing_words)) / len(ref_word_list) * 100:.1f}%")
    print()
    
    # Show EACH missing word with context
    if missing_words:
        print("=" * 80)
        print(f"🔍 MISSING WORDS - DETAILED ANALYSIS ({len(missing_words)} words)")
        print("=" * 80)
        print()
        
        for i, missing in enumerate(missing_words, 1):
            # Estimate timestamp
            estimated_time = estimate_timestamp(
                missing['word_index'], 
                len(ref_word_list), 
                actual_duration
            )
            
            print(f"Missing Word #{i}: '{missing['word']}'")
            print(f"   Position in text: Character {missing['position']}")
            print(f"   Position in word list: Word #{missing['word_index']}/{len(ref_word_list)}")
            print(f"   ⏰ ESTIMATED TIMESTAMP: ~{estimated_time:.1f}s (±10s)")
            print(f"   Context: ...{missing['full_context']}...")
            print()
            print(f"   👂 TO VERIFY: Listen to audio at ~{estimated_time:.0f}s")
            print(f"      If you hear '{missing['word']}' → ASR error (Whisper missed it)")
            print(f"      If you DON'T hear it → TTS error (chunk was skipped/incomplete)")
            print()
            print("-" * 80)
            print()
    else:
        print("✅ NO MISSING WORDS! All content captured!")
        print()
    
    # Also check for repeated words (might indicate chunking overlap issues)
    trans_words_lower = [w.lower() for w in transcribed_text.split()]
    ref_words_lower = [w.lower() for w in REFERENCE_TEXT.split()]
    
    # Find words that appear more times in transcription than reference
    from collections import Counter
    trans_counts = Counter(trans_words_lower)
    ref_counts = Counter(ref_words_lower)
    
    repeated_words = []
    for word, trans_count in trans_counts.items():
        ref_count = ref_counts.get(word, 0)
        if trans_count > ref_count and ref_count > 0:
            repeated_words.append((word, ref_count, trans_count, trans_count - ref_count))
    
    if repeated_words:
        print("🔍 REPEATED WORDS (may indicate chunk overlap):")
        print("=" * 80)
        for word, ref_count, trans_count, extra in sorted(repeated_words, key=lambda x: x[3], reverse=True)[:10]:
            print(f"   '{word}': Expected {ref_count}x, got {trans_count}x (+{extra} extra)")
        print()
    
    # Save detailed report
    report_file = "word_analysis_report.txt"
    with open(report_file, 'w') as f:
        f.write("=" * 80 + "\n")
        f.write("COMPREHENSIVE WORD-LEVEL ANALYSIS REPORT\n")
        f.write("=" * 80 + "\n\n")
        
        f.write("TEST CONFIGURATION:\n")
        f.write(f"Text length: {len(TEST_TEXT)} chars\n")
        f.write(f"Reference words: {len(ref_word_list)}\n")
        f.write(f"Chunked: {chunked}\n")
        f.write(f"Duration: {actual_duration:.1f}s\n\n")
        
        f.write("RESULTS:\n")
        f.write(f"Speech ratio: {speech_ratio:.1f}%\n")
        f.write(f"Missing words: {len(missing_words)}\n")
        f.write(f"Capture rate: {(len(ref_word_list) - len(missing_words)) / len(ref_word_list) * 100:.1f}%\n\n")
        
        f.write("=" * 80 + "\n")
        f.write(f"MISSING WORDS WITH TIMESTAMPS ({len(missing_words)} words)\n")
        f.write("=" * 80 + "\n\n")
        
        for i, missing in enumerate(missing_words, 1):
            estimated_time = estimate_timestamp(missing['word_index'], len(ref_word_list), actual_duration)
            f.write(f"#{i}: '{missing['word']}'\n")
            f.write(f"   Timestamp: ~{estimated_time:.1f}s\n")
            f.write(f"   Context: ...{missing['full_context']}...\n")
            f.write(f"   Verification: Listen at {estimated_time:.0f}s to check if word was spoken\n\n")
        
        if repeated_words:
            f.write("=" * 80 + "\n")
            f.write("REPEATED WORDS\n")
            f.write("=" * 80 + "\n\n")
            for word, ref_count, trans_count, extra in repeated_words:
                f.write(f"'{word}': Expected {ref_count}x, got {trans_count}x (+{extra} extra)\n")
            f.write("\n")
        
        f.write("=" * 80 + "\n")
        f.write("REFERENCE TEXT (Original)\n")
        f.write("=" * 80 + "\n")
        f.write(REFERENCE_TEXT + "\n\n")
        
        f.write("=" * 80 + "\n")
        f.write("TRANSCRIBED TEXT (From ASR)\n")
        f.write("=" * 80 + "\n")
        f.write(transcribed_text + "\n\n")
        
        if segments:
            f.write("=" * 80 + "\n")
            f.write("SEGMENTS WITH TIMESTAMPS\n")
            f.write("=" * 80 + "\n\n")
            for i, seg in enumerate(segments):
                f.write(f"[{seg.start:.1f}s - {seg.end:.1f}s] Segment {i+1}:\n")
                f.write(f"{seg.text}\n\n")
    
    print(f"💾 Detailed report saved: {report_file}")
    print()
    
    # Final summary
    print("=" * 80)
    print("🎯 VERIFICATION INSTRUCTIONS")
    print("=" * 80)
    print()
    print(f"Audio file: {OUTPUT_FILE}")
    print(f"Duration: {actual_duration:.1f}s ({actual_duration/60:.1f} minutes)")
    print()
    
    if missing_words:
        print(f"🔍 Found {len(missing_words)} missing words to verify:")
        print()
        for i, missing in enumerate(missing_words[:10], 1):  # Show first 10
            timestamp = estimate_timestamp(missing['word_index'], len(ref_word_list), actual_duration)
            print(f"   {i}. '{missing['word']}' at ~{timestamp:.0f}s")
            print(f"      Context: ...{missing['context_before']} [?] {missing['context_after']}...")
        
        if len(missing_words) > 10:
            print(f"   ... and {len(missing_words) - 10} more (see report)")
        
        print()
        print("🎧 HOW TO VERIFY:")
        print("   1. Play audio file at each timestamp")
        print("   2. Listen for the missing word in context")
        print("   3. If you HEAR it → ASR error (Whisper missed it)")
        print("   4. If you DON'T hear it → TTS error (chunk skipped)")
        print()
        print(f"   Example: ffplay -ss {estimate_timestamp(missing_words[0]['word_index'], len(ref_word_list), actual_duration):.0f} {OUTPUT_FILE}")
    else:
        print("✅ NO MISSING WORDS!")
        print("   All {len(ref_word_list)} words were captured by ASR!")
        print("   This means:")
        print("   - All chunks were processed ✅")
        print("   - No content was skipped ✅")
        print("   - ASR captured everything ✅")
    
    print()
    print(f"📄 Full report: {report_file}")
    print()
    
    # Final verdict
    capture_rate = (len(ref_word_list) - len(missing_words)) / len(ref_word_list) * 100
    
    print("=" * 80)
    print("🏁 FINAL VERDICT")
    print("=" * 80)
    
    if capture_rate >= 95:
        print(f"✅ EXCELLENT: {capture_rate:.1f}% word capture rate")
        print(f"   Missing: {len(missing_words)} words (likely ASR errors, not chunking errors)")
    elif capture_rate >= 90:
        print(f"✅ GOOD: {capture_rate:.1f}% word capture rate")
        print(f"   Missing: {len(missing_words)} words - verify manually")
    elif capture_rate >= 80:
        print(f"⚠️  ACCEPTABLE: {capture_rate:.1f}% word capture rate")
        print(f"   Missing: {len(missing_words)} words - some may be skipped chunks")
    else:
        print(f"❌ POOR: {capture_rate:.1f}% word capture rate")
        print(f"   Missing: {len(missing_words)} words - likely chunking issues")
    
    print()


if __name__ == '__main__':
    main()

