#!/usr/bin/env python3
"""
Deep analysis of the 86 missing words
Find patterns: Are they emotion tags? Specific sections? Language issues?
"""

import os
import sys
import re
from openai import OpenAI
from collections import Counter

OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    print("❌ OPENAI_API_KEY not set!")
    sys.exit(1)

client = OpenAI(api_key=OPENAI_API_KEY)

# Reference text (correct format - no closing tags)
REFERENCE_TEXT = """<excited> In the heart of an enchanted kingdom, where magic flowed like rivers and mythical beasts roamed freely, there lived a legendary warrior princess named Aisha. She was known throughout the land for her incredible courage, her mastery of ancient combat arts, and her ability to communicate with all living creatures.

<whisper> But few knew of the prophecy that surrounded her birth. The ancient seers had foretold that she would one day face the greatest darkness the world had ever seen, and only she possessed the power to defeat it.

Aisha spent her childhood training in the sacred mountains with the wise monks. <curious> They taught her not just the physical arts of combat, but also the spiritual disciplines that would strengthen her mind and soul. Every dawn, she would practice her sword techniques as the sun rose over the peaks. Every evening, she would meditate under the stars, learning to harness the cosmic energies that flowed through all things.

<angry> One fateful day, dark clouds gathered over the kingdom! The evil demon lord Ravaksh had awakened from his thousand-year slumber. His army of shadow warriors began to sweep across the land, corrupting everything they touched. Villages were destroyed, forests withered, and despair spread like wildfire.

<sigh> The kingdom's armies fought valiantly, but they were no match for Ravaksh's dark magic. One by one, the greatest warriors fell. The king's advisors urged him to flee, to abandon the kingdom and save what few lives they could.

<excited> But Aisha refused to give up! She knew this was the moment the prophecy had spoken of. She gathered the remaining defenders and devised a daring plan. They would launch a surprise attack on Ravaksh's fortress while he was still consolidating his power.

एक बार की बात है, ऐशा ने अपने सबसे भरोसेमंद योद्धाओं को इकट्ठा किया। उन्होंने एक खतरनाक योजना बनाई। वे रावक्ष के किले पर हमला करेंगे। यह असंभव लग रहा था, लेकिन उनके पास कोई विकल्प नहीं था। राज्य का भविष्य उनके हाथों में था।

<whisper> Under the cover of darkness, they approached the fortress. The shadow guards patrolled the walls, their red eyes glowing in the night. <laugh> But Aisha had learned from the wind spirits how to move unseen! She led her team through secret passages known only to the ancient monks.

The infiltration was successful. They reached the inner sanctum where Ravaksh was performing a dark ritual. <angry> He was trying to summon an even greater evil from the void! There was no time to waste.

<excited> Aisha drew her legendary sword, which began to glow with divine light! The blade had been forged by the celestial smiths themselves, imbued with the power to vanquish any darkness. As she charged forward, her companions engaged the shadow warriors, giving her a clear path to Ravaksh.

The battle that followed was epic. <curious> Ravaksh wielded powers that defied comprehension - he could bend reality itself, create illusions that seemed more real than reality, and summon storms of pure destruction. But Aisha had trained for this her entire life.

She dodged his dark bolts, deflected his curse spells, and pressed forward relentlessly. <whisper> She remembered the words of her master: "True strength comes not from power, but from purpose." Her purpose was clear - protect her people, save her kingdom, restore the light.

With a mighty cry, she leaped high into the air, her sword blazing like a star. <excited> The blade struck true, piercing through Ravaksh's dark armor and into his corrupted heart! The demon lord let out a terrible scream as the divine light consumed him.

<laugh> The shadow army dissolved like smoke in the wind! The dark clouds parted, and sunlight flooded the land once more. The corrupted forests began to heal, flowers bloomed, and the rivers ran clear again.

<excited> Aisha returned to her kingdom as a hero beyond measure! The people celebrated for seven days and seven nights. Songs were written about her bravery, statues were erected in her honor, and her legend would be told for a thousand generations.

<whisper> But Aisha remained humble. She knew that true victory was not in defeating enemies, but in protecting those she loved. <sigh> She had seen too much loss, too much suffering. From that day forward, she dedicated herself to teaching the next generation, ensuring they would be ready when darkness threatened again.

<excited> And so the kingdom prospered under her watchful protection, and peace reigned for many years to come!"""

AUDIO_FILE = "correct_format_test.wav"

def clean_text(text):
    """Remove emotion tags."""
    return re.sub(r'<[^>]+>', '', text).strip()

def get_reference_words():
    """Get reference words with context."""
    clean = clean_text(REFERENCE_TEXT)
    words = []
    for match in re.finditer(r'\b[\w]+\b', clean.lower()):
        words.append({
            'word': match.group(),
            'position': match.start(),
            'index': len(words)
        })
    return words

def check_emotion_words():
    """Check if emotion tag words are in reference."""
    emotion_words = ['excited', 'whisper', 'curious', 'angry', 'sigh', 'laugh']
    clean = clean_text(REFERENCE_TEXT).lower()
    
    found = {}
    for emotion in emotion_words:
        # Count in tags
        in_tags = len(re.findall(f'<{emotion}>', REFERENCE_TEXT))
        # Count in actual text (cleaned)
        in_text = len(re.findall(rf'\b{emotion}\b', clean))
        found[emotion] = {'in_tags': in_tags, 'in_text': in_text}
    
    return found

print("="*80)
print("🔍 DEEP ANALYSIS OF 86 MISSING WORDS")
print("="*80)
print()

# Get reference words
ref_words = get_reference_words()
ref_word_list = [w['word'] for w in ref_words]
ref_word_counter = Counter(ref_word_list)

print(f"📝 Reference Analysis:")
print(f"   Total words: {len(ref_words)}")
print(f"   Unique words: {len(ref_word_counter)}")
print()

# Check emotion words in reference
emotion_analysis = check_emotion_words()
print("📊 Emotion Words in Reference Text:")
for emotion, counts in emotion_analysis.items():
    print(f"   {emotion}: {counts['in_tags']} tag(s), {counts['in_text']} in text")
print()

# Transcribe
print("🎧 Transcribing...")
with open(AUDIO_FILE, 'rb') as f:
    transcription = client.audio.transcriptions.create(
        model="whisper-1",
        file=f,
        response_format="text"
    )

transcript_lower = transcription.lower()
trans_words = re.findall(r'\b\w+\b', transcript_lower)
trans_word_counter = Counter(trans_words)

print(f"✅ Done!")
print(f"   Transcribed words: {len(trans_words)}")
print(f"   Unique words: {len(trans_word_counter)}")
print()

# Check if emotion words are being SPOKEN
print("🔍 Checking if Emotion Tags are Being SPOKEN:")
emotion_spoken = {}
for emotion, counts in emotion_analysis.items():
    times_in_transcript = trans_word_counter.get(emotion, 0)
    expected_in_text = counts['in_text']
    
    # If spoken more than in reference text, tags are being spoken
    being_spoken = times_in_transcript > expected_in_text
    extra_times = times_in_transcript - expected_in_text
    
    emotion_spoken[emotion] = {
        'expected': expected_in_text,
        'actual': times_in_transcript,
        'being_spoken': being_spoken,
        'extra': extra_times
    }
    
    status = "❌ SPOKEN" if being_spoken else "✅ Not spoken"
    print(f"   {emotion}: {status} (expected {expected_in_text}, found {times_in_transcript})")
    if being_spoken:
        print(f"      → Tag spoken {extra_times} extra time(s)")

print()

# Count total emotion words being wrongly spoken
total_emotion_words_spoken = sum(e['extra'] for e in emotion_spoken.values() if e['extra'] > 0)
print(f"📊 Total emotion tag words being SPOKEN: {total_emotion_words_spoken}")
print()

# Find actually missing words (not emotion tags)
missing_words = []
for i, word in enumerate(ref_word_list):
    # Check if missing from transcript
    if word not in trans_word_counter or ref_word_counter[word] > trans_word_counter[word]:
        # Get context
        context_before = ' '.join(ref_word_list[max(0,i-3):i])
        context_after = ' '.join(ref_word_list[i+1:min(len(ref_word_list),i+4)])
        
        # Check if it's a Hindi word
        is_hindi = bool(re.search(r'[\u0900-\u097F]', ref_words[i]['word']))
        
        # Check position in text
        position_pct = i / len(ref_word_list)
        
        missing_words.append({
            'word': word,
            'index': i,
            'position_pct': position_pct,
            'context': f"...{context_before} [{word}] {context_after}...",
            'is_hindi': is_hindi
        })

print(f"📉 Actually Missing Words: {len(missing_words)}")
print()

# Analyze patterns
hindi_missing = [w for w in missing_words if w.get('is_hindi', False)]
english_missing = [w for w in missing_words if not w.get('is_hindi', False)]

print(f"   Hindi words: {len(hindi_missing)}")
print(f"   English words: {len(english_missing)}")
print()

# Position analysis
start_missing = [w for w in missing_words if w['position_pct'] < 0.33]
middle_missing = [w for w in missing_words if 0.33 <= w['position_pct'] < 0.66]
end_missing = [w for w in missing_words if w['position_pct'] >= 0.66]

print(f"   Start (0-33%): {len(start_missing)} words")
print(f"   Middle (33-66%): {len(middle_missing)} words")
print(f"   End (66-100%): {len(end_missing)} words")
print()

# Word frequency analysis
missing_word_freq = Counter([w['word'] for w in missing_words])
repeated_missing = {k: v for k, v in missing_word_freq.items() if v > 1}

if repeated_missing:
    print(f"🔄 Words Missing Multiple Times:")
    for word, count in sorted(repeated_missing.items(), key=lambda x: -x[1])[:10]:
        expected = ref_word_counter[word]
        found = trans_word_counter.get(word, 0)
        print(f"   '{word}': expected {expected}x, found {found}x, missing {count}x")
    print()

# Show examples of missing words
print("="*80)
print("📋 EXAMPLES OF MISSING WORDS")
print("="*80)
print()

# Sample from different sections
print("From START (first 10):")
for w in start_missing[:10]:
    print(f"   Word {w['index']}: {w['word']}")
    print(f"      Context: {w['context']}")
print()

print("From MIDDLE (first 10):")
for w in middle_missing[:10]:
    print(f"   Word {w['index']}: {w['word']}")
    print(f"      Context: {w['context']}")
print()

print("From END (first 10):")
for w in end_missing[:10]:
    print(f"   Word {w['index']}: {w['word']}")
    print(f"      Context: {w['context']}")
print()

# Hindi section analysis
print("="*80)
print("🇮🇳 HINDI SECTION ANALYSIS")
print("="*80)
print()

# Check Hindi text handling
hindi_text_in_ref = bool(re.search(r'[\u0900-\u097F]+', REFERENCE_TEXT))
hindi_text_in_trans = bool(re.search(r'[\u0900-\u097F]+', transcription))

print(f"Hindi in reference: {hindi_text_in_ref}")
print(f"Hindi in transcript: {hindi_text_in_trans}")
print()

# Check if Hindi section was transcribed
hindi_keywords = ['ek', 'baar', 'aisha', 'kiya', 'kile', 'hamla']
hindi_found = sum(1 for kw in hindi_keywords if kw in transcript_lower)
print(f"Hindi keywords found: {hindi_found}/{len(hindi_keywords)}")
print()

# Summary
print("="*80)
print("💡 PATTERN SUMMARY")
print("="*80)
print()

print(f"Total missing: 86 words")
print(f"  - Emotion tags being spoken: ~{total_emotion_words_spoken} words")
print(f"  - Hindi/language issues: ~{len(hindi_missing)} words")
print(f"  - ASR errors (English): ~{len(english_missing)} words")
print()

accuracy_without_emotion_tags = (len(trans_words) - total_emotion_words_spoken) / len(ref_words) * 100
print(f"Accuracy if emotion tags weren't spoken: {accuracy_without_emotion_tags:.1f}%")
print()

# Save detailed report
with open("missing_words_analysis.txt", 'w') as f:
    f.write("MISSING WORDS DEEP ANALYSIS\n")
    f.write("="*80 + "\n\n")
    
    f.write(f"Total missing: {len(missing_words)}\n\n")
    
    f.write("EMOTION TAGS BEING SPOKEN:\n")
    for emotion, data in emotion_spoken.items():
        if data['being_spoken']:
            f.write(f"  {emotion}: spoken {data['extra']} extra times\n")
    f.write(f"\nTotal emotion words spoken: {total_emotion_words_spoken}\n\n")
    
    f.write("MISSING WORDS BY POSITION:\n")
    f.write(f"  Start: {len(start_missing)}\n")
    f.write(f"  Middle: {len(middle_missing)}\n")
    f.write(f"  End: {len(end_missing)}\n\n")
    
    f.write("ALL MISSING WORDS:\n")
    f.write("-"*80 + "\n")
    for w in missing_words:
        f.write(f"\nWord {w['index']}: {w['word']} ({w['position_pct']*100:.0f}%)\n")
        f.write(f"  Context: {w['context']}\n")

print("📄 Detailed report: missing_words_analysis.txt")
print()

