#!/usr/bin/env python3
"""
ASR Validation using OpenAI Whisper API

Tests:
1. Transcribe audio with timestamps
2. Detect silence gaps between segments
3. Compare transcription with original text
4. Verify all chunks were processed
"""

import os
import sys
from pathlib import Path
from openai import OpenAI

# Initialize OpenAI client
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')
if not OPENAI_API_KEY:
    print("❌ OPENAI_API_KEY not set")
    sys.exit(1)

client = OpenAI(api_key=OPENAI_API_KEY)

# Original text
ORIGINAL_TEXT = """Once upon a time, in a small village nestled in the mountains, there lived a young girl named Maya. She had a special gift - she could understand the language of birds. Every morning, she would wake up to the melodious songs of the sparrows outside her window. They would tell her stories about their travels and adventures. One day, a majestic eagle landed on her windowsill. It brought news of a distant land where all the birds were disappearing. एक बार की बात है, पहाड़ों में बसे एक छोटे से गाँव में माया नाम की एक युवा लड़की रहती थी। उसके पास एक विशेष उपहार था - वह पक्षियों की भाषा समझ सकती थी। हर सुबह, वह अपनी खिड़की के बाहर गौरैयों के मधुर गीतों के साथ उठती थी। वे उसे अपनी यात्राओं और रोमांचों की कहानियां सुनाते थे। एक दिन, एक राजसी चील उसकी खिड़की पर उतरी। यह एक दूर देश की खबर लाई जहाँ सभी पक्षी गायब हो रहे थे। Maya knew she had to help. She packed her bag with essentials and set off on a long journey. The path was treacherous, winding through dense forests and across rushing rivers. But Maya was determined. She asked every bird she met for directions. The crows guided her through the forest. The geese helped her cross the rivers. The owls kept watch over her at night, ensuring her safety. माया जानती थी कि उसे मदद करनी है। उसने अपना बैग आवश्यक चीजों से भर लिया और एक लंबी यात्रा पर निकल पड़ी। रास्ता कठिन था, घने जंगलों और तेज़ नदियों के पार घूमता हुआ। लेकिन माया दृढ़ थी। उसने हर पक्षी से दिशा-निर्देश पूछे। कौवों ने उसे जंगल के माध्यम से मार्गदर्शन किया। हंसों ने उसे नदियाँ पार करने में मदद की। उल्लुओं ने रात में उस पर नज़र रखी, उसकी सुरक्षा सुनिश्चित करते हुए। After many days of travel, Maya finally reached the land the eagle had spoken of. It was a barren wasteland, with no trees or vegetation. The sky was eerily silent. She discovered that a powerful sorcerer had cast a spell, trapping all the birds in cages of light. Using her gift, Maya spoke to the imprisoned birds, learning the spell weakness. With courage and determination, she confronted the sorcerer and broke the spell. The birds were freed, and the land came alive with their joyous songs once more. कई दिनों की यात्रा के बाद, माया आखिरकार उस भूमि पर पहुंची जिसके बारे में चील ने बात की थी। यह एक बंजर भूमि थी, जिसमें न पेड़ थे और न ही वनस्पति। आकाश भयानक रूप से शांत था। उसने पाया कि एक शक्तिशाली जादूगर ने एक मंत्र डाला था, सभी पक्षियों को प्रकाश के पिंजरों में फंसाया था। अपने उपहार का उपयोग करते हुए, माया ने कैद पक्षियों से बात की। साहस और दृढ़ संकल्प के साथ, उसने जादूगर का सामना किया और मंत्र तोड़ दिया। पक्षियों को मुक्त कर दिया गया। Maya returned home as a hero. The birds celebrated her bravery with a grand festival. From that day forward, she was known as the Guardian of Birds."""

def analyze_silence_gaps(segments):
    """Analyze gaps between segments to detect silence."""
    print("\n🔍 ANALYZING SILENCE GAPS")
    print("=" * 80)
    
    gaps = []
    for i in range(len(segments) - 1):
        end_time = segments[i].end
        next_start = segments[i + 1].start
        gap = next_start - end_time
        
        if gap > 0.5:  # More than 500ms gap
            gaps.append({
                'after_segment': i + 1,
                'gap_seconds': gap,
                'segment_end_text': segments[i].text[-50:],
                'next_segment_text': segments[i + 1].text[:50]
            })
    
    if gaps:
        print(f"⚠️  Found {len(gaps)} significant gaps (>500ms):")
        for g in gaps:
            print(f"   Gap #{g['after_segment']}: {g['gap_seconds']:.2f}s")
            print(f"     Before: ...{g['segment_end_text']}")
            print(f"     After: {g['next_segment_text']}...")
            print()
    else:
        print("✅ No significant gaps detected")
    
    return gaps

def calculate_speech_ratio(segments, total_duration):
    """Calculate what percentage of audio is speech vs silence."""
    total_speech = sum(seg.end - seg.start for seg in segments)
    speech_ratio = (total_speech / total_duration) * 100
    silence = total_duration - total_speech
    
    print("\n📊 SPEECH vs SILENCE ANALYSIS")
    print("=" * 80)
    print(f"Total duration: {total_duration:.2f}s")
    print(f"Total speech: {total_speech:.2f}s ({speech_ratio:.1f}%)")
    print(f"Total silence: {silence:.2f}s ({100-speech_ratio:.1f}%)")
    
    if speech_ratio < 70:
        print(f"⚠️  WARNING: Only {speech_ratio:.1f}% speech - excessive silence!")
    else:
        print(f"✅ Good speech ratio: {speech_ratio:.1f}%")
    
    return speech_ratio

def compare_texts(original, transcribed):
    """Compare original and transcribed text."""
    import re
    
    # Normalize
    def normalize(text):
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()
    
    orig_norm = normalize(original)
    trans_norm = normalize(transcribed)
    
    # Word-level comparison
    orig_words = set(orig_norm.split())
    trans_words = set(trans_norm.split())
    
    common = orig_words & trans_words
    missing = orig_words - trans_words
    extra = trans_words - orig_words
    
    coverage = len(common) / len(orig_words) * 100 if orig_words else 0
    
    print("\n📝 TEXT COMPARISON")
    print("=" * 80)
    print(f"Original words: {len(orig_words)}")
    print(f"Transcribed words: {len(trans_words)}")
    print(f"Common words: {len(common)} ({coverage:.1f}% coverage)")
    print(f"Missing words: {len(missing)}")
    print(f"Extra words: {len(extra)}")
    
    if missing and len(missing) <= 20:
        print(f"\nMissing words: {', '.join(list(missing)[:20])}")
    
    if coverage < 50:
        print(f"❌ FAIL: Only {coverage:.1f}% coverage - chunks may be missing!")
    elif coverage < 70:
        print(f"⚠️  WARNING: {coverage:.1f}% coverage - some content may be missing")
    else:
        print(f"✅ PASS: {coverage:.1f}% coverage")
    
    return coverage

def main():
    audio_file = "test4_fixed_long_story.wav"
    
    if not os.path.exists(audio_file):
        print(f"❌ Audio file not found: {audio_file}")
        sys.exit(1)
    
    print("=" * 80)
    print("🎙️  ASR VALIDATION WITH OPENAI WHISPER")
    print("=" * 80)
    print(f"Audio file: {audio_file}")
    print(f"Original text: {len(ORIGINAL_TEXT)} chars")
    print()
    
    # Get file duration
    import subprocess
    result = subprocess.run(
        ['ffprobe', '-v', 'error', '-show_entries', 'format=duration', 
         '-of', 'default=noprint_wrappers=1:nokey=1', audio_file],
        capture_output=True, text=True
    )
    total_duration = float(result.stdout.strip())
    print(f"Audio duration: {total_duration:.2f}s ({total_duration/60:.1f} minutes)")
    print()
    
    # Transcribe with OpenAI Whisper API
    print("🔄 Transcribing with OpenAI Whisper API...")
    print("   (This may take 30-60 seconds for 4 minute audio)")
    
    with open(audio_file, 'rb') as f:
        transcript = client.audio.transcriptions.create(
            model="whisper-1",
            file=f,
            response_format="verbose_json",
            timestamp_granularities=["segment"]
        )
    
    print(f"✅ Transcription complete")
    print()
    
    # Extract segments
    segments = transcript.segments if hasattr(transcript, 'segments') else []
    full_text = transcript.text
    
    print(f"📊 Transcription stats:")
    print(f"   Total segments: {len(segments)}")
    print(f"   Transcribed text: {len(full_text)} chars")
    print()
    
    # Show first 500 chars
    print("📝 Transcription preview (first 500 chars):")
    print("-" * 80)
    print(full_text[:500])
    print("...")
    print()
    
    # Analyze gaps
    if segments:
        gaps = analyze_silence_gaps(segments)
        speech_ratio = calculate_speech_ratio(segments, total_duration)
    
    # Compare texts
    coverage = compare_texts(ORIGINAL_TEXT, full_text)
    
    # Save full transcription
    output_file = audio_file.replace('.wav', '_transcription.txt')
    with open(output_file, 'w') as f:
        f.write("=" * 80 + "\n")
        f.write("ORIGINAL TEXT\n")
        f.write("=" * 80 + "\n")
        f.write(ORIGINAL_TEXT + "\n\n")
        f.write("=" * 80 + "\n")
        f.write("TRANSCRIBED TEXT\n")
        f.write("=" * 80 + "\n")
        f.write(full_text + "\n\n")
        f.write("=" * 80 + "\n")
        f.write("SEGMENTS WITH TIMESTAMPS\n")
        f.write("=" * 80 + "\n")
        if segments:
            for i, seg in enumerate(segments):
                f.write(f"\nSegment {i+1}: [{seg.start:.2f}s - {seg.end:.2f}s]\n")
                f.write(f"{seg.text}\n")
    
    print(f"\n💾 Full transcription saved to: {output_file}")
    
    # Final verdict
    print("\n" + "=" * 80)
    print("🏁 FINAL VERDICT")
    print("=" * 80)
    
    if segments:
        if speech_ratio >= 70 and coverage >= 70:
            print("✅ PASS: Good speech ratio and text coverage")
        elif speech_ratio < 70:
            print("⚠️  ISSUE: Excessive silence detected")
            print("   Possible causes:")
            print("   - Crossfade creating gaps instead of overlap")
            print("   - Model generating silence between chunks")
            print("   - Padding in audio chunks")
        elif coverage < 70:
            print("⚠️  ISSUE: Low text coverage")
            print("   Possible causes:")
            print("   - Some chunks failed to generate")
            print("   - Hindi text not transcribed (expected)")
            print("   - Audio quality issues")
    
    print()

if __name__ == '__main__':
    main()

