#!/usr/bin/env python3
"""
Comprehensive Validation for Telugu & Hindi Text Normalization

Tests that Indic language text is preserved correctly and not mangled.
"""

import sys
import os
from pathlib import Path
from datetime import datetime

# Ensure repo root is on sys.path so `veena3modal` can be imported when running from scripts/.
repo_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(repo_root))

from veena3modal.processing.text_normalizer import normalize_text


# Comprehensive Telugu test cases
TELUGU_TESTS = {
    "Basic Telugu Words": [
        ("నమస్కారం", "నమస్కారం", "Should keep Telugu greeting intact"),
        ("ధన్యవాదాలు", "ధన్యవాదాలు", "Should keep Telugu thank you intact"),
        ("మీరు ఎలా ఉన్నారు", "మీరు ఎలా ఉన్నారు", "Should keep Telugu question intact"),
    ],
    
    "Telugu with Numbers": [
        ("నాకు 5 పుస్తకాలు ఉన్నాయి", "నాకు five పుస్తకాలు ఉన్నాయి", "Number should expand, Telugu preserved"),
        ("ధర 100 రూపాయలు", "ధర one hundred రూపాయలు", "Number should expand, Telugu preserved"),
        ("నా వయస్సు 25 సంవత్సరాలు", "నా వయస్సు twenty five సంవత్సరాలు", "Number should expand, Telugu preserved"),
    ],
    
    "Telugu with Temperature": [
        ("ఉష్ణోగ్రత 32°C", "ఉష్ణోగ్రత thirty two degrees celsius", "Temperature expansion, Telugu preserved"),
        ("నేడు 25°C ఉంది", "నేడు twenty five degrees celsius ఉంది", "Temperature with Telugu"),
    ],
    
    "Telugu with Currency": [
        ("ధర ₹1,23,456", "ధర one hundred twenty three thousand four hundred fifty six rupees", "Currency expansion, Telugu preserved"),
        ("జీతం ₹50,000 నెలకు", "జీతం fifty thousand rupees నెలకు", "Salary with Telugu"),
    ],
    
    "Telugu with URLs": [
        ("వెబ్‌సైట్ https://example.com చూడండి", "వెబ్సైట్ example dot com చూడండి", "URL expansion, Telugu preserved"),
        ("సందర్శించండి www.google.com", "సందర్శించండి google dot com", "URL with Telugu"),
    ],
    
    "Telugu with Emails": [
        ("నన్ను test@mail.com వద్ద సంప్రదించండి", "నన్ను test at mail dot com వద్ద సంప్రదించండి", "Email expansion, Telugu preserved"),
        ("ఇమెయిల్ user@company.com పంపండి", "ఇమెయిల్ user at company dot com పంపండి", "Email with Telugu"),
    ],
    
    "Telugu with Dates": [
        ("సమావేశం 13/11/2025 న", "సమావేశం thirteen november two thousand twenty five న", "Date expansion, Telugu preserved"),
        ("పుట్టినరోజు 15/08/1990", "పుట్టినరోజు fifteen august nineteen ninety", "Birthday with Telugu"),
    ],
    
    "Telugu with Times": [
        ("సమావేశం 3:45 PM కు", "సమావేశం three forty five p m కు", "Time expansion, Telugu preserved"),
        ("రండి 10:30 AM కి", "రండి ten thirty a m కి", "Time with Telugu"),
    ],
    
    "Telugu with Phone Numbers": [
        ("ఫోన్ +91 98765 43210", "ఫోన్ plus nine one, nine eight seven six five, four three two one zero", "Phone expansion, Telugu preserved"),
        ("కాల్ చేయండి (555) 123-4567", "కాల్ చేయండి five five five, one two three, four five six seven", "US phone with Telugu"),
    ],
    
    "Telugu with Punctuation": [
        ("నువ్వు వస్తావా?", "నువ్వు వస్తావా?", "Question mark preserved"),
        ("అద్భుతం!!!", "అద్భుతం,", "Exclamations converted"),
        ("వెళ్దాం, వద్దా?", "వెళ్దాం, వద్దా?", "Comma and question preserved"),
    ],
    
    "Telugu Complex Sentences": [
        ("నువ్వు ఈరోజు క్లాస్‌కి వెళ్తావా? వెళ్తే మనం కలిసే వెళ్దాం. టైమ్‌ ఏమంటావు?", 
         "నువ్వు ఈరోజు క్లాస్కి వెళ్తావా? వెళ్తే మనం కలిసే వెళ్దాం, టైమ్ ఏమంటావు?",
         "Complex sentence with multiple punctuations"),
        
        ("సమావేశం 2025-11-13 న 3:45 PM కు. ధర ₹5,000. ఇమెయిల్ contact@company.com",
         "సమావేశం thirteen november two thousand twenty five న three forty five p m కు, ధర five thousand rupees, ఇమెయిల్ contact at company dot com",
         "Mixed date, time, currency, email"),
    ],
    
    "Telugu with Mixed English": [
        ("Python programming చాలా బాగుంది", "Python programming చాలా బాగుంది", "English word with Telugu"),
        ("Machine Learning 101 కోర్సు", "Machine Learning one hundred one కోర్సు", "English with number and Telugu"),
    ],
}


# Comprehensive Hindi test cases
HINDI_TESTS = {
    "Basic Hindi Words": [
        ("नमस्ते", "नमस्ते", "Should keep Hindi greeting intact"),
        ("धन्यवाद", "धन्यवाद", "Should keep Hindi thank you intact"),
        ("आप कैसे हैं", "आप कैसे हैं", "Should keep Hindi question intact"),
    ],
    
    "Hindi with Numbers": [
        ("मेरे पास 5 किताबें हैं", "मेरे पास five किताबें हैं", "Number should expand, Hindi preserved"),
        ("कीमत 100 रुपये", "कीमत one hundred रुपये", "Number should expand, Hindi preserved"),
        ("मेरी उम्र 25 साल", "मेरी उम्र twenty five साल", "Number should expand, Hindi preserved"),
    ],
    
    "Hindi with Temperature": [
        ("तापमान 32°C है", "तापमान thirty two degrees celsius है", "Temperature expansion, Hindi preserved"),
        ("आज 25°C है", "आज twenty five degrees celsius है", "Temperature with Hindi"),
    ],
    
    "Hindi with Currency": [
        ("कीमत ₹1,23,456 है", "कीमत one hundred twenty three thousand four hundred fifty six rupees है", "Currency expansion, Hindi preserved"),
        ("वेतन ₹50,000 प्रति माह", "वेतन fifty thousand rupees प्रति माह", "Salary with Hindi"),
    ],
    
    "Hindi with URLs": [
        ("वेबसाइट https://example.com देखें", "वेबसाइट example dot com देखें", "URL expansion, Hindi preserved"),
        ("विजिट करें www.google.com", "विजिट करें google dot com", "URL with Hindi"),
    ],
    
    "Hindi with Emails": [
        ("मुझे test@mail.com पर संपर्क करें", "मुझे test at mail dot com पर संपर्क करें", "Email expansion, Hindi preserved"),
        ("ईमेल user@company.com भेजें", "ईमेल user at company dot com भेजें", "Email with Hindi"),
    ],
    
    "Hindi with Dates": [
        ("बैठक 13/11/2025 को है", "बैठक thirteen november two thousand twenty five को है", "Date expansion, Hindi preserved"),
        ("जन्मदिन 15/08/1990", "जन्मदिन fifteen august nineteen ninety", "Birthday with Hindi"),
    ],
    
    "Hindi with Times": [
        ("मीटिंग 3:45 PM पर", "मीटिंग three forty five p m पर", "Time expansion, Hindi preserved"),
        ("आइए 10:30 AM को", "आइए ten thirty a m को", "Time with Hindi"),
    ],
    
    "Hindi with Phone Numbers": [
        ("फोन +91 98765 43210", "फोन plus nine one, nine eight seven six five, four three two one zero", "Phone expansion, Hindi preserved"),
        ("कॉल करें (555) 123-4567", "कॉल करें five five five, one two three, four five six seven", "US phone with Hindi"),
    ],
    
    "Hindi with Punctuation": [
        ("आप आओगे?", "आप आओगे?", "Question mark preserved"),
        ("अद्भुत!!!", "अद्भुत|", "Exclamations converted to danda"),
        ("चलें, ठीक है?", "चलें, ठीक है?", "Comma and question preserved"),
        ("यह है। वह है।", "यह है| वह है|", "Period converted to danda"),
    ],
    
    "Hindi Complex Sentences": [
        ("बैठक 2025-11-13 को 3:45 PM है। कीमत ₹5,000। ईमेल contact@company.com",
         "बैठक thirteen november two thousand twenty five को three forty five p m है| कीमत five thousand rupees| ईमेल contact at company dot com",
         "Mixed date, time, currency, email"),
        
        ("तापमान 32°C, दूरी 5km, गति 60km/h",
         "तापमान thirty two degrees celsius, दूरी five kilometers, गति sixty kilometers h",
         "Multiple units"),
    ],
    
    "Hindi with Mixed English": [
        ("Python programming बहुत अच्छा है", "Python programming बहुत अच्छा है", "English word with Hindi"),
        ("Machine Learning 101 कोर्स", "Machine Learning one hundred one कोर्स", "English with number and Hindi"),
    ],
}


def run_validation():
    """Run validation and generate report"""
    
    print("=" * 100)
    print("COMPREHENSIVE TELUGU & HINDI NORMALIZATION VALIDATION")
    print("=" * 100)
    print()
    
    all_results = []
    total_tests = 0
    passed = 0
    failed = 0
    
    # Test Telugu
    print("="  * 100)
    print("TELUGU TESTS")
    print("=" * 100)
    print()
    
    for category, tests in TELUGU_TESTS.items():
        print(f"\n{category}")
        print("-" * 80)
        
        for original, expected, description in tests:
            total_tests += 1
            normalized = normalize_text(original)
            
            # Check if normalized matches expected
            matches = normalized == expected
            status = "✅ PASS" if matches else "❌ FAIL"
            
            if matches:
                passed += 1
            else:
                failed += 1
            
            print(f"\n{description}")
            print(f"  Original:   {original}")
            print(f"  Expected:   {expected}")
            print(f"  Normalized: {normalized}")
            print(f"  Status: {status}")
            
            if not matches:
                print(f"  ❌ MISMATCH!")
            
            all_results.append({
                'language': 'Telugu',
                'category': category,
                'description': description,
                'original': original,
                'expected': expected,
                'normalized': normalized,
                'status': 'pass' if matches else 'fail'
            })
    
    # Test Hindi
    print("\n\n" + "=" * 100)
    print("HINDI TESTS")
    print("=" * 100)
    print()
    
    for category, tests in HINDI_TESTS.items():
        print(f"\n{category}")
        print("-" * 80)
        
        for original, expected, description in tests:
            total_tests += 1
            normalized = normalize_text(original)
            
            # Check if normalized matches expected
            matches = normalized == expected
            status = "✅ PASS" if matches else "❌ FAIL"
            
            if matches:
                passed += 1
            else:
                failed += 1
            
            print(f"\n{description}")
            print(f"  Original:   {original}")
            print(f"  Expected:   {expected}")
            print(f"  Normalized: {normalized}")
            print(f"  Status: {status}")
            
            if not matches:
                print(f"  ❌ MISMATCH!")
            
            all_results.append({
                'language': 'Hindi',
                'category': category,
                'description': description,
                'original': original,
                'expected': expected,
                'normalized': normalized,
                'status': 'pass' if matches else 'fail'
            })
    
    # Summary
    print("\n\n" + "=" * 100)
    print("SUMMARY")
    print("=" * 100)
    print(f"\nTotal tests: {total_tests}")
    print(f"Passed: {passed} ({passed/total_tests*100:.1f}%)")
    print(f"Failed: {failed} ({failed/total_tests*100:.1f}%)")
    
    if failed > 0:
        print(f"\n⚠️  {failed} test(s) failed. Review results above.")
    else:
        print("\n✅ All tests passed!")
    
    return all_results, passed, failed, total_tests


def main():
    """Main execution"""
    results, passed, failed, total = run_validation()
    
    # Save results to file
    output_path = Path(__file__).parent.parent / "INDIC_NORMALIZATION_VALIDATION.md"
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write("# Telugu & Hindi Text Normalization Validation\n\n")
        f.write(f"**Generated**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}  \n")
        f.write(f"**Total Tests**: {total}  \n")
        f.write(f"**Passed**: {passed}  \n")
        f.write(f"**Failed**: {failed}  \n")
        f.write(f"**Success Rate**: {(passed/total*100):.1f}%\n\n")
        f.write("---\n\n")
        
        # Group by language
        for language in ['Telugu', 'Hindi']:
            lang_results = [r for r in results if r['language'] == language]
            f.write(f"## {language} Tests\n\n")
            
            # Group by category
            categories = {}
            for r in lang_results:
                cat = r['category']
                if cat not in categories:
                    categories[cat] = []
                categories[cat].append(r)
            
            for category, tests in categories.items():
                f.write(f"### {category}\n\n")
                
                for i, test in enumerate(tests, 1):
                    f.write(f"#### Test {i}: {test['description']}\n\n")
                    f.write(f"**Original:**\n```\n{test['original']}\n```\n\n")
                    f.write(f"**Expected:**\n```\n{test['expected']}\n```\n\n")
                    f.write(f"**Normalized:**\n```\n{test['normalized']}\n```\n\n")
                    
                    if test['status'] == 'pass':
                        f.write("**Status:** ✅ PASS\n\n")
                    else:
                        f.write("**Status:** ❌ FAIL - MISMATCH\n\n")
                
                f.write("---\n\n")
    
    print(f"\n📝 Report saved to: {output_path}")
    
    return 0 if failed == 0 else 1


if __name__ == "__main__":
    sys.exit(main())

