#!/usr/bin/env python3
"""
Comprehensive Text Normalization Validation Script

Tests the text normalizer with extensive real-world sentences across:
- English, Hindi, Telugu
- URLs, emails, phone numbers
- Numbers, currencies, dates, times
- Special characters, symbols, emojis
- Edge cases and complex scenarios

Outputs results to a markdown file for human review.
"""

import sys
import os
from pathlib import Path
from datetime import datetime

# Ensure repo root is on sys.path so `veena3modal` can be imported when running from scripts/.
repo_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(repo_root))

from veena3modal.processing.text_normalizer import normalize_text


# Test cases organized by category
TEST_CASES = {
    "URLs & Web Content": [
        "Visit https://www.example.com for more info",
        "Check out github.com/user/repo for the code",
        "Go to https://ai.google.com/research/pubs/pub12345",
        "Our website is www.company.co.in",
        "API docs at https://api.example.com/v1/docs",
    ],
    
    "Email Addresses": [
        "Contact me at john.doe@example.com",
        "Support email: support@company.co.uk",
        "Send to first.last@my-company.org",
        "Admin: admin+test@example.io",
    ],
    
    "Social Media": [
        "Follow @elonmusk on Twitter",
        "Use #Python for coding",
        "Check out #MakeInIndia initiative",
        "@OpenAI released GPT-4",
        "Trending: #AI #MachineLearning #DeepLearning",
    ],
    
    "Phone Numbers": [
        "Call me at (415) 555-1234",
        "Phone: +91 98765 43210",
        "Contact: +1 800 555 0199",
        "Toll-free: (800) 123-4567",
    ],
    
    "Currency & Money": [
        "The price is $50.99",
        "Budget: ₹1,23,456",
        "Cost: €100.50",
        "Total: £75.25",
        "Investment: $1,000,000",
        "Salary: ₹15,00,000 per annum",
    ],
    
    "Simple Numbers": [
        "I have 5 apples",
        "There are 100 people",
        "The year 2025",
        "Room number 42",
        "Chapter 7 of the book",
        "He scored 95 marks",
    ],
    
    "Large Numbers": [
        "Population: 1,234,567",
        "Revenue: 10,000,000",
        "Distance: 384,400 km to the moon",
        "National debt: 31,400,000,000,000",
    ],
    
    "Decimal Numbers": [
        "Pi is approximately 3.14159",
        "Temperature: 98.6 degrees",
        "Success rate: 99.9%",
        "Score: 8.5 out of 10",
    ],
    
    "Negative Numbers": [
        "Temperature dropped to -5 degrees",
        "Balance: -$250",
        "Altitude: -50 meters below sea level",
    ],
    
    "Ordinal Numbers": [
        "He came 1st in the race",
        "This is the 21st century",
        "My 3rd attempt was successful",
        "The 100th anniversary celebration",
    ],
    
    "Ranges": [
        "Age group: 18-25 years",
        "Temperature range: 20-30 degrees",
        "Pages 45-67 cover this topic",
        "Success rate: 85-95%",
    ],
    
    "Percentages": [
        "Battery at 85%",
        "Growth of 45.5%",
        "Pass rate: 92%",
        "Discount: 25% off",
    ],
    
    "Units - Distance": [
        "Run 5km daily",
        "Height: 180cm",
        "Distance: 2.5km from here",
        "Drive 100 kilometers",
    ],
    
    "Units - Temperature": [
        "Weather is 32°C today",
        "Fever at 101°F",
        "Freezing point: 0°C",
        "Bake at 350°F for 30 minutes",
    ],
    
    "Units - Weight": [
        "Package weighs 2.5kg",
        "Add 500g of flour",
        "Lost 10kg in 3 months",
        "Baby weighs 3.2kg",
    ],
    
    "Units - Volume": [
        "Drink 2l of water daily",
        "Add 250ml of milk",
        "Tank capacity: 50 liters",
    ],
    
    "Units - Speed": [
        "Speed limit: 60km/h",
        "Traveling at 100mph",
        "Internet speed: 100mbps",
    ],
    
    "Dates - ISO Format": [
        "Meeting on 2025-11-13",
        "Born on 1990-05-15",
        "Deadline: 2024-12-31",
        "Event date: 2025-01-01",
    ],
    
    "Dates - Numeric Format": [
        "Conference on 15/08/2025",
        "Birthday: 25/12/1995",
        "Due date: 31/03/2024",
    ],
    
    "Times - 12 Hour": [
        "Meeting at 3:45 PM",
        "Wake up at 6:30 AM",
        "Lunch at 12:00 PM",
        "Dinner at 8:15 PM",
    ],
    
    "Times - 24 Hour": [
        "Train departs at 14:30",
        "Flight at 23:45",
        "Office hours: 09:00 to 18:00",
    ],
    
    "Math Symbols": [
        "Result: 5 + 3 = 8",
        "Subtract: 10 - 4 = 6",
        "Multiply: 6 × 7 = 42",
        "Divide: 20 ÷ 4 = 5",
        "Temperature is < 10 degrees",
        "Score is ≥ 90%",
    ],
    
    "Abbreviations - Titles": [
        "Dr. Smith is the expert",
        "Meet Mr. and Mrs. Johnson",
        "Prof. Kumar teaches here",
        "Ms. Patel will present",
    ],
    
    "Abbreviations - Common": [
        "FYI, the meeting is tomorrow",
        "Send it ASAP",
        "This is i.e. the main point",
        "For e.g. machine learning",
        "It's OK to proceed",
        "DIY projects are fun",
    ],
    
    "Abbreviations - Time": [
        "Meeting at 10 a.m. sharp",
        "Deadline: 5 p.m. today",
        "Working hours: 9am to 6pm",
    ],
    
    "Emojis & Emoticons": [
        "Great work! 😊",
        "I love this ❤️ project",
        "Celebration time 🎉🎊",
        "Happy birthday 🎂",
        "Thumbs up 👍",
        "Smiling face :-)",
        "Heart <3",
    ],
    
    "Punctuation & Symbols": [
        "Wait!!! Are you sure???",
        "This is amazing...",
        "Really? That's great!",
        "Question: what happened?",
        "Note: this is important",
    ],
    
    "Mixed Content - English": [
        "Visit https://example.com or call (415) 555-1234 for help. Costs $50.",
        "Meeting on 2025-11-13 at 3:45 PM. Budget: ₹1,23,456.",
        "Email support@company.com for the 25% discount code.",
        "Follow @username on Twitter! #AI is trending 🚀",
        "Temperature: 32°C, Distance: 5km, Speed: 60km/h",
    ],
    
    "Complex Sentences - English": [
        "Dr. Smith (age 45) earned $1,000,000 in 2024 at company.com",
        "The 21st century saw 50% growth in tech. Visit ai.google.com for more.",
        "Call +91 98765 43210 or email info@startup.in. Funding: ₹10,00,000",
        "Temperature dropped from 35°C to -5°C in 24 hours!",
        "Pi (π ≈ 3.14159) is used in calculations. Learn more at math.org",
    ],
    
    "Hindi Text": [
        "मुझे test@mail.com पर संपर्क करें",
        "बैठक 13/11/2025 को 3:45 PM है",
        "कीमत ₹1,23,456 है।",
        "तापमान 32°C है",
        "वेबसाइट https://example.com देखें",
    ],
    
    "Telugu Text": [
        "నన్ను test@mail.com వద్ద సంప్రదించండి",
        "సమావేశం 13/11/2025 న 3:45 PM కు",
        "ధర ₹1,23,456",
        "ఉష్ణోగ్రత 32°C",
        "వెబ్‌సైట్ https://example.com చూడండి",
    ],
    
    "Edge Cases - Empty & Whitespace": [
        "",
        "   ",
        "\n\n",
        "  hello  world  ",
    ],
    
    "Edge Cases - Special Characters": [
        "Price: $$$100",
        "Email: user@@example.com",
        "Multiple spaces    between    words",
        "Tab\tseparated\tvalues",
    ],
    
    "Edge Cases - Very Long Numbers": [
        "The number is 123456789012345",
        "Scientific: 3e8 m/s",
        "Large decimal: 123.456789012",
    ],
    
    "Edge Cases - Ambiguous Dates": [
        "Date: 01/02/2025",  # Could be Jan 2 or Feb 1
        "Date: 12/12/2025",  # Unambiguous
        "Date: 31/12/2025",  # Must be Dec 31
    ],
    
    "Real World - Business": [
        "Q4 revenue was $2.5M, up 35% YoY. Email: cfo@company.com",
        "Conference call on 15/12/2024 at 10:30 AM. Dial: +1 800 555 0199",
        "Product launch @ https://launch.company.com on 1st Jan 2025!",
    ],
    
    "Real World - Education": [
        "Assignment due on 25/11/2024 @ 11:59 PM. Email submissions to prof@university.edu",
        "Exam scores: Math 95%, Science 88%, English 92%",
        "Class timings: 9:00 AM to 3:30 PM, Room #405",
    ],
    
    "Real World - Healthcare": [
        "Appointment on 20/12/2024 at 2:30 PM. Call (555) 123-4567",
        "Blood pressure: 120/80, Temperature: 98.6°F, Weight: 70kg",
        "Medication: 500mg twice daily. Dr. Kumar: dr.kumar@hospital.com",
    ],
    
    "Real World - Travel": [
        "Flight AI-101 departs at 14:45 from Gate 23B. Check-in @ https://airline.com",
        "Hotel booking: ₹5,500/night for 3 nights. Ref: #BK-12345",
        "Distance: 350km, ETA: 4:30 PM, Speed: 80km/h",
    ],
    
    "Real World - E-commerce": [
        "Order #12345: Total ₹2,499 (15% off). Track @ https://shop.com/track",
        "Flash sale! 50% off on electronics. Valid till 31/12/2024 @ 11:59 PM",
        "COD available. Delivery: 2-3 days. Support: +91 1800 123 4567",
    ],
    
    "Real World - News Headlines": [
        "Stock market up 2.5% today. Sensex @ 75,000. #Markets",
        "Temperature to drop to 5°C tonight. Heavy rain expected.",
        "PM announces ₹10,000 crore package. Read more @ news.com",
    ],
}


def run_validation():
    """Run comprehensive validation and generate markdown report"""
    
    print("🔍 Running comprehensive text normalization validation...")
    print(f"Total categories: {len(TEST_CASES)}")
    total_tests = sum(len(cases) for cases in TEST_CASES.values())
    print(f"Total test cases: {total_tests}\n")
    
    results = []
    passed = 0
    failed = 0
    
    for category, test_sentences in TEST_CASES.items():
        print(f"Testing: {category} ({len(test_sentences)} cases)")
        category_results = []
        
        for sentence in test_sentences:
            try:
                normalized = normalize_text(sentence)
                category_results.append({
                    'original': sentence,
                    'normalized': normalized,
                    'status': 'pass'
                })
                passed += 1
            except Exception as e:
                category_results.append({
                    'original': sentence,
                    'normalized': f"ERROR: {str(e)}",
                    'status': 'fail'
                })
                failed += 1
                print(f"  ❌ FAILED: {sentence[:50]}... - {str(e)}")
        
        results.append({
            'category': category,
            'results': category_results
        })
    
    print(f"\n✅ Validation complete!")
    print(f"Passed: {passed}/{total_tests}")
    print(f"Failed: {failed}/{total_tests}")
    
    return results, passed, failed, total_tests


def generate_markdown_report(results, passed, failed, total_tests):
    """Generate markdown report with all test results"""
    
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    md_lines = [
        "# Text Normalization Validation Report\n",
        f"**Generated**: {timestamp}  ",
        f"**Total Tests**: {total_tests}  ",
        f"**Passed**: {passed}  ",
        f"**Failed**: {failed}  ",
        f"**Success Rate**: {(passed/total_tests*100):.1f}%\n",
        "---\n",
    ]
    
    # Table of contents
    md_lines.append("## Table of Contents\n")
    for i, result_group in enumerate(results, 1):
        category = result_group['category']
        anchor = category.lower().replace(' ', '-').replace('&', '').replace('--', '-')
        md_lines.append(f"{i}. [{category}](#{anchor})\n")
    md_lines.append("\n---\n")
    
    # Detailed results
    for result_group in results:
        category = result_group['category']
        category_results = result_group['results']
        
        md_lines.append(f"\n## {category}\n")
        md_lines.append(f"**Test cases**: {len(category_results)}\n")
        
        for i, result in enumerate(category_results, 1):
            original = result['original']
            normalized = result['normalized']
            status = result['status']
            
            # Escape pipe characters for markdown tables
            original_escaped = original.replace('|', '\\|')
            normalized_escaped = normalized.replace('|', '\\|')
            
            # Use blockquotes for better readability
            md_lines.append(f"\n### {i}. Test Case\n")
            md_lines.append(f"**Original:**\n")
            md_lines.append(f"```\n{original}\n```\n")
            md_lines.append(f"**Normalized:**\n")
            md_lines.append(f"```\n{normalized}\n```\n")
            
            if status == 'fail':
                md_lines.append(f"**Status:** ❌ FAILED\n")
            else:
                md_lines.append(f"**Status:** ✅ PASS\n")
        
        md_lines.append("\n---\n")
    
    # Summary at the end
    md_lines.append("\n## Summary\n")
    md_lines.append(f"- **Total test cases**: {total_tests}\n")
    md_lines.append(f"- **Passed**: {passed} ({(passed/total_tests*100):.1f}%)\n")
    md_lines.append(f"- **Failed**: {failed} ({(failed/total_tests*100):.1f}%)\n")
    
    if failed == 0:
        md_lines.append("\n✅ **All tests passed!**\n")
    else:
        md_lines.append(f"\n⚠️ **{failed} test(s) failed. Review the results above.**\n")
    
    md_lines.append("\n---\n")
    md_lines.append(f"\n*Report generated by `scripts/validate_text_normalization.py` on {timestamp}*\n")
    
    return ''.join(md_lines)


def main():
    """Main execution function"""
    
    # Run validation
    results, passed, failed, total_tests = run_validation()
    
    # Generate markdown report
    print("\n📝 Generating markdown report...")
    report = generate_markdown_report(results, passed, failed, total_tests)
    
    # Write to file
    output_path = Path(__file__).parent.parent / "NORMALIZATION_VALIDATION.md"
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(report)
    
    print(f"✅ Report saved to: {output_path}")
    print(f"\nYou can review the report to check all normalization results.")
    print(f"File: NORMALIZATION_VALIDATION.md")
    
    return 0 if failed == 0 else 1


if __name__ == "__main__":
    sys.exit(main())

