"""
Simple Transcription Validator
==============================

Two checks:
1. Character validation - catch garbage/alien characters
2. Audio-text match - verify words match what's spoken

Usage:
    from src.validators.simple_validator import validate_transcription
    
    result = validate_transcription("audio.flac", "transcription", language="te")
    print(result['status'])  # accept / review / reject
"""
import re
from dataclasses import dataclass
from typing import List, Dict, Optional, Set


# Unicode ranges for Indic scripts
SCRIPT_RANGES = {
    "te": {  # Telugu
        "name": "Telugu",
        "ranges": [(0x0C00, 0x0C7F)],  # Telugu block
        "allow_ascii_punct": True,
        "allow_digits": True,
    },
    "hi": {  # Hindi (Devanagari)
        "name": "Hindi",
        "ranges": [(0x0900, 0x097F)],  # Devanagari block
        "allow_ascii_punct": True,
        "allow_digits": True,
    },
    "ta": {  # Tamil
        "name": "Tamil", 
        "ranges": [(0x0B80, 0x0BFF)],
        "allow_ascii_punct": True,
        "allow_digits": True,
    },
    "kn": {  # Kannada
        "name": "Kannada",
        "ranges": [(0x0C80, 0x0CFF)],
        "allow_ascii_punct": True,
        "allow_digits": True,
    },
    "ml": {  # Malayalam
        "name": "Malayalam",
        "ranges": [(0x0D00, 0x0D7F)],
        "allow_ascii_punct": True,
        "allow_digits": True,
    },
    "bn": {  # Bengali
        "name": "Bengali",
        "ranges": [(0x0980, 0x09FF)],
        "allow_ascii_punct": True,
        "allow_digits": True,
    },
}

# Common allowed characters across all languages
ALLOWED_COMMON = set(" \t\n.,!?;:'\"()-–—0123456789")

# English characters (for code-mixed text)
ENGLISH_CHARS = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")


def get_valid_chars(language: str, allow_english: bool = True) -> Set[int]:
    """Get set of valid Unicode codepoints for a language."""
    valid = set()
    
    # Add common chars
    for c in ALLOWED_COMMON:
        valid.add(ord(c))
    
    # Add script-specific chars
    lang_config = SCRIPT_RANGES.get(language, SCRIPT_RANGES["te"])
    for start, end in lang_config["ranges"]:
        for cp in range(start, end + 1):
            valid.add(cp)
    
    # Allow English for code-mixed
    if allow_english:
        for c in ENGLISH_CHARS:
            valid.add(ord(c))
    
    return valid


def check_characters(
    text: str,
    language: str = "te",
    allow_english: bool = False  # Changed: native = no English
) -> Dict:
    """
    Check if text contains only valid characters for the language.
    
    For native transcription: allow_english=False (default)
    
    Returns:
        {
            "valid": True/False,
            "invalid_chars": [...],
            "script_ratio": 0.8,  # ratio of native script chars
        }
    """
    if not text or not text.strip():
        return {"valid": False, "invalid_chars": [], "script_ratio": 0, "reason": "empty"}
    
    valid_chars = get_valid_chars(language, allow_english)
    lang_config = SCRIPT_RANGES.get(language, SCRIPT_RANGES["te"])
    
    invalid_chars = []
    script_count = 0
    total_alpha = 0
    
    for i, char in enumerate(text):
        cp = ord(char)
        
        # Check if in native script
        is_native = any(start <= cp <= end for start, end in lang_config["ranges"])
        if is_native:
            script_count += 1
            total_alpha += 1
        
        # Check if invalid (includes English if not allowed)
        elif cp not in valid_chars:
            invalid_chars.append({
                "char": char,
                "codepoint": cp,
                "position": i
            })
            if char in ENGLISH_CHARS:
                total_alpha += 1
    
    return {
        "valid": len(invalid_chars) == 0,
        "invalid_chars": invalid_chars[:10],  # First 10 only
        "invalid_count": len(invalid_chars),
        "script_ratio": script_count / total_alpha if total_alpha > 0 else 0,
    }


@dataclass
class ValidationResult:
    """Result of transcription validation."""
    status: str  # "accept", "review", "reject"
    
    # Character check
    char_valid: bool = True
    invalid_chars: List[Dict] = None
    script_ratio: float = 0.0
    
    # Audio match check  
    alignment_score: float = 0.0
    low_confidence_words: List[str] = None
    low_confidence_ratio: float = 0.0
    
    # Reasons
    reasons: List[str] = None
    
    def to_dict(self) -> Dict:
        return {
            "status": self.status,
            "char_valid": self.char_valid,
            "invalid_chars": self.invalid_chars or [],
            "script_ratio": round(self.script_ratio, 3),
            "alignment_score": round(self.alignment_score, 4),
            "low_confidence_words": self.low_confidence_words or [],
            "low_confidence_ratio": round(self.low_confidence_ratio, 3),
            "reasons": self.reasons or []
        }


# Global CTC aligner (lazy loaded)
_ctc_aligner = None

def _get_ctc_aligner(language: str):
    """Lazy load CTC aligner."""
    global _ctc_aligner
    if _ctc_aligner is None:
        from .ctc_forced_aligner import CTCForcedAligner
        _ctc_aligner = CTCForcedAligner(language=language)
    return _ctc_aligner


def validate_transcription(
    audio_path: str,
    transcription: str,
    language: str = "te",
    check_audio: bool = True,
    low_conf_threshold: float = 0.3,
    low_conf_ratio_threshold: float = 0.8,  # Raised from 0.5 - code-mixed is normal
    min_alignment_score: float = 0.5  # Below this = likely wrong transcription
) -> ValidationResult:
    """
    Validate a transcription.
    
    Args:
        audio_path: Path to audio file
        transcription: Text to validate
        language: Language code (te, hi, ta, etc.)
        check_audio: Whether to check audio-text match (slower)
        low_conf_threshold: Per-word confidence threshold
        low_conf_ratio_threshold: Max ratio of low-conf words before review
    
    Returns:
        ValidationResult with status and details
    """
    reasons = []
    
    # === STEP 1: Character validation (instant) ===
    # Native transcription = no English allowed
    char_check = check_characters(transcription, language, allow_english=False)
    
    if not char_check["valid"]:
        return ValidationResult(
            status="reject",
            char_valid=False,
            invalid_chars=char_check["invalid_chars"],
            script_ratio=char_check["script_ratio"],
            english_ratio=char_check["english_ratio"],
            reasons=["Invalid/alien characters found"]
        )
    
    # For native transcription, should be mostly native script
    if char_check["script_ratio"] < 0.5:
        reasons.append(f"Too few native chars ({char_check['script_ratio']:.0%})")
    
    # === STEP 2: Audio-text match (0.1s) ===
    alignment_score = 0.0
    low_conf_words = []
    low_conf_ratio = 0.0
    
    if check_audio:
        try:
            aligner = _get_ctc_aligner(language)
            result = aligner.align(audio_path, transcription)
            
            alignment_score = result.alignment_score
            low_conf_words = result.low_confidence_words
            low_conf_ratio = result.low_confidence_ratio
            
            # REJECT if alignment is very poor (likely wrong transcription)
            if alignment_score < min_alignment_score:
                reasons.append(f"Poor alignment ({alignment_score:.2f} < {min_alignment_score})")
            
            # REVIEW if many low-confidence words (but could be code-mixed)
            elif low_conf_ratio > low_conf_ratio_threshold:
                reasons.append(f"Many low-confidence words ({low_conf_ratio:.0%})")
            
        except Exception as e:
            reasons.append(f"Audio check failed: {str(e)[:50]}")
    
    # === Determine status ===
    # Reject: invalid chars OR very poor alignment
    # Review: high low-conf ratio
    # Accept: everything else
    if not char_check["valid"]:
        status = "reject"
    elif any("Poor alignment" in r for r in reasons):
        status = "reject"
    elif len(reasons) > 0:
        status = "review"
    else:
        status = "accept"
    
    return ValidationResult(
        status=status,
        char_valid=char_check["valid"],
        invalid_chars=char_check.get("invalid_chars"),
        script_ratio=char_check["script_ratio"],
        alignment_score=alignment_score,
        low_confidence_words=low_conf_words,
        low_confidence_ratio=low_conf_ratio,
        reasons=reasons if reasons else None
    )


def cleanup():
    """Release CTC aligner resources."""
    global _ctc_aligner
    if _ctc_aligner is not None:
        _ctc_aligner.cleanup()
        _ctc_aligner = None


# === Quick validation (no audio check) ===
def quick_validate(transcription: str, language: str = "te") -> Dict:
    """
    Quick character-only validation for NATIVE transcription.
    
    Checks:
    - Not empty
    - All characters in target script (no English/garbage)
    - Sufficient native script content
    
    Args:
        transcription: Native script text to validate
        language: Language code (te, hi, ta, etc.)
        
    Returns:
        {"valid": True/False, "reason": "..."}
    """
    if not transcription or not transcription.strip():
        return {"valid": False, "reason": "Empty transcription"}
    
    char_check = check_characters(transcription, language, allow_english=False)
    
    if not char_check["valid"]:
        invalid = [c['char'] for c in char_check['invalid_chars'][:5]]
        return {
            "valid": False,
            "reason": f"Invalid chars: {invalid}"
        }
    
    if char_check["script_ratio"] < 0.5:
        return {
            "valid": False,
            "reason": f"Too few native chars ({char_check['script_ratio']:.0%})"
        }
    
    return {"valid": True, "script_ratio": char_check["script_ratio"]}


if __name__ == "__main__":
    import sys
    
    if len(sys.argv) < 3:
        print("""
Simple Validator
================

Usage:
    # Full validation (with audio check)
    python simple_validator.py <audio_path> <transcription>
    
    # Quick validation (character check only)
    python simple_validator.py --quick <transcription>

Examples:
    python simple_validator.py audio.flac "నాకు కొన్ని యాడ్స్ గుర్తుంటాయి"
    python simple_validator.py --quick "నాకు కొన్ని యాడ్స్"
""")
        sys.exit(1)
    
    if sys.argv[1] == "--quick":
        text = " ".join(sys.argv[2:])
        result = quick_validate(text)
        print(f"Valid: {result['valid']}")
        if not result['valid']:
            print(f"Reason: {result['reason']}")
    else:
        audio_path = sys.argv[1]
        transcription = " ".join(sys.argv[2:])
        
        print("Validating...")
        result = validate_transcription(audio_path, transcription)
        cleanup()
        
        print(f"\nStatus: {result.status.upper()}")
        print(f"Character valid: {result.char_valid}")
        print(f"Script ratio: {result.script_ratio:.1%}")
        print(f"English ratio: {result.english_ratio:.1%}")
        print(f"Alignment score: {result.alignment_score:.4f}")
        print(f"Low conf words: {result.low_confidence_ratio:.1%}")
        
        if result.reasons:
            print(f"\nReasons for review:")
            for r in result.reasons:
                print(f"  - {r}")
        
        if result.low_confidence_words:
            print(f"\nLow confidence words: {result.low_confidence_words[:5]}")
