#!/usr/bin/env python3
"""
Configuration for Fast Pipeline v6.0 - Adaptive Compute-Aware

Key improvements:
- Auto-detects system resources (nproc, vCPUs, GPU vRAM)
- Adapts worker counts and batch sizes to available compute
- Optimized for micro-level speaker detection (0.4s events)
"""

import os
from dataclasses import dataclass, field
from typing import Optional


@dataclass
class Config:
    """
    Pipeline configuration with adaptive compute settings.
    
    Flow (per instructions.md):
    1. Download (max parallel workers)
    2. Quick VAD (parallel, get speech outline)
    3. Chunk at silence boundaries (VAD-aware)
    4. OSD (mark overlaps as unusable FIRST)
    5. Frame-level segmentation (17ms resolution for micro-changes)
    6. Embeddings + Conservative clustering
    7. Output metadata JSON only
    """
    
    # === AUTHENTICATION ===
    # NOTE: Must be provided via environment/secrets (Azure Key Vault, RunPod secrets, etc).
    # Never hard-code tokens in repo (HF gated model access + security).
    hf_token: str = os.environ.get('HF_TOKEN', '')
    
    # === AUDIO PROCESSING ===
    sample_rate: int = 16000
    intro_skip_seconds: float = 0.0
    outro_skip_seconds: float = 0.0
    
    # === INTRO SKIP (v6.8 - Dynamic) ===
    # auto_intro_skip: Automatically skip intro based on video duration
    # When True and no chapter-based intro detected:
    #   - Video ≤ 30 min: skip 180s (3 min)
    #   - Video > 30 min: skip 300s (5 min)
    # Set to False to disable automatic intro skipping
    # Manual intro_skip_seconds > 0 always takes priority
    auto_intro_skip: bool = True
    
    # === HIGH QUALITY AUDIO PRESERVATION (v6.8+) ===
    # preserve_original_audio: Download at highest quality, process at 16kHz
    # When True:
    #   - Downloads original audio format (bestaudio - typically 48kHz)
    #   - Creates 16kHz copy for processing (VAD, diarization, embeddings)
    #   - After processing, deletes 16kHz and keeps only original
    # Final export cuts from original high-quality audio using timestamps
    preserve_original_audio: bool = True
    original_audio_sample_rate: int = 0  # 0 = keep original (highest), or specify (e.g. 48000)
    
    # === PARALLELISM (Auto-detected, these are fallbacks) ===
    # Actual values computed by COMPUTE.get_optimal_config()
    vad_workers: int = 32          # Will be auto-tuned
    max_workers: int = 32          # Will be auto-tuned
    download_workers: int = 4      # Parallel downloads for batch
    chunk_workers: int = 4         # GPU diarization (VRAM limited)
    
    # === CHUNKING (for diarization input) ===
    chunk_duration: float = 300.0  # 5 min max chunks
    min_chunk_duration: float = 30.0
    
    # === VAD (Silero) - Tuned for 0.4s detection ===
    # Per instructions.md: min_duration_on = 0.2s to catch short events
    vad_threshold: float = 0.5
    vad_min_speech_ms: int = 200   # CHANGED: 250 -> 200 for 0.4s events
    vad_min_silence_ms: int = 200  # CHANGED: 100 -> 200 per instructions
    vad_window_size_samples: int = 512
    vad_speech_pad_ms: int = 30
    vad_chunk_size: float = 60.0   # Parallel VAD chunk size
    
    # === MODEL SELECTION ===
    # Per user request: community-1 first, 3.1 fallback
    use_community_model: bool = True  # Try community-1 first
    
    # === SEGMENTATION (Frame-level for micro-changes) ===
    # Per instructions: 17ms frame resolution, NOT 1.5s chunks
    segmentation_step: float = 2.5     # Sliding window step (seconds)
    segmentation_duration: float = 5.0  # Window duration (seconds)
    min_segment_duration: float = 0.2   # Min segment (catch 0.4s events)
    
    # === OVERLAP DETECTION ===
    detect_overlap: bool = True
    overlap_threshold: float = 0.35     # Probability for active speaker
    overlap_min_duration: float = 0.1  # Min overlap to mark
    overlap_padding_ms: int = 100      # Pad overlap boundaries (±100ms)
    
    # === OVERLAP DENSITY DETECTION (v6.9) ===
    # Mark short segments sandwiched between overlaps as unusable
    # These are "islands" of supposedly clean audio in high-overlap regions
    overlap_density_filter: bool = True
    overlap_density_max_gap: float = 1.0      # Max gap to adjacent overlap (seconds)
    overlap_density_max_duration: float = 3.0  # Max segment duration to mark as unusable
    
    # === QUALITY FILTERING (v6.1) ===
    filter_by_quality: bool = True     # Enable SNR/quality filtering
    min_snr_db: float = 15.0           # Minimum SNR for TTS usability
    min_quality_score: float = 0.3     # Minimum quality score (0-1)
    
    # === SEGMENT DURATION FILTERING (v6.1) ===
    # Drop speaker segments shorter than this from the FINAL output.
    #
    # IMPORTANT: we do adjacent same-speaker merging before applying this filter,
    # so short fragments (e.g. 0.4-0.9s) can be absorbed into longer continuous
    # turns when safe to do so.
    min_tts_duration: float = 1.0      # Minimum segment duration for TTS
    
    # === DIARIZATION ===
    min_speakers: int = 1
    max_speakers: int = 10
    
    # === EMBEDDINGS (OOM-protected) ===
    embedding_batch_size: int = 8      # Will be auto-tuned
    max_embedding_length: int = 16000 * 10  # Max 10s per segment
    
    # === CLUSTERING (Conservative - no poison merges) ===
    # Per instructions: 0.75-0.80 to prevent merging different speakers
    cluster_merge_threshold: float = 0.80
    min_segments_for_merge: int = 2
    
    # === CHUNK REASSIGNMENT (v6.3 - Precision Surgical Splitter) ===
    # Detects within-segment speaker changes and surgically splits/reassigns
    enable_chunk_reassignment: bool = True  # Enable by default for TTS quality
    chunk_reassignment_threshold: float = 0.40  # Normal threshold (look-ahead)
    chunk_reassignment_severe: float = 0.25  # Severe threshold (circuit-breaker)
    chunk_reassignment_min_speech: float = 0.6  # Min speech ratio for eligibility
    chunk_reassignment_min_portion: float = 1.5  # Min split portion in seconds
    
    # === MUSIC DETECTION (v6.7 - PANNs CNN14) ===
    # Detects music/instruments to mark segments as clean/needs_demucs/heavy_music
    # Uses PANNs (Pretrained Audio Neural Networks) trained on AudioSet
    enable_music_detection: bool = True  # Enable music detection
    music_chunk_duration: float = 1.5  # Same as embedding chunks
    music_batch_size: int = 64  # Batch size for PANNs inference
    
    # === STRICT DETECTION THRESHOLDS FOR TTS QUALITY (v7.1) ===
    # Zero tolerance: ANY contamination → needs_demucs or unusable
    
    # Per-chunk thresholds
    music_prob_threshold: float = 0.20  # Chunk "has music" if prob > this (was 0.3)
    noise_prob_threshold: float = 0.25  # Chunk "has noise" if prob > this (NEW)
    
    # Per-segment decision thresholds (STRICT but realistic)
    # Based on empirical testing: baseline neural net output is ~0.04-0.06
    music_ratio_clean: float = 0.0      # ANY chunk with music detection → needs_demucs
    music_ratio_demucs: float = 0.15    # 0-15% → demucs, >15% → unusable
    music_mean_clean: float = 0.10      # ADJUSTED: 0.10 accounts for neural net baseline (was 0.05)
    music_mean_demucs: float = 0.25     # Threshold for heavy_contamination
    
    # Noise thresholds (NEW)
    noise_ratio_clean: float = 0.0      # ANY chunk with noise detection → needs processing
    noise_ratio_demucs: float = 0.20    # 0-20% → demucs, >20% → unusable
    noise_mean_clean: float = 0.10      # ADJUSTED: 0.10 accounts for neural net baseline (was 0.05)
    noise_mean_demucs: float = 0.30     
    
    # Strict TTS mode: Zero tolerance for contamination
    strict_tts_mode: bool = True  # When True: ANY detection → needs_demucs
    
    # === v7.0: Detection Early-Exit ===
    # Sample 10% of chunks first. If all clean, skip full analysis.
    music_early_exit: bool = True
    music_early_exit_sample_ratio: float = 0.10  # Sample 10% of chunks
    music_early_exit_threshold: float = 0.05    # STRICT: Was 0.10. Only skip if truly clean
    
    # === OUTPUT ===
    output_dir: str = "data/fast_output_v6"
    generate_sample_clips: bool = False  # CHANGED: metadata-only by default
    clips_per_speaker: int = 3
    
    # === POST-PROCESSING (Adjacent merge) ===
    # max_silence_gap:
    # Post-processing "adjacent merge" threshold for SAME-speaker segments.
    # If two consecutive segments are labeled the same speaker and the time gap
    # between them is <= this value, we merge them into one continuous clip.
    #
    # NOTE: merging *includes* the gap (usually silence/breath) inside the clip.
    # We intentionally allow short natural pauses (breathing/thinking) so a
    # continuous single-speaker monologue doesn't get fragmented into many clips.
    #
    # Keep this <~1s so we don't pull long silences (or missed micro-interjections)
    # into a "speech" segment.
    max_silence_gap: float = 0.5
    
    # === GPU OPTIMIZATION ===
    clear_cache_every_n_chunks: int = 3
    
    # === COMPUTE MONITORING ===
    monitor_compute: bool = True       # Track CPU/GPU utilization
    log_compute_stats: bool = True     # Log stats after each stage
    
    # === RESOURCE MANAGEMENT (v6.9) ===
    # Auto-apply optimal settings from ComputeMonitor
    auto_tune_resources: bool = True  # Auto-tune on init based on system
    max_utilization: float = 0.80     # Target 80% max utilization
    
    def apply_adaptive_settings(self, compute_config: dict):
        """
        Apply auto-detected optimal settings from ComputeMonitor.
        
        Called after COMPUTE.get_optimal_config()
        """
        for key, value in compute_config.items():
            if hasattr(self, key):
                setattr(self, key, value)
    
    def __post_init__(self):
        """Validate configuration and auto-tune resources."""
        # Ensure output dir exists
        os.makedirs(self.output_dir, exist_ok=True)
        
        # === AUTO-TUNE RESOURCES (v6.9) ===
        # Apply optimal settings based on detected system resources
        if self.auto_tune_resources:
            try:
                from src.compute import COMPUTE
                # Set utilization cap first
                if self.max_utilization != COMPUTE.resources.max_utilization:
                    COMPUTE.set_max_utilization(self.max_utilization)
                # Apply optimal settings
                COMPUTE.apply_to_config(self)
            except ImportError:
                pass  # COMPUTE not available, use defaults
        
        # Validate thresholds
        assert 0 < self.cluster_merge_threshold <= 1.0, "merge threshold must be (0, 1]"
        assert 0 < self.vad_threshold <= 1.0, "VAD threshold must be (0, 1]"
        assert self.min_segment_duration >= 0.1, "min_segment too small"