#!/usr/bin/env python3
"""Audio download and preprocessing with robust error handling.

=== v7.0 OPTIMIZATION: Download-Time Resample ===
Uses ffmpeg directly to create both 16kHz processing audio AND original
quality audio in a single pass, eliminating Python resampling overhead.

Previous: yt-dlp → temp.wav → torchaudio load → Python resample → save both
Now:      yt-dlp → ffmpeg pipe → dual output (16kHz + original) in one pass

Saves ~10-15s per video by avoiding torchaudio resampling.
"""

import time
import logging
import hashlib
import subprocess
from pathlib import Path
from typing import Tuple, Dict, Any, Optional
import torchaudio
import yt_dlp

logger = logging.getLogger("FastPipelineV6.Download")


# === EDGE CASE HANDLING: Video Validation ===
class VideoValidationError(Exception):
    """Raised when video fails validation checks."""
    pass


class DownloadError(Exception):
    """Raised when download fails after retries."""
    pass


def validate_video(video_url: str, min_duration: float = 30.0, max_duration: float = 14400.0) -> Tuple[bool, str, Dict]:
    """
    Validate YouTube video before processing.
    
    Checks:
    - Video exists and is accessible
    - Video duration is within acceptable range
    - Video has audio track
    - Video is not age-restricted or private
    
    Args:
        video_url: YouTube URL to validate
        min_duration: Minimum video duration in seconds (default: 30s)
        max_duration: Maximum video duration in seconds (default: 4 hours)
    
    Returns:
        (is_valid, message, info_dict)
    """
    logger.info(f"🔍 Validating video: {video_url}")
    
    ydl_opts = {
        'quiet': True,
        'no_warnings': True,
        'extract_flat': False,
        'skip_download': True,
    }
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(video_url, download=False)
            
            if info is None:
                return False, "Video info extraction returned None", {}
            
            # Check if video has audio
            if info.get('acodec') == 'none' and not info.get('formats'):
                return False, "Video has no audio track", info
            
            # Check duration
            duration = info.get('duration', 0)
            if duration < min_duration:
                return False, f"Video too short: {duration}s (min: {min_duration}s)", info
            
            if duration > max_duration:
                return False, f"Video too long: {duration}s (max: {max_duration}s)", info
            
            # Check availability
            if info.get('availability') == 'private':
                return False, "Video is private", info
            
            if info.get('age_limit', 0) > 0:
                logger.warning(f"⚠️ Video is age-restricted (age_limit={info.get('age_limit')})")
            
            # Check for live stream (not fully supported)
            if info.get('is_live'):
                return False, "Live streams are not supported", info
            
            title = info.get('title', 'Unknown')
            logger.info(f"✅ Video validated: '{title}' ({duration}s)")
            
            return True, "OK", info
            
    except yt_dlp.utils.DownloadError as e:
        error_msg = str(e)
        if "Video unavailable" in error_msg:
            return False, "Video unavailable or deleted", {}
        elif "Private video" in error_msg:
            return False, "Video is private", {}
        elif "Sign in" in error_msg:
            return False, "Video requires sign-in (age-restricted or private)", {}
        else:
            return False, f"Download error: {error_msg}", {}
    except Exception as e:
        return False, f"Validation failed: {str(e)}", {}


def get_dynamic_intro_skip(total_duration: float) -> float:
    """
    Calculate dynamic intro skip based on video duration.
    
    Strategy (when no chapter info available):
    - Video ≤ 30 min (1800s): Skip 180s (3 min) - ~10% intro ratio
    - Video > 30 min: Skip 300s (5 min) - smaller % for longer content
    
    This is ONLY used when no intro chapter is detected.
    Chapter-based detection always takes priority.
    
    Args:
        total_duration: Total video duration in seconds
        
    Returns:
        Intro skip duration in seconds
    """
    THRESHOLD_30MIN = 30 * 60  # 1800s
    SKIP_SHORT = 180.0  # 3 min for ≤30min videos
    SKIP_LONG = 300.0   # 5 min for >30min videos
    
    if total_duration <= THRESHOLD_30MIN:
        return SKIP_SHORT
    else:
        return SKIP_LONG


def detect_intro_outro_from_chapters(chapters, total_duration, config):
    """
    Detect intro/outro sections from YouTube chapters.
    
    === PRIORITY (v6.8) ===
    1. Chapter-based detection (HIGHEST priority)
       - If intro keyword found → skip that chapter
       - If chapters exist but NO intro keyword → STILL skip first chapter
    2. Dynamic duration-based skip (fallback when NO chapters at all)
    3. config.intro_skip_seconds (manual override from CLI)
    
    Strategy:
    - Intro: ALWAYS skip first chapter if chapters exist (creator usually puts intro there)
    - Outro: Last chapter with keywords like "outro", "ending", "credits", "sponsor"
    - Fallback: Dynamic skip based on video duration (180s for ≤30min, 300s for >30min)
    
    Returns:
        (intro_seconds, outro_seconds): Trim amounts
    """
    intro_keywords = ['intro', 'opening', 'advertisement', 'sponsor', 'ad', 'promo']
    outro_keywords = ['outro', 'ending', 'credits', 'sponsor', 'ad', 'promo', 'endcard']
    
    intro_skip = 0.0
    outro_skip = 0.0
    
    # === CHAPTER-BASED DETECTION (Priority 1) ===
    if chapters and len(chapters) > 0:
        first_chapter = chapters[0]
        chapter_title = first_chapter.get('title', '').lower()
        chapter_start = first_chapter.get('start_time', 0)
        chapter_end = first_chapter.get('end_time', 0)
        chapter_duration = chapter_end - chapter_start
        
        has_intro_keyword = any(keyword in chapter_title for keyword in intro_keywords)
        
        # === ALWAYS skip first chapter if chapters exist ===
        # Rationale: If creator bothered to add chapters, first one is usually intro/sponsor
        # Only skip if chapter is reasonable length (<5 min for intro)
        if chapter_duration < 300:  # Less than 5 minutes
            intro_skip = chapter_end
            if has_intro_keyword:
                logger.info(f"   📖 Chapter intro (keyword match): '{first_chapter.get('title')}' ({chapter_end:.1f}s)")
            else:
                logger.info(f"   📖 Chapter intro (first chapter): '{first_chapter.get('title')}' ({chapter_end:.1f}s)")
        else:
            # First chapter is too long (>5 min), might be actual content
            # Only skip if it explicitly has intro keywords
            if has_intro_keyword:
                intro_skip = min(chapter_end, 300)  # Cap at 5 min even if chapter is longer
                logger.info(f"   📖 Chapter intro (keyword, capped): '{first_chapter.get('title')}' ({intro_skip:.1f}s)")
            else:
                logger.info(f"   ⚠️ First chapter too long ({chapter_duration:.0f}s), not skipping")
        
        # Check last chapter for outro
        if len(chapters) > 1:
            last_chapter = chapters[-1]
            chapter_title = last_chapter.get('title', '').lower()
            chapter_start = last_chapter.get('start_time', 0)
            chapter_end = last_chapter.get('end_time', total_duration)
            
            # If last chapter is short (<5 min) and has outro keywords, trim it
            if any(keyword in chapter_title for keyword in outro_keywords):
                if chapter_end - chapter_start < 300:  # Less than 5 minutes
                    outro_skip = total_duration - chapter_start
                    logger.info(f"   📖 Chapter-based outro: '{last_chapter.get('title')}' ({outro_skip:.1f}s)")
    
    # === DYNAMIC DURATION-BASED SKIP (Priority 2 - NO chapters at all) ===
    # Only apply if:
    # 1. NO chapters exist at all (not just "no intro found")
    # 2. config.intro_skip_seconds is 0 (no manual override)
    # 3. config.auto_intro_skip is enabled (default True)
    # 
    # KEY: If chapters exist, we TRUST them - don't fall back to dynamic skip
    has_chapters = chapters and len(chapters) > 0
    if not has_chapters and getattr(config, 'auto_intro_skip', True):
        if config.intro_skip_seconds == 0.0:
            intro_skip = get_dynamic_intro_skip(total_duration)
            duration_threshold = '≤30min' if total_duration <= 1800 else '>30min'
            logger.info(f"   ⏱️ Dynamic intro skip (no chapters): {intro_skip:.0f}s (video {duration_threshold})")
    
    return intro_skip, outro_skip


def extract_video_id(video_url: str) -> str:
    """Extract video ID from various YouTube URL formats."""
    if 'watch?v=' in video_url:
        return video_url.split('watch?v=')[1].split('&')[0]
    elif 'youtu.be/' in video_url:
        return video_url.split('youtu.be/')[1].split('?')[0]
    elif '/shorts/' in video_url:
        return video_url.split('/shorts/')[1].split('?')[0]
    else:
        return hashlib.md5(video_url.encode()).hexdigest()[:11]


def download_video_for_visualization(url: str, output_dir: Path, video_id: str, max_retries: int = 3) -> Optional[Path]:
    """Download video for visualization (720p MP4)."""
    logger.info(f"🎥 Downloading video for visualization: {url}")
    start = time.time()
    output_file = output_dir / f"{video_id}.mp4"
    
    if output_file.exists() and output_file.stat().st_size > 1_000_000:
        logger.info(f"✅ Using cached video: {output_file}")
        return output_file
    
    ydl_opts = {
        'format': 'bestvideo[height<=720]+bestaudio/best[height<=720]',
        'outtmpl': str(output_dir / f"{video_id}.%(ext)s"),
        'merge_output_format': 'mp4',
        'quiet': True,
        'no_warnings': True,
        'retries': 3,
        'fragment_retries': 3,
    }
    
    for attempt in range(max_retries):
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.extract_info(url, download=True)
            if output_file.exists():
                logger.info(f"✅ Video download: {time.time()-start:.1f}s | Size: {output_file.stat().st_size/1e6:.1f}MB")
                return output_file
        except Exception as e:
            logger.warning(f"Video download attempt {attempt+1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
    
    logger.error(f"❌ Video download failed after {max_retries} attempts")
    return None


def download_audio(
    video_url: str, 
    config,
    validate: bool = True,
    max_retries: int = 3
) -> Tuple[str, Dict[str, Any]]:
    """
    Download audio from YouTube URL and prepare for processing.
    
    === v7.0 OPTIMIZATION: Download-Time Resample ===
    Uses ffmpeg directly for dual-output in single pass:
    - 16kHz mono WAV for processing pipeline
    - Original quality mono WAV for high-quality export
    
    Previous: yt-dlp → temp.wav → torchaudio load → Python resample → save both
    Now:      yt-dlp → single ffmpeg → dual output (NO Python resampling!)
    
    Saves ~10-15s per video.
    
    Args:
        video_url: YouTube URL
        config: Pipeline configuration
        validate: Whether to validate video before download (default: True)
        max_retries: Number of download retries (default: 3)
    
    Returns:
        (audio_path, metadata): Path to processed audio and video metadata
        
    Raises:
        VideoValidationError: If video fails validation
        DownloadError: If download fails after retries
    """
    logger.info(f"📥 Downloading: {video_url}")
    start = time.time()
    
    # Extract video ID
    video_id = extract_video_id(video_url)
    
    output_dir = Path(config.output_dir) / video_id
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # === EDGE CASE: Video Validation ===
    info = {}
    if validate:
        is_valid, message, info = validate_video(video_url)
        if not is_valid:
            raise VideoValidationError(f"Video validation failed: {message}")
    
    # Fetch metadata if not already fetched during validation
    if not info:
        logger.info("📋 Fetching video metadata...")
        ydl_info_opts = {
            'quiet': True,
            'no_warnings': True,
            'extract_flat': False,
        }
        
        try:
            with yt_dlp.YoutubeDL(ydl_info_opts) as ydl:
                info = ydl.extract_info(video_url, download=False)
        except Exception as e:
            logger.warning(f"Could not fetch metadata: {e}")
            info = {'title': 'Unknown', 'chapters': []}
    
    # Output file paths
    trimmed_file = output_dir / f"{video_id}_trimmed.wav"
    original_file = output_dir / f"{video_id}_original.wav"
    
    # Check cache - both files must exist for full cache hit
    preserve_original = getattr(config, 'preserve_original_audio', False)
    
    if trimmed_file.exists():
        cache_valid = True
        if preserve_original and not original_file.exists():
            cache_valid = False
            logger.info("⚠️ Cached trimmed file exists but original missing, re-downloading")
        
        if cache_valid:
            logger.info(f"✅ Using cached audio: {trimmed_file}")
            try:
                waveform, sr = torchaudio.load(str(trimmed_file))
                
                # === EDGE CASE: Validate cached file isn't corrupted ===
                if waveform.shape[1] < sr * 10:  # Less than 10 seconds
                    logger.warning(f"⚠️ Cached file seems corrupted or too short, re-downloading")
                    trimmed_file.unlink()
                    if original_file.exists():
                        original_file.unlink()
                else:
                    # Get original sample rate from original file if it exists
                    orig_sr = sr
                    if original_file.exists():
                        try:
                            orig_info = torchaudio.info(str(original_file))
                            orig_sr = orig_info.sample_rate
                        except:
                            pass
                    
                    return str(trimmed_file), {
                        'video_id': video_id,
                        'youtube_url': f'https://www.youtube.com/watch?v={video_id}',
                        'title': info.get('title', 'Cached'),
                        'original_duration': waveform.shape[1] / sr,
                        'intro_skipped': 0.0,
                        'processed_duration': waveform.shape[1] / sr,
                        'output_dir': str(output_dir),
                        'chapters': info.get('chapters', []),
                        'sample_rate': sr,
                        'original_sample_rate': orig_sr,
                        'original_audio_path': str(original_file) if original_file.exists() else None,
                        'original_audio_preserved': original_file.exists(),
                    }
            except Exception as e:
                logger.warning(f"⚠️ Failed to load cached file: {e}, re-downloading")
                try:
                    trimmed_file.unlink()
                    if original_file.exists():
                        original_file.unlink()
                except:
                    pass
    
    # Get video duration for intro/outro detection
    original_duration = info.get('duration', 0)
    chapters = info.get('chapters', []) if info else []
    intro_skip, outro_skip = detect_intro_outro_from_chapters(chapters, original_duration, config)
    
    # === MANUAL OVERRIDE (Priority 3) ===
    if config.intro_skip_seconds > 0 and intro_skip != config.intro_skip_seconds:
        logger.info(f"⚙️ Manual intro skip override: {config.intro_skip_seconds:.1f}s (was: {intro_skip:.1f}s)")
        intro_skip = config.intro_skip_seconds
    
    # === v7.0 OPTIMIZATION: Single-pass ffmpeg dual output ===
    # Download audio stream and process with ffmpeg in one command
    if preserve_original:
        result = _download_with_ffmpeg_dual_output(
            video_url=video_url,
            video_id=video_id,
            output_dir=output_dir,
            trimmed_file=trimmed_file,
            original_file=original_file,
            target_sr=config.sample_rate,
            intro_skip=intro_skip,
            outro_skip=outro_skip,
            original_duration=original_duration,
            max_retries=max_retries,
            info=info
        )
    else:
        # Standard download without original preservation
        result = _download_standard(
            video_url=video_url,
            video_id=video_id,
            output_dir=output_dir,
            trimmed_file=trimmed_file,
            config=config,
            intro_skip=intro_skip,
            outro_skip=outro_skip,
            original_duration=original_duration,
            max_retries=max_retries,
            info=info
        )
    
    elapsed = time.time() - start
    logger.info(f"✅ Download: {elapsed:.1f}s | Duration: {result['processed_duration']:.0f}s")
    
    return str(trimmed_file), result


def _download_with_ffmpeg_dual_output(
    video_url: str,
    video_id: str,
    output_dir: Path,
    trimmed_file: Path,
    original_file: Path,
    target_sr: int,
    intro_skip: float,
    outro_skip: float,
    original_duration: float,
    max_retries: int,
    info: dict
) -> Dict[str, Any]:
    """
    === v7.0 OPTIMIZATION: Download-Time Dual Output ===
    
    Single ffmpeg command creates both outputs:
    1. 16kHz mono WAV (for processing) - with intro/outro trim
    2. Original quality mono WAV (for export) - with intro/outro trim
    
    This eliminates Python resampling overhead (~10-15s per video).
    
    Key insight: ffmpeg can output to multiple files from single input stream.
    """
    logger.info("⚡ Using optimized dual-output download (v7.0)")
    
    # Temp file for raw download
    raw_file = output_dir / f"{video_id}_raw.wav"
    
    # Step 1: Download audio at original quality using yt-dlp
    # NO resampling in yt-dlp - we'll do it in ffmpeg
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': str(output_dir / f"{video_id}_raw.%(ext)s"),
        'quiet': True,
        'no_warnings': True,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav',
            'preferredquality': '0',  # Best quality
        }],
        # Only convert to mono, preserve original sample rate
        'postprocessor_args': ['-ac', '1'],
        'retries': 3,
        'fragment_retries': 3,
    }
    
    last_error = None
    for attempt in range(max_retries):
        try:
            logger.info(f"   Download attempt {attempt + 1}/{max_retries}")
            
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                dl_info = ydl.extract_info(video_url, download=True)
            
            # Find the downloaded file
            if not raw_file.exists():
                for ext_file in output_dir.glob(f"{video_id}_raw.*"):
                    if ext_file.suffix in ['.wav', '.mp3', '.m4a', '.opus', '.webm']:
                        if ext_file.suffix != '.wav':
                            # Convert to wav first
                            subprocess.run([
                                'ffmpeg', '-i', str(ext_file), '-ac', '1',
                                str(raw_file), '-y'
                            ], capture_output=True, check=True)
                            ext_file.unlink()
                        else:
                            ext_file.rename(raw_file)
                        break
            
            if raw_file.exists():
                break
            else:
                raise DownloadError(f"Downloaded file not found")
                
        except Exception as e:
            last_error = e
            logger.warning(f"   Download attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            continue
    
    if not raw_file.exists():
        raise DownloadError(f"Download failed after {max_retries} attempts: {last_error}")
    
    # Get actual audio info
    try:
        audio_info = torchaudio.info(str(raw_file))
        original_sr = audio_info.sample_rate
        actual_duration = audio_info.num_frames / original_sr
    except Exception as e:
        logger.warning(f"Could not get audio info: {e}, using metadata duration")
        original_sr = 48000  # Common default
        actual_duration = original_duration
    
    logger.info(f"   📊 Raw audio: {original_sr}Hz, {actual_duration:.1f}s")
    
    # Calculate trim parameters
    start_time = intro_skip if intro_skip > 0 else 0
    end_time = actual_duration - outro_skip if outro_skip > 0 else actual_duration
    trim_duration = end_time - start_time
    
    # Step 2: Single ffmpeg command for dual output with trim
    # This is the key optimization - one ffmpeg pass creates both files
    ffmpeg_cmd = [
        'ffmpeg', '-i', str(raw_file),
        '-ss', str(start_time),  # Seek to intro skip point
        '-t', str(trim_duration),  # Duration after trim
    ]
    
    # Output 1: 16kHz mono for processing
    ffmpeg_cmd.extend([
        '-map', '0:a',
        '-ar', str(target_sr),
        '-ac', '1',
        '-y', str(trimmed_file)
    ])
    
    # Output 2: Original quality mono (just trim, no resample)
    ffmpeg_cmd.extend([
        '-map', '0:a',
        '-ac', '1',
        '-y', str(original_file)
    ])
    
    logger.info(f"   ⚡ FFmpeg dual output: 16kHz + {original_sr}Hz (trim: {start_time:.1f}s-{end_time:.1f}s)")
    
    try:
        result = subprocess.run(ffmpeg_cmd, capture_output=True, text=True, check=True)
    except subprocess.CalledProcessError as e:
        logger.error(f"FFmpeg failed: {e.stderr}")
        raise DownloadError(f"FFmpeg dual output failed: {e}")
    
    # Verify outputs
    if not trimmed_file.exists():
        raise DownloadError(f"FFmpeg did not create trimmed file: {trimmed_file}")
    if not original_file.exists():
        raise DownloadError(f"FFmpeg did not create original file: {original_file}")
    
    # Get actual processed duration
    try:
        proc_info = torchaudio.info(str(trimmed_file))
        processed_duration = proc_info.num_frames / proc_info.sample_rate
    except:
        processed_duration = trim_duration
    
    # Cleanup raw file
    try:
        raw_file.unlink()
    except:
        pass
    
    # Cleanup any other temp files
    for temp in output_dir.glob(f"{video_id}_raw.*"):
        try:
            temp.unlink()
        except:
            pass
    
    logger.info(f"   ✅ Dual output complete: {trimmed_file.name} ({target_sr}Hz), {original_file.name} ({original_sr}Hz)")
    
    return {
        'video_id': video_id,
        'youtube_url': f'https://www.youtube.com/watch?v={video_id}',
        'title': info.get('title', 'Unknown') if info else 'Unknown',
        'original_duration': actual_duration,
        'intro_skipped': intro_skip,
        'processed_duration': processed_duration,
        'output_dir': str(output_dir),
        'chapters': info.get('chapters', []) if info else [],
        'sample_rate': target_sr,
        'original_sample_rate': original_sr,
        'original_audio_path': str(original_file),
        'original_audio_preserved': True,
    }


def _download_standard(
    video_url: str,
    video_id: str,
    output_dir: Path,
    trimmed_file: Path,
    config,
    intro_skip: float,
    outro_skip: float,
    original_duration: float,
    max_retries: int,
    info: dict
) -> Dict[str, Any]:
    """Standard download without original preservation (16kHz only)."""
    
    temp_file = output_dir / f"{video_id}_temp.%(ext)s"
    downloaded_file = output_dir / f"{video_id}_temp.wav"
    original_file = output_dir / f"{video_id}_original.wav"  # v6.8: High-quality preservation
    
    # === v6.8: HIGH-QUALITY AUDIO PRESERVATION ===
    # If preserve_original_audio is True, we download at original quality first,
    # then create a 16kHz copy for processing. This allows cutting from original later.
    preserve_original = getattr(config, 'preserve_original_audio', False)
    original_sr_target = getattr(config, 'original_audio_sample_rate', 0)  # 0 = keep original
    
    if preserve_original:
        # Download WITHOUT resampling to preserve original quality
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': str(temp_file),
            'quiet': True,
            'no_warnings': True,
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '0',
            }],
            # Only mono conversion, NO sample rate change
            'postprocessor_args': ['-ac', '1'] if original_sr_target == 0 else ['-ar', str(original_sr_target), '-ac', '1'],
            'retries': 3,
            'fragment_retries': 3,
        }
    else:
        # Standard: Download at 16kHz for processing
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': str(temp_file),
            'quiet': True,
            'no_warnings': True,
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'wav',
                'preferredquality': '0',
            }],
            'postprocessor_args': ['-ar', str(config.sample_rate), '-ac', '1'],
            'retries': 3,
            'fragment_retries': 3,
        }
    
    last_error = None
    for attempt in range(max_retries):
        try:
            logger.info(f"   Download attempt {attempt + 1}/{max_retries}")
            
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                ydl.extract_info(video_url, download=True)
            
            if not downloaded_file.exists():
                for ext_file in output_dir.glob(f"{video_id}_temp.*"):
                    if ext_file.suffix in ['.wav', '.mp3', '.m4a', '.opus', '.webm']:
                        if ext_file.suffix != '.wav':
                            subprocess.run([
                                'ffmpeg', '-i', str(ext_file), '-ar', str(config.sample_rate),
                                '-ac', '1', str(downloaded_file), '-y'
                            ], capture_output=True, check=True)
                            ext_file.unlink()
                        else:
                            ext_file.rename(downloaded_file)
                        break
            
            if downloaded_file.exists():
                break
                
        except Exception as e:
            last_error = e
            logger.warning(f"   Download attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)
            continue
    
    if not downloaded_file.exists():
        raise DownloadError(f"Download failed after {max_retries} attempts: {last_error}")
    
    # Load and apply trim
    try:
        waveform, sr = torchaudio.load(str(downloaded_file))
    except Exception as e:
        if downloaded_file.exists():
            downloaded_file.unlink()
        raise DownloadError(f"Audio file corrupted: {e}")
    
    original_sr = sr  # Keep track of original sample rate
    original_duration = waveform.shape[1] / sr
    actual_duration = original_duration  # For backward compatibility
    
    # === v6.8: HIGH-QUALITY AUDIO PRESERVATION ===
    # If preserving original audio, we need to:
    # 1. Save the original quality audio
    # 2. Create a 16kHz resampled version for processing
    original_audio_path = None
    if preserve_original and sr != config.sample_rate:
        logger.info(f"   🎵 Preserving original audio: {sr}Hz → {original_file.name}")
        original_audio_path = str(original_file)
        
        # Save original quality audio (will be trimmed with same intro/outro later)
        # Note: We save after trim to keep timestamps consistent
        
        # Resample to 16kHz for processing pipeline
        import torchaudio.transforms as T
        resampler = T.Resample(orig_freq=sr, new_freq=config.sample_rate)
        waveform = resampler(waveform)
        sr = config.sample_rate
        logger.info(f"   📊 Created processing copy: {sr}Hz")
    
    # === EDGE CASE: Very short audio after download ===
    if original_duration < 30:
        logger.warning(f"⚠️ Audio very short: {original_duration:.1f}s")
    
    # Detect intro/outro using YouTube chapters (smart detection!)
    # Priority: 1) Chapter-based, 2) Dynamic (duration-based), 3) Manual config
    chapters = info.get('chapters', []) if info else []
    intro_skip, outro_skip = detect_intro_outro_from_chapters(chapters, original_duration, config)
    
    # === MANUAL OVERRIDE (Priority 3) ===
    # If user explicitly set intro_skip_seconds > 0, it overrides everything
    if config.intro_skip_seconds > 0 and intro_skip != config.intro_skip_seconds:
        logger.info(f"⚙️ Manual intro skip override: {config.intro_skip_seconds:.1f}s (was: {intro_skip:.1f}s)")
        intro_skip = config.intro_skip_seconds
    
    # Apply intro/outro trimming
    intro_skipped = intro_skip  # Track actual skip for original audio trim
    intro_samples = int(intro_skip * sr)
    outro_samples = int(outro_skip * sr) if outro_skip > 0 else 0
    end_sample = waveform.shape[1] - outro_samples if outro_skip > 0 else waveform.shape[1]
    
    if intro_samples > 0 or outro_samples > 0:
        waveform = waveform[:, intro_samples:end_sample]
    
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # === EDGE CASE: Check for silent/empty audio ===
    rms = (waveform ** 2).mean().sqrt().item()
    if rms < 0.0001:
        logger.warning(f"⚠️ Audio appears to be silent (RMS={rms:.6f})")
    
    # Save processed audio (16kHz for pipeline processing)
    torchaudio.save(str(trimmed_file), waveform, sr)
    
    # === v6.8: Save original quality audio with same trim ===
    original_audio_saved = None
    if preserve_original and downloaded_file.exists():
        try:
            # Load original again and apply same trim
            orig_waveform, orig_sr = torchaudio.load(str(downloaded_file))
            
            # Ensure mono
            if orig_waveform.shape[0] > 1:
                orig_waveform = orig_waveform.mean(dim=0, keepdim=True)
            
            # Apply same intro/outro trim (in original sample rate)
            orig_intro_samples = int(intro_skipped * orig_sr)
            orig_outro_samples = int(outro_skip * orig_sr) if outro_skip > 0 else 0
            orig_end_sample = orig_waveform.shape[1] - orig_outro_samples if outro_skip > 0 else orig_waveform.shape[1]
            orig_waveform = orig_waveform[:, orig_intro_samples:orig_end_sample]
            
            # Save original quality
            torchaudio.save(str(original_file), orig_waveform, orig_sr)
            original_audio_saved = str(original_file)
            logger.info(f"   ✅ Saved original quality: {original_file.name} ({orig_sr}Hz, {orig_waveform.shape[1]/orig_sr:.1f}s)")
        except Exception as e:
            logger.warning(f"   ⚠️ Failed to save original audio: {e}")
    
    # Cleanup temp file
    if downloaded_file.exists():
        downloaded_file.unlink()
    for temp in output_dir.glob(f"{video_id}_temp.*"):
        try:
            temp.unlink()
        except:
            pass
    
    return {
        'video_id': video_id,
        'youtube_url': f'https://www.youtube.com/watch?v={video_id}',
        'title': info.get('title', 'Unknown') if info else 'Unknown',
        'original_duration': actual_duration,
        'intro_skipped': intro_skip,
        'processed_duration': waveform.shape[1] / sr,
        'output_dir': str(output_dir),
        'chapters': chapters,
        # === v6.8: High-quality audio info ===
        'sample_rate': sr,  # Processing sample rate (16kHz)
        'original_sample_rate': original_sr if preserve_original else sr,  # Original audio sample rate
        'original_audio_path': original_audio_saved,  # Path to original quality audio (if preserved)
        'original_audio_preserved': original_audio_saved is not None,
    }

