o
    3NPi                     @   s  d Z ddlZddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZ ddlZddlZedZG dd deZG dd	 d	eZd/dedededeeee	f fddZdedefddZdd ZdedefddZd0dededededee f
ddZ	 	d1ded!ededeee	ee
f f fd"d#Zdededed$ed%ed&ed'ed(ed)eded*ede	ee
f fd+d,Zdededed$ed'ed(ed)eded*ede	ee
f fd-d.ZdS )2u  Audio download and preprocessing with robust error handling.

=== v7.0 OPTIMIZATION: Download-Time Resample ===
Uses ffmpeg directly to create both 16kHz processing audio AND original
quality audio in a single pass, eliminating Python resampling overhead.

Previous: yt-dlp → temp.wav → torchaudio load → Python resample → save both
Now:      yt-dlp → ffmpeg pipe → dual output (16kHz + original) in one pass

Saves ~10-15s per video by avoiding torchaudio resampling.
    N)Path)TupleDictAnyOptionalzFastPipelineV6.Downloadc                   @      e Zd ZdZdS )VideoValidationErrorz*Raised when video fails validation checks.N__name__
__module____qualname____doc__ r   r   R/home/ubuntu/.cursor/worktrees/maya3data__SSH__216.81.248.184_/zxg/src/download.pyr          r   c                   @   r   )DownloadErrorz)Raised when download fails after retries.Nr	   r   r   r   r   r       r   r         >@      @	video_urlmin_durationmax_durationreturnc           
   
   C   s  t d|   ddddd}zt|}|j| dd}|du r.ddi fW  d   W S |dd	krG|d
sGdd|fW  d   W S |dd}||k redd| d| d|fW  d   W S ||kr}dd| d| d|fW  d   W S |ddkrdd|fW  d   W S |dddkrt d|d d |drdd|fW  d   W S |dd}t d| d| d dd|fW  d   W S 1 sw   Y  W dS  tjjy0 } zAt	|}	d |	v rdd!i fW  Y d}~S d"|	v rddi fW  Y d}~S d#|	v rdd$i fW  Y d}~S dd%|	 i fW  Y d}~S d}~w t
yL } zdd&t	| i fW  Y d}~S d}~ww )'a  
    Validate YouTube video before processing.
    
    Checks:
    - Video exists and is accessible
    - Video duration is within acceptable range
    - Video has audio track
    - Video is not age-restricted or private
    
    Args:
        video_url: YouTube URL to validate
        min_duration: Minimum video duration in seconds (default: 30s)
        max_duration: Maximum video duration in seconds (default: 4 hours)
    
    Returns:
        (is_valid, message, info_dict)
    u   🔍 Validating video: TF)quietno_warningsextract_flatskip_downloaddownloadNz#Video info extraction returned NoneacodecnoneformatszVideo has no audio trackdurationr   zVideo too short: zs (min: s)zVideo too long: zs (max: availabilityprivatezVideo is private	age_limitu*   ⚠️ Video is age-restricted (age_limit=)is_livezLive streams are not supportedtitleUnknownu   ✅ Video validated: '' (OKzVideo unavailablezVideo unavailable or deletedzPrivate videozSign inz2Video requires sign-in (age-restricted or private)zDownload error: zValidation failed: )loggerinfoyt_dlp	YoutubeDLextract_infogetwarningutilsr   str	Exception)
r   r   r   ydl_optsydlr-   r!   r(   e	error_msgr   r   r   validate_video%   s`   
("

 r:   total_durationc                 C   s   d}d}d}| |kr|S |S )u  
    Calculate dynamic intro skip based on video duration.
    
    Strategy (when no chapter info available):
    - Video ≤ 30 min (1800s): Skip 180s (3 min) - ~10% intro ratio
    - Video > 30 min: Skip 300s (5 min) - smaller % for longer content
    
    This is ONLY used when no intro chapter is detected.
    Chapter-based detection always takes priority.
    
    Args:
        total_duration: Total video duration in seconds
        
    Returns:
        Intro skip duration in seconds
      g     f@g     r@r   )r;   THRESHOLD_30MIN
SKIP_SHORT	SKIP_LONGr   r   r   get_dynamic_intro_skipq   s   r@   c                    s  g d}g d}d}d}| rt | dkr| d }|dd  |dd}|dd}	|	| }
t fd	d
|D }|
dk re|	}|rTtd|d d|	dd n3td|d d|	dd n"|r}t|	d}td|d d|dd n
td|
dd t | dkr| d }|dd  |dd}|d|}	t fdd
|D r|	| dk r|| }td|d d|dd | ot | dk}|st|ddr|jdkrt	|}|dkrdnd}td|dd| d  ||fS )!u  
    Detect intro/outro sections from YouTube chapters.
    
    === PRIORITY (v6.8) ===
    1. Chapter-based detection (HIGHEST priority)
       - If intro keyword found → skip that chapter
       - If chapters exist but NO intro keyword → STILL skip first chapter
    2. Dynamic duration-based skip (fallback when NO chapters at all)
    3. config.intro_skip_seconds (manual override from CLI)
    
    Strategy:
    - Intro: ALWAYS skip first chapter if chapters exist (creator usually puts intro there)
    - Outro: Last chapter with keywords like "outro", "ending", "credits", "sponsor"
    - Fallback: Dynamic skip based on video duration (180s for ≤30min, 300s for >30min)
    
    Returns:
        (intro_seconds, outro_seconds): Trim amounts
    )introopeningadvertisementsponsoradpromo)outroendingcreditsrD   rE   rF   endcard        r   r(    
start_timeend_timec                 3       | ]}| v V  qd S Nr   .0keywordchapter_titler   r   	<genexpr>       z3detect_intro_outro_from_chapters.<locals>.<genexpr>i,  u(      📖 Chapter intro (keyword match): 'r*   .1fr"   u(      📖 Chapter intro (first chapter): 'u*      📖 Chapter intro (keyword, capped): 'u"      ⚠️ First chapter too long (.0fzs), not skipping   c                 3   rO   rP   r   rQ   rT   r   r   rV      rW   u      📖 Chapter-based outro: 'auto_intro_skipTr<   u   ≤30minz>30minu,      ⏱️ Dynamic intro skip (no chapters): z	s (video r&   )
lenr1   loweranyr,   r-   mingetattrintro_skip_secondsr@   )chaptersr;   configintro_keywordsoutro_keywords
intro_skip
outro_skipfirst_chapterchapter_startchapter_endchapter_durationhas_intro_keywordlast_chapterhas_chaptersduration_thresholdr   rT   r    detect_intro_outro_from_chapters   sH   ""
" 	
rq   c                 C   sz   d| v r|  dd  dd S d| v r |  dd  dd S d| v r0|  dd  dd S t|   dd	 S )
z2Extract video ID from various YouTube URL formats.zwatch?v=rZ   &r   z	youtu.be/?z/shorts/N   )splithashlibmd5encode	hexdigest)r   r   r   r   extract_video_id   s   rz      url
output_dirvideo_idmax_retriesc           
      C   sj  t d|   t }|| d }| r(| jdkr(t d|  |S dt|| d dddd	d	d
}t|D ]l}z>t	|}|j
| dd W d   n1 sWw   Y  | r|t dt | dd| jd dd |W   S W q= ty }	 z t d|d  d|	  ||d k rtd|  W Y d}	~	q=d}	~	ww t d| d dS )z,Download video for visualization (720p MP4).u*   🎥 Downloading video for visualization: z.mp4i@B u   ✅ Using cached video: z2bestvideo[height<=720]+bestaudio/best[height<=720]z.%(ext)smp4Tr{   )formatouttmplmerge_output_formatr   r   retriesfragment_retriesr   Nu   ✅ Video download: rX   z
s | Size: g    .AMBzVideo download attempt rZ   	 failed:    u    ❌ Video download failed after z	 attempts)r,   r-   timeexistsstatst_sizer4   ranger.   r/   r0   r5   r2   sleeperror)
r|   r}   r~   r   startoutput_filer6   attemptr7   r8   r   r   r    download_video_for_visualization   s>   
.
r   Tvalidatec                 C   s  t d|   t }t| }t|j| }|jddd i }|r2t| \}}	}|s2td|	 |s}t d dddd}
zt	
|
}|j| dd}W d	   n1 sWw   Y  W n ty| } zt d
|  dg d}W Y d	}~nd	}~ww || d }|| d }t|dd}| rTd}|r| sd}t d |rTt d|  zstt|\}}|jd |d k rt d |  | r|  nM|}| rztt|}|j}W n   Y t||d| |dd|jd | d|jd | t||dg ||| rt|nd	| dfW S W n0 tyS } z#t d| d z|  | rC|  W n   Y W Y d	}~nd	}~ww |dd}|rc|dg ng }t|||\}}|jdkr||jkrt d|jd d!|d d" |j}|rt| |||||j|||||d#}nt| |||||||||d$
}t | }t d%|d d&|d' d(d) t||fS )*u  
    Download audio from YouTube URL and prepare for processing.
    
    === v7.0 OPTIMIZATION: Download-Time Resample ===
    Uses ffmpeg directly for dual-output in single pass:
    - 16kHz mono WAV for processing pipeline
    - Original quality mono WAV for high-quality export
    
    Previous: yt-dlp → temp.wav → torchaudio load → Python resample → save both
    Now:      yt-dlp → single ffmpeg → dual output (NO Python resampling!)
    
    Saves ~10-15s per video.
    
    Args:
        video_url: YouTube URL
        config: Pipeline configuration
        validate: Whether to validate video before download (default: True)
        max_retries: Number of download retries (default: 3)
    
    Returns:
        (audio_path, metadata): Path to processed audio and video metadata
        
    Raises:
        VideoValidationError: If video fails validation
        DownloadError: If download fails after retries
    u   📥 Downloading: T)parentsexist_okzVideo validation failed: u   📋 Fetching video metadata...F)r   r   r   r   NzCould not fetch metadata: r)   )r(   rc   z_trimmed.wav_original.wavpreserve_original_audiouF   ⚠️ Cached trimmed file exists but original missing, re-downloadingu   ✅ Using cached audio: rZ   
   u?   ⚠️ Cached file seems corrupted or too short, re-downloading https://www.youtube.com/watch?v=r(   CachedrK   rc   r~   youtube_urlr(   original_durationintro_skippedprocessed_durationr}   rc   sample_rateoriginal_sample_rateoriginal_audio_pathoriginal_audio_preservedu#   ⚠️ Failed to load cached file: z, re-downloadingr!   r   #   ⚙️ Manual intro skip override: rX   s (was: r"   )r   r~   r}   trimmed_fileoriginal_file	target_srrg   rh   r   r   r-   )
r   r~   r}   r   rd   rg   rh   r   r   r-   u   ✅ Download: zs | Duration: r   rY   s)r,   r-   r   rz   r   r}   mkdirr:   r   r.   r/   r0   r5   r2   ra   r   
torchaudioloadr4   shapeunlinkr   r1   rq   rb   !_download_with_ffmpeg_dual_output_download_standard)r   rd   r   r   r   r~   r}   r-   is_validmessageydl_info_optsr7   r8   r   r   preserve_originalcache_validwaveformsrorig_sr	orig_infor   rc   rg   rh   resultelapsedr   r   r   download_audio  s    









 r   r   r   r   rg   rh   r   r-   c                 C   sD  t d || d }dt|| d dddddd	gd
dgddd}d}t|	D ]}zlt d|d  d|	  t|}|j| dd}W d   n1 sRw   Y  | s|| dD ]+}|j	dv r|j	dkrt
jddt|d
dt|dgddd |  n||  nqc| rW  n2td ty } z"|}t d|d  d|  ||	d k rtd|  W Y d}~q+d}~ww | std|	 d| ztt|}|j}|j| }W n  ty } zt d| d  d!}|}W Y d}~nd}~ww t d"| d#|d$d% |d&kr|nd&}|d&kr'|| n|}|| }ddt|d't|d(t|g}|d)d*d+t|d
ddt|g |d)d*d
ddt|g t d,| d-|d$d.|d$d/ zt
j|dddd0}W n t
jy } zt d1|j  td2| d}~ww | std3| | std4| ztt|}|j|j }W n   |}Y z|  W n   Y || dD ]}z|  W q   Y qt d5|j d6| d7|j d6| d8	 |d9| |
r|
d:d;nd;|||t||
r|
d<g ng ||t|dd=S )>a  
    === v7.0 OPTIMIZATION: Download-Time Dual Output ===
    
    Single ffmpeg command creates both outputs:
    1. 16kHz mono WAV (for processing) - with intro/outro trim
    2. Original quality mono WAV (for export) - with intro/outro trim
    
    This eliminates Python resampling overhead (~10-15s per video).
    
    Key insight: ffmpeg can output to multiple files from single input stream.
    u/   ⚡ Using optimized dual-output download (v7.0)z_raw.wavbestaudio/bestz_raw.%(ext)sTFFmpegExtractAudiowav0keypreferredcodecpreferredquality-ac1r{   r   r   r   r   postprocessorspostprocessor_argsr   r   N   Download attempt rZ   /r   z_raw.*.wavz.mp3z.m4az.opusz.webmr   ffmpeg-i-ycapture_outputcheckzDownloaded file not foundr   r   Download failed after  attempts: zCould not get audio info: z, using metadata durationi  u      📊 Raw audio: Hz, rX   r   r   z-ssz-tz-mapz0:a-aru#      ⚡ FFmpeg dual output: 16kHz + z
Hz (trim: zs-r"   )r   textr   zFFmpeg failed: zFFmpeg dual output failed: z$FFmpeg did not create trimmed file: z%FFmpeg did not create original file: u      ✅ Dual output complete:  (zHz), zHz)r   r(   r)   rc   r   )r,   r-   r4   r   r.   r/   r0   r   globsuffix
subprocessrunr   renamer   r5   r2   r   r   r   r   
num_framesextendCalledProcessErrorr   stderrnamer1   )r   r~   r}   r   r   r   rg   rh   r   r   r-   raw_filer6   
last_errorr   r7   dl_infoext_filer8   
audio_infooriginal_sractual_durationrM   rN   trim_duration
ffmpeg_cmdr   	proc_infor   tempr   r   r   r     s   





"

(r   c
           )      C   s  || d }
|| d }|| d }t |dd}t |dd}|rDdt|
d	d	d
dddg|dkr7ddgndt|ddgddd}ndt|
d	d	d
dddgdt|jddgddd}d}t|D ]}zotd|d  d|  t|}|j| d	d W d   n1 sw   Y  |	 s|
| dD ]0}|jdv r|jdkrtjddt|dt|jddt|dg	d	d	d |  n||  nq|	 rW  n1W qc ty } z"|}td|d  d|  ||d k rtd |  W Y d}~qcd}~ww |	 std!| d"| ztt|\}}W n ty; } z|	 r0|  td#| d}~ww |}|jd | }|}d}|r||jkrtd$| d%|j  t|}ddlm} |j||jd&}||}|j}td'| d( |d)k rtd*|d+d, |	r|	d-g ng }t|||\}}|jdkr||jkrtd.|jd+d/|d+d0 |j}|}t|| }|dkrt|| nd}|dkr|jd | n|jd } |dks|dkr|dd|| f }|jd dkr	|j dd	d1}|d    ! " }!|!d2k r"td3|!d4d5 t#t||| d}"|r|	 rzktt|\}#}$|#jd dkrN|#j dd	d1}#t||$ }%|dkr_t||$ nd}&|dkrm|#jd |& n|#jd }'|#dd|%|'f }#t#t||#|$ t|}"td6|j d7|$ d8|#jd |$ d+d0 W n ty } ztd9|  W Y d}~nd}~ww |	 r|  |
| dD ]}(z|(  W q   Y q|d:| |	r|	d;d<nd<|||jd | t||||r|n||"|"dud=S )>z=Standard download without original preservation (16kHz only).z_temp.%(ext)sz	_temp.wavr   r   Foriginal_audio_sample_rater   r   Tr   r   r   r   r   r   r   r{   r   Nr   rZ   r   r   z_temp.*r   r   r   r   r   r   r   r   r   r   zAudio file corrupted: u#      🎵 Preserving original audio: u   Hz → )	orig_freqnew_frequ!      📊 Created processing copy: Hz   u   ⚠️ Audio very short: rX   r   rc   r   r   r"   )dimkeepdimg-C6?u'   ⚠️ Audio appears to be silent (RMS=z.6fr&   u      ✅ Saved original quality: r   r   u)      ⚠️ Failed to save original audio: r   r(   r)   r   )$ra   r4   r   r   r,   r-   r.   r/   r0   r   r   r   r   r   r   r   r5   r2   r   r   r   r   r   r   r   torchaudio.transforms
transformsResampler1   rq   rb   intmeansqrtitemsave))r   r~   r}   r   rd   rg   rh   r   r   r-   	temp_filedownloaded_filer   r   original_sr_targetr6   r   r   r7   r   r8   r   r   r   r   r   T	resamplerrc   r   intro_samplesoutro_samples
end_samplermsoriginal_audio_savedorig_waveformr   orig_intro_samplesorig_outro_samplesorig_end_sampler   r   r   r   r   h  s  






"
"0
r   )r   r   )r{   )Tr{   ) r   r   loggingrv   r   pathlibr   typingr   r   r   r   r   r.   	getLoggerr,   r5   r   r   r4   floatboolr:   r@   rq   rz   r   r   r   dictr   r   r   r   r   r   <module>   s   
&LS$'
 )	


 2	

