o
    q^iz	                    @   s  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	m
Z
mZmZmZmZ ddlZddlZedZdee fddZd	ede
eef fd
dZG dd deZG dd deZdLdedede
eef fddZdMdededede	eee
f fddZdedefddZdd  Zdedefd!d"Zda dNd$d%Z!dee fd&d'Z"dee fd(d)Z#dede	ee
eef f fd*d+Z$d,edefd-d.Z%d/edefd0d1Z&	2	dOd3eded4ed5e'd6e
eef de	ee
eef f fd7d8Z(dPd,ed4eded5e'dee f
d9d:Z)	;	2	dQded<ed5e'd6e
eef de	ee
eef f f
d=d>Z*		dRdeded4ed?ed@edAe'dBedCedDed5e'dEe+dFedGede
eef fdHdIZ,deded4ed?edBedCedDed5e'dEe+de
eef fdJdKZ-dS )Su  Audio download and preprocessing with robust error handling.

=== v7.0 OPTIMIZATION: Download-Time Resample ===
Uses ffmpeg directly to create both 16kHz processing audio AND original
quality audio in a single pass, eliminating Python resampling overhead.

Previous: yt-dlp → temp.wav → torchaudio load → Python resample → save both
Now:      yt-dlp → ffmpeg pipe → dual output (16kHz + original) in one pass

Saves ~10-15s per video by avoiding torchaudio resampling.
    N)Path)TupleDictAnyOptionalListIterablezFastPipelineV6.Downloadreturnc                   C   s   t jdS )a	  
    Get proxy URL for yt-dlp from environment variable.

    Set YTDLP_PROXY environment variable with your proxy URL, e.g.:
        export YTDLP_PROXY="http://user:pass@proxy.example.com:7777"

    Returns:
        Proxy URL string or None if not configured.
    YTDLP_PROXY)osenvironget r   r   N/home/ubuntu/.cursor/worktrees/maya3__SSH__216.81.248.184_/nmo/src/download.pyget_ytdlp_proxy   s   
r   	file_pathc              
   C   s   z;t tdr%ztt| }|j|j|j|j dW W S  ty$   Y nw tt| \}}|jd }|||| dW S  tyQ } z
t	d|  d| d}~ww )aL  
    Get audio file info (sample_rate, num_frames, duration).
    
    Compatible with all torchaudio versions by using torchaudio.load 
    and inferring metadata from the returned tensor.
    
    Args:
        file_path: Path to audio file
        
    Returns:
        Dict with 'sample_rate', 'num_frames', 'duration' keys
    info)sample_rate
num_framesduration   zCould not get audio info for : N)
hasattr
torchaudior   strr   r   	ExceptionloadshapeRuntimeError)r   r   waveformr   r   er   r   r   get_audio_info)   s*   



r!   c                   @      e Zd ZdZdS )VideoValidationErrorz*Raised when video fails validation checks.N__name__
__module____qualname____doc__r   r   r   r   r#   Q       r#   c                   @   r"   )DownloadErrorz)Raised when download fails after retries.Nr$   r   r   r   r   r*   V   r)   r*         $@video_idtimeoutc              
   C   s   d|  }dddd|d}t  }|r||d< zJt|:}|j|dd}|rM|dd|  |d	d
|dp9g |dd|ddW  d   W S W d   W i S 1 sYw   Y  W i S  ty~ } ztd|  d|  W Y d}~i S d}~ww )a  
    Fetch YouTube metadata (chapters, title, duration) without downloading.

    === v3: Proxy support for rate limit avoidance ===
    Set YTDLP_PROXY env var for high-volume metadata fetching.

    Args:
        video_id: YouTube video ID (11 chars)
        timeout: Max seconds to wait for metadata

    Returns:
        Dict with chapters, title, duration, etc. Empty dict on failure.
     https://www.youtube.com/watch?v=TF)quietno_warningsextract_flatskip_downloadsocket_timeoutproxydownloadtitlezVideo r   r   chaptersuploaderUnknownupload_date)r7   r   r8   r9   r;   Nz%Could not fetch YouTube metadata for r   )r   yt_dlp	YoutubeDLextract_infor   r   loggerdebug)r,   r-   youtube_urlydl_optsr4   ydlr   r    r   r   r   fetch_youtube_metadata[   s>   
	

 rD         >@      @	video_urlmin_durationmax_durationc              
   C   s  t d|   ddddd}t }|r||d< zt|}|j| dd}|du r7ddi fW  d   W S |d	d
krP|dsPdd|fW  d   W S |dd}||k rndd| d| d|fW  d   W S ||krdd| d| d|fW  d   W S |ddkrdd|fW  d   W S |dddkrt d|d d |drdd|fW  d   W S |dd}t d| d| d dd |fW  d   W S 1 sw   Y  W dS  tjj	y: }	 zBt
|	}
d!|
v rdd"i fW  Y d}	~	S d#|
v rddi fW  Y d}	~	S d$|
v r(dd%i fW  Y d}	~	S dd&|
 i fW  Y d}	~	S d}	~	w tyV }	 zdd't
|	 i fW  Y d}	~	S d}	~	ww )(a  
    Validate YouTube video before processing.
    
    Checks:
    - Video exists and is accessible
    - Video duration is within acceptable range
    - Video has audio track
    - Video is not age-restricted or private
    
    Args:
        video_url: YouTube URL to validate
        min_duration: Minimum video duration in seconds (default: 30s)
        max_duration: Maximum video duration in seconds (default: 4 hours)
    
    Returns:
        (is_valid, message, info_dict)
    u   🔍 Validating video: TF)r/   r0   r1   r2   r4   r5   Nz#Video info extraction returned NoneacodecnoneformatszVideo has no audio trackr   r   zVideo too short: zs (min: s)zVideo too long: zs (max: availabilityprivatezVideo is private	age_limitu*   ⚠️ Video is age-restricted (age_limit=)is_livezLive streams are not supportedr7   r:   u   ✅ Video validated: '' (OKzVideo unavailablezVideo unavailable or deletedzPrivate videozSign inz2Video requires sign-in (age-restricted or private)zDownload error: zValidation failed: )r?   r   r   r<   r=   r>   r   warningutilsr*   r   r   )rG   rH   rI   rB   r4   rC   r   r   r7   r    	error_msgr   r   r   validate_video   sf   
("


 rX   total_durationc                 C   s   d}d}d}| |kr|S |S )u  
    Calculate dynamic intro skip based on video duration.
    
    Strategy (when no chapter info available):
    - Video ≤ 30 min (1800s): Skip 180s (3 min) - ~10% intro ratio
    - Video > 30 min: Skip 300s (5 min) - smaller % for longer content
    
    This is ONLY used when no intro chapter is detected.
    Chapter-based detection always takes priority.
    
    Args:
        total_duration: Total video duration in seconds
        
    Returns:
        Intro skip duration in seconds
      g     f@g     r@r   )rY   THRESHOLD_30MIN
SKIP_SHORT	SKIP_LONGr   r   r   get_dynamic_intro_skip   s   r^   c              	      s  g d}g d}d}d}d}d}| rt | dkr| d }	|	dd}
|	dd}|}d	}|
}td
|
 d|dd t | dkr| d }|dd}| }|dd}d}|D ]
}||v rc|} nqY|r|}d| }|
 d| }td| d| d|dd t | dkr| d }|dd  |dd}|d|}t fdd|D r|| dk r|| }td|d d|dd n+t|ddr|jdkrt|}|dkrdnd}d| }d}td |d!d"| d# ||||fS )$u  
    Detect intro/outro sections from YouTube chapters.

    === v3: SIMPLIFIED INTRO DETECTION ===

    === LOGIC ===
    1. If chapters available:
       - Always skip 1st chapter (assumed to be intro)
       - If 2nd chapter name matches intro keywords, skip it too
    2. If no chapters:
       - Video ≤30 min: skip 3 min (180s)
       - Video >30 min: skip 5 min (300s)

    Returns:
        (intro_seconds, outro_seconds, skip_reason, matched_title): Trim amounts + metadata
    )Uintrointroductionopeningadvertisementsponsoradpromo	sponsoredsponsorshipzbrought to youpromotional   परिचयu   भूमिकाu   शुरुआत   प्रस्तावनाu   विज्ञापन   प्रायोजकparichaybhumikashuruaat
prastavanavigyapanprayojaku   పరిచయంu   ప్రారంభంu   ప్రకటనu   స్పాన్సర్
parichayam
prarambham	prakatanau   ಪರಿಚಯu   ಆರಂಭu   ಜಾಹೀರಾತುu   ಪ್ರಾಯೋಜಕ	parichayaarambhajahiratuu   அறிமுகம்u   தொடக்கம்u   விளம்பரம்u   ஸ்பான்சர்arimukam	thodakkam
vilambaramu   പരിചയംu   ആമുഖംu   തുടക്കംu   പരസ്യംu   സ്പോൺസർrr   aamukham	thudakkamparasyamu   પરિચયu   આરંભu   જાહેરાતu   સ્પોન્સરrl   aarambhjaheratu   পরিচয়u   ভূমিকাu   শুরুu   বিজ্ঞাপনu   স্পন্সরporichoyrm   shurubiggaponri   rj   u   सुरुवातu   जाहिरातrk   rl   ro   suruvatjahiratu   ਜਾਣ-ਪਛਾਣu   ਸ਼ੁਰੂਆਤu   ਇਸ਼ਤਿਹਾਰu   ਸਪਾਂਸਰzjaan-pehchaanrn   ishtihar)outroendingcreditsrc   rd   re   endcard
conclusionclosing	subscribezlike and subscribeu   समापनu	   अंतu   सब्सक्राइब   ముగింపుu!   సబ్స్క్రైబ్r   u   முடிவுu   അവസാനംu   સમાપનu	   শেষ        Nr   r7    end_timefirst_chapteru      📖 Skipping 1st chapter: 'z' (0s - .1frM   r   zfirst_two_chapters:z + u,      📖 Also skipping 2nd chapter (keyword 'z'): 'rS   
start_timec                 3   s    | ]}| v V  qd S )Nr   ).0keywordchapter_titler   r   	<genexpr>`  s    z3detect_intro_outro_from_chapters.<locals>.<genexpr>,  u      📖 Chapter-based outro: 'auto_intro_skipTrZ   u   ≤30minz>30minzdynamic:u,      ⏱️ Dynamic intro skip (no chapters): .0fz	s (video rQ   )	lenr   r?   r   loweranygetattrintro_skip_secondsr^   )r8   rY   configintro_keywordsoutro_keywords
intro_skip
outro_skipskip_reasonmatched_titler   first_chapter_titlefirst_chapter_endsecond_chaptersecond_chapter_titlesecond_chapter_title_lowersecond_chapter_endmatched_keywordr   last_chapterchapter_startchapter_endduration_thresholdr   r   r    detect_intro_outro_from_chapters   s^   !
  
r   c                 C   s   t | dkrtdd | D r| S d| v r!| dd dd S d| v r1| dd d	d S d
| v rA| d
d d	d S d| v sId| v ri| d	d }|dd }d|v re|ddd }|S |}|S t|   dd S )z=Extract video ID from various URL formats (YouTube, R2, etc).   c                 s   s     | ]}|  p|d v V  qdS )z-_N)isalnum)r   cr   r   r   r   u  s    z#extract_video_id.<locals>.<genexpr>zwatch?v=r   &r   z	youtu.be/?z/shorts/r2.cloudflarestorage.comzs3.amazonaws.com/r   .N)r   allsplitrsplithashlibmd5encode	hexdigest)rG   pathfilenamer,   r   r   r   extract_video_idq  s    r   R2Clientc                 C   sZ   t du r+ddlm}m}m} | }tjdd}||j|j	|j
|j|d}||dda t S )	zNLazy-init the R2 client used for reading source videos (bucket_type='source').Nr   )r   get_r2_config_from_envR2ConfigR2_SOURCE_PREFIXr   )endpoint_urlaccess_key_idsecret_access_keybucketprefixsource)r   bucket_type)_R2_SOURCE_CLIENTsrc.r2_clientr   r   r   r   r   r   r   r   r   r   )r   r   r   r   basesource_prefixcfgr   r   r   _get_r2_source_client  s   r   c                 C   s   g }t jdd }|r|dd |dD  t| ddp!d}|r)|| |g d g }t }|D ]$}| }|sBd}n|	drI|n|d }||vr[|
| || q7|S )	z`
    Build a small set of candidate prefixes for source objects without listing the bucket.
    R2_SOURCE_PREFIXESr   c                 S      g | ]
}|  r|  qS r   strip)r   pr   r   r   
<listcomp>      z,_iter_r2_source_prefixes.<locals>.<listcomp>,r2_source_prefix)r   podcasts	podcasts/videosvideos/r   )r   r   r   r   extendr   r   appendsetendswithadd)r   prefixesenv_prefixes
cfg_prefix
normalizedseenr   p_normr   r   r   _iter_r2_source_prefixes  s(   


r   c                 C   s   g }t jdd }|r|dd |dD  t| dd}|rFt|tr5|dd |dD  nt|t	t
frF|d	d |D  |g d
 g }t }|D ]$}| }|s_d}n|drf|nd| }||vrx|| || qT|S )zOReturn candidate extensions to try for source objects (small list, no listing).R2_SOURCE_EXTENSIONSr   c                 S   r   r   r   r   r    r   r   r   r     r   z._iter_r2_source_extensions.<locals>.<listcomp>r   r2_source_extensionsNc                 S   r   r   r   r   r   r   r   r     r   c                 S   s$   g | ]}t | rt | qS r   )r   r   r   r   r   r   r     s   $ )r   .webm.mp4.mkv.m4a.mp3.opusr   )r   r   r   r   r   r   r   
isinstancer   listtupler   
startswithr   r   )r   extsenv_extscfg_extsr   r   r    e_normr   r   r   _iter_r2_source_extensions  s.   


r  c                 C   s   t |}dtt dtt dtt fdd}|t|g d}|t|g d}d}|D ] }|D ]}| |  | }	|	}||	}
|
rJ|	|
f    S q/q+td	|j d
|pU|  )a  
    Resolve a video_id to an object key in the R2 source bucket.

    We avoid bucket listing for performance/cost reasons. Instead we try a small
    set of likely prefixes/extensions via HEAD requests only.

    Returns:
        (remote_key, head_object_response)
    items	preferredr	   c                 S   sb   g }t  }|D ]}|| v r||vr|| || q| D ]}||vr.|| || q|S )z?Return items with preferred values moved to the front (stable).)r   r   r   )r  r  outr   r   itr   r   r   _prioritize  s   



z._resolve_r2_source_object.<locals>._prioritize)r   r   r   )r   r   r   r   r   r   r   NzR2 source not found: s3://r   )r   r   r   r   r  head_objectr*   r   )r,   r   clientr  r   r   last_keyr   extkeyheadr   r   r   _resolve_r2_source_object  s.   
"
r  urlc                 C   s   d| v pd| v od| v S )z'Check if URL is an R2/S3 presigned URL.r   zs3.zamazonaws.comr   )r  r   r   r   	is_r2_url  s   r  presigned_urlc                 C   s6   zddl m} || }|jd}|W S    |  Y S )z
    Extract clean R2 bucket/key path from a presigned URL.
    
    Example:
        Input: https://xxx.r2.cloudflarestorage.com/test/podcasts/VIDEO.webm?X-Amz-...
        Output: test/podcasts/VIDEO.webm
    r   )urlparser   )urllib.parser  r   lstrip)r  r  parsedr   r   r   r   extract_r2_path#  s   r     r2_url
output_dirmax_retriessupabase_metadatac           9      C   s	  t d|  t }|| d }|| d }|| d }	| r| rt d|  z<tt|\}
}tt|\}}t||d| d| |jd | d	|
jd | t|g ||t|d
dddfW S  ty } zt 	d| d W Y d}~nd}~ww d}t
|D ]}zGt d|d  d|  ddd| dddddt|	g
}tj|d
d
dd}|jdkrtd|jdd  |	 r|	 jd krW  nKtd! tjy   td"}t 	d#|d   Y q ty } z#|}t 	d|d  d$|  ||d k rtd%|  W Y d}~qd}~ww |	 s/td&| d'| ztt|	\}}|jd | }~W nw ty } zjt 	d(| d) z;d*d+d,d-d.d/d0t|	g}tj|d
d
d1}ddl}||j}t|d2i d3d}t|d4i gd d5d6}W n ty } zt 	d7|  d6}d}W Y d}~nd}~ww W Y d}~nd}~ww t d8| d9|d:d; d	}d	}d<}d}d| }|r|d=p|}|d>pg } |d?d@}!|dAr|dAddB n|}"| rf|!rft dCt|  dD g }#t| D ]:\}$}%|%dE|%dFd}&|$d t| k r>| |$d  dE| |$d  dF|"}'n|"}'|#|%d=dG|&|'dH qt|#|"|\}}}}t dI|d:dJ|  nJt dK |dkrt|}dL}t dM|dNd; n/t dO |dkrt|dPd
rt|dQd	d	krt t||dR }dS}t dM|dNd; t|dQd	dkr||j!krt dT|j!d:dU|d:dV |j!}dW}|}(t"|| |(dX })|)|( }*|dBk s|*dYk rd}(|dkr|ndZ}*d}d[}|j#}+dddt|	d\t|(d]t|*d^d_d`t|+ddt|d\t|(d]t|*d^d_dddaddbdct|g},t dd| de|dNdV ztj|,d
d
d
df}W n tj$y^ } ztdg|jdd  d}~ww | shtdh| sqtdizHt%t|}-t%t|}.|-d3 }/|.d3 }0t&|/|0 }1|1dRkrt 'dj|/dkdl|0dkd; tdm|1dkdn|/}2t do|/dkdp|1dqdV W n% ty     ty } zt 	dr|  |*}2W Y d}~nd}~ww z|	(  W n   Y d}3i }4zRddsl)m*}5 tt|\}}|5|+d, |}3t dt|3j- du|3j.dNdv ||3j/t0|3j1dt0|3j.d|3j2|3j-t0|3j3d%|3j4|3j5t0|3j6dwt0|3j7dwdx}4~W n ty[ } zt 	dy|  W Y d}~nd}~ww t | }6t dz|6d:d{|2dNd| |r{|d>pzg ng }7t8| }8t|i d}|d~d| d=|d|d|d|d|d|2dt|d>|7d5|+d|dt|d|8dd
dddd|4||4dd@ dfS )a  
    Download audio directly from R2 presigned URL using ffmpeg.
    
    === v2: R2 DIRECT DOWNLOAD ===
    For pre-cached videos in R2, we bypass yt-dlp entirely and use ffmpeg
    to download and convert in a single pass.
    
    Creates:
    - {video_id}_trimmed.wav (16kHz mono for processing)
    - {video_id}_original.flac (original quality mono for export)
    
    Returns:
        (audio_path, metadata): Path to processed audio and basic metadata
    u   ⚡ R2 direct download: _trimmed.wav_original.flac_raw.wav   ✅ Using cached audio: r.   zR2 Source: r   r   Tflacr2)r,   rA   r7   original_durationintro_skippedprocessed_durationr  r8   r   original_sample_rateoriginal_audio_pathoriginal_audio_preservedaudio_formatsource_typezCache validation failed: , re-downloadingNz   R2 download attempt r   ffmpeg-y-iz-vn-ac1z-fwavr   )capture_outputtextr-   r   zffmpeg failed: i  i'  z$Downloaded file too small or missingzDownload timed outz    R2 download timeout, attempt 	 failed:    zR2 download failed after  attempts: z)Could not get audio info via torchaudio: z, trying ffprobeffprobez-verrorz-show_entriesz"format=duration:stream=sample_ratez-ofjson)r1  r2  formatr   streamsr   逻  zffprobe also failed:       📊 Raw audio: Hz, r   srK   r7   r8   has_chaptersFsource_duration_min<   u!      📖 Using Supabase chapters (z
 chapters)secondsr   r   )r7   r   r   u"      📖 Chapter-based skip: intro=z
s, reason=u3      📊 No chapters in Supabase, using dynamic skipzdynamic:no_supabase_chaptersu      ⏱️ Dynamic skip: r   u2      ⚠️ No Supabase metadata, using dynamic skipr   r   皙?zdynamic:no_metadata#   ⚙️ Manual intro skip override: s (was: rM   manual_override
      i?B znone:too_short_or_unknown-ss-t-map0:a-ar-c:a-compression_level5'      ⚡ FFmpeg dual output: 16kHz WAV + zHz FLAC (skip: r1  r2  checkFFmpeg dual output failed: z"FFmpeg did not create trimmed filez#FFmpeg did not create original file"      ❌ DURATION MISMATCH! trimmed=.2fs, original=#Duration mismatch between outputs: s differenceu      ✅ Duration verified: s (diff=.3f&      ⚠️ Could not verify durations: analyze_spectral_quality      🔬 Spectral: z
 (rolloff=Hz)   claimed_sample_rateeffective_sample_ratedetected_nyquistrolloff_frequencyis_upsampledoriginal_format
confidenceis_suitable_for_ttsquality_issueshigh_freq_energy_ratiospectral_flatness   Spectral analysis skipped: u      ✅ R2 download complete: zs, zs audior,   rA   r"  r#  intro_skip_reasonintro_skip_matched_titler$  r  r%  r&  
source_urlr'  r(  r)  rg  )spectral_qualityaudio_native_sample_rateis_native_quality)9r?   r   timeexistsr   r   r   r   r   rU   range
subprocessrun
returncoder*   stderrstatst_sizeTimeoutExpiredsleepr8  loadsstdoutfloatr   intr   	enumerater   r   r^   r   minr   maxr   CalledProcessErrorr!   absr7  unlinksrc.spectral_analysisr^  squeezenumpyoriginal_format_guessrf  rd  roundre  rg  ri  rj  rk  rl  rm  r  )9r  r,   r  r   r  r  starttrimmed_fileoriginal_fileraw_fileproc_waveformproc_srorig_waveformorig_srr    
last_errorattemptffmpeg_downloadresultr   original_sractual_duration	probe_cmdprobe_resultr8  
probe_datae2r   r   r   r   video_titler8   r?  source_durationnormalized_chaptersich	start_secend_secr   r   trim_duration	target_srffmpeg_dualtrimmed_infooriginal_infotrimmed_durationoriginal_duration_checkduration_diffr$  rr  spectral_infor^  elapsedsb_chaptersr2_pathr   r   r   _download_from_r25  s  
	

"
 &



*


 





 
	

r  c                 C   s|  t d|   t }|| d }| r(| jdkr(t d|  |S dt|| d dddd	d	d
}t }|rB||d< t|D ]l}z>t	
|}	|	j| dd W d   n1 s`w   Y  | rt dt | dd| jd dd |W   S W qF ty }
 z t d|d  d|
  ||d k rtd|  W Y d}
~
qFd}
~
ww t d| d dS )z,Download video for visualization (720p MP4).u*   🎥 Downloading video for visualization: r   i@B u   ✅ Using cached video: z2bestvideo[height<=720]+bestaudio/best[height<=720]z.%(ext)smp4Tr  )r9  outtmplmerge_output_formatr/   r0   retriesfragment_retriesr4   r5   Nu   ✅ Video download: r   z
s | Size: g    .AMBzVideo download attempt r   r3  r4  u    ❌ Video download failed after z	 attempts)r?   r   ru  rv  r|  r}  r   r   rw  r<   r=   r>   r   rU   r  r7  )r  r  r,   r  r  output_filerB   r4   r  rC   r    r   r   r    download_video_for_visualization  sD   .
r  Tvalidatec           (      C   sp  t d| dd  d t }t| }t|j| }|jddd t|dd}|d	krt| st	|}	t
||\}
}tt|d
d}|	j|
|d}t d|	j d|
 d|ddd dd t||||||d\}}t | }||d< |	j|d< |
|d< |d|d< t d|dd|dddd ||fS t| rt d t| |||||d\}}t | }||d< t d |dd|dddd ||fS i }|rt| \}}}|std!| |s5t d" ddd#d$}zt|}|j| d#d%}W d   n	1 sw   Y  W n  ty4 } zt d&|  d'g d(}W Y d}~nd}~ww || d) }|| d* }t|d+d#}|| d, }| r;d}| r^|n| re|nd}|rt|std#}t d- |r;t d.|  ztt|\}}|jd/ |d0 k rt d1 |  |r| r|  n]|}|r| rztt|} | d2 }W n   Y t||d3| |d4d5|jd/ | d6|jd/ | t||d7g |||rt|nd|du|r t|d8r d9nd:d;fW S W n3 ty: } z&t d<| d= z|  |r*| r*|  W n   Y W Y d}~nd}~ww |d>d}!|rJ|d7g ng }"t|"|!|\}#}$}%}&|j dkrt|#|j krtt d?|j dd@|#ddA |j }#|rt!| |||||j"|#|$|!|||%|&dB}'nt#| |||||#|$|!||dC
}'t | }t dD|dd|'d dd ||'d< t||'fS )Eu;  
    Download audio from YouTube URL or R2 presigned URL and prepare for processing.

    === v7.0 OPTIMIZATION: Download-Time Resample ===
    Uses ffmpeg directly for dual-output in single pass:
    - 16kHz mono WAV for processing pipeline
    - Original quality mono WAV for high-quality export

    Previous: yt-dlp → temp.wav → torchaudio load → Python resample → save both
    Now:      yt-dlp → single ffmpeg → dual output (NO Python resampling!)

    === v2: R2 URL SUPPORT ===
    When video_url is an R2 presigned URL, we use ffmpeg directly instead of yt-dlp.

    === v3: Supabase Metadata ===
    When supabase_metadata is provided (from distributed queue), use it for:
    - Title, chapters, duration (no yt-dlp metadata fetch needed)
    - Chapter-based intro/outro skip

    Args:
        video_url: YouTube URL or R2 presigned URL
        config: Pipeline configuration
        validate: Whether to validate video before download (default: True)
        max_retries: Number of download retries (default: 3)
        supabase_metadata: Pre-fetched metadata from Supabase (chapters, title, etc.)

    Returns:
        (audio_path, metadata): Path to processed audio and video metadata

    Raises:
        VideoValidationError: If video fails validation
        DownloadError: If download fails after retries
    u   📥 Downloading: NP   z...T)parentsexist_okinput_sourceyoutuber!  r2_presign_ttl_seci  )expires_in_secu   📦 R2 source resolved: s3://r    (ContentLengthr   i   r   z MB))r  r,   r  r   r  r  download_wall_time_secr2_source_bucketr2_source_keyr2_source_size_bytesu   ✅ Download (R2): zs | Duration: r$  r   r>  u5   📦 R2 source detected, using ffmpeg direct downloadu   ✅ Download (R2 URL): zVideo validation failed: u   📋 Fetching video metadata...F)r/   r0   r1   r5   zCould not fetch metadata: r:   )r7   r8   r  r  preserve_original_audio_original.wavuF   ⚠️ Cached trimmed file exists but original missing, re-downloadingr  r   rG  u?   ⚠️ Cached file seems corrupted or too short, re-downloadingr   r.   r7   Cachedr   r8   z.flacr   r0  )r,   rA   r7   r"  r#  r$  r  r8   r   r%  r&  r'  r(  u#   ⚠️ Failed to load cached file: r*  r   rD  rE  rM   )rG   r,   r  r  r  r  r   r   r"  r  r   r   r   )
rG   r,   r  r  r   r   r   r"  r  r   u   ✅ Download: )$r?   r   ru  r   r   r  mkdirr   r  r   r  r  generate_presigned_get_urlr   r   r  rX   r#   r<   r=   r>   r   rU   rv  r   r   r   r   r  r!   r   r   r   !_download_with_ffmpeg_dual_outputr   _download_standard)(rG   r   r  r  r  r  r,   r  r  r	  
remote_keyr  ttlr  
audio_pathmetadatar  r   is_validmessageydl_info_optsrC   r    r  r  preserve_originallegacy_original_filecache_validactual_originalr   srr  	orig_infor"  r8   r   r   r   r   r  r   r   r   download_audio  s<  (
	
$

$

 





 r  r  r  r  r   r   r"  r   r   r   c           +      C   s  t d || d }dt|| d dddddd	gd
dgddd}t }|r.||d< d}t|	D ]}zlt d|d  d|	  t|}|j| dd}W d   n1 s[w   Y  | s|	| dD ]+}|j
dv r|j
dkrtjddt|d
dt|dgddd |  n||  nql| rW  n2td ty } z"|}t d|d  d|  ||	d k rtd|  W Y d}~q4d}~ww | std|	 d| ztt|}|d  }|d! }W n  ty } zt d"| d# d$}|}W Y d}~nd}~ww t d%| d&|d'd( d}zBd)d*lm} d)dl}|t|\}}||d) |}t d+|j d,|j d-|jd.d/ |jr`t d0d1|j  W n ty| } zt d2|  W Y d}~nd}~ww |d)kr|nd)}|d)kr|| n|}|| } ddt|d3t|d4t| d5d6d7t|d
ddt|d3t|d4t| d5d6d
dd8d9d:d;dt|g}!t d<| d=|d'd>|d'd? ztj|!dddd@}"W n tj y } zt !dA|j"  tdB| d}~ww | stdC| | stdD| zUtt|}#tt|}$|#d! }%|$d! }&t#|%|& }'|'dEkrWt !dF|%dGdH|&dGdI|'dGd? t !dJ tdK|'dGdL|%}(t dM|%dGdH|&dGdI|'dNd? W n% tyu     ty } zt dO|  | }(W Y d}~nd}~ww z|  W n   Y |	| dD ]})z|)  W q   Y qt dP|j$ dQ| dR|j$ dQ| dS	 i }*|dur||jt%|j&d)t%|jd)|j'|jt%|j(d|j)|jt%|j*dTt%|j+dTdU}*i dV|dWdX| dY|
r|
,dYdZndZd[|d\|d]|(d^t|d_|
r%|
,d_g ng d |d`|dat|db| dcdddd9dedfdgdhdi|*||*,djdk ||dlS )maq  
    === v7.2 OPTIMIZATION: Download-Time Dual Output with FLAC ===
    
    Single ffmpeg command creates both outputs:
    1. 16kHz mono WAV (for processing) - with intro/outro trim
    2. Original quality mono FLAC (for export) - LOSSLESS, ~50% smaller than WAV
    
    FLAC Benefits:
    - 100% lossless (bit-perfect audio quality)
    - ~50% smaller than WAV (saves storage costs)
    - Fast decode (CPU-efficient for training)
    - Compression level 5 (balanced speed vs size)
    
    Key insight: ffmpeg can output to multiple files from single input stream,
    encoding each to different formats in one pass.
    u9   ⚡ Using optimized dual-output download with FLAC (v7.2)r  bestaudio/bestz_raw.%(ext)sTFFmpegExtractAudior0  0r  preferredcodecpreferredqualityr.  r/  r  r9  r  r/   r0   postprocessorspostprocessor_argsr  r  r4   N   Download attempt r   r   r5   z_raw.*.wavr   r   r   r   r  r+  r-  r,  r1  rS  zDownloaded file not foundr3  r4  Download failed after r5  r   r   zCould not get audio info: z, using metadata durationr;  r<  r=  r   r>  r   r]  r_  z (effective=zHz, rolloff=r   r`  u      ⚠️ Quality: z, rn  rI  rJ  rK  rL  rM  rN  r   rO  rP  rQ  zHz FLAC (trim: zs-rM   rR  zFFmpeg failed: rT  z$FFmpeg did not create trimmed file: z%FFmpeg did not create original file: rC  rU  rV  rW  rZ  z>   This indicates FFmpeg trim was not applied to both outputs!rX  rY  u"      ✅ Duration verified: trimmed=r[  r\  u      ✅ Dual output complete: r  z	Hz WAV), zHz FLAC)ra  rb  r,   rA   r.   r7   r:   r"  r#  r$  r  r8   r%  r&  rq  r'  r(  audio_compressionlosslessr)  r  rr  rg  F)rs  rt  ro  rp  )-r?   r   r   r   rw  r<   r=   r>   rv  globsuffixrx  ry  r  renamer*   r   rU   ru  r  r!   r  r^  r   r   r  r  r  rd  rf  rk  joinr  r7  r{  r  namer  re  rg  ri  rj  rl  rm  r   )+rG   r,   r  r  r  r  r   r   r"  r  r   r   r   r  rB   r4   r  r  rC   dl_infoext_filer    
audio_infor  r  rr  r^  r   raw_waveform_r   r   r  
ffmpeg_cmdr  r  r  r  r  r  r$  tempr  r   r   r   r    s  






"


$
((






	
r  c
           *      C   s  || d }
|| d }|| d }t |dd}t |dd}|rDdt|
d	d	d
dddg|dkr7ddgndt|ddgddd}ndt|
d	d	d
dddgdt|jddgddd}d}t|D ]}zotd|d  d|  t|}|j| d	d W d   n1 sw   Y  |	 s|
| dD ]0}|jdv r|jdkrtjddt|dt|jddt|dg	d	d	d |  n||  nq|	 rW  n1W qc ty } z"|}td|d  d|  ||d k rtd |  W Y d}~qcd}~ww |	 std!| d"| ztt|\}}W n ty; } z|	 r0|  td#| d}~ww |}|jd | }|}d}|r||jkrtd$| d%|j  t|}ddlm} |j||jd&}||}|j}td'| d( |d)k rtd*|d+d, |	r|	d-g ng }t|||\}}}}|jdkr||jkrtd.|jd+d/|d+d0 |j}|}t|| }|dkrt|| nd} |dkr|jd |  n|jd }!|dks| dkr|dd||!f }|jd dkr|j dd	d1}|d    ! " }"|"d2k r$td3|"d4d5 t#t||| d}#|r|	 rzktt|\}$}%|$jd dkrP|$j dd	d1}$t||% }&|dkrat||% nd}'|dkro|$jd |' n|$jd }(|$dd|&|(f }$t#t||$|% t|}#td6|j d7|% d8|$jd |% d+d0 W n ty } ztd9|  W Y d}~nd}~ww |	 r|  |
| dD ]})z|)  W q   Y qi d:|d;d<| d=|	r|	d=d>nd>d?|d@|dA|jd | dBt|d-|dC|dD|r|n|dE|#dF|#dudG|	r(|	dGdHndHdI|	r3|	dInddJ|	r?|	dKd nd dL|	rL|	dLdHS dHS )Mz=Standard download without original preservation (16kHz only).z_temp.%(ext)sz	_temp.wavr  r  Foriginal_audio_sample_rater   r  Tr  r0  r  r  r.  r/  rM  r  r  Nr  r   r   r5   z_temp.*r  r  r+  r-  r,  r  r3  r4  r  r5  zAudio file corrupted: u#      🎵 Preserving original audio: u   Hz → )	orig_freqnew_frequ!      📊 Created processing copy: HzrH  u   ⚠️ Audio very short: r   r>  r8   rD  rE  rM   )dimkeepdimg-C6?u'   ⚠️ Audio appears to be silent (RMS=z.6frQ   u      ✅ Saved original quality: r  r=  u)      ⚠️ Failed to save original audio: r,   rA   r.   r7   r:   r"  r#  r$  r  r   r%  r&  r'  rJ   unknownabrchannelsaudio_channelsr9  )$r   r   r   rw  r?   r   r<   r=   r>   rv  r  r  rx  ry  r  r  r   rU   ru  r  r*   r   r   r   r  torchaudio.transforms
transformsResampler   r   r   r  meansqrtitemsave)*rG   r,   r  r  r   r   r   r"  r  r   	temp_filedownloaded_filer  r  original_sr_targetrB   r  r  rC   r  r    r   r  r  r  r&  T	resamplerr8   r  r#  intro_samplesoutro_samples
end_samplermsoriginal_audio_savedr  r  orig_intro_samplesorig_outro_samplesorig_end_sampler  r   r   r   r    sD  






"
"0


r  )r+   )rE   rF   )r	   r   )r  N)r  )Tr  N)NN).r(   ru  r   loggingr   rx  pathlibr   typingr   r   r   r   r   r   r   r<   	getLoggerr?   r   r   r!   r   r#   r*   r  rD   boolrX   r^   r   r   r   r   r   r  r  r  r  r  r  r  r  dictr  r  r   r   r   r   <module>   s    
( &.Q|
!"!3

  $L,

 |	


  	

