o
    ӮiE                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	m
Z
mZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlZdd	lmZ d
ZdZdZdZg dZg dZe
ddG dd dZe
G dd dZ dcddZ!ddddZ"ded d!Z#dfd%d&Z$dgd)d*Z%dhd,d-Z&did2d3Z'djd=d>Z(dkdBdCZ)dldFdGZ*dmdIdJZ+dndQdRZ,dodTdUZ-dpdYdZZ.dqd\d]Z/dqd^d_Z0ddd`daZ1e2dbkre1  dS dS )rz
Fetch classification-oriented YouTube metadata for known video IDs.

Example:
  python scripts/fetch_youtube_video_metadata.py       --input-csv data/transcription_video_ids.csv       --output-csv data/youtube_metadata_sample_10.csv       --limit 10
    )annotationsN)	dataclassfield)datetimetimezone)chain)Path)AnyIteratorSequence)load_dotenvz,https://www.googleapis.com/youtube/v3/videosz#snippet,contentDetails,topicDetailszitems(id,snippet(channelId,channelTitle,title,description,tags,categoryId,defaultLanguage,defaultAudioLanguage),contentDetails(duration,definition),topicDetails(topicCategories))2   )YOUTUBE_API_KEYGOOGLE_API_KEY
GEMINI_KEYGEMINI_PROJECT2GEMINI_PROJECT3GEMINI_PROJECT4)video_id
channel_idchannel_titletitledescriptiontagscategory_iddefault_languagedefault_audio_languagedurationduration_seconds
definitiontopic_categoriescrawl_timestamp_utcfetch_statuserror_detailraw_jsonT)frozenc                   @  s   e Zd ZU ded< ded< dS )ApiKeystrenv_namevalueN)__name__
__module____qualname____annotations__ r.   r.   'scripts/fetch_youtube_video_metadata.pyr&   C   s   
 r&   c                   @  sH   e Zd ZU ded< ded< dZded< eejdZded	< dddZ	dS )ApiKeyStater&   api_keyfloatmin_interval_seconds        next_request_at)default_factoryzasyncio.LocklockreturnNonec              	     s   | j 4 I d H / t }| j| }|dkr"t|I d H  t }t| j|| j | _W d   I d H  d S 1 I d H s=w   Y  d S Nr   )r7   time	monotonicr5   asynciosleepmaxr3   )selfnowwait_secondsr.   r.   r/   wait_for_slotP   s   
.zApiKeyState.wait_for_slotNr8   r9   )
r*   r+   r,   r-   r5   r   r=   Lockr7   rC   r.   r.   r.   r/   r0   I   s   
 r0   r8   argparse.Namespacec                  C  s   t jdd} | jdddd | jddd	d | jd
dg dd | jdtddd | jdtddd | jddg dd | jdtddd | jdtddd | jdtddd | jd td!d"d | jd#d$d%d& | jd'd$d(d& |  S ))Nz-Fetch YouTube metadata for a set of video IDs)r   z--input-csvz data/transcription_video_ids.csvzQCSV containing video IDs (column: video_id/videoID, or first column if no header))defaulthelpz--output-csvzdata/youtube_video_metadata.csvz'Path for normalized metadata CSV outputz
--video-idappendz8Explicit video ID to fetch; can be passed multiple times)actionrG   rH   z--limitr   zDMax number of video IDs to fetch after input ordering (0 = no limit))typerG   rH   z--batch-sizer   z;Number of video IDs per API request (YouTube API max is 50)z--api-key-envzBEnv var name to read an API key from; can be passed multiple timesz--sleep-secondsr4   z8Legacy global sleep knob; prefer --per-key-delay-secondsz--per-key-delay-secondsg?z9Minimum delay between request starts for the same API keyz--max-concurrency    z*Maximum in-flight requests across all keysz--progress-everyd   z(Print progress every N completed batchesz--overwrite
store_truez)Overwrite output CSV if it already exists)rJ   rH   z--no-raw-jsonz6Skip the raw_json column payload to reduce output size)argparseArgumentParseradd_argumentintr2   
parse_args)parserr.   r.   r/   rS   Z   s   rS   r9   c                   C  s   t tt jjd  d S )Nz.env)r   r   __file__resolveparentr.   r.   r.   r/   load_env   s   rX   explicit_env_namesSequence[str]list[ApiKey]c                 C  s~   | rt | nt}g }t }|D ]}t|d }|r||v r q|t||d || q|s=d	|}t
d| |S )N )r(   r)   z, z%No API keys found. Checked env vars: )listDEFAULT_API_KEY_ENV_NAMESsetosgetenvstriprI   r&   addjoin
SystemExit)rY   	env_nameskeysseen_valuesr(   r)   joinedr.   r.   r/   discover_api_keys   s   
rj   	input_csvr   	list[str]c           
   	   C  s   |   std|  g }t }| jdddd]}t|}zt|}W n ty6   | Y W  d    S w t|}|d urB|}nt	|g|}d}|D ] }|t
|krUqL||  }	|	ra|	|v rbqL||	 ||	 qLW d    |S 1 sxw   Y  |S )NzInput CSV not found: rutf-8r\   encodingnewliner   )existsre   r_   opencsvreadernextStopIterationdetect_video_id_columnr   lenrb   rI   rc   )
rk   idsseenhandleru   	first_rowheader_indexrowsrowr   r.   r.   r/   read_video_ids   s<   


r   r   
int | Nonec                 C  s2   dd | D }dD ]}||v r| |  S q	d S )Nc                 S  s   g | ]}|   qS r.   )rb   lower).0r)   r.   r.   r/   
<listcomp>   s    z*detect_video_id_column.<locals>.<listcomp>)r   videoidid)index)r   
normalizednamer.   r.   r/   rx      s   rx   argsc                 C  sN   dd | j D }|r|ntt| j}| jdkr|d | j }|s%td|S )Nc                 S  s   g | ]
}|  r|  qS r.   )rb   )r   r   r.   r.   r/   r      s    z$select_video_ids.<locals>.<listcomp>r   zNo video IDs selected)r   r   r   rk   limitre   )r   explicit_ids
source_idsr.   r.   r/   select_video_ids   s   
r   valuessizerR   Iterator[list[str]]c                 c  s2    t dt| |D ]}t| |||  V  q	d S r:   )rangery   r]   )r   r   startr.   r.   r/   chunked   s   r   clienthttpx.AsyncClient	video_ids
key_statesSequence[ApiKeyState]preferred_key_indexlegacy_sleep_secondsr2   list[dict[str, Any]]c                  s<  t td|d}g }tt|D ]}||| t|  }| I d H  z| jti |d|jj	idI d H }	W n t
jyX }
 z||jj d|
  W Y d }
~
qd }
~
ww |	jdkrv|dkrjt|I d H  |	 }|dg   S ||jj d	|	j d
t|	  |dkrt|I d H  qtdd| )N,)partfieldsr   key)paramsz: transport error:    r   itemsz:  zAll API keys failed for batch: z | )YOUTUBE_PARTSYOUTUBE_FIELDSrd   r   ry   rC   getYOUTUBE_VIDEOS_ENDPOINTr1   r)   httpx	HTTPErrorrI   r(   status_coder=   r>   jsonextract_error_messageRuntimeError)r   r   r   r   r   r   failuresoffset	key_stateresponseexcpayloadr.   r.   r/   fetch_video_batch   s>   	
r   r   httpx.Responser'   c                 C  sp   z|   }W n ty   | jd d  Y S w |di }t|tr-|d}|r-t|S t j|ddd d S )Nr   errormessageTensure_ascii)r   
ValueErrortextr   
isinstancedictr'   dumps)r   r   r   r   r.   r.   r/   r     s   

r   r   r   c                 C  s   t | }d|v r| dS |S )Nz2youtube.api.v3.V3DataVideoService.List are blockedz
The configured Google API keys exist, but YouTube Data API access is blocked for them. Add a key with YouTube Data API v3 enabled and no API restriction blocking `youtube.v3.videos.list`, then rerun.)r'   )r   r   r.   r.   r/   build_fetch_failure_message#  s
   r   r   c                 C  s   | r|  ds	d S d}d}d}| dd  D ]H}|dkrd}q| r'||7 }q|s*qt|}d}|dkr;||d	 7 }q|d
krF||d 7 }q|dkrU|||rPdnd 7 }q|dkr]||7 }q|S )NPr   r\   F   TTDiQ Hi  M<   i ' S)
startswithisdigitrR   )r   total_secondsnumberin_time_sectioncharr)   r.   r.   r/   parse_iso8601_duration_seconds/  s4   r   itemdict[str, Any]r!   include_raw_jsonbooldict[str, str]c                C  s  |  di pi }|  di pi }|  di pi }t| ddp d}t|}i dt|  ddp0ddt| d	dp;dd
t| ddpFddt| ddpQddt| ddp\ddtj| dg phg dddt| ddpuddt| ddpddt| ddpdd|d|d u rdnt|dt| ddpddtj| dg pg ddd|ddddd|rtj| ddd S dS )!NsnippetcontentDetailstopicDetailsr   r\   r   r   r   	channelIdr   channelTitler   r   r   Tr   r   
categoryIdr   defaultLanguager   defaultAudioLanguager   r   r    topicCategoriesr!   r"   okr#   r$   )r   	sort_keys)r   r'   r   r   r   )r   r!   r   r   content_detailstopic_detailsr   r   r.   r.   r/   normalize_video_itemL  sT   	
r   r   c                C  sj   i d| ddddddddddd	dd
dddddddddddd|ddddddS )Nr   r   r\   r   r   r   r   z[]r   r   r   r   r   r   r    r!   r"   	not_foundr#   z)Video not returned by YouTube videos.listr$   r.   )r   r!   r.   r.   r/   build_missing_rowl  sF   	
r   batch_indexvideo_id_batch tuple[int, list[dict[str, str]]]c                   s   t |||| t| |dI d H }dd |D }g }	|D ]}
||
}|d u r1|	t|
|d q|	t|||d q| |	fS )N)r   r   c                 S  s"   i | ]}t |d dpd|qS )r   r\   )r'   r   )r   r   r.   r.   r/   
<dictcomp>  s   " z-fetch_and_normalize_batch.<locals>.<dictcomp>)r!   )r!   r   )r   ry   r   rI   r   r   )r   r   r   r   r!   r   r   r   by_video_idr   r   r   r.   r.   r/   fetch_and_normalize_batch  s.   	

r   tuple[Path, int, int, int]c                   s$  t   t j}| r jstd| dtt jdt	} j|kr1t
d| dt	 d t j} fdd|D t }tt||}t|}ttj |jjd	d	d
 ||j d}| ro|  d}d}	d}
zz|jdddd}tj|td}|  i }d}t !t j"dt#j$dd4 I d H d$ fddfddt%|D }d}t &|D ]t}|I d H \}}|||< |d7 } j'dkr| j' dks|t|krt
d|ddt|dd t|| |dd|dd!	 ||v r5|(|}|D ]}|)| |d7 }|d" d#kr&|	d7 }	q|
d7 }
q|d7 }||v sqW d   I d H  n1 I d H sGw   Y  W d    n	1 sWw   Y  |*| |||	|
fW W | rr|  S S  t+y } ztt,||d }~ww | r|  w w )%NzOutput already exists: z! (pass --overwrite to replace it)r   zAdjusted batch size to z (YouTube API max is )c                   s    g | ]}t |t jd dqS )r4   )r1   r3   )r0   r?   per_key_delay_seconds)r   r1   )r   r.   r/   r     s    z#_run_fetch_impl.<locals>.<listcomp>T)parentsexist_okz.tmpr   wrn   r\   ro   )
fieldnamesg      >@timeoutr   rR   r   rZ   r8   r   c                   sf   4 I d H  t | | j t jddI d H W  d   I d H  S 1 I d H s,w   Y  d S )Nr4   )r   r   )r   no_raw_jsonr?   sleep_seconds)r   r   )r   r   r!   r   	semaphorer.   r/   guarded_fetch  s   

0z&_run_fetch_impl.<locals>.guarded_fetchc                   s    g | ]\}}t  ||qS r.   )r=   create_task)r   r   r   )r   r.   r/   r     s    z
Completed r   /z
 batches (z videos)r"   r   )r   rR   r   rZ   r8   r   )-rX   r   
output_csvrr   	overwritere   minr?   
batch_sizeMAX_BATCH_SIZEprintrj   api_key_envr   r]   r   ry   r   rA   r   utc	isoformatrW   mkdir	with_namer   unlinkrs   rt   
DictWriterOUTPUT_FIELDSwriteheaderr=   	Semaphoremax_concurrencyr   AsyncClient	enumerateas_completedprogress_everypopwriterowreplacer   r   )r   r   r   api_keysselected_video_idsvideo_id_batchesselected_video_counttemp_output_csv	row_countok_countmissing_countr|   writerpending_rowsnext_batch_to_writetaskscompleted_batchestaskr   r   
ready_rowsr   r   r.   )r   r   r!   r   r   r   r/   _run_fetch_impl  s   








*
2



r"  c              
     sB   zt jt| ddI d H W S  t jy  } ztd|d }~ww )Ni  r   zFetch exceeded the 500-second cap before completion. Lower per-key delay, increase concurrency, or reduce output size and rerun.)r=   wait_forr"  TimeoutErrorre   )r   r   r.   r.   r/   	run_fetch  s   r%  c                  C  sR   t tt \} }}}td|dd|   td|d td|d d S )NzWrote r   z	 rows to zResolved videos: zMissing videos: )r=   runr%  rS   r   )r   r  r  r  r.   r.   r/   main  s   r'  __main__)r8   rF   rD   )rY   rZ   r8   r[   )rk   r   r8   rl   )r   rZ   r8   r   )r   rF   r8   rl   )r   rZ   r   rR   r8   r   )r   r   r   rZ   r   r   r   rR   r   r2   r8   r   )r   r   r8   r'   )r   r   r8   r'   )r   r'   r8   r   )r   r   r!   r'   r   r   r8   r   )r   r'   r!   r'   r8   r   )r   rR   r   rZ   r   r   r   r   r!   r'   r   r   r   r2   r8   r   )r   rF   r8   r   )3__doc__
__future__r   r=   rO   rt   r   r`   r;   dataclassesr   r   r   r   	itertoolsr   pathlibr   typingr	   r
   r   r   dotenvr   r   r   r   r   r^   r  r&   r0   rS   rX   rj   r   rx   r   r   r   r   r   r   r   r   r   r"  r%  r'  r*   r.   r.   r.   r/   <module>   s\    	

I







)



 

!
W
	
