o
    i-                     @  sr  d Z ddlmZ ddlZddlZddlZddlZddlmZ ddl	m
Z
mZ ddlZddlmZmZ dZdZi d	d
dddddddddddddddddddddd d!d"d#d$d%d&d'd(d)d*d+d,d-d.d/d0Zd1Zd2Zd3ZdSd8d9ZdTd=d>ZG d?d@ d@eZe ZeG dAdB dBZeG dCdD dDZG dEdF dFZdUdIdJZdVdLdMZdWdQdRZ dS )Xaz  
OpenRouter-backed metadata classifier for TTS dataset filtering.

One video per request. The prompt is structured so the static rubric lives in
the first user message with a cache_control breakpoint, and per-video data is
injected in a second user message. This keeps the cached prefix identical
across all 500k requests so OpenRouter/Gemini implicit + explicit caching works.
    )annotationsN)	dataclass)AnyLiteral)	BaseModelFieldz-https://openrouter.ai/api/v1/chat/completionszgoogle/gemini-3-flash-preview1zFilm & Animation2zAutos & Vehicles10Music15zPets & Animals17Sports18zShort Movies19zTravel & Events20Gaming21Videoblogging22zPeople & Blogs23Comedy24Entertainment25zNews & Politics26zHowto & Style27	Education28zScience & Technology29zNonprofits & ActivismMoviesDocumentaryShortsShowsTrailers)3035424344uH  You rank YouTube videos for TTS training suitability.

TARGET: long-form spoken-word — podcasts, lectures, interviews, oral histories, audiobooks, explainers, clean narration.
REJECT: meetings, webinars, elevator pitches, demo days, noisy events, gaming, music, sports highlights, memes, short-form clips.

Judge ONLY from the metadata payload below. Do not use outside knowledge.

Scoring (0-100 each):
- tts_suitability_score: overall keep-worthiness for TTS data.
- spoken_word_score: speech-dominated vs music/visuals/noise.
- clean_speech_likelihood_score: metadata guess at clean, stable audio.
- single_speaker_likelihood_score: one voice at a time vs crosstalk.
- metadata_confidence_score: how confidently metadata supports the call.

Decision:
- KEEP: strong spoken-word fit, low noise risk.
- REVIEW: mixed signals or ambiguous metadata.
- DROP: clear evidence of unsuitable content.

Hard negatives: meetings, webinars, launchpads, demo days, pitch sessions, ceremonies, Q&A, music videos, songs, remixes, gaming streams, gameplay, highlights, reactions, compilations, memes, shorts, live stage/crowd audio.
Hard positives: lecture series, university talks, archive interviews, structured podcasts, audiobooks, clean explainers, narrated history/stories.

Rules:
- Duration alone is not enough. Category alone is not enough.
- People & Blogs can be any action depending on title/channel/tags.
- Music category is almost always DROP unless metadata clearly shows a spoken interview.
- Meeting-like language (agenda, townhall, launchpad, elevator pitch, Q&A) is a strong negative.
- Scores >= 80 → usually KEEP. 55-79 → usually REVIEW. < 55 → usually DROP.
- If content is clearly a meeting/pitch/webinar/noisy event, set hard_reject=true.

Output: keep JSON compact. Lists max 4 items. short_rationale under 30 words. No info not in the payload.znYou are a strict metadata-based gatekeeper for TTS training data. Think silently. Return only the JSON object.z\Classify this video's metadata for TTS dataset filtering. Return the structured JSON result.	raw_valuestrreturn	list[str]c                 C  sL   | sg S zt | }W n t jy   g  Y S w t|tr$dd |D S g S )Nc                 S  s   g | ]}t |qS  )r+   .0itemr.   r.   4/home/ubuntu/transcripts/src/video_tts_classifier.py
<listcomp>W   s    z#parse_json_list.<locals>.<listcomp>)jsonloadsJSONDecodeError
isinstancelist)r*   parsedr.   r.   r2   parse_json_listP   s   r:   rowdict[str, str]dict[str, Any]c                 C  s   t | ddpd}t| dd}t| dd}| d}|r%t|nd }t | ddp/dt | ddp8dt | ddpAdt | d	dpJdd d
 |d d |t|dt | ddpbd|t | ddpld|dS )Ncategory_id tagstopic_categoriesduration_secondsvideo_idchannel_titletitledescriptioni      Unknowndefault_audio_language
definition)rC   rD   rE   rF   r@   r>   category_labelrI   rB   rJ   rA   )r+   getr:   intYOUTUBE_CATEGORY_LABELS)r;   r>   r@   rA   durrB   r.   r.   r2   normalize_metadata_rowZ   s"   


rP   c                   @  s  e Zd ZU eddZded< eddZded< edd	d
Zded< edd	d
Zded< edd	d
Z	ded< edd	d
Z
ded< edd	d
Zded< eddZded< eedZded< eedZded< eedZded< eddZded< eddZded< dS )VideoTtsClassificationzFinal action for this video.)rF   z!Literal['keep', 'review', 'drop']recommended_actionz_Short label: podcast, lecture, sports talk, meeting, audiobook, explainer, event, unknown, etc.r+   likely_content_typer   d   )gelerM   tts_suitability_scorespoken_word_scoreclean_speech_likelihood_scoresingle_speaker_likelihood_scoremetadata_confidence_scorez0True when metadata strongly indicates exclusion.boolhard_reject)default_factoryr-   hard_reject_reasonspositive_signalsrisk_signalsz%Under 30 words explaining the action.short_rationalez*True if audio-level check is still needed.needs_audio_validationN)__name__
__module____qualname__r   rR   __annotations__rS   rW   rX   rY   rZ   r[   r]   r8   r_   r`   ra   rb   rc   r.   r.   r.   r2   rQ   o   s0   
 rQ   c                   @  sJ   e Zd ZU dZded< dZded< dZded< dZded< dZded	< d
S )ClassificationUsager   rM   prompt_tokenscompletion_tokenscached_tokenstotal_tokensFr\   	cache_hitN)	rd   re   rf   ri   rg   rj   rk   rl   rm   r.   r.   r.   r2   rh      s   
 rh   c                   @  s.   e Zd ZU ded< ded< ded< ded< d	S )
ClassificationResultr+   rC   rQ   classificationrh   usagefloat
latency_msN)rd   re   rf   rg   r.   r.   r.   r2   rn      s
   
 rn   c                   @  s:   e Zd ZdZedddddddZdddZdddZdS )OpenRouterClassifierz>Single-video-per-request classifier with stable cached prefix.g?   low)modeltemperaturemax_retriesreasoning_effortapi_keyr+   rv   rw   rq   rx   rM   ry   c                C  s4   || _ || _|| _|| _|| _d| dd| _d S )NzBearer zapplication/json)AuthorizationzContent-Type)rz   rv   rw   rx   ry   _headers)selfrz   rv   rw   rx   ry   r.   r.   r2   __init__   s   	zOpenRouterClassifier.__init__metadata_rowr<   r,   r=   c                 C  st   t jt|dd}dt ttdi   d }| jdtddt	d	 t
 | dd|dg| jd
ddid}|S )NF)ensure_asciiz5

Return ONLY a JSON object with these exact fields:

propertiesz
Field types: recommended_action=(keep|review|drop), all scores=int 0-100, hard_reject=bool, lists=string[], short_rationale=string, needs_audio_validation=bool.system)rolecontentuserz

iX  typejson_object)rv   messagesrw   
max_tokensresponse_format)r4   dumpsrP   r8   _SCHEMArL   keysrv   SYSTEM_PROMPTSTATIC_TASK_PREFIXCACHED_RUBRICrw   )r}   r   
video_jsonschema_instructionbodyr.   r.   r2   _build_body   s2   z OpenRouterClassifier._build_bodyclienthttpx.AsyncClientrn   c                   s  t |ddp	d}| |}g }t| jd D ]}zt }|jt| j	|dI d H }t | d }	W n8 t
jym }
 z+|d|
  || jk r[tt|I d H  W Y d }
~
qtd| dd	| |
d }
~
ww |jd
ksx|jdkr||j  || jk rtt|I d H  qn|jdkrtd|j d| d|jd d  | }|di gd di dd}|std| t|}t|}t|}t||||	d  S td| dd	| )NrC   r?      )headersr4   i  ztransport: zTransport failed z: z | i  i     zHTTP z for i,  choicesr   messager   zEmpty content for )rC   ro   rp   rr   zAll retries exhausted for )r+   rL   r   rangerx   time	monotonicpostOPENROUTER_API_URLr|   httpx	HTTPErrorappendasynciosleep_retry_delayRuntimeErrorjoinstatus_codetextr4   _parse_jsonrQ   model_validate_extract_usagern   )r}   r   r   rC   r   failuresattemptt0resprr   excpayloadr   r9   ro   rp   r.   r.   r2   classify   sf   

	

 "

zOpenRouterClassifier.classifyN)
rz   r+   rv   r+   rw   rq   rx   rM   ry   r+   )r   r<   r,   r=   )r   r<   r   r   r,   rn   )rd   re   rf   __doc__DEFAULT_OPENROUTER_MODELr~   r   r   r.   r.   r.   r2   rs      s    
!rs   r   r   c                 C  s   t | tr| S t | trddd | D } t | ts$tdt| d|  }|dr?|d}|dr?|d	d  	 }|
d
|d}}|dkrZ||krZ|||d  }dd l}|dd|}t|S )Nr?   c                 s  s2    | ]}t |trt|d |nt|V  qdS )r   N)r7   dictr+   rL   r/   r.   r.   r2   	<genexpr>  s
     
z_parse_json.<locals>.<genexpr>zbad typer   z````r4      {}r   z,\s*([}\]])z\1)r7   r   r8   r   r+   r4   r6   strip
startswithlstripfindrfindresubr5   )r   sijr   r.   r.   r2   r     s&   







r   r   c              
   C  s   |  di pi }t| ddpd}t| ddpd}| di p#i }t| ddp,d}t|||t| d|| p<d|dkdS )	Nrp   ri   r   rj   prompt_tokens_detailsrk   rl   )ri   rj   rk   rl   rm   )rL   rM   rh   )r   rp   ptctdetailscachedr.   r.   r2   r   %  s   r   r   rM   rq   c                 C  s   t d|  dtdd S )N      g        g333333?)minrandomuniform)r   r.   r.   r2   r   4  s   r   )r*   r+   r,   r-   )r;   r<   r,   r=   )r   r   r,   r=   )r   r=   r,   rh   )r   rM   r,   rq   )!r   
__future__r   r   r4   r   r   dataclassesr   typingr   r   r   pydanticr   r   r   r   rN   r   r   r   r:   rP   rQ   model_json_schemar   rh   rn   rs   r   r   r   r.   r.   r.   r2   <module>   s    !



q
