o
    lQiz                     @  s0  d dl mZ d dlZd dlZd dlmZ d dlZedZed Zed Z	ed d d	 d
 
 Zed 
 Zed 
 Zed 
 Zed 
 Zed 
 Ze	d 
 Ze	d 
 Ze	d 
 ZdZdddddZd;dd Zd<d%d&Zd=d+d,Zd>d/d0Zd?d@d3d4ZdAd6d7ZdBd8d9Zed:kre  dS dS )C    )annotationsN)Pathz/home/ubuntu/transcriptsdata
final_dataphase1_incrementalsegment_map_v1z**z	*.parquetzrecover_v2_consolidated.parquetzvideo_queue.csv.gzzyoutube_video_metadata_all.csvz video_tts_classification_all.csvz"video_tts_classification_final.csvzvideo_rollup_final.parquetz&video_rollup_final_kept_subset.parquetz*video_rollup_gemini_refined_strict.parquet)enhitetaknmlgupabnormrasa  language-learning|language learning|language lesson|language lessons|learn english|learn hindi|learn tamil|learn telugu|learn kannada|learn malayalam|learn gujarati|learn punjabi|learn bengali|learn assamese|learn odia|phrasebook|phrases|daily use sentences|spoken english|spoken hindi|spoken tamil|spoken telugu|spoken kannada|spoken malayalam|spoken gujarati|spoken punjabi|spoken bengali|spoken odia|translationz-(^|[^a-z])(pakistan|pakistani|urdu)([^a-z]|$)zzoom meeting|group meeting|team meeting|meeting recording|weekly meeting|committee meeting|board meeting|panel discussion|roundtable|group discussion|discussion session|question answer|q\s*&\s*a|q and a|multi speaker|multilingualzbstartup interview|startup interview podcast|founder interview|co-founder interview|startup podcast)language_learningpakistan_urdumeeting_zoom_groupstartup_interviewpathr   payloaddictreturnNonec                 C  s   |  tj|dddd  d S )N   Tindent	sort_keys
)
write_textjsondumps)r   r    r%   &scripts/build_final_cleaned_dataset.py
write_json+   s   r'   conduckdb.DuckDBPyConnectionquerystrc                 C  s<   |  |}| }|d u ri S dd |jD }tt||S )Nc                 S  s   g | ]}|d  qS )r   r%   ).0dr%   r%   r&   
<listcomp>4   s    z!fetchone_dict.<locals>.<listcomp>)executefetchonedescriptionr   zip)r(   r*   relrowcolsr%   r%   r&   fetchone_dict/   s   
r6   	numerator
int | Nonedenominatorfloatc                 C  s   | r|sdS t d|  | dS )Ng        g      Y@   )round)r7   r9   r%   r%   r&   pct8   s   r=   itemstuple[str, ...]c                 C  s   d dd | D S )Nz, c                 s  s    | ]	}d | d V  qdS )'Nr%   )r,   itemr%   r%   r&   	<genexpr>?   s    zsql_list.<locals>.<genexpr>)join)r>   r%   r%   r&   sql_list>   s   rD    aliasc                 C  s`   | r|  dnd}d| d| d| d| d| d| d	| d
| d| d| d| d| dS )N.rE   z
        CASE
            WHEN z#lid_consensus = false AND COALESCE(z8lid_agree_count, 0) < 2 THEN 'dispose'
            WHEN z@conformer_multi_ctc_normalized IS NOT NULL
                 AND zEconformer_multi_ctc_normalized < 0.3 THEN 'dispose'
            WHEN z:duration_s < 1.0 THEN 'dispose'
            WHEN COALESCE(z/lid_agree_count, 0) >= 3
                 AND (z?conformer_multi_ctc_normalized >= 0.7
                      OR z>conformer_multi_ctc_normalized IS NULL)
                 AND (z5gemini_quality_score >= 0.5
                      OR z2gemini_quality_score = 0
                      OR z3gemini_quality_score IS NULL)
                 AND zHduration_s >= 2.0 THEN 'golden'
            ELSE 'redo'
        END
    r%   )rF   prefixr%   r%   r&   bucket_caseB   s4   	
rI   columnc                 C  s   d|  dS )Nz;trim(regexp_replace(regexp_replace(regexp_replace(coalesce(zW, ''), '\\[[^\\]]+\\]', '', 'g'), '\\[UNK\\]', '', 'gi'), '\\[INAUDIBLE\\]', '', 'gi'))r%   )rJ   r%   r%   r&   clean_text_exprV   s   rK   c                  C  s  t jddd t d } | jddd t }|d |d |d |d|   d t }tt}|d	t	 d
 |dt
 d |dt d
 |dt d
 |dt d
 |dt d |d |dt dt  d |d |d |d |d |d| d| d d}t D ]\}}|d| d| d | d! qd"d#d$ tD }|d%| d& |d'| d( |d) |d* td+}|d,| d-| d.| d/ |d0t d1   d2 |d3t d4   d2 |d5t d6   d2 |d7| d8| d9t d:   d2 |d;t d<   d= |d>t d?   d= t|d@t d!}	t|d@t d!}
t|d@t
 d!}t|dA}t|dB| dC}t|dD}t|dEt d?   d!}t|dF}t|dG| dH}t|dI}tt dJtt | dKttdLddJdMdNtt dddOi |	dPt|	dQ|	dRii |
dPt|
dQ|
dRii |dPt|dQ|dRi|dSi |t|dT|dUt|dV|dUdWi |t|dX|dY|dZdM|d[dM d\i |||dPt|dQ|dRid]d^d_d`dadbdcdd}tt de | ttj |dKddf d S )gNT)parentsexist_ok
duckdb_tmpzSET threads = 8zSET memory_limit = '24GB'z$SET preserve_insertion_order = falsezSET temp_directory = 'r@   z
        CREATE OR REPLACE VIEW selected_videos AS
        SELECT DISTINCT video_id, recommended_action
        FROM read_csv_auto('z', header=true)
    zm
        CREATE OR REPLACE VIEW strict_videos AS
        SELECT DISTINCT video_id
        FROM read_parquet('z')
    z
        CREATE OR REPLACE VIEW queue_videos AS
        SELECT video_id, language AS queue_language
        FROM read_csv_auto('a  
        CREATE OR REPLACE VIEW youtube_meta AS
        SELECT
            video_id,
            regexp_extract(lower(coalesce(default_audio_language, '')), '^([a-z]+)', 1) AS youtube_audio_language,
            regexp_extract(lower(coalesce(default_language, '')), '^([a-z]+)', 1) AS youtube_default_language,
            channel_id,
            channel_title,
            title,
            description,
            tags,
            lower(coalesce(channel_title, '')) AS channel_title_lc,
            lower(coalesce(title, '')) AS title_lc,
            lower(coalesce(description, '')) AS description_lc,
            lower(coalesce(tags, '')) AS tags_lc
        FROM read_csv_auto('a  
        CREATE OR REPLACE VIEW classifier_meta AS
        SELECT
            video_id,
            lower(coalesce(likely_content_type, '')) AS likely_content_type_lc,
            lower(coalesce(risk_signals, '')) AS risk_signals_lc,
            lower(coalesce(hard_reject_reasons, '')) AS hard_reject_reasons_lc,
            lower(coalesce(short_rationale, '')) AS short_rationale_lc
        FROM read_csv_auto('a  
        CREATE OR REPLACE VIEW strict_segments_raw AS
        SELECT
            s.video_id,
            sv.recommended_action,
            coalesce(q.queue_language, s.queue_language) AS queue_language,
            s.segment_file,
            s.parent_segment_file,
            s.is_split_segment,
            s.split_index_from_id,
            s.speaker_id,
            s.original_start_ms,
            s.original_end_ms,
            s.trimmed_start_ms,
            s.trimmed_end_ms,
            s.leading_pad_ms,
            s.trailing_pad_ms,
            s.expected_language_hint,
            s.tx_detected_language,
            coalesce(nullif(s.tx_detected_language, ''), nullif(s.expected_language_hint, ''), coalesce(q.queue_language, s.queue_language)) AS gemini_lang,
            s.lang_mismatch_flag,
            s.transcription,
            s.tagged,
            s.num_unk,
            s.num_inaudible,
            s.num_event_tags,
            s.text_length_per_sec,
            s.tx_quality_score,
            s.asr_eligible,
            s.tts_clean_eligible,
            s.tts_expressive_eligible,
            s.validation_source,
            s.has_validation,
            s.duration_s,
            s.provisional_bucket
        FROM read_parquet('z', hive_partitioning=true, union_by_name=true) s
        JOIN strict_videos st USING (video_id)
        JOIN selected_videos sv USING (video_id)
        LEFT JOIN queue_videos q USING (video_id)
    z
        CREATE OR REPLACE TEMP TABLE strict_missing_segments_v1 AS
        SELECT video_id, segment_file
        FROM strict_segments_raw
        WHERE NOT has_validation
    a  
        CREATE OR REPLACE TEMP TABLE recover_v2_dedup AS
        WITH ranked AS (
            SELECT
                *,
                ROW_NUMBER() OVER (
                    PARTITION BY video_id, segment_file
                    ORDER BY
                        CASE WHEN conformer_multi_ctc_normalized IS NULL THEN 1 ELSE 0 END ASC,
                        conformer_multi_ctc_normalized DESC NULLS LAST,
                        mms_confidence DESC NULLS LAST
                ) AS rn
            FROM read_parquet('zX')
        )
        SELECT
            video_id,
            segment_file,
            zD AS provisional_bucket
        FROM ranked
        WHERE rn = 1
    a  
        CREATE OR REPLACE TEMP TABLE recover_v2_matched AS
        SELECT
            m.video_id,
            m.segment_file,
            v.provisional_bucket
        FROM strict_missing_segments_v1 m
        JOIN recover_v2_dedup v USING (video_id, segment_file)
    a  
        CREATE OR REPLACE VIEW strict_segments_final AS
        SELECT
            s.*,
            CASE
                WHEN s.has_validation THEN s.validation_source
                WHEN r.segment_file IS NOT NULL THEN 'recover_v2'
                ELSE 'missing'
            END AS final_validation_source,
            (s.has_validation OR r.segment_file IS NOT NULL) AS final_has_validation,
            CASE
                WHEN s.has_validation THEN s.provisional_bucket
                ELSE r.provisional_bucket
            END AS final_bucket
        FROM strict_segments_raw s
        LEFT JOIN recover_v2_matched r USING (video_id, segment_file)
    z
        CREATE OR REPLACE VIEW strict_video_lang_counts AS
        SELECT
            video_id,
            gemini_lang,
            count(*) AS segments
        FROM strict_segments_final
        GROUP BY video_id, gemini_lang
    z
        CREATE OR REPLACE VIEW strict_video_lang_ranked AS
        SELECT
            *,
            row_number() OVER (PARTITION BY video_id ORDER BY segments DESC, gemini_lang) AS rn
        FROM strict_video_lang_counts
    a#  
        CREATE OR REPLACE VIEW strict_video_profile AS
        WITH totals AS (
            SELECT
                s.video_id,
                any_value(s.recommended_action) AS recommended_action,
                any_value(s.queue_language) AS queue_language,
                any_value(y.youtube_audio_language) AS youtube_audio_language,
                any_value(y.youtube_default_language) AS youtube_default_language,
                any_value(y.channel_id) AS channel_id,
                any_value(y.channel_title) AS channel_title,
                any_value(y.title) AS title,
                count(*) AS total_segments,
                count(DISTINCT s.gemini_lang) FILTER (WHERE s.gemini_lang <> '') AS distinct_detected_languages,
                count(*) FILTER (WHERE s.gemini_lang NOT IN (a'  ) AND s.gemini_lang <> '') AS foreign_segments
            FROM strict_segments_final s
            LEFT JOIN youtube_meta y USING (video_id)
            GROUP BY s.video_id
        ),
        dominant AS (
            SELECT
                video_id,
                gemini_lang AS dominant_gemini_language,
                segments AS dominant_gemini_segments
            FROM strict_video_lang_ranked
            WHERE rn = 1
        )
        SELECT
            t.*,
            d.dominant_gemini_language,
            d.dominant_gemini_segments,
            round(100.0 * t.foreign_segments / t.total_segments, 6) AS foreign_share_pct,
            round(100.0 * d.dominant_gemini_segments / t.total_segments, 6) AS dominant_share_pct,
            CASE
                WHEN t.youtube_audio_language IN (a  )
                     AND t.youtube_audio_language = d.dominant_gemini_language
                THEN t.youtube_audio_language
                ELSE t.queue_language
            END AS corrected_language
        FROM totals t
        JOIN dominant d USING (video_id)
    a?  coalesce(c.likely_content_type_lc,'') || ' ' || coalesce(c.risk_signals_lc,'') || ' ' || coalesce(c.hard_reject_reasons_lc,'') || ' ' || coalesce(c.short_rationale_lc,'') || ' ' || coalesce(y.channel_title_lc,'') || ' ' || coalesce(y.title_lc,'') || ' ' || coalesce(y.description_lc,'') || ' ' || coalesce(y.tags_lc,'')z)
            CREATE OR REPLACE VIEW flag_z AS
            SELECT DISTINCT s.video_id
            FROM strict_videos s
            LEFT JOIN classifier_meta c USING (video_id)
            LEFT JOIN youtube_meta y USING (video_id)
            WHERE regexp_matches(z, 'z')
        z UNION ALL c                 s  s     | ]}d | d| V  qdS )zSELECT video_id, 'z' AS family FROM flag_Nr%   )r,   familyr%   r%   r&   rB   D  s    
zmain.<locals>.<genexpr>z@
        CREATE OR REPLACE VIEW approved_theme_flags AS
        z
    a/  
        CREATE OR REPLACE VIEW final_video_filters AS
        SELECT
            p.*,
            p.foreign_share_pct >= 5.0 AS drop_foreign_share_ge5,
            p.youtube_audio_language <> ''
                AND p.youtube_audio_language <> 'und'
                AND p.youtube_audio_language NOT IN (a  ) AS drop_audio_non_target,
            p.youtube_audio_language = 'und'
                AND NOT (p.distinct_detected_languages <= 3 AND p.foreign_segments = 0) AS drop_und_unsound,
            EXISTS (SELECT 1 FROM flag_language_learning f WHERE f.video_id = p.video_id) AS drop_language_learning,
            EXISTS (SELECT 1 FROM flag_pakistan_urdu f WHERE f.video_id = p.video_id) AS drop_pakistan_urdu,
            EXISTS (SELECT 1 FROM flag_meeting_zoom_group f WHERE f.video_id = p.video_id) AS drop_meeting_zoom_group,
            EXISTS (SELECT 1 FROM flag_startup_interview f WHERE f.video_id = p.video_id) AS drop_startup_interview
        FROM strict_video_profile p
    a  
        CREATE OR REPLACE VIEW final_video_decisions AS
        SELECT
            *,
            (
                drop_language_learning
                OR drop_pakistan_urdu
                OR drop_meeting_zoom_group
                OR drop_startup_interview
            ) AS drop_approved_theme_family,
            (
                drop_foreign_share_ge5
                OR drop_audio_non_target
                OR drop_und_unsound
                OR drop_language_learning
                OR drop_pakistan_urdu
                OR drop_meeting_zoom_group
                OR drop_startup_interview
            ) AS drop_any
        FROM final_video_filters
    z
        CREATE OR REPLACE VIEW kept_videos_after_video_filters AS
        SELECT *
        FROM final_video_decisions
        WHERE NOT drop_any
    zs.transcriptionz
        CREATE OR REPLACE VIEW cleaned_segment_candidates AS
        SELECT
            s.*,
            k.corrected_language,
            zL AS clean_text,
            CASE
                WHEN s.gemini_lang NOT IN (z) AND s.gemini_lang <> '' THEN 'foreign_segment'
                WHEN s.queue_language = 'ta' AND k.corrected_language = 'te' AND s.gemini_lang = 'ta' THEN 'ta_to_te_residual'
                WHEN (
                    aZ   = ''
                    OR coalesce(s.num_unk, 0) > 0
                    OR coalesce(s.num_inaudible, 0) > 0
                    OR regexp_matches(coalesce(s.transcription, ''), '\\[UNK\\]', 'i')
                    OR regexp_matches(coalesce(s.transcription, ''), '\\[INAUDIBLE\\]', 'i')
                    OR regexp_matches(coalesce(s.transcription, ''), '\\[NO_SPEECH\\]', 'i')
                ) THEN 'blank_unk_or_inaudible'
                ELSE 'kept'
            END AS segment_decision
        FROM strict_segments_final s
        JOIN kept_videos_after_video_filters k USING (video_id)
    a.  
        COPY (
            SELECT
                video_id,
                recommended_action,
                queue_language,
                youtube_audio_language,
                youtube_default_language,
                dominant_gemini_language,
                corrected_language,
                total_segments,
                distinct_detected_languages,
                foreign_segments,
                foreign_share_pct,
                drop_foreign_share_ge5,
                drop_audio_non_target,
                drop_und_unsound,
                drop_language_learning,
                drop_pakistan_urdu,
                drop_meeting_zoom_group,
                drop_startup_interview
            FROM final_video_decisions
            WHERE drop_any
            ORDER BY video_id
        ) TO 'z!final_cleaned_excluded_videos.csvz' (HEADER, DELIMITER ',')
    a  
        COPY (
            SELECT
                video_id,
                recommended_action,
                queue_language,
                youtube_audio_language,
                youtube_default_language,
                dominant_gemini_language,
                corrected_language,
                total_segments,
                distinct_detected_languages,
                foreign_segments,
                foreign_share_pct
            FROM kept_videos_after_video_filters
            ORDER BY video_id
        ) TO 'zfinal_cleaned_keep_videos.csvz
        COPY (
            SELECT
                segment_decision,
                count(*) AS segments
            FROM cleaned_segment_candidates
            GROUP BY segment_decision
            ORDER BY segments DESC, segment_decision
        ) TO 'z)final_cleaned_segment_removal_summary.csva7  
        COPY (
            SELECT
                corrected_language,
                gemini_lang,
                count(*) AS segments,
                count(DISTINCT video_id) AS videos
            FROM cleaned_segment_candidates
            WHERE segment_decision = 'kept'
              AND gemini_lang IN (z+)
              AND corrected_language IN (z)
              AND gemini_lang <> corrected_language
            GROUP BY corrected_language, gemini_lang
            ORDER BY segments DESC, corrected_language, gemini_lang
        ) TO 'z1final_cleaned_remaining_target_mismatch_pairs.csva  
        COPY (
            SELECT
                video_id,
                recommended_action,
                queue_language,
                corrected_language,
                segment_file,
                speaker_id,
                parent_segment_file,
                is_split_segment,
                split_index_from_id,
                original_start_ms,
                original_end_ms,
                trimmed_start_ms,
                trimmed_end_ms,
                leading_pad_ms,
                trailing_pad_ms,
                expected_language_hint,
                tx_detected_language,
                gemini_lang,
                transcription,
                tagged,
                clean_text,
                num_unk,
                num_inaudible,
                num_event_tags,
                text_length_per_sec,
                tx_quality_score,
                asr_eligible,
                tts_clean_eligible,
                tts_expressive_eligible,
                lang_mismatch_flag,
                duration_s,
                final_validation_source,
                final_has_validation,
                final_bucket
            FROM cleaned_segment_candidates
            WHERE segment_decision = 'kept'
            ORDER BY video_id, segment_file
        ) TO 'zfinal_cleaned_segments.parquetz)' (FORMAT PARQUET, COMPRESSION ZSTD)
    a  
        COPY (
            SELECT
                video_id,
                any_value(recommended_action) AS recommended_action,
                any_value(queue_language) AS queue_language,
                any_value(corrected_language) AS corrected_language,
                any_value(gemini_lang) FILTER (WHERE gemini_lang = corrected_language) AS matched_language_example,
                count(*) AS total_segments,
                count(*) FILTER (WHERE final_has_validation) AS final_validated_segments,
                count(*) FILTER (WHERE NOT final_has_validation) AS final_missing_segments,
                count(*) FILTER (WHERE final_bucket = 'golden') AS golden_segments,
                count(*) FILTER (WHERE final_bucket = 'redo') AS redo_segments,
                count(*) FILTER (WHERE final_bucket = 'dispose') AS dispose_segments,
                count(*) FILTER (WHERE gemini_lang <> corrected_language) AS target_language_mismatch_segments,
                count(*) FILTER (WHERE lang_mismatch_flag) AS lang_mismatch_flag_segments
            FROM cleaned_segment_candidates
            WHERE segment_decision = 'kept'
            GROUP BY video_id
            ORDER BY video_id
        ) TO 'z"final_cleaned_video_rollup.parqueta  
        SELECT
            count(*) AS videos,
            sum(total_segments) AS total_segments,
            sum(final_validated_segments) AS final_validated_segments,
            sum(final_missing_segments) AS final_missing_segments
        FROM read_parquet('a5  
        SELECT
            count(*) AS strict_videos,
            count(*) FILTER (WHERE drop_foreign_share_ge5) AS drop_foreign_share_ge5_videos,
            count(*) FILTER (WHERE drop_audio_non_target) AS drop_audio_non_target_videos,
            count(*) FILTER (WHERE drop_und_unsound) AS drop_und_unsound_videos,
            count(*) FILTER (WHERE drop_language_learning) AS drop_language_learning_videos,
            count(*) FILTER (WHERE drop_pakistan_urdu) AS drop_pakistan_urdu_videos,
            count(*) FILTER (WHERE drop_meeting_zoom_group) AS drop_meeting_zoom_group_videos,
            count(*) FILTER (WHERE drop_startup_interview) AS drop_startup_interview_videos,
            count(*) FILTER (WHERE drop_approved_theme_family) AS drop_approved_theme_family_videos,
            count(*) FILTER (WHERE drop_any) AS drop_any_videos,
            count(*) FILTER (WHERE NOT drop_any) AS kept_after_video_filters_videos,
            sum(total_segments) FILTER (WHERE NOT drop_any) AS kept_after_video_filters_segments
        FROM final_video_decisions
        zG
        SELECT
            count(*) FILTER (WHERE gemini_lang NOT IN (z) AND gemini_lang <> '') AS strict_foreign_segments,
            count(*) FILTER (WHERE lang_mismatch_flag) AS strict_lang_mismatch_flag_segments
        FROM strict_segments_final
        a  
        SELECT
            count(*) AS candidate_segments,
            count(*) FILTER (WHERE segment_decision = 'kept') AS kept_segments,
            count(*) FILTER (WHERE segment_decision = 'foreign_segment') AS removed_foreign_segments,
            count(*) FILTER (WHERE segment_decision = 'ta_to_te_residual') AS removed_ta_to_te_residual_segments,
            count(*) FILTER (WHERE segment_decision = 'blank_unk_or_inaudible') AS removed_blank_unk_or_inaudible_segments
        FROM cleaned_segment_candidates
        a^  
        SELECT
            count(*) AS videos,
            sum(total_segments) AS total_segments,
            sum(final_validated_segments) AS final_validated_segments,
            sum(final_missing_segments) AS final_missing_segments,
            sum(golden_segments) AS golden_segments,
            sum(redo_segments) AS redo_segments,
            sum(dispose_segments) AS dispose_segments,
            sum(target_language_mismatch_segments) AS remaining_target_language_mismatch_segments,
            sum(lang_mismatch_flag_segments) AS remaining_lang_mismatch_flag_segments,
            count(*) FILTER (WHERE target_language_mismatch_segments > 0) AS videos_with_remaining_target_language_mismatch_segments,
            count(*) FILTER (WHERE lang_mismatch_flag_segments > 0) AS videos_with_remaining_lang_mismatch_flag_segments
        FROM read_parquet('a  
        SELECT
            count(*) FILTER (WHERE final_validation_source = 'historical') AS historical_segments,
            count(*) FILTER (WHERE final_validation_source = 'recover') AS recover_v1_segments,
            count(*) FILTER (WHERE final_validation_source = 'recover_v2') AS recover_v2_segments,
            count(*) FILTER (WHERE final_validation_source = 'missing') AS missing_segments
        FROM cleaned_segment_candidates
        WHERE segment_decision = 'kept'
        ze
        SELECT
            count(*) FILTER (WHERE segment_decision = 'kept' AND gemini_lang NOT IN (aH  ) AND gemini_lang <> '') AS remaining_foreign_segments,
            count(*) FILTER (WHERE segment_decision = 'kept' AND queue_language = 'ta' AND corrected_language = 'te' AND gemini_lang = 'ta') AS remaining_ta_to_te_residual_segments,
            count(*) FILTER (WHERE segment_decision = 'kept' AND (coalesce(num_unk, 0) > 0 OR regexp_matches(coalesce(transcription, ''), '\\[UNK\\]', 'i'))) AS remaining_unk_segments,
            count(*) FILTER (WHERE segment_decision = 'kept' AND (coalesce(num_inaudible, 0) > 0 OR regexp_matches(coalesce(transcription, ''), '\\[INAUDIBLE\\]', 'i'))) AS remaining_inaudible_segments,
            count(*) FILTER (WHERE segment_decision = 'kept' AND regexp_matches(coalesce(transcription, ''), '\\[NO_SPEECH\\]', 'i')) AS remaining_no_speech_segments
        FROM cleaned_segment_candidates
        a  
        SELECT
            (SELECT count(*) FROM kept_videos_after_video_filters) AS videos_after_video_filters,
            (SELECT count(DISTINCT video_id) FROM cleaned_segment_candidates WHERE segment_decision = 'kept') AS videos_after_segment_filters
           r   g      @r   )max_distinct_detected_languagesrequired_foreign_segments)target_languages foreign_share_drop_threshold_pct,drop_known_non_target_youtube_audio_languageund_keep_ruleapproved_theme_familiesdrop_ta_to_te_residual_segments!drop_blank_unk_inaudible_segmentscoverage_pctfinal_validated_segmentstotal_segments)all_transcribedselected_set
strict_setdrop_any_videosstrict_videoskept_after_video_filters_videos)drop_any_pct_of_strict&kept_after_video_filters_pct_of_strictkept_segmentscandidate_segmentsvideos_after_video_filtersvideos_after_segment_filters)$kept_segment_pct_after_video_filtersvideos_removed_to_zero_segmentsz,final_data/final_cleaned_excluded_videos.csvz(final_data/final_cleaned_keep_videos.csvz4final_data/final_cleaned_segment_removal_summary.csvz<final_data/final_cleaned_remaining_target_mismatch_pairs.csvz-final_data/final_cleaned_video_rollup.parquetz)final_data/final_cleaned_segments.parquet)excluded_videos_csvkeep_videos_csvsegment_removal_summary_csv#remaining_target_mismatch_pairs_csvvideo_rollup_parquetsegments_parquet)generated_at_epoch_s	elapsed_srule_config	baselinesvideo_filter_countssegment_filter_countsfinal_cleanedoutputszfinal_cleaned_summary.jsonr   )!	FINAL_DIRmkdirduckdbconnectr/   as_posixtimerD   TARGET_LANGSSELECTED_VIDEO_SETVIDEO_ROLLUP_STRICTVIDEO_QUEUEYOUTUBE_METACLASSIFICATION_ALLSEGMENT_MAP_GLOB
RECOVER_V2rI   APPROVED_THEME_PATTERNSr>   rC   rK   r6   VIDEO_ROLLUP_ALLVIDEO_ROLLUP_SELECTEDr<   listkeysr=   getr'   printr#   r$   )temp_dirr(   started
target_sql	text_exprrO   pattern
flag_union
clean_textbaseline_allbaseline_selectedbaseline_strictru   strict_foreign_summarycandidate_summaryfinal_rollup_summaryfinal_source_breakdownfinal_sanityempty_video_dropsummaryr%   r%   r&   mainb   s  








#

(


	

	"
+














	



(
*

	





Pr   __main__)r   r   r   r   r   r   )r(   r)   r*   r+   r   r   )r7   r8   r9   r8   r   r:   )r>   r?   r   r+   )rE   )rF   r+   r   r+   )rJ   r+   r   r+   )r   r   )
__future__r   r#   r~   pathlibr   r{   ROOTDATA_DIRry   r}   r   r   r   r   r   r   r   r   r   r   r   r'   r6   r=   rD   rI   rK   r   __name__r%   r%   r%   r&   <module>   sL    


	


     
