from __future__ import annotations

import json
import time
from pathlib import Path

import duckdb


ROOT = Path("/home/ubuntu/transcripts")
DATA_DIR = ROOT / "data"
OUTPUT_DIR = ROOT / "final_data"

SEGMENT_MAP_GLOB = (DATA_DIR / "phase1_incremental" / "segment_map_v1" / "**" / "*.parquet").as_posix()
FINAL_VIDEO_SELECTION = (DATA_DIR / "video_tts_classification_final.csv").as_posix()
VIDEO_QUEUE = (DATA_DIR / "video_queue.csv.gz").as_posix()
YOUTUBE_METADATA = (DATA_DIR / "youtube_video_metadata_all.csv").as_posix()

TARGET_LANGS = ("en", "hi", "te", "ta", "kn", "ml", "gu", "pa", "bn", "or", "mr", "as")
LATIN_LANGS = (
    "en", "id", "fr", "it", "es", "tl", "ms", "zu", "pt", "de", "pl", "vi", "so", "sw",
    "nl", "sv", "no", "da", "fi", "ca", "eu", "hr", "cs", "ro", "sk", "sl", "hu", "et",
    "is", "af", "ceb", "fil", "mg", "oc", "rm", "br", "ia", "lb",
)
DEVANAGARI_LANGS = ("hi", "mr", "ne", "sa", "kok", "bho", "doi", "mai", "pi", "bh", "anp", "new", "gom")
TELUGU_LANGS = ("te",)
TAMIL_LANGS = ("ta",)
KANNADA_LANGS = ("kn", "tcy")
MALAYALAM_LANGS = ("ml",)
GUJARATI_LANGS = ("gu",)
GURMUKHI_LANGS = ("pa", "pnb")
BENGALI_LANGS = ("bn", "as")
ODIA_LANGS = ("or",)
ARABIC_SCRIPT_LANGS = ("ar", "ur", "fa", "ps", "sd", "ckb", "ks", "bal")
CYRILLIC_LANGS = ("ru", "sr", "mn", "uk", "kk", "ky", "bg")
GREEK_LANGS = ("el",)
HEBREW_LANGS = ("he", "yi")
SINHALA_LANGS = ("si",)
ETHIOPIC_LANGS = ("am", "ti")
THAI_LANGS = ("th",)
KHMER_LANGS = ("km",)
BURMESE_LANGS = ("my",)
GEORGIAN_LANGS = ("ka",)
TIBETAN_LANGS = ("bo", "dz")
HANGUL_LANGS = ("ko",)
HAN_LANGS = ("zh", "yue")
JAPANESE_LANGS = ("ja",)


def write_json(path: Path, payload: dict) -> None:
    path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n")


def fetchone_dict(con: duckdb.DuckDBPyConnection, query: str) -> dict:
    rel = con.execute(query)
    row = rel.fetchone()
    if row is None:
        return {}
    cols = [d[0] for d in rel.description]
    return dict(zip(cols, row))


def sql_list(items: tuple[str, ...]) -> str:
    return ", ".join(f"'{item}'" for item in items)


def normalize_lang_expr(column: str) -> str:
    return f"regexp_extract(lower(coalesce({column}, '')), '^([a-z]+)', 1)"


def clean_text_expr(column: str) -> str:
    return (
        f"trim("
        f"regexp_replace("
        f"regexp_replace("
        f"regexp_replace(coalesce({column}, ''), '\\\\[[^\\\\]]+\\\\]', '', 'g'), "
        f"'\\\\[UNK\\\\]', '', 'gi'), "
        f"'\\\\[INAUDIBLE\\\\]', '', 'gi')"
        f")"
    )


def script_family_expr(column: str) -> str:
    return f"""
        CASE
            WHEN {column} IN ({sql_list(LATIN_LANGS)}) THEN 'latin'
            WHEN {column} IN ({sql_list(DEVANAGARI_LANGS)}) THEN 'devanagari'
            WHEN {column} IN ({sql_list(TELUGU_LANGS)}) THEN 'telugu'
            WHEN {column} IN ({sql_list(TAMIL_LANGS)}) THEN 'tamil'
            WHEN {column} IN ({sql_list(KANNADA_LANGS)}) THEN 'kannada'
            WHEN {column} IN ({sql_list(MALAYALAM_LANGS)}) THEN 'malayalam'
            WHEN {column} IN ({sql_list(GUJARATI_LANGS)}) THEN 'gujarati'
            WHEN {column} IN ({sql_list(GURMUKHI_LANGS)}) THEN 'gurmukhi'
            WHEN {column} IN ({sql_list(BENGALI_LANGS)}) THEN 'bengali_assamese'
            WHEN {column} IN ({sql_list(ODIA_LANGS)}) THEN 'odia'
            WHEN {column} IN ({sql_list(ARABIC_SCRIPT_LANGS)}) THEN 'arabic_script'
            WHEN {column} IN ({sql_list(CYRILLIC_LANGS)}) THEN 'cyrillic'
            WHEN {column} IN ({sql_list(GREEK_LANGS)}) THEN 'greek'
            WHEN {column} IN ({sql_list(HEBREW_LANGS)}) THEN 'hebrew'
            WHEN {column} IN ({sql_list(SINHALA_LANGS)}) THEN 'sinhala'
            WHEN {column} IN ({sql_list(ETHIOPIC_LANGS)}) THEN 'ethiopic'
            WHEN {column} IN ({sql_list(THAI_LANGS)}) THEN 'thai'
            WHEN {column} IN ({sql_list(KHMER_LANGS)}) THEN 'khmer'
            WHEN {column} IN ({sql_list(BURMESE_LANGS)}) THEN 'burmese'
            WHEN {column} IN ({sql_list(GEORGIAN_LANGS)}) THEN 'georgian'
            WHEN {column} IN ({sql_list(TIBETAN_LANGS)}) THEN 'tibetan'
            WHEN {column} IN ({sql_list(HANGUL_LANGS)}) THEN 'hangul'
            WHEN {column} IN ({sql_list(HAN_LANGS)}) THEN 'han'
            WHEN {column} IN ({sql_list(JAPANESE_LANGS)}) THEN 'japanese'
            WHEN {column} = '' THEN 'missing'
            ELSE 'other'
        END
    """


def expected_script_char_count_expr(lang_col: str, text_col: str) -> str:
    return f"""
        CASE
            WHEN {lang_col} IN ({sql_list(LATIN_LANGS)}) THEN length(regexp_replace({text_col}, '[^A-Za-z]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(DEVANAGARI_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0900}}-\\x{{097F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(TELUGU_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0C00}}-\\x{{0C7F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(TAMIL_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0B80}}-\\x{{0BFF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(KANNADA_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0C80}}-\\x{{0CFF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(MALAYALAM_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0D00}}-\\x{{0D7F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(GUJARATI_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0A80}}-\\x{{0AFF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(GURMUKHI_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0A00}}-\\x{{0A7F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(BENGALI_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0980}}-\\x{{09FF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(ODIA_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0B00}}-\\x{{0B7F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(ARABIC_SCRIPT_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0600}}-\\x{{06FF}}\\x{{0750}}-\\x{{077F}}\\x{{08A0}}-\\x{{08FF}}\\x{{FB50}}-\\x{{FDFF}}\\x{{FE70}}-\\x{{FEFF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(CYRILLIC_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0400}}-\\x{{04FF}}\\x{{0500}}-\\x{{052F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(GREEK_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0370}}-\\x{{03FF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(HEBREW_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0590}}-\\x{{05FF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(SINHALA_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0D80}}-\\x{{0DFF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(ETHIOPIC_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{1200}}-\\x{{137F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(THAI_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0E00}}-\\x{{0E7F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(KHMER_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{1780}}-\\x{{17FF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(BURMESE_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{1000}}-\\x{{109F}}\\x{{AA60}}-\\x{{AA7F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(GEORGIAN_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{10A0}}-\\x{{10FF}}\\x{{2D00}}-\\x{{2D2F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(TIBETAN_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{0F00}}-\\x{{0FFF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(HANGUL_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{AC00}}-\\x{{D7AF}}\\x{{1100}}-\\x{{11FF}}\\x{{3130}}-\\x{{318F}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(HAN_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{4E00}}-\\x{{9FFF}}\\x{{3400}}-\\x{{4DBF}}]', '', 'g'))
            WHEN {lang_col} IN ({sql_list(JAPANESE_LANGS)}) THEN length(regexp_replace({text_col}, '[^\\x{{3040}}-\\x{{30FF}}\\x{{31F0}}-\\x{{31FF}}\\x{{4E00}}-\\x{{9FFF}}]', '', 'g'))
            ELSE 0
        END
    """


def export_csv(con: duckdb.DuckDBPyConnection, query: str, path: Path) -> None:
    con.execute(f"COPY ({query}) TO '{path.as_posix()}' (HEADER, DELIMITER ',')")


def main() -> None:
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    temp_dir = OUTPUT_DIR / "duckdb_tmp"
    temp_dir.mkdir(parents=True, exist_ok=True)

    con = duckdb.connect()
    con.execute("SET threads = 2")
    con.execute("SET memory_limit = '16GB'")
    con.execute("SET preserve_insertion_order = false")
    con.execute(f"SET temp_directory = '{temp_dir.as_posix()}'")

    started = time.time()

    clean_text = clean_text_expr("s.transcription")
    gemini_lang = "coalesce(nullif(s.tx_detected_language, ''), nullif(s.expected_language_hint, ''), s.queue_language)"
    gemini_script_family = script_family_expr(gemini_lang)
    expected_script_char_count = expected_script_char_count_expr(gemini_lang, clean_text)

    con.execute(f"""
        CREATE OR REPLACE VIEW selected_videos AS
        SELECT DISTINCT video_id, recommended_action
        FROM read_csv_auto('{FINAL_VIDEO_SELECTION}', header=true)
    """)
    con.execute(f"""
        CREATE OR REPLACE VIEW queue_videos AS
        SELECT
            video_id,
            language AS queue_language,
            segment_count
        FROM read_csv_auto('{VIDEO_QUEUE}', header=true)
    """)
    con.execute(f"""
        CREATE OR REPLACE VIEW youtube_meta AS
        SELECT
            video_id,
            {normalize_lang_expr('default_audio_language')} AS youtube_audio_language,
            {normalize_lang_expr('default_language')} AS youtube_default_language,
            channel_id,
            channel_title,
            title
        FROM read_csv_auto('{YOUTUBE_METADATA}', header=true)
    """)
    con.execute(f"""
        CREATE OR REPLACE VIEW selected_segments AS
        SELECT
            s.video_id,
            v.recommended_action,
            s.segment_file,
            coalesce(q.queue_language, s.queue_language) AS queue_language,
            {gemini_lang} AS gemini_lang,
            {gemini_script_family} AS gemini_script_family,
            y.youtube_audio_language,
            y.youtube_default_language,
            y.channel_id,
            y.channel_title,
            y.title,
            s.transcription,
            s.tagged,
            {clean_text} AS clean_text,
            {expected_script_char_count} AS expected_script_char_count,
            ({expected_script_char_count}) > 0 AS has_expected_script_char,
            ({gemini_lang}) IN ({sql_list(TARGET_LANGS)}) AS is_target_lang,
            ({gemini_lang}) NOT IN ({sql_list(TARGET_LANGS)}) AND ({gemini_lang}) <> '' AS is_unexpected_lang,
            {gemini_script_family} = 'arabic_script' AS is_arabic_script_lang,
            regexp_matches({clean_text}, '[A-Za-z]') AS has_ascii_letters,
            trim(coalesce(s.transcription, '')) = '' AS is_blank_transcript,
            regexp_matches(coalesce(s.transcription, ''), '\\\\[NO_SPEECH\\\\]', 'i') AS has_no_speech_tag,
            regexp_matches(coalesce(s.transcription, ''), 'https?://|www\\\\.') AS has_url,
            regexp_matches(coalesce(s.transcription, ''), '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\\\.[A-Za-z]{{2,}}') AS has_email,
            regexp_matches(coalesce(s.transcription, ''), '&[A-Za-z]+;') AS has_html_entity,
            strpos(coalesce(s.transcription, ''), '�') > 0 AS has_replacement_char,
            regexp_matches(coalesce(s.transcription, ''), '[\\\\x00-\\\\x08\\\\x0B\\\\x0C\\\\x0E-\\\\x1F\\\\x7F]') AS has_control_char,
            regexp_matches(coalesce(s.transcription, ''), '(.)\\\\1{{4,}}') AS has_repeated_char_5plus,
            regexp_matches(coalesce(s.transcription, ''), '([!?.,])\\\\1{{3,}}') AS has_repeated_punct_4plus,
            coalesce(s.num_unk, 0) > 0 AS has_unk,
            coalesce(s.num_inaudible, 0) > 0 AS has_inaudible,
            coalesce(s.num_event_tags, 0) > 0 AS has_event_tags,
            coalesce(s.lang_mismatch_flag, false) AS gemini_expected_lang_mismatch_flag,
            coalesce(s.text_length_per_sec, 0.0) < 1.0 AS suspicious_cps_low,
            coalesce(s.text_length_per_sec, 0.0) > 50.0 AS suspicious_cps_high,
            coalesce(s.text_length_per_sec, 0.0) AS text_length_per_sec,
            coalesce(s.tx_quality_score, 0.0) AS tx_quality_score,
            coalesce(s.asr_eligible, false) AS asr_eligible,
            coalesce(s.tts_clean_eligible, false) AS tts_clean_eligible,
            coalesce(s.tts_expressive_eligible, false) AS tts_expressive_eligible,
            coalesce(s.num_unk, 0) AS num_unk,
            coalesce(s.num_inaudible, 0) AS num_inaudible,
            coalesce(s.num_event_tags, 0) AS num_event_tags
        FROM read_parquet('{SEGMENT_MAP_GLOB}', hive_partitioning=true, union_by_name=true) s
        JOIN selected_videos v USING (video_id)
        LEFT JOIN queue_videos q USING (video_id)
        LEFT JOIN youtube_meta y USING (video_id)
    """)

    export_csv(
        con,
        """
        SELECT
            gemini_lang,
            gemini_script_family,
            count(*) AS segments,
            count(DISTINCT video_id) AS videos,
            count(*) FILTER (WHERE has_expected_script_char) AS segments_with_expected_script_char,
            count(*) FILTER (WHERE NOT has_expected_script_char) AS segments_without_expected_script_char,
            count(*) FILTER (WHERE queue_language = gemini_lang) AS queue_language_match_segments,
            count(*) FILTER (WHERE youtube_audio_language = gemini_lang) AS youtube_audio_language_match_segments,
            count(*) FILTER (WHERE youtube_default_language = gemini_lang) AS youtube_default_language_match_segments,
            round(avg(tx_quality_score), 6) AS avg_tx_quality_score,
            round(avg(text_length_per_sec), 6) AS avg_text_length_per_sec
        FROM selected_segments
        GROUP BY gemini_lang, gemini_script_family
        ORDER BY segments DESC, gemini_lang
        """,
        OUTPUT_DIR / "gemini_detected_language_summary.csv",
    )

    export_csv(
        con,
        """
        SELECT
            gemini_lang,
            gemini_script_family,
            count(*) AS segments,
            count(DISTINCT video_id) AS videos,
            count(*) FILTER (WHERE NOT has_expected_script_char) AS segments_without_expected_script_char,
            count(*) FILTER (WHERE queue_language = gemini_lang) AS queue_language_match_segments,
            count(*) FILTER (WHERE youtube_audio_language = gemini_lang) AS youtube_audio_language_match_segments,
            count(*) FILTER (WHERE youtube_default_language = gemini_lang) AS youtube_default_language_match_segments
        FROM selected_segments
        WHERE is_unexpected_lang
        GROUP BY gemini_lang, gemini_script_family
        ORDER BY segments DESC, gemini_lang
        """,
        OUTPUT_DIR / "gemini_unexpected_language_summary.csv",
    )

    export_csv(
        con,
        """
        SELECT
            'segments' AS scope,
            'queue_language' AS comparator,
            count(*) AS total,
            count(*) FILTER (WHERE gemini_lang = queue_language) AS matches,
            round(100.0 * count(*) FILTER (WHERE gemini_lang = queue_language) / count(*), 6) AS match_pct
        FROM selected_segments

        UNION ALL

        SELECT
            'segments' AS scope,
            'youtube_audio_language' AS comparator,
            count(*) FILTER (WHERE youtube_audio_language <> '') AS total,
            count(*) FILTER (WHERE youtube_audio_language <> '' AND gemini_lang = youtube_audio_language) AS matches,
            round(
                100.0 * count(*) FILTER (WHERE youtube_audio_language <> '' AND gemini_lang = youtube_audio_language)
                / NULLIF(count(*) FILTER (WHERE youtube_audio_language <> ''), 0),
                6
            ) AS match_pct
        FROM selected_segments

        UNION ALL

        SELECT
            'segments' AS scope,
            'youtube_default_language' AS comparator,
            count(*) FILTER (WHERE youtube_default_language <> '') AS total,
            count(*) FILTER (WHERE youtube_default_language <> '' AND gemini_lang = youtube_default_language) AS matches,
            round(
                100.0 * count(*) FILTER (WHERE youtube_default_language <> '' AND gemini_lang = youtube_default_language)
                / NULLIF(count(*) FILTER (WHERE youtube_default_language <> ''), 0),
                6
            ) AS match_pct
        FROM selected_segments
        ORDER BY scope, comparator
        """,
        OUTPUT_DIR / "gemini_segment_language_match_summary.csv",
    )

    con.execute("""
        CREATE OR REPLACE VIEW video_lang_counts AS
        SELECT
            video_id,
            gemini_lang,
            gemini_script_family,
            count(*) AS segments
        FROM selected_segments
        GROUP BY video_id, gemini_lang, gemini_script_family
    """)
    con.execute("""
        CREATE OR REPLACE VIEW video_lang_ranked AS
        SELECT
            *,
            row_number() OVER (PARTITION BY video_id ORDER BY segments DESC, gemini_lang) AS rn
        FROM video_lang_counts
    """)
    con.execute("""
        CREATE OR REPLACE VIEW video_script_counts AS
        SELECT
            video_id,
            gemini_script_family,
            count(*) AS segments
        FROM selected_segments
        GROUP BY video_id, gemini_script_family
    """)

    con.execute(f"""
        COPY (
            WITH video_base AS (
                SELECT
                    video_id,
                    any_value(recommended_action) AS recommended_action,
                    any_value(queue_language) AS queue_language,
                    any_value(youtube_audio_language) AS youtube_audio_language,
                    any_value(youtube_default_language) AS youtube_default_language,
                    any_value(channel_id) AS channel_id,
                    any_value(channel_title) AS channel_title,
                    any_value(title) AS title,
                    count(*) AS total_segments,
                    count(DISTINCT gemini_lang) AS distinct_detected_languages,
                    count(DISTINCT gemini_script_family) AS distinct_detected_script_families,
                    count(*) FILTER (WHERE is_unexpected_lang) AS unexpected_language_segments,
                    count(DISTINCT gemini_lang) FILTER (WHERE is_unexpected_lang) AS distinct_unexpected_languages,
                    count(*) FILTER (WHERE is_arabic_script_lang) AS arabic_script_language_segments,
                    count(*) FILTER (WHERE NOT has_expected_script_char) AS no_expected_script_char_segments,
                    count(*) FILTER (
                        WHERE has_url OR has_email OR has_html_entity OR has_replacement_char
                           OR has_control_char OR has_repeated_char_5plus OR has_repeated_punct_4plus
                           OR has_unk OR has_inaudible OR suspicious_cps_low OR suspicious_cps_high
                           OR (gemini_lang <> 'en' AND NOT has_expected_script_char)
                    ) AS abnormal_transcript_segments
                FROM selected_segments
                GROUP BY video_id
            ),
            lang_thresholds AS (
                SELECT
                    video_id,
                    count(*) FILTER (WHERE segments >= 2) AS detected_languages_2plus,
                    count(*) FILTER (WHERE segments >= 3) AS detected_languages_3plus,
                    string_agg(gemini_lang || ':' || segments, ', ' ORDER BY segments DESC, gemini_lang)
                        FILTER (WHERE rn <= 5) AS top_detected_languages
                FROM video_lang_ranked
                GROUP BY video_id
            ),
            script_thresholds AS (
                SELECT
                    video_id,
                    count(*) FILTER (WHERE segments >= 2) AS script_families_2plus,
                    count(*) FILTER (WHERE segments >= 3) AS script_families_3plus
                FROM video_script_counts
                GROUP BY video_id
            ),
            dominant_lang AS (
                SELECT
                    video_id,
                    gemini_lang AS dominant_gemini_language,
                    gemini_script_family AS dominant_gemini_script_family,
                    segments AS dominant_gemini_segments
                FROM video_lang_ranked
                WHERE rn = 1
            )
            SELECT
                b.video_id,
                b.recommended_action,
                b.queue_language,
                b.youtube_audio_language,
                b.youtube_default_language,
                d.dominant_gemini_language,
                d.dominant_gemini_script_family,
                d.dominant_gemini_segments,
                round(100.0 * d.dominant_gemini_segments / b.total_segments, 6) AS dominant_gemini_share_pct,
                b.total_segments,
                b.distinct_detected_languages,
                coalesce(l.detected_languages_2plus, 0) AS detected_languages_2plus,
                coalesce(l.detected_languages_3plus, 0) AS detected_languages_3plus,
                b.distinct_detected_script_families,
                coalesce(s.script_families_2plus, 0) AS script_families_2plus,
                coalesce(s.script_families_3plus, 0) AS script_families_3plus,
                b.unexpected_language_segments,
                b.distinct_unexpected_languages,
                b.arabic_script_language_segments,
                b.no_expected_script_char_segments,
                b.abnormal_transcript_segments,
                d.dominant_gemini_language = b.queue_language AS dominant_matches_queue_language,
                d.dominant_gemini_language = b.youtube_audio_language AS dominant_matches_youtube_audio_language,
                d.dominant_gemini_language = b.youtube_default_language AS dominant_matches_youtube_default_language,
                l.top_detected_languages,
                b.channel_id,
                b.channel_title,
                b.title
            FROM video_base b
            LEFT JOIN dominant_lang d USING (video_id)
            LEFT JOIN lang_thresholds l USING (video_id)
            LEFT JOIN script_thresholds s USING (video_id)
            ORDER BY b.video_id
        ) TO '{(OUTPUT_DIR / "gemini_video_language_profile.parquet").as_posix()}' (FORMAT PARQUET, COMPRESSION ZSTD)
    """)

    export_csv(
        con,
        """
        SELECT
            *
        FROM read_parquet('/home/ubuntu/transcripts/final_data/gemini_video_language_profile.parquet')
        WHERE distinct_detected_languages >= 2
           OR distinct_detected_script_families >= 2
           OR unexpected_language_segments > 0
           OR no_expected_script_char_segments > 0
        ORDER BY
            unexpected_language_segments DESC,
            no_expected_script_char_segments DESC,
            distinct_detected_languages DESC,
            dominant_gemini_share_pct ASC,
            total_segments DESC
        """,
        OUTPUT_DIR / "gemini_video_language_anomalies.csv",
    )

    export_csv(
        con,
        """
        SELECT
            anomaly,
            segments,
            round(100.0 * segments / total_segments, 6) AS pct_of_segments
        FROM (
            SELECT 'is_blank_transcript' AS anomaly, count(*) FILTER (WHERE is_blank_transcript) AS segments, count(*) AS total_segments FROM selected_segments
            UNION ALL SELECT 'has_no_speech_tag', count(*) FILTER (WHERE has_no_speech_tag), count(*) FROM selected_segments
            UNION ALL SELECT 'has_url', count(*) FILTER (WHERE has_url), count(*) FROM selected_segments
            UNION ALL SELECT 'has_email', count(*) FILTER (WHERE has_email), count(*) FROM selected_segments
            UNION ALL SELECT 'has_html_entity', count(*) FILTER (WHERE has_html_entity), count(*) FROM selected_segments
            UNION ALL SELECT 'has_replacement_char', count(*) FILTER (WHERE has_replacement_char), count(*) FROM selected_segments
            UNION ALL SELECT 'has_control_char', count(*) FILTER (WHERE has_control_char), count(*) FROM selected_segments
            UNION ALL SELECT 'has_repeated_char_5plus', count(*) FILTER (WHERE has_repeated_char_5plus), count(*) FROM selected_segments
            UNION ALL SELECT 'has_repeated_punct_4plus', count(*) FILTER (WHERE has_repeated_punct_4plus), count(*) FROM selected_segments
            UNION ALL SELECT 'has_unk', count(*) FILTER (WHERE has_unk), count(*) FROM selected_segments
            UNION ALL SELECT 'has_inaudible', count(*) FILTER (WHERE has_inaudible), count(*) FROM selected_segments
            UNION ALL SELECT 'has_event_tags', count(*) FILTER (WHERE has_event_tags), count(*) FROM selected_segments
            UNION ALL SELECT 'suspicious_cps_low', count(*) FILTER (WHERE suspicious_cps_low), count(*) FROM selected_segments
            UNION ALL SELECT 'suspicious_cps_high', count(*) FILTER (WHERE suspicious_cps_high), count(*) FROM selected_segments
            UNION ALL SELECT 'no_expected_script_char', count(*) FILTER (WHERE NOT has_expected_script_char), count(*) FROM selected_segments
            UNION ALL SELECT 'non_english_no_expected_script_char', count(*) FILTER (WHERE gemini_lang <> 'en' AND NOT has_expected_script_char), count(*) FROM selected_segments
            UNION ALL SELECT 'unexpected_detected_language', count(*) FILTER (WHERE is_unexpected_lang), count(*) FROM selected_segments
            UNION ALL SELECT 'arabic_script_detected_language', count(*) FILTER (WHERE is_arabic_script_lang), count(*) FROM selected_segments
            UNION ALL SELECT 'gemini_expected_lang_mismatch_flag', count(*) FILTER (WHERE gemini_expected_lang_mismatch_flag), count(*) FROM selected_segments
        )
        ORDER BY segments DESC, anomaly
        """,
        OUTPUT_DIR / "gemini_text_abnormality_summary.csv",
    )

    export_csv(
        con,
        """
        WITH candidates AS (
            SELECT
                video_id,
                segment_file,
                recommended_action,
                queue_language,
                youtube_audio_language,
                youtube_default_language,
                gemini_lang,
                gemini_script_family,
                transcription,
                text_length_per_sec,
                tx_quality_score,
                (
                    CASE WHEN has_url THEN 1 ELSE 0 END +
                    CASE WHEN has_email THEN 1 ELSE 0 END +
                    CASE WHEN has_html_entity THEN 1 ELSE 0 END +
                    CASE WHEN has_replacement_char THEN 1 ELSE 0 END +
                    CASE WHEN has_control_char THEN 1 ELSE 0 END +
                    CASE WHEN has_repeated_char_5plus THEN 1 ELSE 0 END +
                    CASE WHEN has_repeated_punct_4plus THEN 1 ELSE 0 END +
                    CASE WHEN has_unk THEN 1 ELSE 0 END +
                    CASE WHEN has_inaudible THEN 1 ELSE 0 END +
                    CASE WHEN suspicious_cps_low OR suspicious_cps_high THEN 1 ELSE 0 END +
                    CASE WHEN NOT has_expected_script_char THEN 1 ELSE 0 END +
                    CASE WHEN is_unexpected_lang THEN 1 ELSE 0 END
                ) AS anomaly_score,
                has_url,
                has_email,
                has_html_entity,
                has_replacement_char,
                has_control_char,
                has_repeated_char_5plus,
                has_repeated_punct_4plus,
                has_unk,
                has_inaudible,
                has_event_tags,
                suspicious_cps_low,
                suspicious_cps_high,
                has_expected_script_char,
                is_unexpected_lang,
                is_arabic_script_lang
            FROM selected_segments
            WHERE has_url OR has_email OR has_html_entity OR has_replacement_char OR has_control_char
               OR has_repeated_char_5plus OR has_repeated_punct_4plus OR has_unk OR has_inaudible
               OR suspicious_cps_low OR suspicious_cps_high OR NOT has_expected_script_char OR is_unexpected_lang
        )
        SELECT
            *
        FROM candidates
        ORDER BY anomaly_score DESC, tx_quality_score ASC, video_id, segment_file
        LIMIT 2000
        """,
        OUTPUT_DIR / "gemini_text_anomaly_samples.csv",
    )

    selected_summary = fetchone_dict(
        con,
        """
        SELECT
            count(*) AS total_segments,
            count(DISTINCT video_id) AS total_videos,
            count(*) FILTER (WHERE recommended_action = 'keep') AS keep_segments,
            count(*) FILTER (WHERE recommended_action = 'review') AS review_segments,
            count(*) FILTER (WHERE is_unexpected_lang) AS unexpected_language_segments,
            count(DISTINCT CASE WHEN is_unexpected_lang THEN video_id END) AS videos_with_unexpected_language,
            count(*) FILTER (WHERE is_arabic_script_lang) AS arabic_script_language_segments,
            count(DISTINCT CASE WHEN is_arabic_script_lang THEN video_id END) AS videos_with_arabic_script_language,
            count(*) FILTER (WHERE NOT has_expected_script_char) AS no_expected_script_char_segments,
            count(*) FILTER (WHERE gemini_lang <> 'en' AND NOT has_expected_script_char) AS non_english_no_expected_script_char_segments,
            count(*) FILTER (WHERE is_blank_transcript) AS blank_transcript_segments,
            count(*) FILTER (WHERE has_no_speech_tag) AS no_speech_tag_segments,
            count(*) FILTER (WHERE has_url) AS url_segments,
            count(*) FILTER (WHERE has_email) AS email_segments,
            count(*) FILTER (WHERE has_html_entity) AS html_entity_segments,
            count(*) FILTER (WHERE has_replacement_char) AS replacement_char_segments,
            count(*) FILTER (WHERE has_control_char) AS control_char_segments,
            count(*) FILTER (WHERE has_repeated_char_5plus) AS repeated_char_segments,
            count(*) FILTER (WHERE has_repeated_punct_4plus) AS repeated_punct_segments,
            count(*) FILTER (WHERE has_unk) AS unk_segments,
            count(*) FILTER (WHERE has_inaudible) AS inaudible_segments,
            count(*) FILTER (WHERE has_event_tags) AS event_tag_segments,
            count(*) FILTER (WHERE suspicious_cps_low) AS suspicious_cps_low_segments,
            count(*) FILTER (WHERE suspicious_cps_high) AS suspicious_cps_high_segments,
            count(*) FILTER (WHERE gemini_expected_lang_mismatch_flag) AS gemini_expected_lang_mismatch_segments,
            round(avg(tx_quality_score), 6) AS avg_tx_quality_score,
            round(avg(text_length_per_sec), 6) AS avg_text_length_per_sec
        FROM selected_segments
        """,
    )

    video_profile_summary = fetchone_dict(
        con,
        """
        SELECT
            count(*) AS total_videos,
            count(*) FILTER (WHERE distinct_detected_languages >= 2) AS videos_with_2plus_detected_languages,
            count(*) FILTER (WHERE distinct_detected_languages >= 3) AS videos_with_3plus_detected_languages,
            count(*) FILTER (WHERE detected_languages_2plus >= 2) AS videos_with_2plus_languages_at_2_segments,
            count(*) FILTER (WHERE detected_languages_3plus >= 3) AS videos_with_3plus_languages_at_3_segments,
            count(*) FILTER (WHERE distinct_detected_script_families >= 2) AS videos_with_2plus_detected_script_families,
            count(*) FILTER (WHERE distinct_detected_script_families >= 3) AS videos_with_3plus_detected_script_families,
            count(*) FILTER (WHERE script_families_2plus >= 2) AS videos_with_2plus_script_families_at_2_segments,
            count(*) FILTER (WHERE unexpected_language_segments > 0) AS videos_with_unexpected_detected_languages,
            count(*) FILTER (WHERE arabic_script_language_segments > 0) AS videos_with_arabic_script_languages,
            count(*) FILTER (WHERE no_expected_script_char_segments > 0) AS videos_with_script_mismatch_segments,
            count(*) FILTER (WHERE dominant_matches_queue_language) AS videos_dominant_gemini_matches_queue_language,
            count(*) FILTER (WHERE youtube_audio_language <> '' AND dominant_matches_youtube_audio_language) AS videos_dominant_gemini_matches_youtube_audio_language,
            count(*) FILTER (WHERE youtube_default_language <> '' AND dominant_matches_youtube_default_language) AS videos_dominant_gemini_matches_youtube_default_language,
            round(avg(dominant_gemini_share_pct), 6) AS avg_dominant_gemini_share_pct
        FROM read_parquet('/home/ubuntu/transcripts/final_data/gemini_video_language_profile.parquet')
        """,
    )

    top_unexpected = con.execute(
        """
        SELECT gemini_lang, segments
        FROM read_csv_auto('/home/ubuntu/transcripts/final_data/gemini_unexpected_language_summary.csv', header=true)
        ORDER BY segments DESC, gemini_lang
        LIMIT 15
        """
    ).fetchall()

    summary = {
        "generated_at_epoch_s": round(time.time(), 3),
        "elapsed_s": round(time.time() - started, 2),
        "scope": {
            "selected_videos_file": "data/video_tts_classification_final.csv",
            "selected_video_count": 454854,
            "scope_name": "final_selected_videos_keep_plus_review",
        },
        "selected_segments": selected_summary,
        "video_language_profile_summary": video_profile_summary,
        "top_unexpected_detected_languages": [
            {"gemini_lang": lang, "segments": segments} for lang, segments in top_unexpected
        ],
        "notes": {
            "youtube_language_source": "joined from youtube_video_metadata_all.csv because video_tts_classification_final.csv has no language columns",
            "queue_language_source": "video_queue.csv.gz / segment_map_v1 queue_language",
            "script_mismatch_definition": "Gemini detected language has zero characters from its expected script in the cleaned transcript text",
        },
    }

    write_json(OUTPUT_DIR / "gemini_only_summary.json", summary)

    print(json.dumps(summary, indent=2, sort_keys=True))


if __name__ == "__main__":
    main()
