o
    lQig                     @  s  d dl mZ d dlZd dlZd dlmZ d dlZdZdZdZ	dZ
dZd	Zed
Zed Zed Zed d d d  Zed d d  Zed  Zed  Zed  Zd3ddZd4d"d#Zd5d6d&d'Zd7d,d-Zd8d.d/Zd9d0d1Zed2kre  dS dS ):    )annotationsN)Path   gffffff?g      ?g       @g333333?g      ?z/home/ubuntu/transcriptsdata
final_dataphase1_incrementalsegment_map_v1z**z	*.parquetanalytics_v1zvideo_rollup.parquetzrecover_v2_consolidated.parquetz"video_tts_classification_final.csvztranscription_results.parquetpathr   payloaddictreturnNonec                 C  s   |  tj|dddd  d S )N   Tindent	sort_keys
)
write_textjsondumps)r
   r    r   #scripts/build_final_data_summary.py
write_json   s   r   conduckdb.DuckDBPyConnectionquerystrc                 C  s<   |  |}| }|d u ri S dd |jD }tt||S )Nc                 S  s   g | ]}|d  qS )r   r   ).0dr   r   r   
<listcomp>%   s    z!fetchone_dict.<locals>.<listcomp>)executefetchonedescriptionr   zip)r   r   relrowcolsr   r   r   fetchone_dict    s   
r(    aliasc                 C  s   | r|  dnd}d g d| d| d| d| dt d| d	t d
| dt d| dt d| d| dt d| d| d| dt dS )N.r)   z
        CASE
            WHEN z#lid_consensus = false AND COALESCE(z8lid_agree_count, 0) < 2 THEN 'dispose'
            WHEN z@conformer_multi_ctc_normalized IS NOT NULL
                 AND z!conformer_multi_ctc_normalized < z! THEN 'dispose'
            WHEN zduration_s < z* THEN 'dispose'
            WHEN COALESCE(zlid_agree_count, 0) >= z
                 AND (z"conformer_multi_ctc_normalized >= z
                      OR z>conformer_multi_ctc_normalized IS NULL)
                 AND (zgemini_quality_score >= z2gemini_quality_score = 0
                      OR z3gemini_quality_score IS NULL)
                 AND zduration_s >= z7 THEN 'golden'
            ELSE 'redo'
        END
    )joinDISPOSE_CTC_MAXDISPOSE_DURATION_MAXGOLDEN_LID_AGREEGOLDEN_CTC_MINGOLDEN_QUALITY_MINGOLDEN_DURATION_MIN)r*   prefixr   r   r   bucket_case)   sL   
		
r4   	numeratorintdenominatorfloatc                 C  s   |dkrdS t | d | dS )Nr   g        g      Y@   )round)r5   r7   r   r   r   pct=   s   r;   c                 C  s"  |  dtd   d |  dtd   d |  dtd   d |  dtd	   d |  d
td   d |  dtd   d |  dtd   d |  dtd   d |  dtd   d |  dtd   d |  dtd   d d S )Nah  
        COPY (
            SELECT
                'all' AS scope,
                sum(total_segments) AS total_segments,
                count(*) AS total_videos,
                sum(v1_validated_segments) AS v1_validated_segments,
                sum(v1_missing_segments) AS v1_missing_segments,
                sum(final_validated_segments) AS final_validated_segments,
                sum(final_missing_segments) AS final_missing_segments,
                sum(final_golden_segments) AS golden_segments,
                sum(final_redo_segments) AS redo_segments,
                sum(final_dispose_segments) AS dispose_segments,
                count(*) FILTER (WHERE final_missing_segments > 0) AS videos_with_missing_validation
            FROM final_video_rollup

            UNION ALL

            SELECT
                'kept_video_subset' AS scope,
                sum(f.total_segments) AS total_segments,
                count(*) AS total_videos,
                sum(f.v1_validated_segments) AS v1_validated_segments,
                sum(f.v1_missing_segments) AS v1_missing_segments,
                sum(f.final_validated_segments) AS final_validated_segments,
                sum(f.final_missing_segments) AS final_missing_segments,
                sum(f.final_golden_segments) AS golden_segments,
                sum(f.final_redo_segments) AS redo_segments,
                sum(f.final_dispose_segments) AS dispose_segments,
                count(*) FILTER (WHERE f.final_missing_segments > 0) AS videos_with_missing_validation
            FROM final_video_rollup f
            JOIN kept_videos k USING (video_id)
        ) TO 'zscope_rollup.csvz' (HEADER, DELIMITER ',')
    z
        COPY (
            SELECT source, segments
            FROM final_source_rollup_all
            ORDER BY segments DESC, source
        ) TO 'zsource_breakdown_all.csvz
        COPY (
            SELECT source, segments
            FROM final_source_rollup_kept
            ORDER BY segments DESC, source
        ) TO 'z source_breakdown_kept_subset.csvz
        COPY (
            SELECT bucket, segments
            FROM final_bucket_rollup_all
            ORDER BY segments DESC, bucket
        ) TO 'zbucket_breakdown_all.csvz
        COPY (
            SELECT bucket, segments
            FROM final_bucket_rollup_kept
            ORDER BY segments DESC, bucket
        ) TO 'z bucket_breakdown_kept_subset.csva  
        COPY (
            SELECT
                k.recommended_action,
                count(*) AS videos,
                sum(f.total_segments) AS total_segments,
                sum(f.final_validated_segments) AS final_validated_segments,
                sum(f.final_missing_segments) AS final_missing_segments,
                sum(f.final_golden_segments) AS golden_segments,
                sum(f.final_redo_segments) AS redo_segments,
                sum(f.final_dispose_segments) AS dispose_segments
            FROM final_video_rollup f
            JOIN kept_videos k USING (video_id)
            GROUP BY k.recommended_action
            ORDER BY videos DESC
        ) TO 'zaction_rollup.csvzu
        COPY (
            SELECT *
            FROM final_video_rollup
            ORDER BY video_id
        ) TO 'zvideo_rollup_final.parquetz)' (FORMAT PARQUET, COMPRESSION ZSTD)
    z
        COPY (
            SELECT f.*
            FROM final_video_rollup f
            JOIN kept_videos k USING (video_id)
            ORDER BY f.video_id
        ) TO 'z&video_rollup_final_kept_subset.parquetz
        COPY (
            SELECT *
            FROM final_video_rollup
            WHERE final_missing_segments > 0
            ORDER BY final_missing_segments DESC, video_id
        ) TO 'z#videos_missing_final_validation.csvz
        COPY (
            SELECT f.*
            FROM final_video_rollup f
            JOIN kept_videos k USING (video_id)
            WHERE f.final_missing_segments > 0
            ORDER BY f.final_missing_segments DESC, f.video_id
        ) TO 'z/videos_missing_final_validation_kept_subset.csvz
        COPY (
            SELECT k.video_id, k.recommended_action
            FROM kept_videos k
            LEFT JOIN final_video_rollup f USING (video_id)
            WHERE f.video_id IS NULL
            ORDER BY k.video_id
        ) TO 'z&kept_videos_without_transcriptions.csv)r!   
OUTPUT_DIRas_posix)r   r   r   r   export_rollupsC   sB   
 
#

















	
r>   c                  C  sd  t jddd t } | d | d | d t }| dt d | dt d	 | d
t d | d | dt	 dt
  d | d | d | d | dt d d }t| d}t| d}t| d}t| d}t| d}t| d}t| d}	t| d}
t| d}t| d}t| d }t| d!}i d"|d#|d# d$||d#  d%|d% d&|d& d'|d' d(|d( d)|d) d*|d* d+|d+ d,|d, d-|d- d.|d. d/|d/ d0|d0 d1|d1 d2|d3 |d4 |d5 t|d& |d# t|d) |d# d6}i d7|	d7 d8|	d8 d9|	d9 d:|
d: d;|	d7 |
d:  d#|
d# d&|d& d'|d' d(|d( d)|d) d*|d* d+|d+ d,|d, d-|d- d.|d. d/|d/ d0|d0 |d1 |d3 |d4 |d5 t|d& |
d# t|d) |
d# d<}| d=|d, |d- |d. |d* g | d>|d, |d- |d. |d* g | d?|d/ |d0 |d1 |d* g | d@|d/ |d0 |d1 |d* g t|  tt dAtt | dBdCdDdEdFdGdHdIdIddg dJdKdLdMdNdOdPdQdRdSdTdU	dV}tt dW | tt dX | tt dY | tt dZ | ttj||||d[dBdd\ d S )]NT)parentsexist_okzSET threads = 8zSET memory_limit = '24GB'z$SET preserve_insertion_order = falsezW
        CREATE OR REPLACE VIEW video_rollup_v1 AS
        SELECT * FROM read_parquet('z')
    z
        CREATE OR REPLACE VIEW kept_videos AS
        SELECT DISTINCT video_id, recommended_action
        FROM read_csv_auto('z', header=true)
    a  
        CREATE OR REPLACE VIEW segment_map AS
        SELECT
            video_id,
            segment_file,
            has_validation,
            validation_source,
            parent_segment_file,
            is_split_segment,
            split_index_from_id,
            original_start_ms,
            original_end_ms,
            trimmed_start_ms,
            trimmed_end_ms,
            leading_pad_ms,
            trailing_pad_ms
        FROM read_parquet('z3', hive_partitioning=true, union_by_name=true)
    z
        CREATE OR REPLACE TEMP TABLE missing_segments_v1 AS
        SELECT video_id, segment_file
        FROM segment_map
        WHERE NOT has_validation
    a  
        CREATE OR REPLACE TEMP TABLE recover_v2_dedup AS
        WITH ranked AS (
            SELECT
                *,
                ROW_NUMBER() OVER (
                    PARTITION BY video_id, segment_file
                    ORDER BY
                        CASE WHEN conformer_multi_ctc_normalized IS NULL THEN 1 ELSE 0 END ASC,
                        conformer_multi_ctc_normalized DESC NULLS LAST,
                        mms_confidence DESC NULLS LAST
                ) AS rn
            FROM read_parquet('zX')
        )
        SELECT
            video_id,
            segment_file,
            zD AS provisional_bucket
        FROM ranked
        WHERE rn = 1
    a  
        CREATE OR REPLACE TEMP TABLE recover_v2_matched AS
        SELECT
            m.video_id,
            m.segment_file,
            v.provisional_bucket
        FROM missing_segments_v1 m
        JOIN recover_v2_dedup v USING (video_id, segment_file)
    a  
        CREATE OR REPLACE TEMP TABLE recover_v2_by_video AS
        SELECT
            video_id,
            count(*) AS recover_v2_segments,
            count(*) FILTER (WHERE provisional_bucket = 'golden') AS recover_v2_golden_segments,
            count(*) FILTER (WHERE provisional_bucket = 'redo') AS recover_v2_redo_segments,
            count(*) FILTER (WHERE provisional_bucket = 'dispose') AS recover_v2_dispose_segments
        FROM recover_v2_matched
        GROUP BY video_id
    a  
        CREATE OR REPLACE TEMP TABLE final_video_rollup AS
        SELECT
            v.video_id,
            v.queue_language,
            v.total_segments,
            v.validated_segments AS v1_validated_segments,
            v.missing_validation_segments AS v1_missing_segments,
            v.golden_segments AS v1_golden_segments,
            v.redo_segments AS v1_redo_segments,
            v.dispose_segments AS v1_dispose_segments,
            coalesce(r.recover_v2_segments, 0) AS recover_v2_segments,
            v.validated_segments + coalesce(r.recover_v2_segments, 0) AS final_validated_segments,
            v.missing_validation_segments - coalesce(r.recover_v2_segments, 0) AS final_missing_segments,
            v.golden_segments + coalesce(r.recover_v2_golden_segments, 0) AS final_golden_segments,
            v.redo_segments + coalesce(r.recover_v2_redo_segments, 0) AS final_redo_segments,
            v.dispose_segments + coalesce(r.recover_v2_dispose_segments, 0) AS final_dispose_segments,
            (v.missing_validation_segments - coalesce(r.recover_v2_segments, 0) = 0) AS fully_validated_final
        FROM video_rollup_v1 v
        LEFT JOIN recover_v2_by_video r USING (video_id)
    z#SELECT count(*) FROM read_parquet('z')r   z
        SELECT
            sum(total_segments) AS deduped_transcribed_segments,
            count(*) AS transcribed_videos
        FROM video_rollup_v1
    a  
        SELECT
            sum(validated_segments) AS v1_validated_segments,
            sum(missing_validation_segments) AS v1_missing_segments,
            sum(golden_segments) AS v1_golden_segments,
            sum(redo_segments) AS v1_redo_segments,
            sum(dispose_segments) AS v1_dispose_segments,
            count(*) FILTER (WHERE missing_validation_segments > 0) AS v1_videos_with_missing_validation
        FROM video_rollup_v1
    a   
        SELECT
            count(*) FILTER (WHERE validation_source = 'historical') AS historical_segments,
            count(*) FILTER (WHERE validation_source = 'recover') AS recover_v1_segments
        FROM segment_map
        WHERE has_validation
    a  
        SELECT
            count(*) AS recover_v2_segments,
            count(*) FILTER (WHERE provisional_bucket = 'golden') AS recover_v2_golden_segments,
            count(*) FILTER (WHERE provisional_bucket = 'redo') AS recover_v2_redo_segments,
            count(*) FILTER (WHERE provisional_bucket = 'dispose') AS recover_v2_dispose_segments
        FROM recover_v2_matched
    a  
        SELECT
            sum(final_validated_segments) AS final_validated_segments,
            sum(final_missing_segments) AS final_missing_segments,
            sum(final_golden_segments) AS golden_segments,
            sum(final_redo_segments) AS redo_segments,
            sum(final_dispose_segments) AS dispose_segments,
            count(*) FILTER (WHERE final_missing_segments > 0) AS videos_with_missing_final_validation
        FROM final_video_rollup
    a7  
        SELECT
            count(*) AS total_segment_map_rows,
            count(*) FILTER (WHERE parent_segment_file IS NOT NULL AND parent_segment_file <> '') AS parent_segment_file_present,
            count(*) FILTER (WHERE is_split_segment) AS split_segments,
            count(*) FILTER (WHERE split_index_from_id IS NOT NULL) AS split_index_present,
            count(*) FILTER (WHERE original_start_ms IS NOT NULL AND original_start_ms <> '') AS original_start_present,
            count(*) FILTER (WHERE original_end_ms IS NOT NULL AND original_end_ms <> '') AS original_end_present,
            count(*) FILTER (WHERE trimmed_start_ms IS NOT NULL AND trimmed_start_ms <> '') AS trimmed_start_present,
            count(*) FILTER (WHERE trimmed_end_ms IS NOT NULL AND trimmed_end_ms <> '') AS trimmed_end_present,
            count(*) FILTER (WHERE leading_pad_ms IS NOT NULL AND leading_pad_ms <> '') AS leading_pad_present,
            count(*) FILTER (WHERE trailing_pad_ms IS NOT NULL AND trailing_pad_ms <> '') AS trailing_pad_present
        FROM segment_map
    a  
        SELECT
            count(*) AS selected_videos_total,
            count(*) FILTER (WHERE recommended_action = 'keep') AS selected_videos_action_keep,
            count(*) FILTER (WHERE recommended_action = 'review') AS selected_videos_action_review
        FROM kept_videos
    z
        SELECT
            count(*) AS selected_videos_with_transcriptions,
            sum(f.total_segments) AS deduped_transcribed_segments
        FROM final_video_rollup f
        JOIN kept_videos k USING (video_id)
    a  
        SELECT
            sum(f.v1_validated_segments) AS v1_validated_segments,
            sum(f.v1_missing_segments) AS v1_missing_segments,
            sum(f.v1_golden_segments) AS v1_golden_segments,
            sum(f.v1_redo_segments) AS v1_redo_segments,
            sum(f.v1_dispose_segments) AS v1_dispose_segments,
            count(*) FILTER (WHERE f.v1_missing_segments > 0) AS v1_videos_with_missing_validation
        FROM final_video_rollup f
        JOIN kept_videos k USING (video_id)
    a4  
        SELECT
            count(*) FILTER (WHERE s.validation_source = 'historical') AS historical_segments,
            count(*) FILTER (WHERE s.validation_source = 'recover') AS recover_v1_segments
        FROM segment_map s
        JOIN kept_videos k USING (video_id)
        WHERE s.has_validation
    a  
        SELECT
            count(*) AS recover_v2_segments,
            count(*) FILTER (WHERE m.provisional_bucket = 'golden') AS recover_v2_golden_segments,
            count(*) FILTER (WHERE m.provisional_bucket = 'redo') AS recover_v2_redo_segments,
            count(*) FILTER (WHERE m.provisional_bucket = 'dispose') AS recover_v2_dispose_segments
        FROM recover_v2_matched m
        JOIN kept_videos k USING (video_id)
    a  
        SELECT
            sum(f.final_validated_segments) AS final_validated_segments,
            sum(f.final_missing_segments) AS final_missing_segments,
            sum(f.final_golden_segments) AS golden_segments,
            sum(f.final_redo_segments) AS redo_segments,
            sum(f.final_dispose_segments) AS dispose_segments,
            count(*) FILTER (WHERE f.final_missing_segments > 0) AS videos_with_missing_final_validation
        FROM final_video_rollup f
        JOIN kept_videos k USING (video_id)
    raw_transcription_rowsdeduped_transcribed_segmentsduplicate_transcription_rowstranscribed_videosv1_validated_segmentsv1_missing_segments!v1_videos_with_missing_validationfinal_validated_segmentsfinal_missing_segments$videos_with_missing_final_validationhistorical_segmentsrecover_v1_segmentsrecover_v2_segmentsgolden_segmentsredo_segmentsdispose_segmentsv2_golden_segmentsrecover_v2_golden_segmentsrecover_v2_redo_segmentsrecover_v2_dispose_segments)v2_redo_segmentsv2_dispose_segmentsv1_coverage_pctfinal_coverage_pctselected_videos_totalselected_videos_action_keepselected_videos_action_review#selected_videos_with_transcriptions&selected_videos_without_transcriptions)rP   rQ   rU   rV   rW   rX   a  
        CREATE OR REPLACE TEMP TABLE final_source_rollup_all AS
        SELECT * FROM (
            SELECT 'historical' AS source, ?::BIGINT AS segments
            UNION ALL
            SELECT 'recover_v1' AS source, ?::BIGINT AS segments
            UNION ALL
            SELECT 'recover_v2' AS source, ?::BIGINT AS segments
            UNION ALL
            SELECT 'missing' AS source, ?::BIGINT AS segments
        )
    a  
        CREATE OR REPLACE TEMP TABLE final_source_rollup_kept AS
        SELECT * FROM (
            SELECT 'historical' AS source, ?::BIGINT AS segments
            UNION ALL
            SELECT 'recover_v1' AS source, ?::BIGINT AS segments
            UNION ALL
            SELECT 'recover_v2' AS source, ?::BIGINT AS segments
            UNION ALL
            SELECT 'missing' AS source, ?::BIGINT AS segments
        )
    a  
        CREATE OR REPLACE TEMP TABLE final_bucket_rollup_all AS
        SELECT * FROM (
            SELECT 'golden' AS bucket, ?::BIGINT AS segments
            UNION ALL
            SELECT 'redo' AS bucket, ?::BIGINT AS segments
            UNION ALL
            SELECT 'dispose' AS bucket, ?::BIGINT AS segments
            UNION ALL
            SELECT 'missing' AS bucket, ?::BIGINT AS segments
        )
    a  
        CREATE OR REPLACE TEMP TABLE final_bucket_rollup_kept AS
        SELECT * FROM (
            SELECT 'golden' AS bucket, ?::BIGINT AS segments
            UNION ALL
            SELECT 'redo' AS bucket, ?::BIGINT AS segments
            UNION ALL
            SELECT 'dispose' AS bucket, ?::BIGINT AS segments
            UNION ALL
            SELECT 'missing' AS bucket, ?::BIGINT AS segments
        )
    r   r   z'data/phase1_incremental/segment_map_v1/z9data/phase1_incremental/analytics_v1/video_rollup.parquetz$data/recover_v2_consolidated.parquetz'data/video_tts_classification_final.csvz%data/video_tts_dropped_by_channel.csv)deduped_transcript_backbonephase1_video_rollup_v1recover_v2_consolidatedfinal_video_selectionchannel_dropped_videosF)	original_start_msoriginal_end_mstrimmed_start_mstrimmed_end_msleading_pad_mstrailing_pad_msparent_segment_fileis_split_segmentsplit_index_from_id)-transcribed_tars_persist_polished_child_audio'local_replay_ledgers_present_under_data)split_lineage_present_in_segment_backbone0millisecond_cutpoint_columns_exist_but_are_emptycutpoint_fieldsz%final_data/video_rollup_final.parquetz1final_data/video_rollup_final_kept_subset.parquetz.final_data/videos_missing_final_validation.csvz:final_data/videos_missing_final_validation_kept_subset.csvz#final_data/source_breakdown_all.csvz+final_data/source_breakdown_kept_subset.csvz#final_data/bucket_breakdown_all.csvz+final_data/bucket_breakdown_kept_subset.csvzfinal_data/action_rollup.csv)	video_rollup_finalvideo_rollup_final_kept_subsetvideos_missing_final_validation+videos_missing_final_validation_kept_subsetsource_breakdown_allsource_breakdown_kept_subsetbucket_breakdown_allbucket_breakdown_kept_subsetaction_rollup)generated_at_epoch_s	elapsed_ssource_artifactsreality_checksworking_outputszsummary_all.jsonzsummary_kept_subset.jsonzcutpoint_coverage.jsonzdataset_inventory.json)summary_allsummary_kept_subsetcutpoint_coverage	inventoryr   )r<   mkdirduckdbconnectr!   timeVIDEO_ROLLUP_V1FINAL_VIDEO_SELECTIONSEGMENT_MAP_GLOB
RECOVER_V2r4   RAW_TXr"   r(   r;   r>   r:   r   printr   r   )r   startedraw_tx_rowstranscribed_summary_allv1_summary_allsource_rollup_allv2_summary_allfinal_summary_allr   kept_selectiontranscribed_summary_keptv1_summary_keptsource_rollup_keptv2_summary_keptfinal_summary_keptall_summarykept_summaryr   r   r   r   main   st  









	














		
	
(r   __main__)r
   r   r   r   r   r   )r   r   r   r   r   r   )r)   )r*   r   r   r   )r5   r6   r7   r6   r   r8   )r   r   r   r   )r   r   )
__future__r   r   r   pathlibr   r   r/   r0   r1   r2   r-   r.   ROOTDATA_DIRr<   r=   r   r   r   r   r   r   r(   r4   r;   r>   r   __name__r   r   r   r   <module>   s>    

	


|   
