o
    lQi:                     @  s:  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
 d dlmZ d dlmZ d dlZd dlZd dlZd dlmZ ddlmZ eeZedZe
G d	d
 d
ZdMddZdNddZ dOddZ!dPddZ"dQddZ#dRd!d"Z$dSd(d)Z%dTd+d,Z&dUd1d2Z'dVd:d;Z(dWdCdDZ)dXdKdLZ*dS )Y    )annotationsN)	dataclass)Path)Any   )FinalExportConfigz
\[[^\]]+\]c                   @  sn   e Zd ZU ded< ded< ded< ded< ded< ded< ded	< d
ed< d
ed< d
ed< d
ed< d
ed< dS )PackArtifactsr   metadata_pathaudio_tar_pathaudio_index_pathmanifest_pathint	row_countmember_countsum_flac_bytesstrmetadata_sha256audio_sha256audio_index_sha256manifest_sha256segment_id_set_sha256N)__name__
__module____qualname____annotations__ r   r   src/final_export_common.pyr      s   
 r   payloaddict[str, Any]returnr   c                 C  s   t j| ddddd S )NF   T)ensure_asciiindent	sort_keys
)jsondumps)r   r   r   r   stable_json_dumps+   s   r'   pathr   Nonec                 C  s   |  t| d S N)
write_textr'   )r(   r   r   r   r   
write_json/   s   r,   databytesc                 C  s   t |  S r*   )hashlibsha256	hexdigest)r-   r   r   r   sha256_bytes3   s   r2   c                 C  s^   t  }| d}	 |d}|sn|| qW d    | S 1 s&w   Y  | S )NrbTi   )r/   r0   openreadupdater1   )r(   hhandlechunkr   r   r   sha256_file7   s   


r:   values	list[str]c                 C  s"   d t| d}t| S )Nr$   zutf-8)joinsortedencoder/   r0   r1   )r;   materialr   r   r   sha256_string_setB   s   rA   
segment_idc                 C  s   |  dr| S |  dS )Nz.flac)endswithrB   r   r   r   segment_audio_member_nameG   s   rE   original_file	was_splitboolsplit_indexr   c                 C  s   |r	|  d| S | S )N_splitr   )rF   rG   rI   r   r   r   replay_segment_idK   s   rK   textc                 C  s$   t d| pddddd S )N  z[UNK]z[INAUDIBLE])_TAG_REsubreplacestrip)rL   r   r   r   clean_transcript_textQ   s   $rS   audio
np.ndarraysr
transcriptc              
   C  sh  t | dkrd|d d dddddS tj| tjd} tt | t| }t | r4tttt| nd}t | rDttt	| nd}|dkrLd ntdt
| }|dkr[d ntdt
| }d}t | dkrttt	tt| tj}tt|d d}	tt | |	 d}
t | |	kr| d |
|	  n| }t ||	k rtj|gtjd}n||
|	}ttjt|dd	}t|d
 d}t |rtt||knd}t|}|dkrdntt || }dd | D }|dkrdntt || }t|dt||d u rd nt|d|d u rd nt|dt|dt|dt|dt|ddS )Nr   g        g      ?)
duration_ssample_rate_hzrms_dbfs	peak_dbfszero_crossing_ratesilence_fractionchars_per_secwords_per_sec)dtypeg      4@r   g{Gz?)axisg?g-C6?c                 S  s   g | ]}|r|qS r   r   ).0partr   r   r   
<listcomp>y   s    z)compute_audio_metrics.<locals>.<listcomp>   )lennpasarrayfloat32floatsqrtmeansquaremaxabsmathlog10diffsignbitastypeint8r   arrayreshaperS   splitround)rT   rV   rW   rX   rmspeakrZ   r[   zcr	frame_lenframe_counttrimmed	frame_rmsframessilence_thresholdr]   
clean_textr^   wordsr_   r   r   r   compute_audio_metricsU   sR   & ( r   canonical_row
raw_tx_rowdict[str, Any] | Nonevariant_rowvalidation_rowvideo_metadataexport_provenancec                 C  s  |  dd|  ddt|  dd|  d|  d|  d|  d	|  d
|  d|  dd
|p2i |  dd|  dd|p?i  dd|pFi  ddd|pOi |pRi |  dd|  dd|  dd|  dd|  dd|  dd|  ddd|pzi |d}tj|dddS )Nsegment_filerN   parent_segment_fileis_split_segmentFsplit_index_from_idoriginal_start_msoriginal_end_mstrimmed_start_mstrimmed_end_msleading_pad_mstrailing_pad_ms)
r   r   r   r   r   r   r   r   r   r   transcriptiontagged)canonical_transcriptioncanonical_taggedraw_transcription
raw_taggedsegment_languagetx_detected_languagegemini_langcorrected_languagequeue_languageyoutube_audio_languageyoutube_default_language)r   r   r   r   r   r   r   )replay_provenancesource_row_provenancetranscript_provenancevariant_provenancevalidation_provenancelanguage_evidencer   r   T)r!   r#   )getrH   r%   r&   )r   r   r   r   r   r   r   r   r   r   build_meta_information   s>   










	"r   video_idpolished_segmentr   run_id	worker_idexported_atdict[str, dict[str, Any]]c                 C  s  t |dp|dpd}t |dp|dpd}t |dp#d}t |dp,d}	t|j|j|}
t|j}t |dpBd}t|}tt	
|	}|d|d|d	|d
|d|d|d|dd}|d|d|d|d|dd}|d|d|d|d|d|d|d|dd}|d|d|d|d |d!|d"|d#d$}t||||||||d%d&d'}i d(| d)|d*t |d*p|jjd+t|d,|jjd-t|d.p|jjpd/d0t |d0pdd1t |d2p"|d3p"ddt |dp-ddt |dp8dd4|d5|d6|d7|	d8|d9|
d9 d:|
d: d;|
d; |
d< |
d= |
d> |
d? |
d@ |dAt |dpsd|t|j|dB
}| |||j||
d9 dC}||dDS )ENnative_script_textr   rN   romanized_textr   r   r   r   raw_detected_languageraw_quality_scorespeaker_emotionspeaker_stylespeaker_pacespeaker_accent)r   r   detected_languagequality_scorer   r   r   r   input_script_profilevariant_routevariant_validation_errors)r   r   r   processing_routevalidation_errorsfinal_validation_sourcefinal_has_validationfinal_bucketlid_consensuslid_agree_countconsensus_langconformer_multi_ctc_normalizedmms_confidence)r   r   r   r   r   r   r   r   r   r   
channel_idchannel_titletitledescriptiontags)r   r   r   r   r   r   r   flac_sha256)r   r   r   audio_sha256_type)r   r   r   r   r   r   r   rB   r   is_split_partr   rI   r   r   r   video_languager   r   transcription_nativetranscription_romanizedtranscription_mixedtranscription_taggedhas_audio_tagrX   rY   rZ   r[   r\   r]   r^   r_   tx_quality_score)
r[   r\   r]   r^   r_   r   r   r   flac_size_bytesmeta_information)r   rB   tar_member_name
flac_bytesr   audio_duration_s)metadata_row	audio_row)r   r   r   rT   rV   r2   r   rE   rH   rO   searchr   	trim_metarF   rG   r   rI   rf   )r   r   r   r   r   r   r   r   r   r   metricsr   rB   member_namer   r   r   r   r   r   r   r   r   r   r   build_export_segment_payload   s   	
		 $
r   pack_dirmanifest_namemetadata_rowslist[dict[str, Any]]
audio_rowsmanifest_payloadc                 C  s  | j ddd | d }| d }| d }| | }tjtj||dd g }	g }
d}t|d	J}|D ]?}|d
 }|d }tj|d}t	||_
||t| |
| |t	|7 }|	|d |d |t	||d |d d q2W d    n1 s|w   Y  tjtj|	|dd dd |D }t|}t|}t|}i |t	|t	|	t	|
|t||||| j| j| jd}t|| t|}t||||t	|t	|
|||||t|dS )NT)parentsexist_okzmetadata.parquetz	audio.tarzaudio_index.parquetzstd)compressionr   wr   r   )namerB   r   r   r   )rB   r   r   r   r   r   c                 S  s   g | ]}t |d  qS rD   )r   )rb   rowr   r   r   rd   T  s    z(build_pack_artifacts.<locals>.<listcomp>)metadata_row_countaudio_index_row_countaudio_tar_member_countr   r   r   r   audio_tar_sha256metadata_size_bytesaudio_index_size_bytesaudio_tar_size_bytes)r	   r
   r   r   r   r   r   r   r   r   r   r   )mkdirpqwrite_tablepaTablefrom_pylisttarfiler4   TarInforf   sizeaddfileioBytesIOappendr:   rA   statst_sizer,   r   )r   r   r   r   r   r	   r
   r   r   audio_index_rowsmember_namesr   tfitemr   r   infosegment_idsr   r   r   pack_manifestr   r   r   r   build_pack_artifacts+  s   


r  )r   r   r   r   )r(   r   r   r   r   r)   )r-   r.   r   r   )r(   r   r   r   )r;   r<   r   r   )rB   r   r   r   )rF   r   rG   rH   rI   r   r   r   )rL   r   r   r   )rT   rU   rV   r   rW   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   )+
__future__r   r/   r  r%   loggingrp   rer   dataclassesr   pathlibr   typingr   duckdbnumpyrg   pyarrowr   pyarrow.parquetparquetr   final_export_configr   	getLoggerr   loggercompilerO   r   r'   r,   r2   r:   rA   rE   rK   rS   r   r   r   r  r   r   r   r   <module>   s@    











3
.u