o
    lQi#                     @  s   d dl mZ d dlZd dlZd dlmZ d dlmZ ddlm	Z	m
Z
 edZed d	 Zed
 d Zed
 d Zed
 d ZdddZdddZdddZeG dd dZdS )     )annotationsN)	dataclass)Path   )	EnvConfigSUPPORTED_LANGUAGESz/home/ubuntu/transcripts
final_datazIfinal_cleaned_segments_with_variants_rerouted_repetition_filtered.parquetdataztranscription_results.parquetzrecover_v2_consolidated.parquetzyoutube_video_metadata_all.csvnamestrdefaultintreturnc                 C  s.   zt t| t|W S  ty   | Y S w N)r   osgetenvr   
ValueError)r
   r    r   src/final_export_config.py_env_int   s
   r   boolc                 C  s&   t | }|d u r|S |  dv S )N>   1onyestrue)r   r   striplowerr
   r   rawr   r   r   	_env_bool   s   
r   	list[str]c                 C  s,   t | d}| s|S dd |dD S )N c                 S  s   g | ]
}|  r|  qS r   )r   ).0itemr   r   r   
<listcomp>&   s    z_env_csv.<locals>.<listcomp>,)r   r   r   splitr   r   r   r   _env_csv"   s   r'   c                   @  s  e Zd ZU ded< ded< ded< ded< ded< ded< ded	< d
ed< d
ed< d
ed< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded< ded < ded!< ded"< ded#< ded$< ded%< d&ed'< d&ed(< d&ed)< d&ed*< ed@d,d-Zed@d.d/Zed@d0d1ZedAd2d3Zed@d4d5Z	ed@d6d7Z
edBd9d:ZdCd;d<ZdCd=d>Zd?S )DFinalExportConfigr   baser   run_idreference_modereference_bucketreference_prefixoutput_bucketoutput_prefixr   local_work_rootcanonical_segments_pathraw_transcripts_pathzPath | Nonevariants_pathvalidation_pathyoutube_meta_pathr    supported_languagesr   require_variantsrequire_validationr   microshard_target_rowsfinal_shard_target_rowspolish_threadsduckdb_threadscompactor_claim_limit
max_videos
max_shardsallow_partial_shardslanguage_filterslanguage_lease_secondsclaim_stale_after_sreference_download_concurrencycanonical_segments_r2_keyraw_transcripts_r2_keyz
str | Nonevalidation_r2_keyyoutube_meta_r2_keyvariants_r2_keyreference_manifest_r2_keyr   c                 C     | j jS r   )r)   	worker_idselfr   r   r   rL   N      zFinalExportConfig.worker_idc                 C  rK   r   )r)   gpu_typerM   r   r   r   rP   R   rO   zFinalExportConfig.gpu_typec                 C  rK   r   )r)   database_urlrM   r   r   r   rQ   V   rO   zFinalExportConfig.database_urlc                 C  rK   r   )r)   	mock_moderM   r   r   r   rR   Z   rO   zFinalExportConfig.mock_modec                 C     | j d dS )N/z/microshardsr/   rstriprM   r   r   r   microshard_prefix^      z#FinalExportConfig.microshard_prefixc                 C  rS   )NrT   z/shardsrU   rM   r   r   r   shard_prefixb   rX   zFinalExportConfig.shard_prefix'FinalExportConfig'c                 C  s  t  }tddtt  }tdd  }td|j}tdd}td|j}td	d
| }tdd }tdtt	}	tdtt
}
tdg }tdtt}| d[i d|d|d|d|d|dd|d|dttdttd d dttdttdttd ttd!|rt|nd d"|	rt|	nd d#|
rt|
nd d$|d%td&d'd(td)d'd*td+d,d-td.d/d0td1d2d3td4d5d6td7d8d9td:d;d<td=d;d>td?d'd@|dAtdBdCdDtdEdFdGtdHd2dItdJ|d dKdLtdM|d dNdOtdP|d dQ p5d dRtdS|d dT pGd dUtdV|d dW pYd dXtdY|d dZ pnd S S )\NFINAL_EXPORT_RUN_IDzfinal-export-FINAL_EXPORT_REFERENCE_MODElocalFINAL_EXPORT_REFERENCE_BUCKETFINAL_EXPORT_REFERENCE_PREFIXzfinal-export-referenceFINAL_EXPORT_OUTPUT_BUCKETFINAL_EXPORT_OUTPUT_PREFIXzfinal-export/FINAL_EXPORT_VARIANTS_PATHr!   FINAL_EXPORT_VALIDATION_PATHFINAL_EXPORT_YOUTUBE_META_PATHFINAL_EXPORT_LANG_FILTERSFINAL_EXPORT_SUPPORTED_LANGSr)   r*   r+   r,   r-   rT   r.   r/   r0   FINAL_EXPORT_LOCAL_WORK_ROOTtmpfinal_exportr1   $FINAL_EXPORT_CANONICAL_SEGMENTS_PATHr2   !FINAL_EXPORT_RAW_TRANSCRIPTS_PATHr3   r4   r5   r6   r7   FINAL_EXPORT_REQUIRE_VARIANTSFr8   FINAL_EXPORT_REQUIRE_VALIDATIONr9   #FINAL_EXPORT_MICROSHARD_TARGET_ROWSi  r:   FINAL_EXPORT_FINAL_SHARD_ROWSi:  r;   FINAL_EXPORT_POLISH_THREADS   r<   FINAL_EXPORT_DUCKDB_THREADS   r=   "FINAL_EXPORT_COMPACTOR_CLAIM_LIMIT    r>   FINAL_EXPORT_MAX_VIDEOSr   r?   FINAL_EXPORT_MAX_SHARDSr@   !FINAL_EXPORT_ALLOW_PARTIAL_SHARDSrA   rB   #FINAL_EXPORT_LANGUAGE_LEASE_SECONDSx   rC    FINAL_EXPORT_CLAIM_STALE_AFTER_Si  rD   +FINAL_EXPORT_REFERENCE_DOWNLOAD_CONCURRENCYrE   &FINAL_EXPORT_CANONICAL_SEGMENTS_R2_KEYz/canonical_segments.parquetrF   #FINAL_EXPORT_RAW_TRANSCRIPTS_R2_KEYz/raw_transcripts.parquetrG   FINAL_EXPORT_VALIDATION_R2_KEYz/validation.parquetrH    FINAL_EXPORT_YOUTUBE_META_R2_KEYz/youtube_meta.csvrI   FINAL_EXPORT_VARIANTS_R2_KEYz/variants.parquetrJ   &FINAL_EXPORT_REFERENCE_MANIFEST_R2_KEYz/manifest.jsonr   )r   r   r   r   timer   r   	r2_bucketr   DEFAULT_VALIDATION_PARQUETDEFAULT_YOUTUBE_METAr'   listr   rV   r   ROOTDEFAULT_CANONICAL_SEGMENTSDEFAULT_RAW_TRANSCRIPTSr   r   )clsr)   r*   r+   r,   r-   r.   r/   variants_rawvalidation_rawyoutube_rawrA   r6   r   r   r   from_envf   s   

	












 !%).388zFinalExportConfig.from_envc                 C  s  g }| j s| js|d | jjs|d | jdvr |d | jdkrg| j s3|d| j  | j sA|d| j  | j	rS| j
d u sN| j
 sS|d | jre| jd u s`| j se|d	 |S | jso|d
 | jsw|d | jr| js|d |S )NDATABASE_URL is requiredR2_ENDPOINT_URL is required>   r2r]   z3FINAL_EXPORT_REFERENCE_MODE must be 'local' or 'r2'r]   z&Canonical segments parquet not found: z#Raw transcripts parquet not found: zBFINAL_EXPORT_REQUIRE_VARIANTS=true but variants parquet is missingzFFINAL_EXPORT_REQUIRE_VALIDATION=true but validation parquet is missingz=FINAL_EXPORT_CANONICAL_SEGMENTS_R2_KEY is required in r2 modez:FINAL_EXPORT_RAW_TRANSCRIPTS_R2_KEY is required in r2 modezRFINAL_EXPORT_VALIDATION_R2_KEY is required in r2 mode when validation is mandatory)rR   rQ   appendr)   r2_endpoint_urlr+   r1   existsr2   r7   r3   r8   r4   rE   rF   rG   rN   errorsr   r   r   validate_for_video_stage   s2   











z*FinalExportConfig.validate_for_video_stagec                 C  s0   g }| j s| js|d | jjs|d |S )Nr   r   )rR   rQ   r   r)   r   r   r   r   r   validate_for_compactor   s   

z(FinalExportConfig.validate_for_compactorN)r   r   )r   r   )r   rZ   )r   r    )__name__
__module____qualname____annotations__propertyrL   rP   rQ   rR   rW   rY   classmethodr   r   r   r   r   r   r   r(   )   sf   
 
Lr(   )r
   r   r   r   r   r   )r
   r   r   r   r   r   )r
   r   r   r    r   r    )
__future__r   r   r   dataclassesr   pathlibr   configr   r   r   r   r   r   r   r   r   r'   r(   r   r   r   r   <module>   s"    



