o
    Тi                     @  s   d Z ddlmZ ddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZ eeZdd	d
dddddddddZedde d ejZedejZedejZedejZeG dd dZ	d0d1d%d&Zd2d.d/ZdS )3z
Tier 1 programmatic validation: instant, free, runs on every segment.
Computes quality_score (0-1) and lane flags.
Optional Tier 1.5 GPU validation is flag-gated.
    )annotationsN)	dataclass)Optional   )LANGUAGE_MAPAUDIO_EVENT_TAGSVALIDATOR_VERSION)i 	  i	  )i   i  )i  i  )i  i  )i   i  )i
  i
  )i 
  i
  )i	  i	  )i   i  )r      )
DevanagariTeluguTamilKannada	MalayalamGujaratiGurmukhiBengaliAssameseOdiaLatinz\[(|z)\]z\[UNK\]z\[INAUDIBLE\]z\[NO_SPEECH\]c                   @  s   e Zd ZU ded< dZded< dZded< dZded	< d
Zded< dZded< dZ	ded< dZ
ded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< dZded< eZded< dd  ZdS )!ValidationResultstr
segment_id      ?floatquality_scoreFboolis_emptyis_no_speech        chars_per_secondTlength_ratio_okscript_check_oklang_mismatchtag_consistency_okr   intnum_unknum_inaudiblenum_event_tagsoverlap_suspectedboundary_scoreasr_eligibletts_clean_eligibletts_expressive_eligibleNz	list[str]flagsvalidator_versionc                 C  s   | j d u r
g | _ d S d S N)r.   )self r2   )/home/ubuntu/transcripts/src/validator.py__post_init__=   s   

zValidationResult.__post_init__)__name__
__module____qualname____annotations__r   r   r   r    r!   r"   r#   r$   r&   r'   r(   r)   r*   r+   r,   r-   r.   r   r/   r4   r2   r2   r2   r3   r   '   s*   
 r   r   r   transcription_datadictexpected_languageaudio_duration_sr   	trim_metaOptional[dict]returnc                   s  t | d}|dd}|dd}|dd}|di }	|r#| s:d|_d|_d	|_d	|_d	|_|j	d
 |S t
|rPd|_d|_d	|_d	|_d	|_|S td| }
td|
 }
td|
 }
|dkrt|
| |_|jdk s}|jdkrd	|_|j	d|jd |tv rt| \}}}|tv r|dkrt| \ t fdd|
D }tdd |
D }|| }|dkr|| }|dk r|dkrd	|_|j	d|d |r||krd|_|j	d| d|  td| }| }||kr|dd|ddkrd	|_|j	d tt||_tt||_tt||_t|
 }|dkrP|j|j | }|dkrP|j	d|d |rz|dd	}|d d	}|ri| j d!8  _ |rs| j d!8  _ t!d|j |_ d}|js|d"7 }|js|d!7 }|jr|d7 }|js|d7 }|jd#kr|t"d!|jd$ 7 }|j dk r|d|j  d" 7 }t!dd| |_|jdko|j |_|jd%ko|j d&ko|j o|jdko|jdk|_|jo|jd#ko|jd&k|_|S )'z@Run Tier 1 programmatic checks on a single transcription result.)r   transcription taggeddetected_languagespeakerTr   Fempty_transcriptiong      ?r   r   g      I@zsuspicious_length_ratio:z.1fr   c                 3  s0    | ]}t |  kr krn nd V  qdS )r   N)ord.0chilor2   r3   	<genexpr>t      . z)validate_transcription.<locals>.<genexpr>c                 s  s0    | ]}t |d rt|dk rdV  qdS )L   r   N)unicodedatacategory
startswithrF   rG   r2   r2   r3   rM   u   rN   g?enzlow_expected_script_ratio:z.2fzlang_mismatch:expected=z
,detected= tag_text_mismatchg333333?zhigh_unk_density:abrupt_start
abrupt_endg?g333333?   g?gffffff?g?)#r   getstripr   r   r+   r,   r-   r.   appendNO_SPEECH_PATTERNsearchr   TAG_PATTERNsubUNK_PATTERNINAUDIBLE_PATTERNlenr    r!   r   SCRIPT_RANGESsumr"   r#   replacer$   findallr&   r'   r(   splitr*   maxmin)r   r9   r;   r<   r=   resultr@   rB   detected_langrD   
clean_text_script_namescript_charslatin_charstotal_alphaexpected_ratiostripped_taggedstripped_transcriptiontotal_wordsunk_densityrW   rX   	penaltiesr2   rJ   r3   validate_transcriptionB   s   




ry   	responses
list[dict]audio_durationsdict[str, float]
trim_metasdict[str, dict]list[ValidationResult]c           
   	   C  sV   g }| D ]$}| dd}| di }| |d}| |}	|t|||||	 q|S )z,Validate a batch of transcription responses.r   unknownr9   g      @)rZ   r\   ry   )
rz   r;   r|   r~   resultsrespseg_iddatadurationtrimr2   r2   r3   validate_batch   s   
r   r0   )r   r   r9   r:   r;   r   r<   r   r=   r>   r?   r   )
rz   r{   r;   r   r|   r}   r~   r   r?   r   )__doc__
__future__r   loggingrerQ   dataclassesr   typingr   configr   r   r   	getLoggerr5   loggerrd   compilejoin
IGNORECASEr_   ra   rb   r]   r   ry   r   r2   r2   r2   r3   <module>   s>    
 