o
    5i,                     @   s  d Z ddlZddlZddlmZ ddlmZmZmZm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZ eG dd dZG dd dZ 				d#dee de!de!de!dee" dee fddZ#e$dkrddl%Z%e&e%j'dkre%j'd Z(e&e%j'dkre%j'd ndZ)edde)dZe  Z*e*+e(eZ,e-d  ddl.Z.e-e.j/e,dd!d" dS dS dS )$z
Gemini transcription module using Google AI Studio endpoints.
Handles audio transcription with structured output and multiple model support.
    N)Path)ListOptionalDictAny)	dataclass)genai)types   )GEMINI_API_KEYGEMINI_MODELSget_model_name)TranscriptionOutputTranscriptionResultSpeakerMetaget_transcription_promptget_user_promptTRANSCRIPTION_JSON_SCHEMA)
AudioChunk)romanize_for_alignmentc                   @   sR   e Zd ZU dZdZeed< dZee ed< dZ	e
ed< dZeed	< d
Zeed< dS )TranscriptionConfigz&Configuration for a transcription job.gemini-3-flash-previewmodellowthinking_levelg        temperatureTelugulanguage<   timeout_secN)__name__
__module____qualname____doc__r   str__annotations__r   r   r   floatr   r   int r(   r(   ?/home/ubuntu/maya3_transcribe/src/backend/gemini_transcriber.pyr      s   
 r   c                   @   s   e Zd ZdZddee fddZdedefdd	Zdedefd
dZ	de
dejfddZdede
deeef fddZdede
defddZ		ddee de
dee dee dee f
ddZdS )GeminiTranscriberzO
    Handles audio transcription using Gemini models via Google AI Studio.
    Napi_keyc                 C   s,   |pt | _| jstdtj| jd| _dS )z~
        Initialize the transcriber.
        
        Args:
            api_key: Gemini API key (defaults to env var)
        zGEMINI_API_KEY not set)r+   N)r   r+   
ValueErrorr   Clientclient)selfr+   r(   r(   r)   __init__.   s   
zGeminiTranscriber.__init__	file_pathreturnc                 C   s,   t |j }ddddddd}||dS )z(Determine MIME type from file extension.z
audio/flacz	audio/wavz
audio/mpegz	audio/oggz	audio/mp4z	audio/aac)z.flacz.wavz.mp3z.oggz.m4az.aac)r   suffixlowerget)r/   r1   ext
mime_typesr(   r(   r)   _get_mime_type;   s   z GeminiTranscriber._get_mime_typec                 C   s6   t |d}| W  d   S 1 sw   Y  dS )zLoad audio file as bytes.rbN)openread)r/   r1   fr(   r(   r)   _load_audio_bytesH   s   $z#GeminiTranscriber._load_audio_bytesconfigc                 C   sz   t |j}t|i }d}|dr#|jr#|dr#tj|j d}tj	|j
dttjjt|jdgd}|r;||_|S )z-Build the generation config for the API call.Nsupports_thinkingzgemini-3)r   zapplication/jsontext)r   response_mime_typeresponse_json_schemasystem_instruction)r   r   r   r5   r   
startswithr	   ThinkingConfigupperGenerateContentConfigr   r   Part	from_textr   r   thinking_config)r/   r>   
model_name
model_inforK   
gen_configr(   r(   r)   _build_configM   s$   

	zGeminiTranscriber._build_config
audio_pathc              
   C   sT  t |j}| |}| |}tjdtjj||dtjjt	 dgdg}| 
|}t }z[| jjj|||d}	t | }
|	jrddl}||	j}|
|d< ||d	< |j|d
< |dd}|rkd|vrkt||d< d|vr|dd}tdd| |d< |W S d|
|dW S  ty } zt | }
t||
|dW  Y d}~S d}~ww )z
        Transcribe a single audio file.
        
        Args:
            audio_path: Path to the audio file
            config: Transcription configuration
            
        Returns:
            Dictionary with transcription results
        user)	mime_typedatar@   )roleparts)r   contentsr>   r   N_processing_time_sec_model_thinking_leveltranscription 	romanizedcode_switchtaggedz
\[[\w_]+\]zEmpty response from model)errorrW   rX   )r   r   r=   r8   r	   ContentrI   
from_bytesrJ   r   rO   timer.   modelsgenerate_contentrA   jsonloadsr   r5   r   resubstrip	Exceptionr$   )r/   rP   r>   rL   audio_bytesrR   rV   rN   
start_timeresponseprocessing_timere   resultnativer^   er(   r(   r)   transcribe_audioh   s^   




z"GeminiTranscriber.transcribe_audiochunkc                 C   s   |  |j|}|d}|r'td| dd| dd| dd| dd}n1|d}t|tr8tdi |nd}t|dd|d	d|d
d|dd||ddd}t|j|j	|j
|j|j||d|j|d|dd	S )a  
        Transcribe an AudioChunk and return structured result.
        
        Args:
            chunk: AudioChunk to transcribe
            config: Transcription configuration
            
        Returns:
            TranscriptionResult with full metadata
        r_   z[ERROR: ])rZ   r]   r\   r^   speakerNrZ   r[   r]   r\   r^   detected_language)rZ   r]   r\   r^   ru   rv   rX   rY   rW   )	
segment_idchunk_indextotal_chunksduration_secr   rZ   
model_usedr   processing_time_secr(   )rr   r1   r5   r   
isinstancedictr   r   original_segmentrx   ry   rz   r   r   )r/   rs   r>   
raw_result	error_msgrZ   speaker_dataru   r(   r(   r)   transcribe_chunk   s<   










	z"GeminiTranscriber.transcribe_chunkchunks
max_chunksprogress_callbackc           
      C   s   |r|d| }g }t |}td| d|j d t|D ]?\}}|r,||d | ntd|d  d| d|j d|jd  d|j d	 | ||}	||	 ||d k r]t	
d
 qtdt | d |S )a  
        Transcribe multiple audio chunks.
        
        Args:
            chunks: List of AudioChunks to transcribe
            config: Transcription configuration
            max_chunks: Maximum chunks to process (for testing)
            progress_callback: Optional callback(current, total) for progress updates
            
        Returns:
            List of TranscriptionResults
        Nz[Transcriber] Processing z chunks with z...r
   /z: z (chunk )g      ?z[Transcriber] Completed z transcriptions)lenprintr   	enumerater   rx   ry   r   appendrb   sleep)
r/   r   r>   r   r   resultstotalirs   ro   r(   r(   r)   transcribe_batch   s(   


z"GeminiTranscriber.transcribe_batch)N)NN)r    r!   r"   r#   r   r$   r0   r8   bytesr=   r   r	   rH   rO   r   r   rr   r   r   r   r   r'   callabler   r(   r(   r(   r)   r*   )   sB    

P
9r*   r   r   highsegmentsr   r   r   max_segmentsr2   c                 C   s$   t |||d}t }|j| ||dS )a  
    Convenience function to transcribe a list of audio segments.
    
    Args:
        segments: List of AudioChunk objects
        language: Primary language of the audio
        model: Gemini model to use
        thinking_level: Thinking level for Gemini 3 models
        max_segments: Max segments to process (for testing)
        
    Returns:
        List of TranscriptionResult objects
    r   r   r   )r   )r   r*   r   )r   r   r   r   r   r>   transcriberr(   r(   r)   transcribe_segments  s   r   __main__   r   r   z
Transcription Result:F)indentensure_ascii)r   r   r   N)0r#   rg   rb   pathlibr   typingr   r   r   r   dataclassesr   googler   google.genair	   r>   r   r   r   transcription_schemar   r   r   r   r   r   audio_processorr   src.romanizationr   r   r*   r$   r'   r   r    sysr   argvrP   r   r   rr   ro   r   re   dumpsr(   r(   r(   r)   <module>   sd      t

