o
    i#J                  
   @   s  d Z ddlZddlZddlZddlmZmZ ddlmZm	Z	m
Z
 ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( zddl)m*Z+ ddl,m-Z- W n  e.y Z/ ze0de/  e0d e1de/ dZ/[/ww dZ2de3de4de3fddZ5de#de
e3 fddZ6eG dd deZ7G dd  d e!Z8dS )!zGradium's speech-to-text service implementation.

This module provides integration with Gradium's real-time speech-to-text
WebSocket API for streaming audio transcription.
    N)	dataclassfield)AnyAsyncGeneratorOptional)logger)	BaseModel)CancelFrameEndFrameFrameInterimTranscriptionFrame
StartFrameTranscriptionFrameVADUserStartedSpeakingFrameVADUserStoppedSpeakingFrame)FrameDirection)	NOT_GIVENSTTSettings	_NotGiven)GRADIUM_TTFS_P99)WebsocketSTTService)Languageresolve_language)time_now_iso8601)
traced_stt)connect)StatezException: zIIn order to use Gradium, you need to `pip install "pipecat-ai[gradium]"`.zMissing module: g?encodingsample_ratereturnc                 C   sJ   | dkr#| dkr dS  dkr dS dkrdS t d| d	 dS | S )
a  Build Gradium input_format from encoding type and sample rate.

    For PCM encoding, appends the sample rate (e.g., "pcm_16000").
    For other encodings (wav, opus), returns the encoding as-is.

    Args:
        encoding: Base encoding type ("pcm", "wav", or "opus").
        sample_rate: Audio sample rate in Hz.

    Returns:
        The full input_format string for the Gradium API.
    pcmi@  pcm_8000i>  	pcm_16000i]  	pcm_24000z+GradiumSTTService: unsupported sample rate z" for PCM encoding, using pcm_16000)r   warning)r   r    r%   P/home/ubuntu/.local/lib/python3.10/site-packages/pipecat/services/gradium/stt.py_input_format_from_encoding5   s   


r'   languagec              
   C   s0   t jdt jdt jdt jdt jdi}t| |ddS )zConvert a Language enum to Gradium's language code format.

    Args:
        language: The Language enum value to convert.

    Returns:
        The Gradium language code string or None if not supported.
    deenesfrptT)use_base_code)r   DEENESFRPTr   )r(   LANGUAGE_MAPr%   r%   r&   language_to_gradium_languageQ   s   
r5   c                   @   s0   e Zd ZU dZedd dZee eB e	d< dS )GradiumSTTSettingsa^  Settings for GradiumSTTService.

    Parameters:
        delay_in_frames: Delay in audio frames (80ms each) before text is
            generated. Higher delays allow more context but increase latency.
            Allowed values: 7, 8, 10, 12, 14, 16, 20, 24, 36, 48.
            Default is 10 (800ms). Lower values like 7-8 give faster response.
    c                   C   s   t S N)r   r%   r%   r%   r&   <lambda>p   s    zGradiumSTTSettings.<lambda>)default_factorydelay_in_framesN)
__name__
__module____qualname____doc__r   r:   r   intr   __annotations__r%   r%   r%   r&   r6   e   s   
 "	r6   c                       s  e Zd ZU dZeZeed< G dd deZdddddde	dd	e
d
e
de
dee dee dee
 dee dee f fddZdefddZdedee
ef f fddZdef fddZdef fddZdef fddZd d! Zded"ef fd#d$Zd%d& Zd'e de!edf fd(d)Z"e#d*e
d+ed,e$fd-d.Z% fd/d0Z&d1d2 Z' fd3d4Z(d5d6 Z)d7d8 Z*d9d: Z+d;e
fd<d=Z,d>d? Z-d@dA Z.dBdC Z/  Z0S )DGradiumSTTServicezGradium real-time speech-to-text service.

    Provides real-time speech transcription using Gradium's WebSocket API.
    Supports both interim and final transcriptions with configurable parameters
    for audio processing and connection management.
    	_settingsc                   @   s2   e Zd ZU dZdZee ed< dZee	 ed< dS )zGradiumSTTService.InputParamsa  Configuration parameters for Gradium STT API.

        .. deprecated:: 0.0.105
            Use ``settings=GradiumSTTService.Settings(...)`` instead.

        Parameters:
            language: Expected language of the audio (e.g., "en", "es", "fr").
                This helps ground the model to a specific language and improve
                transcription quality.
            delay_in_frames: Delay in audio frames (80ms each) before text is
                generated. Higher delays allow more context but increase latency.
                Allowed values: 7, 8, 10, 12, 14, 16, 20, 24, 36, 48.
                Default is 10 (800ms). Lower values like 7-8 give faster response.
        Nr(   r:   )
r;   r<   r=   r>   r(   r   r   r@   r:   r?   r%   r%   r%   r&   InputParams~   s   
 rC   z&wss://eu.api.gradium.ai/api/speech/asrr    N)api_endpoint_base_urlr   r   paramsjson_configsettingsttfs_p99_latencyapi_keyrD   r   r   rE   rF   rG   rH   c                   s   |durddl }
|
jdtdd | jdddd}|dur0| d |s0|j|_|jdur0|j|_|dur9|| t j	d|||d	|	 || _
|| _|| _d| _|| _d| _d
| _t | _d| _d| _g | _d| _d| _dS )a  Initialize the Gradium STT service.

        Args:
            api_key: Gradium API key for authentication.
            api_endpoint_base_url: WebSocket endpoint URL. Defaults to Gradium's streaming endpoint.
            encoding: Base audio encoding type. One of "pcm", "wav", or "opus".
                For PCM, the sample rate is appended automatically from the
                pipeline's audio_in_sample_rate (e.g., "pcm" becomes "pcm_16000").
                Defaults to "pcm".
            sample_rate: Audio sample rate in Hz. If None, uses the pipeline
                sample rate.
            params: Configuration parameters for language and delay settings.

                .. deprecated:: 0.0.105
                    Use ``settings=GradiumSTTService.Settings(...)`` instead.

            json_config: Optional JSON configuration string for additional model settings.

                .. deprecated:: 0.0.101
                    Use `params` instead for type-safe configuration.

            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
                Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
            **kwargs: Additional arguments passed to parent STTService class.
        Nr   zdParameter 'json_config' is deprecated and will be removed in a future version, use 'params' instead.   )
stackleveldefault)modelr(   r:   rE   )r   rH   rG    P   r%   )warningswarnDeprecationWarningSettings"_warn_init_param_moved_to_settingsr(   r:   apply_updatesuper__init___api_key_api_endpoint_base_url	_encoding
_websocket_json_config_receive_task_input_format	bytearray_audio_buffer_chunk_size_ms_chunk_size_bytes_accumulated_text_flush_counter_transcript_aggregation_task)selfrI   rD   r   r   rE   rF   rG   rH   kwargsrP   default_settings	__class__r%   r&   rW      sP   (	



zGradiumSTTService.__init__r   c                 C   s   dS )zzCheck if the service can generate metrics.

        Returns:
            True if metrics generation is supported.
        Tr%   rf   r%   r%   r&   can_generate_metrics   s   z&GradiumSTTService.can_generate_metricsdeltac                    sB   t  |I dH }|s|S | jr|  I dH  |  I dH  |S )zApply a settings delta, sync params, and reconnect.

        Args:
            delta: A :class:`STTSettings` (or ``GradiumSTTService.Settings``) delta.

        Returns:
            Dict mapping changed field names to their previous values.
        N)rV   _update_settingsr[   _disconnect_connect)rf   rm   changedri   r%   r&   rn      s   	z"GradiumSTTService._update_settingsframec                    sP   t  |I dH  t| j| j| _t| j| j d d | _| 	 I dH  dS )zmStart the speech-to-text service.

        Args:
            frame: Start frame to begin processing.
        NrJ   i  )
rV   startr'   rZ   r   r^   r?   ra   rb   rp   rf   rr   ri   r%   r&   rs     s
   zGradiumSTTService.startc                    &   t  |I dH  |  I dH  dS )ziStop the speech-to-text service.

        Args:
            frame: End frame to stop processing.
        N)rV   stopro   rt   ri   r%   r&   rv        zGradiumSTTService.stopc                    ru   )zoCancel the speech-to-text service.

        Args:
            frame: Cancel frame to abort processing.
        N)rV   cancelro   rt   ri   r%   r&   rx   "  rw   zGradiumSTTService.cancelc                    s   |   I dH  dS )zBStart performance metrics collection for transcription processing.N)start_processing_metricsrk   r%   r%   r&   _start_metrics+  s   z GradiumSTTService._start_metrics	directionc                    sR   t  ||I dH  t|tr|  I dH  dS t|tr'|  I dH  dS dS )zProcess incoming frames and handle speech events.

        Args:
            frame: The frame to process.
            direction: Direction of frame flow in the pipeline.
        N)rV   process_frame
isinstancer   rz   r   _send_flush)rf   rr   r{   ri   r%   r&   r|   /  s   

zGradiumSTTService.process_framec              
      s   | j r| j jtjurdS |  jd7  _t| j}d|d}z| j t|I dH  W dS  t	yH } zt
d|  W Y d}~dS d}~ww )a  Send a flush request to process any buffered audio immediately.

        Sends a flush message to tell the server to process buffered audio.
        The server responds with text fragments followed by a "flushed"
        acknowledgment, which triggers finalization.
        N   flush)typeflush_idzFailed to send flush: )r[   stater   OPENrd   strsendjsondumps	Exceptionr   r$   )rf   r   msger%   r%   r&   r~   =  s   

zGradiumSTTService._send_flushaudioc                 C  s   | j | t| j | jkrMt| j d| j }| j | jd | _ t|d}d|d}| jrE| jj	t
ju rE| jt|I dH  t| j | jksdV  dS )zProcess audio data for speech-to-text conversion.

        Args:
            audio: Raw audio bytes to process.

        Yields:
            None (processing handled via WebSocket messages).
        Nzutf-8r   )r   r   )r`   extendlenrb   bytesbase64	b64encodedecoder[   r   r   r   r   r   r   )rf   r   chunkr   r%   r%   r&   run_sttO  s   	

zGradiumSTTService.run_stt
transcriptis_finalr(   c                    s   dS )z'Record transcription event for tracing.Nr%   )rf   r   r   r(   r%   r%   r&   _trace_transcriptiond  s   z&GradiumSTTService._trace_transcriptionc                    sL   t   I d H  |  I d H  | jr"| js$| | | j| _d S d S d S r7   )rV   rp   _connect_websocketr[   r]   create_task_receive_task_handler_report_errorrk   ri   r%   r&   rp   i  s   zGradiumSTTService._connectc              
      sx  z| j r| j jtju rW d S td | j}| jdd}t||dI d H | _ | 	dI d H  d| j
j| jd}i }| jrCt| j}| j
jrSt| j
j}|rS||d< | j
jr]| j
j|d	< |rc||d
< | j t|I d H  | j  I d H }t|}|d dkrtd|d  |d dkrtd|d  td W d S  ty } z| jd| |dI d H   d }~ww )NzConnecting to Gradium STTpipecat)z	x-api-keyzx-api-source)additional_headerson_connectedsetup)r   
model_nameinput_formatr(   r:   rF   r   errorzreceived error messagereadyzunexpected first message type zConnected to Gradium STTUnknown error occurred: 	error_msg	exception)r[   r   r   r   r   debugrY   rX   websocket_connect_call_event_handlerrB   rM   r^   r\   r   loadsr(   r5   r:   r   r   recvr   
push_error)rf   ws_urlheaders	setup_msgrF   gradium_language	ready_msgr   r%   r%   r&   r   q  sT   

z$GradiumSTTService._connect_websocketc                    sp   t   I d H  | jr| | jI d H  d | _| j  d| _| jr/| | jI d H  d | _|  I d H  d S )Nr   )	rV   ro   re   cancel_taskrc   clearrd   r]   _disconnect_websocketrk   ri   r%   r&   ro     s   
zGradiumSTTService._disconnectc              
      s   zUz| j r| j jtju rtd | j  I d H  W n ty: } z| jd| |dI d H  W Y d }~nd }~ww W d | _ | 	dI d H  d S W d | _ | 	dI d H  d S d | _ | 	dI d H  w )NzDisconnecting from Gradium STTr   r   on_disconnected)
r[   r   r   r   r   r   closer   r   r   )rf   r   r%   r%   r&   r     s$   
&z'GradiumSTTService._disconnect_websocketc                 C   s   | j r| j S td)NzWebsocket not connected)r[   r   rk   r%   r%   r&   _get_websocket  s   z GradiumSTTService._get_websocketc              	      s   |   2 z[3 d H W }zt|}W n tjy$   td|  Y qw |dd}|dkr:| |d I d H  q|dkrF|  I d H  q|dkrPt	d q|dkr`| j
d	| d
I d H  q6 d S )NzReceived non-JSON message: r   rN   textflushedend_of_streamz*Received end_of_stream message from serverr   zError: )r   )r   r   r   JSONDecodeErrorr   r$   get_handle_text_handle_flushedr   r   )rf   r   r   type_r%   r%   r&   _receive_messages  s&   z#GradiumSTTService._receive_messagesr   c                    sP   | j | d| j }| t|| jt | jjdI dH  | 	 I dH  dS )zHandle streaming transcription fragment.

        Accumulates text and pushes an InterimTranscriptionFrame with the
        full accumulated text so far.
         )r   user_id	timestampr(   N)
rc   appendjoin
push_framer   _user_idr   rB   r(   stop_processing_metrics)rf   r   accumulatedr%   r%   r&   r     s   
zGradiumSTTService._handle_textc                    s0   | j r| | j I dH  | |  d| _ dS )a7  Handle flush completion by starting a transcript aggregation timer.

        The "flushed" message confirms that buffered audio has been processed,
        but text tokens may still arrive after this point. A short timer allows
        trailing tokens to accumulate before finalizing the transcription.
        Ntranscript_aggregation)re   r   r   _transcript_aggregation_handlerrk   r%   r%   r&   r     s   
z!GradiumSTTService._handle_flushedc                    s$   t tI dH  |  I dH  dS )zEWait for trailing tokens then finalize the accumulated transcription.N)asynciosleepTRANSCRIPT_AGGREGATION_DELAY_finalize_accumulated_textrk   r%   r%   r&   r     s   z1GradiumSTTService._transcript_aggregation_handlerc                    sz   | j sdS d| _d| j }| j   td| d | t|| jt	 | j
jI dH  | j|d| j
jdI dH  dS )z@Join accumulated text, push TranscriptionFrame, and clear state.Nr   zFinal transcription: []T)r   r(   )rc   re   r   r   r   r   r   r   r   r   rB   r(   r   )rf   r   r%   r%   r&   r     s    

z,GradiumSTTService._finalize_accumulated_text)1r;   r<   r=   r>   r6   rS   r@   r   rC   r   r   r   r?   floatrW   boolrl   r   dictr   rn   r   rs   r
   rv   r	   rx   rz   r   r   r|   r~   r   r   r   r   r   r   rp   r   ro   r   r   r   r   r   r   r   __classcell__r%   r%   ri   r&   rA   s   sd   
 	
c		0rA   )9r>   r   r   r   dataclassesr   r   typingr   r   r   logurur   pydanticr   pipecat.frames.framesr	   r
   r   r   r   r   r   r   "pipecat.processors.frame_processorr   pipecat.services.settingsr   r   r   pipecat.services.stt_latencyr   pipecat.services.stt_servicer   pipecat.transcriptions.languager   r   pipecat.utils.timer   (pipecat.utils.tracing.service_decoratorsr   websockets.asyncio.clientr   r   websockets.protocolr   ModuleNotFoundErrorr   r   r   r   r   r?   r'   r5   r6   rA   r%   r%   r%   r&   <module>   s>   (

