o
    iRp                     @   s  d Z ddlZddlZddlmZmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZ ddlmZ dd	lmZmZm Z  dd
l!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. zddl/m0Z1 ddl2m3Z3 W n e4y   dZ1dZ3Y nw eG dd de'j5Z6G dd de'Z7dZ8eG dd deZ9G dd de%Z:dS )a$  OpenAI Speech-to-Text service implementations.

Provides two STT services:

- ``OpenAISTTService``: REST-based transcription using the Audio API
  (Whisper / GPT-4o).
- ``OpenAIRealtimeSTTService``: WebSocket-based streaming transcription
  using the Realtime API in transcription-only mode.
    N)	dataclassfield)AnyAsyncGeneratorLiteralOptionalUnion)logger)create_stream_resampler)
CancelFrameEndFrameFrameInterimTranscriptionFrame
StartFrameTranscriptionFrameUserStartedSpeakingFrameUserStoppedSpeakingFrameVADUserStartedSpeakingFrameVADUserStoppedSpeakingFrame)FrameDirection)	NOT_GIVENSTTSettings	_NotGiven)OPENAI_REALTIME_TTFS_P99OPENAI_TTFS_P99)WebsocketSTTService)BaseWhisperSTTServiceTranscription)Language)time_now_iso8601)
traced_stt)connect)Statec                   @   s   e Zd ZdZdS )OpenAISTTSettingsz$Settings for the OpenAI STT service.N)__name__
__module____qualname____doc__ r(   r(   O/home/ubuntu/.local/lib/python3.10/site-packages/pipecat/services/openai/stt.pyr#   9   s    r#   c                       s   e Zd ZU dZeZeed< dddejddde	dde
e de
e de
e de
e d	e
e d
e
e de
e de
e f fddZdedefddZ  ZS )OpenAISTTServicezOpenAI Speech-to-Text service that generates text from audio.

    Uses OpenAI's transcription API to convert audio to text. Requires an OpenAI API key
    set via the api_key parameter or OPENAI_API_KEY environment variable.
    	_settingsN)modelapi_keybase_urllanguageprompttemperaturesettingsttfs_p99_latencyr,   r-   r.   r/   r0   r1   r2   r3   c                   s   |pt j}
| jd|
ddd}|dur| dd ||_|dur(| dd ||_|dur5| dd ||_|dur>|| t j	d||||d|	 dS )	uw  Initialize OpenAI STT service.

        Args:
            model: Model to use — either gpt-4o or Whisper.

                .. deprecated:: 0.0.105
                    Use ``settings=OpenAISTTService.Settings(model=...)`` instead.

            api_key: OpenAI API key. Defaults to None.
            base_url: API base URL. Defaults to None.
            language: Language of the audio input. Defaults to English.

                .. deprecated:: 0.0.105
                    Use ``settings=OpenAISTTService.Settings(language=...)`` instead.

            prompt: Optional text to guide the model's style or continue a previous segment.

                .. deprecated:: 0.0.105
                    Use ``settings=OpenAISTTService.Settings(prompt=...)`` instead.

            temperature: Optional sampling temperature between 0 and 1. Defaults to 0.0.

                .. deprecated:: 0.0.105
                    Use ``settings=OpenAISTTService.Settings(temperature=...)`` instead.

            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
                Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
            **kwargs: Additional arguments passed to BaseWhisperSTTService.
        gpt-4o-transcribeN)r,   r/   r0   r1   r,   r0   r1   )r-   r.   r2   r3   r(   )
r   ENSettings"_warn_init_param_moved_to_settingsr,   r0   r1   apply_updatesuper__init__)selfr,   r-   r.   r/   r0   r1   r2   r3   kwargs	_languagedefault_settings	__class__r(   r)   r:   J   s4   
-

zOpenAISTTService.__init__audioreturnc                    s   | j jd us	J d|df| j j| j jd}| jr-| j jdv r)d|d< dg|d< nd	|d< | j jd ur9| j j|d
< | j jd urE| j j|d< | jjjj	di |I d H S )Nz	audio.wavz	audio/wav)filer,   r/   )r4   zgpt-4o-mini-transcribejsonresponse_formatlogprobsincludeverbose_jsonr0   r1   r(   )
r+   r/   r,   _include_prob_metricsr0   r1   _clientrA   transcriptionscreate)r;   rA   r<   r(   r(   r)   _transcribe   s    zOpenAISTTService._transcribe)r$   r%   r&   r'   r#   r6   __annotations__r   r5   r   r   strfloatr:   bytesr   rM   __classcell__r(   r(   r?   r)   r*   @   s<   
 	
Nr*   i]  c                   @   sR   e Zd ZU dZedd dZedB eB ed< edd dZ	e
d dB eB ed	< dS )
OpenAIRealtimeSTTSettingsa/  Settings for OpenAIRealtimeSTTService.

    Parameters:
        prompt: Optional prompt text to guide transcription style.
        noise_reduction: Noise reduction mode. ``"near_field"`` for close
            microphones, ``"far_field"`` for distant microphones, or ``None``
            to disable.
    c                   C      t S Nr   r(   r(   r(   r)   <lambda>       z"OpenAIRealtimeSTTSettings.<lambda>)default_factoryNr0   c                   C   rT   rU   rV   r(   r(   r(   r)   rW      rX   
near_field	far_fieldnoise_reduction)r$   r%   r&   r'   r   r0   rO   r   rN   r]   r   r(   r(   r(   r)   rS      s   
 	rS   c                       s:  e Zd ZU dZeZeed< ddejddddde	d	de
d	ee
 d
e
dee dee
 deeeed f  deed  dedee dee f fddZedede
fddZdefddZdedee
ef f fddZdef fddZdef fd d!Zdef fd"d#Zd$edee df fd%d&Z!de d'e"f fd(d)Z# fd*d+Z$ fd,d-Z%d.d/ Z&d0d1 Z'd2efd3d4Z(d5d6 Z)d$efd7d8Z*d9d: Z+d;d< Z,d=d> Z-d?efd@dAZ.d?efdBdCZ/d?efdDdEZ0d?efdFdGZ1e2	dTdHe
dIedee fdJdKZ3d?efdLdMZ4d?efdNdOZ5d?efdPdQZ6d?efdRdSZ7  Z8S )UOpenAIRealtimeSTTServiceu  OpenAI Realtime Speech-to-Text service using WebSocket transcription sessions.

    Uses OpenAI's Realtime API in transcription-only mode for real-time streaming
    speech recognition with optional server-side VAD and noise reduction. The model
    does not generate conversational responses — only transcription output.

    This service supports two VAD modes:

    **Local VAD** (default): Disable server-side VAD and use
    a local VAD processor in the pipeline instead. When a
    ``VADUserStoppedSpeakingFrame`` is received, the service commits the
    audio buffer so that the server begins transcription for the completed
    speech segment.

    **Server-side VAD** (``turn_detection=None``): The OpenAI server performs voice-activity
    detection. The service broadcasts ``UserStartedSpeakingFrame`` and
    ``UserStoppedSpeakingFrame`` when the server detects speech boundaries.
    Do **not** use a separate VAD processor in the pipeline in this mode.

    Audio is sent as 24 kHz 16-bit mono PCM as required by the OpenAI Realtime
    API. If the pipeline runs at a different sample rate (e.g. 16 kHz for Silero
    VAD compatibility), audio is automatically upsampled before sending.

    Example::

        stt = OpenAIRealtimeSTTService(
            api_key="sk-...",
            settings=OpenAIRealtimeSTTService.Settings(
                model="gpt-4o-transcribe",
                noise_reduction="near_field",
            ),
        )
    r+   Nz wss://api.openai.com/v1/realtimeFT)	r,   r.   r/   r0   turn_detectionr]   should_interruptr2   r3   r-   r,   r.   r/   r0   r_   r]   rZ   r`   r2   r3   c       
            s   t du rtd| jdtjddd}|dur| dd ||_|dur1|tjkr1| dd ||_|dur>| dd ||_|durK| dd ||_	|	durT|
|	 t jd|
|d	| || _|| _|| _|| _d| _d
| _t | _|d
u| _dS )a  Initialize the OpenAI Realtime STT service.

        Args:
            api_key: OpenAI API key for authentication.
            model: Transcription model. Supported values are
                ``"gpt-4o-transcribe"`` and ``"gpt-4o-mini-transcribe"``.

                .. deprecated:: 0.0.105
                    Use ``settings=OpenAIRealtimeSTTService.Settings(model=...)`` instead.

            base_url: WebSocket base URL for the Realtime API.
                Defaults to ``"wss://api.openai.com/v1/realtime"``.
            language: Language of the audio input. Defaults to English.

                .. deprecated:: 0.0.105
                    Use ``settings=OpenAIRealtimeSTTService.Settings(language=...)`` instead.

            prompt: Optional prompt text to guide transcription style
                or provide keyword hints.

                .. deprecated:: 0.0.105
                    Use ``settings=OpenAIRealtimeSTTService.Settings(prompt=...)`` instead.

            turn_detection: Server-side VAD configuration. Defaults to
                ``False`` (disabled), which relies on a local VAD
                processor in the pipeline. Pass ``None`` to use server
                defaults (``server_vad``), or a dict with custom
                settings (e.g. ``{"type": "server_vad", "threshold": 0.5}``).
            noise_reduction: Noise reduction mode. ``"near_field"`` for
                close microphones, ``"far_field"`` for distant
                microphones, or ``None`` to disable.

                .. deprecated:: 0.0.106
                    Use ``settings=OpenAIRealtimeSTTService.Settings(noise_reduction=...)`` instead.
            should_interrupt: Whether to interrupt bot output when
                speech is detected by server-side VAD. Only applies when
                turn detection is enabled. Defaults to True.
            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
                Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
            **kwargs: Additional arguments passed to parent
                WebsocketSTTService.
        Nzdwebsockets is required for OpenAIRealtimeSTTService. Install it with: pip install pipecat-ai[openai]r4   )r,   r/   r0   r]   r,   r/   r0   r]   )r3   r2   Fr(   )websocket_connectImportErrorr6   r   r5   r7   r,   r/   r0   r]   r8   r9   r:   _api_key	_base_url_turn_detection_should_interrupt_receive_task_session_readyr
   
_resampler_server_vad_enabled)r;   r-   r,   r.   r/   r0   r_   r]   r`   r2   r3   r<   r>   r?   r(   r)   r:      sL   ;
z!OpenAIRealtimeSTTService.__init__rB   c                 C   s   t | dd  S )zConvert a Language enum value to an ISO-639-1 code.

        Args:
            language: The Language enum value.

        Returns:
            Two-letter ISO-639-1 language code.
        -r   )rO   splitlower)r/   r(   r(   r)   _language_to_code_  s   z*OpenAIRealtimeSTTService._language_to_codec                 C   s   dS )zCheck if the service can generate processing metrics.

        Returns:
            True, as this service supports metrics generation.
        Tr(   r;   r(   r(   r)   can_generate_metricsl  s   z-OpenAIRealtimeSTTService.can_generate_metricsdeltac                    s0   t  |I dH }|r| jr|  I dH  |S )aT  Apply a settings delta and send session update if needed.

        Sends a ``session.update`` to the server when the session is active.

        Args:
            delta: A :class:`STTSettings` (or ``OpenAIRealtimeSTTService.Settings``) delta.

        Returns:
            Dict mapping changed field names to their previous values.
        N)r9   _update_settingsrh   _send_session_update)r;   rq   changedr?   r(   r)   rr   t  s
   
z)OpenAIRealtimeSTTService._update_settingsframec                    &   t  |I dH  |  I dH  dS )zStart the service and establish WebSocket connection.

        Args:
            frame: The start frame triggering service initialization.
        N)r9   start_connectr;   ru   r?   r(   r)   rw        zOpenAIRealtimeSTTService.startc                    rv   )zStop the service and close WebSocket connection.

        Args:
            frame: The end frame triggering service shutdown.
        N)r9   stop_disconnectry   r?   r(   r)   r{     rz   zOpenAIRealtimeSTTService.stopc                    rv   )zCancel the service and close WebSocket connection.

        Args:
            frame: The cancel frame triggering service cancellation.
        N)r9   cancelr|   ry   r?   r(   r)   r}     rz   zOpenAIRealtimeSTTService.cancelrA   c                 C  s   |  |I dH  dV  dS )u  Send audio data to the transcription session.

        Audio is streamed over the WebSocket. Transcription results arrive
        asynchronously via the receive task and are pushed as
        ``InterimTranscriptionFrame`` or ``TranscriptionFrame``.

        Args:
            audio: Raw audio bytes (16-bit mono PCM at the pipeline
                sample rate). Automatically resampled to 24 kHz.

        Yields:
            None — results are delivered via the WebSocket receive task.
        N)_send_audio)r;   rA   r(   r(   r)   run_stt  s   
z OpenAIRealtimeSTTService.run_stt	directionc                    s\   t  ||I dH  | js*t|tr|  I dH  dS t|tr,|  I dH  dS dS dS )a  Process frames from the pipeline.

        Extends the base STT service to handle local VAD events when
        server-side VAD is disabled. On ``VADUserStoppedSpeakingFrame``,
        commits the audio buffer so the server begins transcription for
        the completed speech segment.

        Args:
            frame: The frame to process.
            direction: The direction of frame flow in the pipeline.
        N)r9   process_framerj   
isinstancer   start_processing_metricsr   _commit_audio_buffer)r;   ru   r   r?   r(   r)   r     s   

z&OpenAIRealtimeSTTService.process_framec                    sL   t   I dH  |  I dH  | jr"| js$| | | j| _dS dS dS )z:Connect to the transcription endpoint and start receiving.N)r9   rx   _connect_websocket
_websocketrg   create_task_receive_task_handler_report_errorro   r?   r(   r)   rx     s   z!OpenAIRealtimeSTTService._connectc                    sF   t   I dH  | jr| j| jddI dH  d| _|  I dH  dS )z)Disconnect and clean up background tasks.Ng      ?)timeout)r9   r|   rg   cancel_task_disconnect_websocketro   r?   r(   r)   r|     s   z$OpenAIRealtimeSTTService._disconnectc              
      s   z1| j r| j jtju rW dS d| _| j d}t|dd| j idI dH | _ | dI dH  W dS  t	yU } z| j
d| |d	I dH  d| _ W Y d}~dS d}~ww )
zAEstablish the WebSocket connection to the transcription endpoint.NFz?intent=transcriptionAuthorizationzBearer )uriadditional_headerson_connectedz)Error connecting to OpenAI Realtime STT: 	error_msg	exception)r   stater"   OPENrh   rd   ra   rc   _call_event_handler	Exception
push_error)r;   urler(   r(   r)   r     s(   z+OpenAIRealtimeSTTService._connect_websocketc              
      s   zLzd| _ | jr| j I dH  W n ty1 } z| jd| |dI dH  W Y d}~nd}~ww W d| _| dI dH  dS W d| _| dI dH  dS d| _| dI dH  w )zClose the WebSocket connection.FNzError disconnecting: r   on_disconnected)rh   r   closer   r   r   )r;   r   r(   r(   r)   r     s*   z.OpenAIRealtimeSTTService._disconnect_websocketmessagec              
      s   z| j s| jr| jt|I dH  W dS W dS W dS  tyI } z!| j s*| js1W Y d}~dS | jd| |dI dH  W Y d}~dS d}~ww )z|Send a JSON message over the WebSocket.

        Args:
            message: The message dict to serialize and send.
        NzError sending message: r   )_disconnectingr   sendrD   dumpsr   r   )r;   r   r   r(   r(   r)   _ws_send  s   z!OpenAIRealtimeSTTService._ws_sendc                    s   d| j ji}| j jr| | j jnd}|r||d< | j jr$| j j|d< dtd|d}| jdu r6d|d	< n
| jdur@| j|d	< | j jrLd
| j ji|d< | ddd|iddI dH  dS )z?Send ``session.update`` to configure the transcription session.r,   Nr/   r0   z	audio/pcm)typerate)formattranscriptionFr_   r   r]   zsession.updater   inputr   rA   )r   session)	r+   r,   r/   rn   r0   _OPENAI_SAMPLE_RATEre   r]   r   )r;   r   language_codeinput_audior(   r(   r)   rs     s8   
	


z-OpenAIRealtimeSTTService._send_session_updatec                    sL   | j || jtI dH }|sdS t|d}| d|dI dH  dS )zSend audio data via ``input_audio_buffer.append``.

        Resamples from the pipeline sample rate to 24 kHz if needed.

        Args:
            audio: Raw audio bytes at the pipeline sample rate.
        Nzutf-8zinput_audio_buffer.appendr   )ri   resamplesample_rater   base64	b64encodedecoder   )r;   rA   payloadr(   r(   r)   r~   A  s   z$OpenAIRealtimeSTTService._send_audioc                       |  ddiI dH  dS )z2Commit the current audio buffer for transcription.r   zinput_audio_buffer.commitNr   ro   r(   r(   r)   r   T     z-OpenAIRealtimeSTTService._commit_audio_bufferc                    r   )zClear the current audio buffer.r   zinput_audio_buffer.clearNr   ro   r(   r(   r)   _clear_audio_bufferX  r   z,OpenAIRealtimeSTTService._clear_audio_bufferc              	      sX  | j 2 z3 dH W }zt|}W n tjy    td Y qw |dd}|dkr4| |I dH  q|dkrA| |I dH  q|dkrN| 	|I dH  q|dkr[| 
|I dH  q|d	krh| |I dH  q|d
kru| |I dH  q|dkr| |I dH  q|dkrtd|dd  q|dkr| |I dH  qtd|  q6 dS )zReceive and dispatch server events from the transcription session.

        Called by ``WebsocketService._receive_task_handler`` which wraps
        this method with automatic reconnection on connection errors.
        Nz!Failed to parse WebSocket messager    zsession.createdzsession.updatedz1conversation.item.input_audio_transcription.deltaz5conversation.item.input_audio_transcription.completedz2conversation.item.input_audio_transcription.failedz!input_audio_buffer.speech_startedz!input_audio_buffer.speech_stoppedzinput_audio_buffer.committedz Audio buffer committed: item_id=item_iderrorzUnhandled event: )r   rD   loadsJSONDecodeErrorr	   warningget_handle_session_created_handle_session_updated_handle_transcription_delta_handle_transcription_completed_handle_transcription_failed_handle_speech_started_handle_speech_stoppedtrace_handle_error)r;   r   evtevt_typer(   r(   r)   _receive_messages`  s:   
z*OpenAIRealtimeSTTService._receive_messagesr   c                    s   t d |  I dH  dS )zHandle ``session.created``.

        Sent immediately after connecting. We respond by configuring the
        session with our desired settings.

        Args:
            evt: The session created event from the server.
        z4Transcription session created, sending configurationN)r	   debugrs   r;   r   r(   r(   r)   r     s   
	z0OpenAIRealtimeSTTService._handle_session_createdc                    s   t d d| _dS )zHandle ``session.updated``.

        The session is now fully configured and ready to transcribe.

        Args:
            evt: The session updated event from the server.
        z*Transcription session configured and readyTN)r	   r   rh   r   r(   r(   r)   r     s   

z0OpenAIRealtimeSTTService._handle_session_updatedc                    s:   | dd}|r| t|| jt |dI dH  dS dS )a+  Handle incremental transcription text.

        For ``gpt-4o-transcribe`` and ``gpt-4o-mini-transcribe``, deltas
        contain streaming partial text. For ``whisper-1``, each delta
        contains the full turn transcript.

        Args:
            evt: The delta event from the server.
        rq   r   resultN)r   
push_framer   _user_idr   )r;   r   rq   r(   r(   r)   r     s   
z4OpenAIRealtimeSTTService._handle_transcription_deltac                    sZ   | dd}|r+| t|| jt |dI dH  | |dI dH  |  I dH  dS dS )zHandle a completed transcription for a speech segment.

        Pushes a ``TranscriptionFrame`` and records the result for
        tracing.

        Args:
            evt: The completed event containing the full transcript.
        
transcriptr   r   NT)r   r   r   r   r   _handle_transcription_tracestop_processing_metrics)r;   r   r   r(   r(   r)   r     s   	
z8OpenAIRealtimeSTTService._handle_transcription_completedr   is_finalc                    s   dS )zRecord transcription result for tracing.

        Args:
            transcript: The transcribed text.
            is_final: Whether this is a final transcription result.
            language: Optional language of the transcription.
        Nr(   )r;   r   r   r/   r(   r(   r)   r     s   z4OpenAIRealtimeSTTService._handle_transcription_tracec                    sB   t d | tI dH  | jr|  I dH  |  I dH  dS )zHandle server-side VAD speech start.

        Broadcasts ``UserStartedSpeakingFrame`` and optionally triggers
        interruption of current bot output.

        Args:
            evt: The ``input_audio_buffer.speech_started`` event.
        zServer VAD: speech startedN)r	   r   broadcast_framer   rf   broadcast_interruptionr   r   r(   r(   r)   r     s   
	z/OpenAIRealtimeSTTService._handle_speech_startedc                    s    t d | tI dH  dS )a  Handle server-side VAD speech stop.

        Broadcasts ``UserStoppedSpeakingFrame``. The audio buffer is
        automatically committed by the server when VAD is enabled.

        Args:
            evt: The ``input_audio_buffer.speech_stopped`` event.
        zServer VAD: speech stoppedN)r	   r   r   r   r   r(   r(   r)   r     s   
	z/OpenAIRealtimeSTTService._handle_speech_stoppedc                    s6   | di }| dd}| jd| dI dH  dS )u   Handle a transcription failure for a speech segment.

        Logs the error but does not treat it as fatal — the session
        remains active for subsequent turns.

        Args:
            evt: The failed event containing error details.
        r   r   zTranscription failedzOpenAI Realtime STT error: r   N)r   r   )r;   r   r   r   r(   r(   r)   r     s   	z5OpenAIRealtimeSTTService._handle_transcription_failedc                    sP   | di }| dd}| dd}d| d| }| j|dI d	H  t|)
zHandle a fatal error from the transcription session.

        Raises an exception so that ``WebsocketService`` can decide
        whether to attempt reconnection.

        Args:
            evt: The error event.
        r   r   zUnknown errorcoder   zOpenAI Realtime STT error [z]: r   N)r   r   r   )r;   r   r   r   
error_codemsgr(   r(   r)   r     s   	z&OpenAIRealtimeSTTService._handle_errorrU   )9r$   r%   r&   r'   rS   r6   rN   r   r5   r   rO   r   r   dictr   boolrP   r:   staticmethodrn   rp   r   r   rr   r   rw   r   r{   r   r}   rQ   r   r   r   r   r   rx   r|   r   r   r   rs   r~   r   r   r   r   r   r   r   r    r   r   r   r   r   rR   r(   r(   r?   r)   r^      s   
 "
	
q			-$r^   );r'   r   rD   dataclassesr   r   typingr   r   r   r   r   logurur	   pipecat.audio.utilsr
   pipecat.frames.framesr   r   r   r   r   r   r   r   r   r   "pipecat.processors.frame_processorr   pipecat.services.settingsr   r   r   pipecat.services.stt_latencyr   r   pipecat.services.stt_servicer   !pipecat.services.whisper.base_sttr   r   pipecat.transcriptions.languager   pipecat.utils.timer   (pipecat.utils.tracing.service_decoratorsr    websockets.asyncio.clientr!   ra   websockets.protocolr"   ModuleNotFoundErrorr6   r#   r*   r   rS   r^   r(   r(   r(   r)   <module>   s<   
0t