o
    qmœi½>  ã                   @   s  d dl Z d dlmZmZ d dlZd dlZd dlm  m	Z
 ddlmZ ddlmZmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" zd dl#m$Z% W n e&yz   d dlm$Z% Y nw G dd„ dƒZ'G dd„ dƒZ(dS )é    N)ÚasynccontextmanagerÚcontextmanageré   )ÚApiError)ÚAsyncClientWrapperÚSyncClientWrapper)ÚRequestOptionsé   )Ú&AsyncSpeechToTextStreamingSocketClientÚ!SpeechToTextStreamingSocketClient)Ú SpeechToTextStreamingFlushSignal)Ú'SpeechToTextStreamingHighVadSensitivity)Ú$SpeechToTextStreamingInputAudioCodec)Ú!SpeechToTextStreamingLanguageCode)ÚSpeechToTextStreamingMode)ÚSpeechToTextStreamingModel)ÚSpeechToTextStreamingVadSignals)Úconnectc                   @   ó¦   e Zd Zdefdd„Zeddddddddddœ	dedeje	 deje
 d	eje d
eje deje deje deje deje deje deje fdd„ƒZdS )ÚRawSpeechToTextStreamingClientÚclient_wrapperc                C   ó
   || _ d S ©N©Ú_client_wrapper©Úselfr   © r   ú`/home/ubuntu/.local/lib/python3.10/site-packages/sarvamai/speech_to_text_streaming/raw_client.pyÚ__init__   ó   
z'RawSpeechToTextStreamingClient.__init__N©	ÚmodelÚmodeÚsample_rateÚhigh_vad_sensitivityÚvad_signalsÚflush_signalÚinput_audio_codecÚapi_subscription_keyÚrequest_optionsÚlanguage_coder"   r#   r$   r%   r&   r'   r(   r)   r*   Úreturnc       
      
   c   s”   | j  ¡ jd }t ¡ }|dur| d|¡}|dur!| d|¡}|dur+| d|¡}|dur5| d|¡}|dur?| d|¡}|durI| d|¡}|durS| d	|¡}|dur]| d
|¡}|d|›  }| j  ¡ }|	durst|	ƒ|d< |
r€d|
v r€| |
d ¡ z"t	j
||d}t|dV  W d  ƒ W dS 1 s›w   Y  W dS  tjjyÉ } z|j}|dkr¼t|t|ƒdd‚t|t|ƒdd‚d}~ww )u–  
        WebSocket channel for real-time speech to text streaming.

        **Note:** This API Reference page is provided for informational purposes only.
        The Try It playground may not provide the best experience for streaming audio.
        For optimal streaming performance, please use the SDK or implement your own WebSocket client.

        Parameters
        ----------
        language_code : SpeechToTextStreamingLanguageCode
            Specifies the language of the input audio in BCP-47 format.

            **Available Options (saarika:v2.5):**
            - `unknown` (default): Use when the language is not known; the API will auto-detect.
            - `hi-IN`: Hindi
            - `bn-IN`: Bengali
            - `gu-IN`: Gujarati
            - `kn-IN`: Kannada
            - `ml-IN`: Malayalam
            - `mr-IN`: Marathi
            - `od-IN`: Odia
            - `pa-IN`: Punjabi
            - `ta-IN`: Tamil
            - `te-IN`: Telugu
            - `en-IN`: English

            **Additional Options (saaras:v3 only):**
            - `as-IN`: Assamese
            - `ur-IN`: Urdu
            - `ne-IN`: Nepali
            - `kok-IN`: Konkani
            - `ks-IN`: Kashmiri
            - `sd-IN`: Sindhi
            - `sa-IN`: Sanskrit
            - `sat-IN`: Santali
            - `mni-IN`: Manipuri
            - `brx-IN`: Bodo
            - `mai-IN`: Maithili
            - `doi-IN`: Dogri

        model : typing.Optional[SpeechToTextStreamingModel]
            Specifies the model to use for speech-to-text conversion.

            - **saarika:v2.5** (default): Transcribes audio in the spoken language.

            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.

        mode : typing.Optional[SpeechToTextStreamingMode]
            Mode of operation. **Only applicable when using saaras:v3 model.**

            Example audio: 'à¤®à¥‡à¤°à¤¾ à¤«à¥‹à¤¨ à¤¨à¤‚à¤¬à¤° à¤¹à¥ˆ 9840950950'

            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
              - Output: `à¤®à¥‡à¤°à¤¾ à¤«à¥‹à¤¨ à¤¨à¤‚à¤¬à¤° à¤¹à¥ˆ 9840950950`

            - **translate**: Translates speech from any supported Indic language to English.
              - Output: `My phone number is 9840950950`

            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
              - Output: `à¤®à¥‡à¤°à¤¾ à¤«à¥‹à¤¨ à¤¨à¤‚à¤¬à¤° à¤¹à¥ˆ à¤¨à¥Œ à¤†à¤  à¤šà¤¾à¤° zero à¤¨à¥Œ à¤ªà¤¾à¤‚à¤š zero à¤¨à¥Œ à¤ªà¤¾à¤‚à¤š zero`

            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
              - Output: `mera phone number hai 9840950950`

            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
              - Output: `à¤®à¥‡à¤°à¤¾ phone number à¤¹à¥ˆ 9840950950`

        sample_rate : typing.Optional[str]
            Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.

        high_vad_sensitivity : typing.Optional[SpeechToTextStreamingHighVadSensitivity]
            Enable high VAD (Voice Activity Detection) sensitivity

        vad_signals : typing.Optional[SpeechToTextStreamingVadSignals]
            Enable VAD signals in response

        flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
            Signal to flush the audio buffer and finalize transcription

        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
            Audio codec/format of the input stream. Use this when sending raw PCM audio.
            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.

        api_subscription_key : typing.Optional[str]
            API subscription key for authentication

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        SpeechToTextStreamingSocketClient
        ú/speech-to-text/wsNúlanguage-coder"   r#   r$   r%   r&   r'   r(   ú?úApi-Subscription-KeyÚadditional_headers)r1   ©Ú	websocketé‘  ú/Websocket initialized with invalid credentials.©Ústatus_codeÚheadersÚbodyú8Unexpected error when initializing websocket connection.)r   Úget_environmentÚ
productionÚhttpxÚQueryParamsÚaddÚget_headersÚstrÚupdateÚwebsockets_sync_clientr   r   Ú
websocketsÚ
exceptionsÚInvalidStatusCoder7   r   Údict©r   r+   r"   r#   r$   r%   r&   r'   r(   r)   r*   Úws_urlÚquery_paramsr8   ÚprotocolÚexcr7   r   r   r   r      sX   €l
&ÿýý€øz&RawSpeechToTextStreamingClient.connect)Ú__name__Ú
__module__Ú__qualname__r   r   r   r   ÚtypingÚOptionalr   r   rA   r   r   r   r   r   ÚIteratorr   r   r   r   r   r   r      óF    ôýüûúùø	÷
öõôór   c                   @   r   )Ú#AsyncRawSpeechToTextStreamingClientr   c                C   r   r   r   r   r   r   r   r   ¶   r    z,AsyncRawSpeechToTextStreamingClient.__init__Nr!   r+   r"   r#   r$   r%   r&   r'   r(   r)   r*   r,   c       
      
   C  s¦  | j  ¡ jd }t ¡ }|dur| d|¡}|dur!| d|¡}|dur+| d|¡}|dur5| d|¡}|dur?| d|¡}|durI| d|¡}|durS| d	|¡}|dur]| d
|¡}|d|›  }| j  ¡ }|	durst|	ƒ|d< |
r€d|
v r€| |
d ¡ z+t	||d4 I dH š}t
|dV  W d  ƒI dH  W dS 1 I dH s¤w   Y  W dS  tjjyÒ } z|j}|dkrÅt|t|ƒdd‚t|t|ƒdd‚d}~ww )u›  
        WebSocket channel for real-time speech to text streaming.

        **Note:** This API Reference page is provided for informational purposes only.
        The Try It playground may not provide the best experience for streaming audio.
        For optimal streaming performance, please use the SDK or implement your own WebSocket client.

        Parameters
        ----------
        language_code : SpeechToTextStreamingLanguageCode
            Specifies the language of the input audio in BCP-47 format.

            **Available Options (saarika:v2.5):**
            - `unknown` (default): Use when the language is not known; the API will auto-detect.
            - `hi-IN`: Hindi
            - `bn-IN`: Bengali
            - `gu-IN`: Gujarati
            - `kn-IN`: Kannada
            - `ml-IN`: Malayalam
            - `mr-IN`: Marathi
            - `od-IN`: Odia
            - `pa-IN`: Punjabi
            - `ta-IN`: Tamil
            - `te-IN`: Telugu
            - `en-IN`: English

            **Additional Options (saaras:v3 only):**
            - `as-IN`: Assamese
            - `ur-IN`: Urdu
            - `ne-IN`: Nepali
            - `kok-IN`: Konkani
            - `ks-IN`: Kashmiri
            - `sd-IN`: Sindhi
            - `sa-IN`: Sanskrit
            - `sat-IN`: Santali
            - `mni-IN`: Manipuri
            - `brx-IN`: Bodo
            - `mai-IN`: Maithili
            - `doi-IN`: Dogri

        model : typing.Optional[SpeechToTextStreamingModel]
            Specifies the model to use for speech-to-text conversion.

            - **saarika:v2.5** (default): Transcribes audio in the spoken language.

            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.

        mode : typing.Optional[SpeechToTextStreamingMode]
            Mode of operation. **Only applicable when using saaras:v3 model.**

            Example audio: 'à¤®à¥‡à¤°à¤¾ à¤«à¥‹à¤¨ à¤¨à¤‚à¤¬à¤° à¤¹à¥ˆ 9840950950'

            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
              - Output: `à¤®à¥‡à¤°à¤¾ à¤«à¥‹à¤¨ à¤¨à¤‚à¤¬à¤° à¤¹à¥ˆ 9840950950`

            - **translate**: Translates speech from any supported Indic language to English.
              - Output: `My phone number is 9840950950`

            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
              - Output: `à¤®à¥‡à¤°à¤¾ à¤«à¥‹à¤¨ à¤¨à¤‚à¤¬à¤° à¤¹à¥ˆ à¤¨à¥Œ à¤†à¤  à¤šà¤¾à¤° zero à¤¨à¥Œ à¤ªà¤¾à¤‚à¤š zero à¤¨à¥Œ à¤ªà¤¾à¤‚à¤š zero`

            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
              - Output: `mera phone number hai 9840950950`

            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
              - Output: `à¤®à¥‡à¤°à¤¾ phone number à¤¹à¥ˆ 9840950950`

        sample_rate : typing.Optional[str]
            Audio sample rate for the WebSocket connection. When specified as a connection parameter, only 16kHz and 8kHz are supported. 8kHz is only available via this connection parameter. If not specified, defaults to 16kHz.

        high_vad_sensitivity : typing.Optional[SpeechToTextStreamingHighVadSensitivity]
            Enable high VAD (Voice Activity Detection) sensitivity

        vad_signals : typing.Optional[SpeechToTextStreamingVadSignals]
            Enable VAD signals in response

        flush_signal : typing.Optional[SpeechToTextStreamingFlushSignal]
            Signal to flush the audio buffer and finalize transcription

        input_audio_codec : typing.Optional[SpeechToTextStreamingInputAudioCodec]
            Audio codec/format of the input stream. Use this when sending raw PCM audio.
            Supported values: wav, pcm_s16le, pcm_l16, pcm_raw.

        api_subscription_key : typing.Optional[str]
            API subscription key for authentication

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        AsyncSpeechToTextStreamingSocketClient
        r-   Nr.   r"   r#   r$   r%   r&   r'   r(   r/   r0   r1   )Úextra_headersr2   r4   r5   r6   r:   )r   r;   r<   r=   r>   r?   r@   rA   rB   Úwebsockets_client_connectr
   rD   rE   rF   r7   r   rG   rH   r   r   r   r   ¹   sX   €l
2ÿýý€øz+AsyncRawSpeechToTextStreamingClient.connect)rM   rN   rO   r   r   r   r   rP   rQ   r   r   rA   r   r   r   r   r   ÚAsyncIteratorr
   r   r   r   r   r   rT   µ   rS   rT   ))rP   Ú
contextlibr   r   r=   Úwebsockets.exceptionsrD   Úwebsockets.sync.clientÚsyncÚclientrC   Úcore.api_errorr   Úcore.client_wrapperr   r   Úcore.request_optionsr   Úsocket_clientr
   r   Ú+types.speech_to_text_streaming_flush_signalr   Ú3types.speech_to_text_streaming_high_vad_sensitivityr   Ú0types.speech_to_text_streaming_input_audio_codecr   Ú,types.speech_to_text_streaming_language_coder   Ú#types.speech_to_text_streaming_moder   Ú$types.speech_to_text_streaming_modelr   Ú*types.speech_to_text_streaming_vad_signalsr   Úwebsockets.legacy.clientr   rV   ÚImportErrorr   rT   r   r   r   r   Ú<module>   s0   ÿ 