# This file was auto-generated by Fern from our API Definition.

import typing
from json.decoder import JSONDecodeError

from .. import core
from ..core.api_error import ApiError
from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper
from ..core.http_response import AsyncHttpResponse, HttpResponse
from ..core.pydantic_utilities import parse_obj_as
from ..core.request_options import RequestOptions
from ..errors.bad_request_error import BadRequestError
from ..errors.forbidden_error import ForbiddenError
from ..errors.internal_server_error import InternalServerError
from ..errors.service_unavailable_error import ServiceUnavailableError
from ..errors.too_many_requests_error import TooManyRequestsError
from ..errors.unprocessable_entity_error import UnprocessableEntityError
from ..types.input_audio_codec import InputAudioCodec
from ..types.mode import Mode
from ..types.speech_to_text_language import SpeechToTextLanguage
from ..types.speech_to_text_model import SpeechToTextModel
from ..types.speech_to_text_response import SpeechToTextResponse
from ..types.speech_to_text_translate_model import SpeechToTextTranslateModel
from ..types.speech_to_text_translate_response import SpeechToTextTranslateResponse

# this is used as the default value for optional parameters
OMIT = typing.cast(typing.Any, ...)


class RawSpeechToTextClient:
    def __init__(self, *, client_wrapper: SyncClientWrapper):
        self._client_wrapper = client_wrapper

    def transcribe(
        self,
        *,
        file: core.File,
        model: typing.Optional[SpeechToTextModel] = OMIT,
        mode: typing.Optional[Mode] = OMIT,
        language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
        input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> HttpResponse[SpeechToTextResponse]:
        """
        ## Speech to Text API

        This API transcribes speech to text in multiple Indian languages and English. Supports transcription for interactive applications.

        ### Available Options:
        - **REST API** (Current Endpoint): For quick responses under 30 seconds with immediate results
        - **Batch API**: For longer audio files, [Follow This Documentation](https://docs.sarvam.ai/api-reference-docs/api-guides-tutorials/speech-to-text/batch-api)
          - Supports diarization (speaker identification)

        ### Note:
        - Pricing differs for REST and Batch APIs
        - Diarization is only available in Batch API with separate pricing
        - Please refer to [here](https://docs.sarvam.ai/api-reference-docs/getting-started/pricing) for detailed pricing information

        Parameters
        ----------
        file : core.File
            See core.File for more documentation

        model : typing.Optional[SpeechToTextModel]
            Specifies the model to use for speech-to-text conversion.

            - **saarika:v2.5** (default): Transcribes audio in the spoken language.

            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.

        mode : typing.Optional[Mode]
            Mode of operation. **Only applicable when using saaras:v3 model.**

            Example audio: 'मेरा फोन नंबर है 9840950950'

            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
              - Output: `मेरा फोन नंबर है 9840950950`

            - **translate**: Translates speech from any supported Indic language to English.
              - Output: `My phone number is 9840950950`

            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`

            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
              - Output: `mera phone number hai 9840950950`

            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
              - Output: `मेरा phone number है 9840950950`

        language_code : typing.Optional[SpeechToTextLanguage]
            Specifies the language of the input audio in BCP-47 format.

            **Note:** This parameter is optional for `saarika:v2.5` model.

            **Available Options:**
            - `unknown`: Use when the language is not known; the API will auto-detect.
            - `hi-IN`: Hindi
            - `bn-IN`: Bengali
            - `kn-IN`: Kannada
            - `ml-IN`: Malayalam
            - `mr-IN`: Marathi
            - `od-IN`: Odia
            - `pa-IN`: Punjabi
            - `ta-IN`: Tamil
            - `te-IN`: Telugu
            - `en-IN`: English
            - `gu-IN`: Gujarati

            **Additional Options (saaras:v3 only):**
            - `as-IN`: Assamese
            - `ur-IN`: Urdu
            - `ne-IN`: Nepali
            - `kok-IN`: Konkani
            - `ks-IN`: Kashmiri
            - `sd-IN`: Sindhi
            - `sa-IN`: Sanskrit
            - `sat-IN`: Santali
            - `mni-IN`: Manipuri
            - `brx-IN`: Bodo
            - `mai-IN`: Maithili
            - `doi-IN`: Dogri

        input_audio_codec : typing.Optional[InputAudioCodec]
            Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        HttpResponse[SpeechToTextResponse]
            Successful Response
        """
        _response = self._client_wrapper.httpx_client.request(
            "speech-to-text",
            base_url=self._client_wrapper.get_environment().base,
            method="POST",
            data={
                "model": model,
                "mode": mode,
                "language_code": language_code,
                "input_audio_codec": input_audio_codec,
            },
            files={
                "file": file,
            },
            request_options=request_options,
            omit=OMIT,
            force_multipart=True,
        )
        try:
            if 200 <= _response.status_code < 300:
                _data = typing.cast(
                    SpeechToTextResponse,
                    parse_obj_as(
                        type_=SpeechToTextResponse,  # type: ignore
                        object_=_response.json(),
                    ),
                )
                return HttpResponse(response=_response, data=_data)
            if _response.status_code == 400:
                raise BadRequestError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 403:
                raise ForbiddenError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 422:
                raise UnprocessableEntityError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 429:
                raise TooManyRequestsError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 500:
                raise InternalServerError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 503:
                raise ServiceUnavailableError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            _response_json = _response.json()
        except JSONDecodeError:
            raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
        raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)

    def translate(
        self,
        *,
        file: core.File,
        prompt: typing.Optional[str] = OMIT,
        model: typing.Optional[SpeechToTextTranslateModel] = OMIT,
        input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> HttpResponse[SpeechToTextTranslateResponse]:
        """
        ## Speech to Text Translation API

        This API automatically detects the input language, transcribes the speech, and translates the text to English.

        ### Available Options:
        - **REST API** (Current Endpoint): For quick responses under 30 seconds with immediate results
        - **Batch API**: For longer audio files [Follow this documentation](https://docs.sarvam.ai/api-reference-docs/api-guides-tutorials/speech-to-text/batch-api)
          - Supports diarization (speaker identification)

        ### Note:
        - Pricing differs for REST and Batch APIs
        - Diarization is only available in Batch API with separate pricing
        - Please refer to [here](https://docs.sarvam.ai/api-reference-docs/getting-started/pricing) for detailed pricing information

        Parameters
        ----------
        file : core.File
            See core.File for more documentation

        prompt : typing.Optional[str]
            Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.

        model : typing.Optional[SpeechToTextTranslateModel]
            Model to be used for speech to text translation.

            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
              - Example: Hindi audio → English text output

        input_audio_codec : typing.Optional[InputAudioCodec]
            Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        HttpResponse[SpeechToTextTranslateResponse]
            Successful Response
        """
        _response = self._client_wrapper.httpx_client.request(
            "speech-to-text-translate",
            base_url=self._client_wrapper.get_environment().base,
            method="POST",
            data={
                "prompt": prompt,
                "model": model,
                "input_audio_codec": input_audio_codec,
            },
            files={
                "file": file,
            },
            request_options=request_options,
            omit=OMIT,
            force_multipart=True,
        )
        try:
            if 200 <= _response.status_code < 300:
                _data = typing.cast(
                    SpeechToTextTranslateResponse,
                    parse_obj_as(
                        type_=SpeechToTextTranslateResponse,  # type: ignore
                        object_=_response.json(),
                    ),
                )
                return HttpResponse(response=_response, data=_data)
            if _response.status_code == 400:
                raise BadRequestError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 403:
                raise ForbiddenError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 422:
                raise UnprocessableEntityError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 429:
                raise TooManyRequestsError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 500:
                raise InternalServerError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 503:
                raise ServiceUnavailableError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            _response_json = _response.json()
        except JSONDecodeError:
            raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
        raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)


class AsyncRawSpeechToTextClient:
    def __init__(self, *, client_wrapper: AsyncClientWrapper):
        self._client_wrapper = client_wrapper

    async def transcribe(
        self,
        *,
        file: core.File,
        model: typing.Optional[SpeechToTextModel] = OMIT,
        mode: typing.Optional[Mode] = OMIT,
        language_code: typing.Optional[SpeechToTextLanguage] = OMIT,
        input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> AsyncHttpResponse[SpeechToTextResponse]:
        """
        ## Speech to Text API

        This API transcribes speech to text in multiple Indian languages and English. Supports transcription for interactive applications.

        ### Available Options:
        - **REST API** (Current Endpoint): For quick responses under 30 seconds with immediate results
        - **Batch API**: For longer audio files, [Follow This Documentation](https://docs.sarvam.ai/api-reference-docs/api-guides-tutorials/speech-to-text/batch-api)
          - Supports diarization (speaker identification)

        ### Note:
        - Pricing differs for REST and Batch APIs
        - Diarization is only available in Batch API with separate pricing
        - Please refer to [here](https://docs.sarvam.ai/api-reference-docs/getting-started/pricing) for detailed pricing information

        Parameters
        ----------
        file : core.File
            See core.File for more documentation

        model : typing.Optional[SpeechToTextModel]
            Specifies the model to use for speech-to-text conversion.

            - **saarika:v2.5** (default): Transcribes audio in the spoken language.

            - **saaras:v3**: State-of-the-art model with flexible output formats. Supports multiple modes via the `mode` parameter: transcribe, translate, verbatim, translit, codemix.

        mode : typing.Optional[Mode]
            Mode of operation. **Only applicable when using saaras:v3 model.**

            Example audio: 'मेरा फोन नंबर है 9840950950'

            - **transcribe** (default): Standard transcription in the original language with proper formatting and number normalization.
              - Output: `मेरा फोन नंबर है 9840950950`

            - **translate**: Translates speech from any supported Indic language to English.
              - Output: `My phone number is 9840950950`

            - **verbatim**: Exact word-for-word transcription without normalization, preserving filler words and spoken numbers as-is.
              - Output: `मेरा फोन नंबर है नौ आठ चार zero नौ पांच zero नौ पांच zero`

            - **translit**: Romanization - Transliterates speech to Latin/Roman script only.
              - Output: `mera phone number hai 9840950950`

            - **codemix**: Code-mixed text with English words in English and Indic words in native script.
              - Output: `मेरा phone number है 9840950950`

        language_code : typing.Optional[SpeechToTextLanguage]
            Specifies the language of the input audio in BCP-47 format.

            **Note:** This parameter is optional for `saarika:v2.5` model.

            **Available Options:**
            - `unknown`: Use when the language is not known; the API will auto-detect.
            - `hi-IN`: Hindi
            - `bn-IN`: Bengali
            - `kn-IN`: Kannada
            - `ml-IN`: Malayalam
            - `mr-IN`: Marathi
            - `od-IN`: Odia
            - `pa-IN`: Punjabi
            - `ta-IN`: Tamil
            - `te-IN`: Telugu
            - `en-IN`: English
            - `gu-IN`: Gujarati

            **Additional Options (saaras:v3 only):**
            - `as-IN`: Assamese
            - `ur-IN`: Urdu
            - `ne-IN`: Nepali
            - `kok-IN`: Konkani
            - `ks-IN`: Kashmiri
            - `sd-IN`: Sindhi
            - `sa-IN`: Sanskrit
            - `sat-IN`: Santali
            - `mni-IN`: Manipuri
            - `brx-IN`: Bodo
            - `mai-IN`: Maithili
            - `doi-IN`: Dogri

        input_audio_codec : typing.Optional[InputAudioCodec]
            Input Audio codec/format of the input file. PCM files are supported only at 16kHz sample rate.

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        AsyncHttpResponse[SpeechToTextResponse]
            Successful Response
        """
        _response = await self._client_wrapper.httpx_client.request(
            "speech-to-text",
            base_url=self._client_wrapper.get_environment().base,
            method="POST",
            data={
                "model": model,
                "mode": mode,
                "language_code": language_code,
                "input_audio_codec": input_audio_codec,
            },
            files={
                "file": file,
            },
            request_options=request_options,
            omit=OMIT,
            force_multipart=True,
        )
        try:
            if 200 <= _response.status_code < 300:
                _data = typing.cast(
                    SpeechToTextResponse,
                    parse_obj_as(
                        type_=SpeechToTextResponse,  # type: ignore
                        object_=_response.json(),
                    ),
                )
                return AsyncHttpResponse(response=_response, data=_data)
            if _response.status_code == 400:
                raise BadRequestError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 403:
                raise ForbiddenError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 422:
                raise UnprocessableEntityError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 429:
                raise TooManyRequestsError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 500:
                raise InternalServerError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 503:
                raise ServiceUnavailableError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            _response_json = _response.json()
        except JSONDecodeError:
            raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
        raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)

    async def translate(
        self,
        *,
        file: core.File,
        prompt: typing.Optional[str] = OMIT,
        model: typing.Optional[SpeechToTextTranslateModel] = OMIT,
        input_audio_codec: typing.Optional[InputAudioCodec] = OMIT,
        request_options: typing.Optional[RequestOptions] = None,
    ) -> AsyncHttpResponse[SpeechToTextTranslateResponse]:
        """
        ## Speech to Text Translation API

        This API automatically detects the input language, transcribes the speech, and translates the text to English.

        ### Available Options:
        - **REST API** (Current Endpoint): For quick responses under 30 seconds with immediate results
        - **Batch API**: For longer audio files [Follow this documentation](https://docs.sarvam.ai/api-reference-docs/api-guides-tutorials/speech-to-text/batch-api)
          - Supports diarization (speaker identification)

        ### Note:
        - Pricing differs for REST and Batch APIs
        - Diarization is only available in Batch API with separate pricing
        - Please refer to [here](https://docs.sarvam.ai/api-reference-docs/getting-started/pricing) for detailed pricing information

        Parameters
        ----------
        file : core.File
            See core.File for more documentation

        prompt : typing.Optional[str]
            Conversation context can be passed as a prompt to boost model accuracy. However, the current system is at an experimentation stage and doesn't match the prompt performance of large language models.

        model : typing.Optional[SpeechToTextTranslateModel]
            Model to be used for speech to text translation.

            - **saaras:v2.5** (default): Translation model that translates audio from any spoken Indic language to English.
              - Example: Hindi audio → English text output

        input_audio_codec : typing.Optional[InputAudioCodec]
            Audio codec/format of the input file. Our API automatically detects all codec formats, but for PCM files specifically (pcm_s16le, pcm_l16, pcm_raw), you must pass this parameter. PCM files are supported only at 16kHz sample rate.

        request_options : typing.Optional[RequestOptions]
            Request-specific configuration.

        Returns
        -------
        AsyncHttpResponse[SpeechToTextTranslateResponse]
            Successful Response
        """
        _response = await self._client_wrapper.httpx_client.request(
            "speech-to-text-translate",
            base_url=self._client_wrapper.get_environment().base,
            method="POST",
            data={
                "prompt": prompt,
                "model": model,
                "input_audio_codec": input_audio_codec,
            },
            files={
                "file": file,
            },
            request_options=request_options,
            omit=OMIT,
            force_multipart=True,
        )
        try:
            if 200 <= _response.status_code < 300:
                _data = typing.cast(
                    SpeechToTextTranslateResponse,
                    parse_obj_as(
                        type_=SpeechToTextTranslateResponse,  # type: ignore
                        object_=_response.json(),
                    ),
                )
                return AsyncHttpResponse(response=_response, data=_data)
            if _response.status_code == 400:
                raise BadRequestError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 403:
                raise ForbiddenError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 422:
                raise UnprocessableEntityError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 429:
                raise TooManyRequestsError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 500:
                raise InternalServerError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            if _response.status_code == 503:
                raise ServiceUnavailableError(
                    headers=dict(_response.headers),
                    body=typing.cast(
                        typing.Optional[typing.Any],
                        parse_obj_as(
                            type_=typing.Optional[typing.Any],  # type: ignore
                            object_=_response.json(),
                        ),
                    ),
                )
            _response_json = _response.json()
        except JSONDecodeError:
            raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text)
        raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json)