o
    iUG                  
   @   s  d Z ddlZddlmZmZ ddlmZ ddlmZm	Z	 ddl
ZddlmZ ddlmZmZ ddlmZmZmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ ddl m!Z! erzddl"m#Z# W n  e$y Z% ze&de%  e&d e'de% dZ%[%ww zddl(Z(W n  e$y Z% ze&de%  e&d e'de% dZ%[%ww G dd deZ)G dd deZ*dede	e+ fddZ,eG dd deZ-eG dd deZ.G dd  d eZ/G d!d" d"e/Z0dS )#zWhisper speech-to-text services with locally-downloaded models.

This module implements Whisper transcription using locally-downloaded models,
supporting both Faster Whisper and MLX Whisper backends for efficient inference.
    N)	dataclassfield)Enum)AsyncGeneratorOptional)logger)TYPE_CHECKINGoverride)
ErrorFrameFrameTranscriptionFrame)	NOT_GIVENSTTSettings	_NotGiven)SegmentedSTTService)Languageresolve_language)time_now_iso8601)
traced_sttWhisperModelException: GIn order to use Whisper, you need to `pip install pipecat-ai[whisper]`.zMissing module: zKIn order to use Whisper, you need to `pip install pipecat-ai[mlx-whisper]`.c                   @   s0   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
S )Modela   Whisper model selection options for Faster Whisper.

    Provides various model sizes and specializations for speech recognition,
    balancing quality and performance based on use case requirements.

    Parameters:
        TINY: Smallest multilingual model, fastest inference.
        BASE: Basic multilingual model, good speed/quality balance.
        SMALL: Small multilingual model, better speed/quality balance than BASE.
        MEDIUM: Medium-sized multilingual model, better quality.
        LARGE: Best quality multilingual model, slower inference.
        LARGE_V3_TURBO: Fast multilingual model, slightly lower quality than LARGE.
        DISTIL_LARGE_V2: Fast multilingual distilled model.
        DISTIL_MEDIUM_EN: Fast English-only distilled model.
    tinybasesmallmediumzlarge-v3z)deepdml/faster-whisper-large-v3-turbo-ct2z&Systran/faster-distil-whisper-large-v2z'Systran/faster-distil-whisper-medium.enN)__name__
__module____qualname____doc__TINYBASESMALLMEDIUMLARGELARGE_V3_TURBODISTIL_LARGE_V2DISTIL_MEDIUM_EN r*   r*   P/home/ubuntu/.local/lib/python3.10/site-packages/pipecat/services/whisper/stt.pyr   -   s    r   c                   @   s(   e Zd ZdZdZdZdZdZdZdZ	dS )	MLXModelas  MLX Whisper model selection options for Apple Silicon.

    Provides various model sizes optimized for Apple Silicon hardware,
    including quantized variants for improved performance.

    Parameters:
        TINY: Smallest multilingual model for MLX.
        MEDIUM: Medium-sized multilingual model for MLX.
        LARGE_V3: Best quality multilingual model for MLX.
        LARGE_V3_TURBO: Finetuned, pruned Whisper large-v3, much faster with slightly lower quality.
        DISTIL_LARGE_V3: Fast multilingual distilled model for MLX.
        LARGE_V3_TURBO_Q4: LARGE_V3_TURBO quantized to Q4 for reduced memory usage.
    zmlx-community/whisper-tinyz mlx-community/whisper-medium-mlxz"mlx-community/whisper-large-v3-mlxz$mlx-community/whisper-large-v3-turboz%mlx-community/distil-whisper-large-v3z'mlx-community/whisper-large-v3-turbo-q4N)
r   r   r    r!   r"   r%   LARGE_V3r'   DISTIL_LARGE_V3LARGE_V3_TURBO_Q4r*   r*   r*   r+   r,   K   s    r,   languagereturnc                 C   s   i t jdt jdt jdt jdt jdt jdt jdt jdt j	d	t j
d
t jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdt jdi}t| |dd S )!a@  Maps pipecat Language enum to Whisper language codes.

    Args:
        language: A Language enum value representing the input language.

    Returns:
        str or None: The corresponding Whisper language code, or None if not supported.

    Note:
        Only includes languages officially supported by Whisper.
    arbncsdadeelenesfafifrhihuiditjakonlplptrorusksvthtrukurvizhT)use_base_code) r   ARBNCSDADEELENESFAFIFRHIHUIDITJAKONLPLPTRORUSKSVTHTRUKURVIZHr   )r0   LANGUAGE_MAPr*   r*   r+   language_to_whisper_languagec   sd   
 "$?rp   c                   @   s,   e Zd ZU dZedd dZeeB ed< dS )WhisperSTTSettingszSettings for WhisperSTTService.

    Parameters:
        no_speech_prob: Probability threshold for filtering non-speech segments.
    c                   C      t S Nr   r*   r*   r*   r+   <lambda>       zWhisperSTTSettings.<lambda>default_factoryno_speech_probN)	r   r   r    r!   r   ry   floatr   __annotations__r*   r*   r*   r+   rq      s   
 rq   c                   @   s`   e Zd ZU dZedd dZeeB ed< edd dZ	eeB ed< edd dZ
eeB ed	< d
S )WhisperMLXSTTSettingszSettings for WhisperMLXSTTService.

    Parameters:
        no_speech_prob: Probability threshold for filtering non-speech segments.
        temperature: Sampling temperature (0.0-1.0).
        engine: Whisper engine identifier.
    c                   C   rr   rs   rt   r*   r*   r*   r+   ru      rv   zWhisperMLXSTTSettings.<lambda>rw   ry   c                   C   rr   rs   rt   r*   r*   r*   r+   ru      rv   temperaturec                   C   rr   rs   rt   r*   r*   r*   r+   ru      rv   engineN)r   r   r    r!   r   ry   rz   r   r{   r}   r~   strr*   r*   r*   r+   r|      s
   
 r|   c                       s   e Zd ZU dZeZeed< ddddddddeee	B  ded	ed
ee
 dee dee f fddZdefddZdedee fddZdd Ze	ddededee fddZdedeedf fddZ  ZS )WhisperSTTServicezClass to transcribe audio with a locally-downloaded Whisper model.

    This service uses Faster Whisper to perform speech-to-text transcription on audio
    segments. It supports multiple languages and various model sizes.
    	_settingsNautodefault)modeldevicecompute_typery   r0   settingsr   r   r   ry   r0   r   c          	         s   | j tjjtjdd}|dur | dd t|tr|n|j|_	|dur-| dd ||_
|dur:| dd ||_|durC|| t jdd|i| || _|| _d| _|   dS )	a  Initialize the Whisper STT service.

        Args:
            model: The Whisper model to use for transcription. Can be a Model enum or string.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTService.Settings(model=...)`` instead.

            device: The device to run inference on ('cpu', 'cuda', or 'auto').
                Defaults to ``"auto"``.
            compute_type: The compute type for inference ('default', 'int8',
                'int8_float16', etc.). Defaults to ``"default"``.
            no_speech_prob: Probability threshold for filtering out non-speech segments.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTService.Settings(no_speech_prob=...)`` instead.

            language: The default language for transcription.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTService.Settings(language=...)`` instead.

            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            **kwargs: Additional arguments passed to SegmentedSTTService.
        g?)r   r0   ry   Nr   ry   r0   r   r*   )Settingsr   r)   valuer   rW   "_warn_init_param_moved_to_settings
isinstancer   r   ry   r0   apply_updatesuper__init___device_compute_type_model_load)	selfr   r   r   ry   r0   r   kwargsdefault_settings	__class__r*   r+   r      s2   &

zWhisperSTTService.__init__r1   c                 C      dS )zIndicates whether this service can generate metrics.

        Returns:
            bool: True, as this service supports metric generation.
        Tr*   r   r*   r*   r+   can_generate_metrics  s   z&WhisperSTTService.can_generate_metricsc                 C   s   t |S )zConvert from pipecat Language to Whisper language code.

        Args:
            language: The Language enum value to convert.

        Returns:
            str or None: The corresponding Whisper language code, or None if not supported.
        )rp   )r   r0   r*   r*   r+   language_to_service_language'  s   	z.WhisperSTTService.language_to_service_languagec              
   C   s   zddl m} td || jj| j| jd| _td W dS  t	yB } zt
d|  t
d d| _W Y d}~dS d}~ww )	zLoads the Whisper model.

        Note:
            If this is the first time this model is being run,
            it will take time to download from the Hugging Face model hub.
        r   r   zLoading Whisper model...)r   r   zLoaded Whisper modelr   r   N)faster_whisperr   r   debugr   r   r   r   r   ModuleNotFoundErrorerror)r   r   er*   r*   r+   r   2  s   

zWhisperSTTService._load
transcriptis_finalc                       dS z+Handle a transcription result with tracing.Nr*   r   r   r   r0   r*   r*   r+   _handle_transcriptionF     z'WhisperSTTService._handle_transcriptionaudioc                 C  s   | j stdV  dS |  I dH  tj|tjdtjd }tj	| j j
|| jjdI dH \}}d}|D ]}|j| jjk rF||j d7 }q5|  I dH  |rs| |d| jjI dH  td	| d
 t|| jt | jjV  dS dS )a  Transcribe audio data using Whisper.

        Args:
            audio: Raw audio bytes in 16-bit PCM format.

        Yields:
            Frame: Either a TranscriptionFrame containing the transcribed text
                  or an ErrorFrame if transcription fails.

        Note:
            The audio is expected to be 16-bit signed PCM data.
            The service will normalize it to float32 in the range [-1, 1].
        zWhisper model not availableNdtype      @)r0     TTranscription: [])r   r
   start_processing_metricsnp
frombufferint16astypefloat32asyncio	to_thread
transcriber   r0   ry   textstop_processing_metricsr   r   r   r   _user_idr   )r   r   audio_floatsegments_r   segmentr*   r*   r+   run_sttM  s2   

zWhisperSTTService.run_sttrs   )r   r   r    r!   rq   r   r{   r   r   r   rz   r   r   boolr   r   r   r   r   bytesr   r   r   __classcell__r*   r*   r   r+   r      sH   
 
J"r   c                   @   s   e Zd ZU dZeZeed< dddddddeee	B  dee
 dee dee
 d	ee f
d
dZedd Ze	ddededee fddZededeedf fddZdS )WhisperSTTServiceMLXzSubclass of `WhisperSTTService` with MLX Whisper model support.

    This service uses MLX Whisper to perform speech-to-text transcription on audio
    segments. It's optimized for Apple Silicon and supports multiple languages and quantizations.
    r   N)r   ry   r0   r}   r   r   ry   r0   r}   r   c                K   s   | j tjjtjdddd}|dur"| dd t|tr|n|j|_	|dur/| dd ||_
|dur<| dd ||_|durI| d	d	 ||_|durR|| tj| fd
|i| dS )a  Initialize the MLX Whisper STT service.

        Args:
            model: The MLX Whisper model to use for transcription. Can be an MLXModel enum or string.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTServiceMLX.Settings(model=...)`` instead.

            no_speech_prob: Probability threshold for filtering out non-speech segments.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTServiceMLX.Settings(no_speech_prob=...)`` instead.

            language: The default language for transcription.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTServiceMLX.Settings(language=...)`` instead.

            temperature: Temperature for sampling. Can be a float or tuple of floats.

                .. deprecated:: 0.0.105
                    Use ``settings=WhisperSTTServiceMLX.Settings(temperature=...)`` instead.

            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            **kwargs: Additional arguments passed to SegmentedSTTService.
        g333333?        mlx)r   r0   ry   r}   r~   Nr   ry   r0   r}   r   )r   r,   r"   r   r   rW   r   r   r   r   ry   r0   r}   r   r   r   )r   r   ry   r0   r}   r   r   r   r*   r*   r+   r     s8   &	

zWhisperSTTServiceMLX.__init__c                 C   r   )z7MLX Whisper loads models on demand, so this is a no-op.Nr*   r   r*   r*   r+   r     s   zWhisperSTTServiceMLX._loadr   r   c                    r   r   r*   r   r*   r*   r+   r     r   z*WhisperSTTServiceMLX._handle_transcriptionr   r1   c              
   C  s\  zddl }|  I dH  tj|tjdtjd }tj|j	|| j
j| j
j| j
jdI dH }d}|dg D ] }|ddd	krBq7|d
d| j
jk rW||dd d7 }q7t| dkrbd}|  I dH  |r| |d| j
jI dH  td| d t|| jt | j
jV  W dS W dS  ty } ztd| dV  W Y d}~dS d}~ww )a  Transcribe audio data using MLX Whisper.

        The audio is expected to be 16-bit signed PCM data.
        MLX Whisper will handle the conversion internally.

        Args:
            audio: Raw audio bytes in 16-bit PCM format.

        Yields:
            Frame: Either a TranscriptionFrame containing the transcribed text
                  or an ErrorFrame if transcription fails.
        r   Nr   r   )path_or_hf_repor}   r0   r   r   compression_ratiogrq?ry   r   r   r   Tr   r   zUnknown error occurred: )r   )mlx_whisperr   r   r   r   r   r   r   r   r   r   r   r}   r0   getry   lenstripr   r   r   r   r   r   r   	Exceptionr
   )r   r   r   r   chunkr   r   r   r*   r*   r+   r     sH   
 zWhisperSTTServiceMLX.run_sttrs   )r   r   r    r!   r|   r   r{   r   r   r,   rz   r   r   r	   r   r   r   r   r   r   r   r   r*   r*   r*   r+   r   y  sB   
 

K
 r   )1r!   r   dataclassesr   r   enumr   typingr   r   numpyr   logurur   typing_extensionsr   r	   pipecat.frames.framesr
   r   r   pipecat.services.settingsr   r   r   pipecat.services.stt_servicer   pipecat.transcriptions.languager   r   pipecat.utils.timer   (pipecat.utils.tracing.service_decoratorsr   r   r   r   r   r   r   r   r   r,   r   rp   rq   r|   r   r   r*   r*   r*   r+   <module>   sR   

N
 /