o
    ie                  
   @   s  d Z ddlZddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZ dd	lmZmZmZ dd
lmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z) zddl*Z+W n  e,y Z- ze.de-  e.d e/de- dZ-[-ww de$dee0 fddZ1eG dd deZ2eG dd deZ3G dd de"Z4G dd de!Z5dS )zYNVIDIA Riva Speech-to-Text service implementations for real-time and batch transcription.    N)CancelledError)	dataclassfield)AnyAsyncGeneratorListMappingOptional)logger)	BaseModel)CancelFrameEndFrame
ErrorFrameFrameInterimTranscriptionFrame
StartFrameTranscriptionFrame)	NOT_GIVENSTTSettings	_NotGiven)NVIDIA_TTFS_P99)SegmentedSTTService
STTService)Languageresolve_language)time_now_iso8601)
traced_sttzException: zNIn order to use NVIDIA Riva STT, you need to `pip install pipecat-ai[nvidia]`.zMissing module: languagereturnc                 C   s   i t jdt jdt jdt jdt jdt jdt jdt jdt j	dt j
dt jdt jdt jdt jdt jd	t jd	t jd
t jd
t jdt jdt jdt jdt jdi}t| |ddS )a@  Maps Language enum to NVIDIA Riva ASR language codes.

    Source:
    https://docs.nvidia.com/deeplearning/riva/user-guide/docs/asr/asr-riva-build-table.html?highlight=fr%20fr

    Args:
        language: Language enum value.

    Returns:
        Optional[str]: NVIDIA Riva language code or None if not supported.
    zar-ARen-USzen-GBzfr-FRzde-DEzhi-INzit-ITzja-JPzko-KRzpt-BRzru-RUzes-ESzes-USF)use_base_code)r   ARENEN_USEN_GBFRFR_FRDEDE_DEHIHI_INITIT_ITJAJA_JPKOKO_KRPTPT_BRRURU_RUESES_ESES_USr   )r   LANGUAGE_MAP r9   O/home/ubuntu/.local/lib/python3.10/site-packages/pipecat/services/nvidia/stt.py language_to_nvidia_riva_language*   sV   	%r;   c                   @   s   e Zd ZdZdS )NvidiaSTTSettingszSettings for NvidiaSTTService.N)__name__
__module____qualname____doc__r9   r9   r9   r:   r<   ^   s    r<   c                   @   s   e Zd ZU dZedd dZeeB ed< edd dZ	eeB ed< edd dZ
eeB ed	< ed
d dZee dB eB ed< edd dZeeB ed< dS )NvidiaSegmentedSTTSettingsa  Settings for NvidiaSegmentedSTTService.

    Parameters:
        profanity_filter: Whether to filter profanity from results.
        automatic_punctuation: Whether to add automatic punctuation.
        verbatim_transcripts: Whether to return verbatim transcripts.
        boosted_lm_words: List of words to boost in language model.
        boosted_lm_score: Score boost for specified words.
    c                   C      t S Nr   r9   r9   r9   r:   <lambda>q       z#NvidiaSegmentedSTTSettings.<lambda>)default_factoryprofanity_filterc                   C   rB   rC   rD   r9   r9   r9   r:   rE   r   rF   automatic_punctuationc                   C   rB   rC   rD   r9   r9   r9   r:   rE   s   rF   verbatim_transcriptsc                   C   rB   rC   rD   r9   r9   r9   r:   rE   t   rF   Nboosted_lm_wordsc                   C   rB   rC   rD   r9   r9   r9   r:   rE   u   rF   boosted_lm_score)r=   r>   r?   r@   r   rH   boolr   __annotations__rI   rJ   rK   r   strrL   floatr9   r9   r9   r:   rA   e   s   
 
"rA   c                       sd  e Zd ZU dZeZeed< G dd deZddddd	d	d
d	e	dde
de
dee
e
f dee dee dedee dee f fddZdd Zdd ZdefddZde
fddZd ef fd!d"Zd ef fd#d$Zd ef fd%d&Zd'd( Zd)d* Zd+d, Ze		d;d-e
d.ed/ee fd0d1Z d2d3 Z!d4e"de#e$d	f fd5d6Z%de"fd7d8Z&d9d: Z'  Z(S )<NvidiaSTTServicea  Real-time speech-to-text service using NVIDIA Riva streaming ASR.

    Provides real-time transcription capabilities using NVIDIA's Riva ASR models
    through streaming recognition. Supports interim results and continuous audio
    processing for low-latency applications.
    	_settingsc                   @   s$   e Zd ZU dZejZee ed< dS )zNvidiaSTTService.InputParamsa  Configuration parameters for NVIDIA Riva STT service.

        .. deprecated:: 0.0.105
            Use ``settings=NvidiaSTTService.Settings(...)`` instead.

        Parameters:
            language: Target language for transcription. Defaults to EN_US.
        r   N)	r=   r>   r?   r@   r   r#   r   r	   rN   r9   r9   r9   r:   InputParams   s   
 	rS   grpc.nvcf.nvidia.com:443z$1598d209-5e27-4d3c-8079-4751568b1081zparakeet-ctc-1.1b-asrfunction_id
model_nameNTservermodel_function_mapsample_rateparamsuse_sslsettingsttfs_p99_latencyapi_keyrY   rZ   r[   r\   r]   r^   r_   c                   s   | j |dtjd}
|dur| d |s|j|
_|dur#|
| t jd
|||
d|	 || _	|| _
|| _d| _d| _d| _d| _d| _d| _d| _|d	| _d| _d| _d| _d| _dS )a/  Initialize the NVIDIA Riva STT service.

        Args:
            api_key: NVIDIA API key for authentication.
            server: NVIDIA Riva server address. Defaults to NVIDIA Cloud Function endpoint.
            model_function_map: Mapping containing 'function_id' and 'model_name' for the ASR model.
            sample_rate: Audio sample rate in Hz. If None, uses pipeline default.
            params: Additional configuration parameters for NVIDIA Riva.

                .. deprecated:: 0.0.105
                    Use ``settings=NvidiaSTTService.Settings(...)`` instead.

            use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True.
            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
                Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
            **kwargs: Additional arguments passed to STTService.
        rW   )modelr   Nr\   r[   r_   r^          rV   r9   )Settingsgetr   r#   "_warn_init_param_moved_to_settingsr   apply_updatesuper__init___server_api_key_use_ssl_start_history_start_threshold_stop_history_stop_threshold_stop_history_eou_stop_threshold_eou_custom_configuration_function_id_asr_service_queue_config_thread_taskselfr`   rY   rZ   r[   r\   r]   r^   r_   kwargsdefault_settings	__class__r9   r:   rk      s@   $


zNvidiaSTTService.__init__c                 C   sB   d| j gdd| j gg}tjd | j| j|}tj|| _d S )Nfunction-idauthorizationBearer )	rv   rm   rivaclientAuthrn   rl   
ASRServicerw   r|   metadataauthr9   r9   r:   _initialize_client   s
   z#NvidiaSTTService._initialize_clientc                 C   sn   t jjt jjt jjj| jjddddd| jdd	dd}t j	|| j
| j| j| j| j| j t j|| j |S )5Create the NVIDIA Riva ASR recognition configuration.re      FT)	encodinglanguage_codera   max_alternativesrH   enable_automatic_punctuationrJ   sample_rate_hertzaudio_channel_count)configinterim_results)r   r   StreamingRecognitionConfigRecognitionConfigAudioEncoding
LINEAR_PCMrR   r   r[   !add_endpoint_parameters_to_configro   rp   rq   rs   rr   rt   "add_custom_configuration_to_configru   )r|   r   r9   r9   r:   _create_recognition_config   s2   	z+NvidiaSTTService._create_recognition_configr   c                 C      dS )zCheck if this service can generate processing metrics.

        Returns:
            False - this service does not support metrics generation.
        Fr9   r|   r9   r9   r:   can_generate_metrics     z%NvidiaSTTService.can_generate_metricsra   c                    sR   ddl }|  |d |jdtdd W d   dS 1 s"w   Y  dS )a  Set the ASR model for transcription.

        .. deprecated:: 0.0.104
            Model cannot be changed after initialization for NVIDIA Riva streaming STT.
            Set model and function id in the constructor instead, e.g.::

                NvidiaSTTService(
                    api_key=...,
                    model_function_map={"function_id": "<UUID>", "model_name": "<model_name>"},
                )

        Args:
            model: Model name to set.
        r   Nalwaysa  'set_model' is deprecated. Model cannot be changed after initialization for NVIDIA Riva streaming STT. Set model and function id in the constructor instead, e.g.: NvidiaSTTService(api_key=..., model_function_map={'function_id': '<UUID>', 'model_name': '<model_name>'})   )
stacklevel)warningscatch_warningssimplefilterwarnDeprecationWarning)r|   ra   r   r9   r9   r:   	set_model  s   

"zNvidiaSTTService.set_modelframec                    s^   t  |I dH  |   |  | _t | _| js#| 	| 
 | _td| jj  dS )zStart the NVIDIA Riva STT service and initialize streaming configuration.

        Args:
            frame: StartFrame indicating pipeline start.
        Nz)Initialized NvidiaSTTService with model: )rj   startr   r   ry   asyncioQueuerx   rz   create_task_thread_task_handlerr
   debugrR   ra   r|   r   r   r9   r:   r   (  s   

zNvidiaSTTService.startc                    &   t  |I dH  |  I dH  dS )zStop the NVIDIA Riva STT service and clean up resources.

        Args:
            frame: EndFrame indicating pipeline stop.
        N)rj   stop_stop_tasksr   r   r9   r:   r   9     zNvidiaSTTService.stopc                    r   )zCancel the NVIDIA Riva STT service operation.

        Args:
            frame: CancelFrame indicating operation cancellation.
        N)rj   cancelr   r   r   r9   r:   r   B  r   zNvidiaSTTService.cancelc                    s(   | j r| | j I d H  d | _ d S d S rC   )rz   cancel_taskr   r9   r9   r:   r   K  s
   
zNvidiaSTTService._stop_tasksc                 C   s>   | j j| | jd}|D ]}|jsqt| ||   qd S )N)audio_chunksstreaming_config)rw   streaming_response_generatorry   resultsr   run_coroutine_threadsafe_handle_responseget_event_loop)r|   	responsesresponser9   r9   r:   _response_handlerP  s   z"NvidiaSTTService._response_handlerc                    s:   zd| _ t| jI d H  W d S  tjy   d| _  w )NTF)_thread_runningr   	to_threadr   r   r   r9   r9   r:   r   Z  s   z%NvidiaSTTService._thread_task_handler
transcriptis_finalr   c                       dS z+Handle a transcription result with tracing.Nr9   r|   r   r   r   r9   r9   r:   _handle_transcriptionb     z&NvidiaSTTService._handle_transcriptionc              
      s   |j D ]U}|r|jsq|jd j}|rYt|dkrY|jrF|  I d H  | t|| jt	 | j
j|dI d H  | j||j| j
jdI d H  q| t|| jt	 | j
j|dI d H  qd S )Nr   )result)r   r   r   )r   alternativesr   lenr   stop_processing_metrics
push_framer   _user_idr   rR   r   r   r   )r|   r   r   r   r9   r9   r:   r   i  sB   


	
z!NvidiaSTTService._handle_responseaudioc                 C  s,   |   I dH  | j|I dH  dV  dS )zProcess audio data for speech-to-text transcription.

        Args:
            audio: Raw audio bytes to transcribe.

        Yields:
            None - transcription results are pushed to the pipeline via frames.
        N)start_processing_metricsrx   put)r|   r   r9   r9   r:   run_stt  s   	
zNvidiaSTTService.run_sttc                 C   s>   | j stzt| j |  }| W S  ty   tw )zGet the next audio chunk for NVIDIA Riva processing.

        Returns:
            Audio bytes from the queue.

        Raises:
            StopIteration: When the thread is no longer running.
        )	r   StopIterationr   r   rx   rg   r   r   FuturesCancelledError)r|   futurer9   r9   r:   __next__  s   	
zNvidiaSTTService.__next__c                 C   s   | S )zdReturn iterator for audio chunk processing.

        Returns:
            Self as iterator.
        r9   r   r9   r9   r:   __iter__  r   zNvidiaSTTService.__iter__rC   ))r=   r>   r?   r@   r<   rf   rN   r   rS   r   rO   r   r	   intrM   rP   rk   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   bytesr   r   r   r   r   __classcell__r9   r9   r   r:   rQ   x   sl   
 
	
M			
"rQ   c                       s>  e Zd ZU dZeZeed< G dd deZddddd	d	d
d	e	dde
de
dee
e
f dee dee dedee dee f fddZdedee
 fddZdd Zde
fddZdd Zdefd d!Zd"ef fd#d$Zd%edee
ef f fd&d'Ze		d/d(e
d)edee fd*d+Zd,ede e!d	f fd-d.Z"  Z#S )0NvidiaSegmentedSTTServicea  Speech-to-text service using NVIDIA Riva's offline/batch models.

    By default, his service uses NVIDIA's Riva Canary ASR API to perform speech-to-text
    transcription on audio segments. It inherits from SegmentedSTTService to handle
    audio buffering and speech detection.
    rR   c                   @   sh   e Zd ZU dZejZee ed< dZ	e
ed< dZe
ed< dZe
ed< dZeee  ed	< d
Zeed< dS )z%NvidiaSegmentedSTTService.InputParamsaz  Configuration parameters for NVIDIA Riva segmented STT service.

        .. deprecated:: 0.0.105
            Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead.

        Parameters:
            language: Target language for transcription. Defaults to EN_US.
            profanity_filter: Whether to filter profanity from results.
            automatic_punctuation: Whether to add automatic punctuation.
            verbatim_transcripts: Whether to return verbatim transcripts.
            boosted_lm_words: List of words to boost in language model.
            boosted_lm_score: Score boost for specified words.
        r   FrH   TrI   rJ   NrK         @rL   )r=   r>   r?   r@   r   r#   r   r	   rN   rH   rM   rI   rJ   rK   r   rO   rL   rP   r9   r9   r9   r:   rS     s   
 rS   rT   z$ee8dc628-76de-4acc-8595-1836e7e857bdzcanary-1b-asrrU   NTrX   r`   rY   rZ   r[   r\   r]   r^   r_   c             	      s   | j |dtjdddddd}
|dur6| d |s6|jp tj|
_|j|
_|j|
_|j|
_|j	|
_	|j
|
_
|dur?|
| t jd|||
d|	 || _|| _|| _|d	| _d
| _d| _d
| _d| _d
| _d| _d| _d| _d| _dS )aR  Initialize the NVIDIA Riva segmented STT service.

        Args:
            api_key: NVIDIA API key for authentication
            server: NVIDIA Riva server address (defaults to NVIDIA Cloud Function endpoint)
            model_function_map: Mapping of model name and its corresponding NVIDIA Cloud Function ID
            sample_rate: Audio sample rate in Hz. If not provided, uses the pipeline's rate
            params: Additional configuration parameters for NVIDIA Riva

                .. deprecated:: 0.0.105
                    Use ``settings=NvidiaSegmentedSTTService.Settings(...)`` instead.

            use_ssl: Whether to use SSL for the NVIDIA Riva server. Defaults to True.
            settings: Runtime-updatable settings. When provided alongside deprecated
                parameters, ``settings`` values take precedence.
            ttfs_p99_latency: P99 latency from speech end to final transcript in seconds.
                Override for your deployment. See https://github.com/pipecat-ai/stt-benchmark
            **kwargs: Additional arguments passed to SegmentedSTTService
        rW   FTNr   )ra   r   rH   rI   rJ   rK   rL   r\   rb   rV   rc   rd   re   r9   )rf   rg   r   r#   rh   r   rH   rI   rJ   rK   rL   ri   rj   rk   rm   rl   rn   rv   ro   rp   rq   rr   rs   rt   ru   ry   rw   r{   r   r9   r:   rk     sP   $


z"NvidiaSegmentedSTTService.__init__r   r   c                 C   s   t |S )zConvert pipecat Language enum to NVIDIA Riva's language code.

        Args:
            language: Language enum value.

        Returns:
            NVIDIA Riva language code or None if not supported.
        )r;   )r|   r   r9   r9   r:   language_to_service_language,  s   	z6NvidiaSegmentedSTTService.language_to_service_languagec                 C   sP   | j durdS d| jgdd| j gg}tjd| j| j|}tj|| _ dS )zCInitialize the NVIDIA Riva ASR client with authentication metadata.Nr   r   r   )	rw   rv   rm   r   r   r   rn   rl   r   r   r9   r9   r:   r   7  s   
z,NvidiaSegmentedSTTService._initialize_clientc                 C   s   | j jpdS )z1Get the current NVIDIA Riva language code string.r   )rR   r   r   r9   r9   r:   _get_language_codeF  s   z,NvidiaSegmentedSTTService._get_language_codec              	   C   s~   | j }tjj|  d|j|j|jd}|jr tj	||j|j
 tj|| j| j| j| j| j| j | jr=tj|| j |S )r   r   )r   r   rH   r   rJ   )rR   r   r   r   r   rH   rI   rJ   rK   add_word_boosting_to_configrL   r   ro   rp   rq   rs   rr   rt   ru   r   )r|   sr   r9   r9   r:   r   J  s,   	z4NvidiaSegmentedSTTService._create_recognition_configc                 C   r   )zCheck if this service can generate processing metrics.

        Returns:
            True - this service supports metrics generation.
        Tr9   r   r9   r9   r:   r   k  r   z.NvidiaSegmentedSTTService.can_generate_metricsr   c                    s>   t  |I dH  |   |  | _td| jj  dS )zInitialize the service when the pipeline starts.

        Args:
            frame: StartFrame indicating pipeline start.
        Nz2Initialized NvidiaSegmentedSTTService with model: )	rj   r   r   r   ry   r
   r   rR   ra   r   r   r9   r:   r   s  s
   
zNvidiaSegmentedSTTService.startdeltac                    s&   t  |I dH }|r|  | _|S )zApply a settings delta and sync internal state.

        Args:
            delta: A :class:`STTSettings` (or ``NvidiaSegmentedSTTService.Settings``) delta.

        Returns:
            Dict mapping changed field names to their previous values.
        N)rj   _update_settingsr   ry   )r|   r   changedr   r9   r:   r   ~  s
   	
z*NvidiaSegmentedSTTService._update_settingsr   r   c                    r   r   r9   r   r9   r9   r:   r     r   z/NvidiaSegmentedSTTService._handle_transcriptionr   c              
   C  s  z| j dusJ d| jdusJ d|  I dH  | j j|| jdd}|  I dH  t|dr6| }n|}d}t|dg }|D ]6}t|dg }|rx|d	 j	 }|rxt
d
| d t|| jt | jjV  d}| |d| jjI dH  qB|st
|  d W dS W dS  ty }	 zt
|  d|	  t|  dt|	 V  W Y d}	~	dS d}	~	w ty }
 zt
|  d|
  t|  d|
 dV  W Y d}
~
dS d}
~
ww )zTranscribe an audio segment.

        Args:
            audio: Raw audio bytes in WAV format (already converted by base class).

        Yields:
            Frame: TranscriptionFrame containing the transcribed text.
        NzASR service not initializedzRecognition config not createdF)r   r   r   r   r   zTranscription: []Tz8: No transcription results found in NVIDIA Riva responsez2: Unexpected response structure from NVIDIA Riva: z*: Unexpected NVIDIA Riva response format: z exception: z error: )error)rw   ry   r   offline_recognizer   hasattrr   getattrr   stripr
   r   r   r   r   rR   r   r   AttributeErrorr   r   rO   	Exception)r|   r   raw_responser   transcription_foundr   r   r   textaeer9   r9   r:   r     sN   	

&$z!NvidiaSegmentedSTTService.run_sttrC   )$r=   r>   r?   r@   rA   rf   rN   r   rS   r   rO   r   r	   r   rM   rP   rk   r   r   r   r   r   r   r   r   r   dictr   r   r   r   r   r   r   r   r   r9   r9   r   r:   r     s`   
 
	
X!"r   )6r@   r   concurrent.futuresr   r   dataclassesr   r   typingr   r   r   r   r	   logurur
   pydanticr   pipecat.frames.framesr   r   r   r   r   r   r   pipecat.services.settingsr   r   r   pipecat.services.stt_latencyr   pipecat.services.stt_servicer   r   pipecat.transcriptions.languager   r   pipecat.utils.timer   (pipecat.utils.tracing.service_decoratorsr   riva.clientr   ModuleNotFoundErrorr   r   r   rO   r;   r<   rA   rQ   r   r9   r9   r9   r:   <module>   s>   $	
4  =