o
    ui6                     @   s   d dl Z d dlmZmZmZmZ d dlZd dlZd dl	Z	d dl
mZmZmZmZ d dlmZmZ d dlmZmZmZ d dlZG dd deZ	 								
		ddededededee dedee dedee dedefddZdS )    N)ListUnionOptional
NamedTuple)	N_SAMPLESSAMPLE_RATE
load_audiolog_mel_spectrogram)TranscriptionResultSingleSegment)WhisperModelFasterWhisperPipelinefind_numeral_symbol_tokensc                       s   e Zd ZdZ					ddedeeed	f d
ef fddZ	de
jfddZ							ddeee
jf dee defddZ		ddee deee
jf dedededefddZ  ZS )VadFreeFasterWhisperPipelinez(
    FasterWhisperModel without VAD
    NptenFoptionsdeviceztorch.devicesuppress_numeralsc           	         s*   t  jd|di ||||||d	| dS )a  
        Initialize the VadFreeFasterWhisperPipeline.

        Args:
            model: The Whisper model instance.
            options: Transcription options.
            tokenizer: The tokenizer instance.
            device: Device to run the model on.
            framework: The framework to use ('pt' for PyTorch).
            language: The language for transcription.
            suppress_numerals: Whether to suppress numeral tokens.
            **kwargs: Additional keyword arguments.

        Returns:
            None
        N)	modelvad
vad_paramsr   	tokenizerr   	frameworklanguager    )super__init__)	selfr   r   r   r   r   r   r   kwargs	__class__r   =/home/ubuntu/sommelier/podcast-pipeline/models/whisper_asr.pyr      s   

z%VadFreeFasterWhisperPipeline.__init__audioc                 C   s   | j jd}|jd tkr#tjd|jd t }|||t  }n|dt }|jd tkr2dnt|jd  }t||durA|nd|d}| j 	|}| j j 
|}|d d \}	}
|	dd }||
fS )z
        Detect the language of the audio.

        Args:
            audio (np.ndarray): The input audio signal.

        Returns:
            tuple: Detected language and its probability.
        feature_sizer   NP   )n_melspadding   )r   feat_kwargsgetshaper   nprandomrandintr	   encodedetect_language)r   r$   model_n_melsstart_indexaudio_sampler(   segmentencoder_outputresultslanguage_tokenlanguage_probabilityr   r   r   r#   r2   >   s    
 z,VadFreeFasterWhisperPipeline.detect_languager      vad_segmentsreturnc
              
   C   sR  t |tr	t|}dd }
|du r"| |}t |tr |d }n|}t |tr+|d }ddg}|durA||vrAtd| d d}| jdu r[|pId}tjj| j	j
| j	j	j||d	| _n?|p`| jj}t| jd
rt| jj|v rt|pr| jj}n|pwd}||vr~d}|| jjks|| jjkrtjj| j	j
| j	j	j||d	| _| jr| jj}t| j}|| jj }tt|}| jj|d| _g }|p| j}t|}tj|dd}t| j|
||||dD ]5\}}|r|d |d }|dv r|d }||t|| d dt|| d d|| ddd q| jdu rd| _| jr$| jj|d| _||dS )a  
        Transcribe the audio into text.

        Args:
            audio (Union[str, np.ndarray]): The input audio signal or path to audio file.
            vad_segments (List[dict]): List of VAD segments.
            batch_size (int, optional): Batch size for transcription. Defaults to None.
            num_workers (int, optional): Number of workers for loading data. Defaults to 0.
            language (str, optional): Language for transcription. Defaults to None.
            task (str, optional): Task type ('transcribe' or 'translate'). Defaults to None.
            chunk_size (int, optional): Size of chunks for processing. Defaults to 30.
            print_progress (bool, optional): Whether to print progress. Defaults to False.
            combined_progress (bool, optional): Whether to combine progress. Defaults to False.

        Returns:
            TranscriptionResult: The transcription result containing segments and language.
        c                 s   sB    |D ]}t |d t }t |d t }d| || iV  qd S )Nstartendinputs)intr   )r$   segmentssegf1f2r   r   r#   data{   s   z5VadFreeFasterWhisperPipeline.transcribe.<locals>.dataNr   
transcribe	translatezWarning: Invalid task 'z', defaulting to 'transcribe'taskr   rJ   )suppress_tokensTranscribing)totaldesc)
batch_sizenum_workers   text)r   rQ   Nr>      r?   speaker)rR   r>   r?   rT   )rB   r   ) 
isinstancestrr   r2   tupleprintr   faster_whisper	Tokenizerr   hf_tokenizeris_multilinguallanguage_codehasattrrJ   r   r   rK   r   listset_replace_batch_sizelentqdm	enumerate__call__updateappendroundr,   preset_language)r   r$   r<   rO   rP   r   rJ   
chunk_sizeprint_progresscombined_progressrF   detected_languagevalid_tasksprevious_suppress_tokensnumeral_symbol_tokensnew_suppressed_tokensrB   total_segmentsprogressidxoutrR   r   r   r#   rG   [   s   











z'VadFreeFasterWhisperPipeline.transcribecudarB   r   return_char_alignmentsc              
   C   s   t |tr
t|}n|}ztj||d\}}W n% ty< }	 ztd| d|	  td ||dW  Y d}	~	S d}	~	ww ztj||||||d}
|
W S  tym }	 ztd|	  td ||dW  Y d}	~	S d}	~	ww )	au  
        Align transcribed segments to get word-level timestamps using WhisperX alignment.

        Args:
            segments (List[dict]): Transcribed segments from transcribe().
            audio (Union[str, np.ndarray]): The input audio signal or path to audio file.
            language (str): Language code for the alignment model.
            device (str, optional): Device to run alignment on. Defaults to "cuda".
            return_char_alignments (bool, optional): Whether to return character-level alignments. Defaults to False.

        Returns:
            dict: Aligned segments with word-level timestamps.
        )r]   r   z6Warning: Could not load alignment model for language 'z': z0Returning segments without word-level alignment.)rB   word_segmentsN)rx   zWarning: Alignment failed: )rU   rV   r   whisperxload_align_model	ExceptionrX   align)r   rB   r$   r   r   rx   audio_arraymodel_ametadataealigned_resultr   r   r#   align_segments   s<   

z+VadFreeFasterWhisperPipeline.align_segments)Nr   r   r   F)Nr   NNr;   FF)rw   F)__name__
__module____qualname____doc__r   r   rA   rV   boolr   r.   ndarrayr2   r   dictr
   rG   r   __classcell__r   r   r!   r#   r      sZ    (!
 r   float16r   rG      whisper_archr   device_indexcompute_typeasr_optionsr   r   rJ   download_rootthreadsr=   c                 C   s$  |  drd}|pt| ||||
|d}|dur%tjj|j|jj|	|d}ntd d}i ddd	dd
ddddddddg dddddddddddddddddd d!gd"dd#dd$d%ddddddd&
}|dury|	| |d' }|d'= tj
jd)i |}t|||||d(S )*a  
    Load a Whisper model for inference.

    Args:
        whisper_arch (str): The name of the Whisper model to load.
        device (str): The device to load the model on.
        device_index (int, optional): The device index. Defaults to 0.
        compute_type (str, optional): The compute type to use for the model. Defaults to "float16".
        asr_options (Optional[dict], optional): Options for ASR. Defaults to None.
        language (Optional[str], optional): The language of the model. Defaults to None.
        vad_model: The VAD model instance. Defaults to None.
        vad_options: Options for VAD. Defaults to None.
        model (Optional[WhisperModel], optional): The WhisperModel instance to use. Defaults to None.
        task (str, optional): The task type ('transcribe' or 'translate'). Defaults to "transcribe".
        download_root (Optional[str], optional): The root directory to download the model to. Defaults to None.
        threads (int, optional): The number of CPU threads to use per worker. Defaults to 4.

    Returns:
        VadFreeFasterWhisperPipeline: The loaded Whisper pipeline.

    Raises:
        ValueError: If the whisper architecture is not recognized.
    z.enr   )r   r   r   r   cpu_threadsNrI   z`No language specified, language will be detected for each audio file (increases inference time).	beam_size   best_ofpatiencerQ   length_penaltyrepetition_penaltyno_repeat_ngram_sizer   temperatures)        g?g?333333?g?g      ?compression_ratio_thresholdg333333@log_prob_thresholdg      no_speech_thresholdr   condition_on_previous_textFprompt_reset_on_temperatureg      ?initial_promptprefixsuppress_blankTrK   r   without_timestampsr   u   "'“¿([{-u   "'.。,，!！?？:：”)]}、)
max_initial_timestampword_timestampsprepend_punctuationsappend_punctuationsr   max_new_tokensclip_timestampshallucination_silence_thresholdmultilingualhotwordsr   )r   r   r   r   r   r   )endswithr   rY   r   rZ   r[   r   r\   rX   rg   rG   TranscriptionOptionsr   )r   r   r   r   r   r   	vad_modelvad_optionsr   rJ   r   r   r   default_asr_optionsr   r   r   r#   load_asr_model  s   
&	


r   )
r   r   Nr   NNNrG   Nr   )rY   typingr   r   r   r   torchnumpyr.   rd   whisperx.audior   r   r   r	   whisperx.typesr
   r   whisperx.asrr   r   r   rz   r   rV   rA   r   r   r   r   r   r#   <module>   sZ     	
