o
    9wi?                     @   sN  d dl Z d dlmZmZmZ d dlmZ d dlZd dlZd dl	Z
d dlZd dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZmZmZ d d	lmZmZ d d
lmZmZm Z  dd Z!G dd dej"Z"G dd deZ#	 											d"de$de$dee% dee$ dee dee$ dee% dee" dee$ de#fd d!Z&dS )#    N)ListOptionalUnion)replace)	Tokenizer)TranscriptionOptionsget_ctranslate2_storage)Pipeline)PipelineIterator)	N_SAMPLESSAMPLE_RATE
load_audiolog_mel_spectrogram)SingleSegmentTranscriptionResult)VadSileroPyannotec                 C   sJ   g }t | jD ]}| |gd}tdd |D }|r"|| q|S )N c                 s   s    | ]}|d v V  qdS )u   0123456789%$£N ).0cr   r   I/home/ubuntu/sommelier/.venv/lib/python3.10/site-packages/whisperx/asr.py	<genexpr>   s    z-find_numeral_symbol_tokens.<locals>.<genexpr>)rangeeotdecoderemoveprefixanyappend)	tokenizernumeral_symbol_tokensitokenhas_numeral_symbolr   r   r   find_numeral_symbol_tokens   s   
r%   c                   @   sB   e Zd ZdZ	ddejdedefddZdejde	j
fd	d
ZdS )WhisperModelz
    FasterWhisperModel provides batched inference for faster-whisper.
    Currently only works in non-timestamp mode and fixed prompt for all samples in batch.
    Nfeaturesr    optionsc              
      s   |j d }g }d}|jd urd|j  } |}	||	 ||d  }
| j |
|j|j|jd}| |}t	t
|j| j }| jj||g| |j|j|j| j|j|jd}dd |D }dttt	  dtf fd	d
}||}|S )Nr   r   )without_timestampsprefixhotwords)	beam_sizepatiencelength_penalty
max_lengthsuppress_blanksuppress_tokensc                 S   s   g | ]}|j d  qS )r   )sequences_idsr   xr   r   r   
<listcomp>J   s    z9WhisperModel.generate_segment_batched.<locals>.<listcomp>tokensreturnc                    s2   g }| D ]}|  fdd|D  q j|S )Nc                    s   g | ]	}| j k r|qS r   )r   )r   r#   r    r   r   r5   O   s    zOWhisperModel.generate_segment_batched.<locals>.decode_batch.<locals>.<listcomp>)r   r    decode_batch)r6   restkr8   r   r   r9   L   s   z;WhisperModel.generate_segment_batched.<locals>.decode_batch)shapeinitial_promptstripencodeextend
get_promptr)   r*   r+   introundmax_initial_timestamptime_precisionmodelgenerater,   r-   r.   r/   r0   r1   r   str)selfr'   r    r(   encoder_output
batch_size
all_tokensprompt_reset_sincer=   initial_prompt_tokensprevious_tokenspromptmax_initial_timestamp_indexresulttokens_batchr9   textr   r8   r   generate_segment_batched"   sB   




z%WhisperModel.generate_segment_batchedr7   c                 C   sN   | j jdkot| j jdk}t|jdkrt|d}t|}| j j||dS )Ncuda      r   )to_cpu)	rF   devicelendevice_indexr<   npexpand_dimsr   r?   )rI   r'   rY   r   r   r   r?   W   s
   zWhisperModel.encodeN)__name__
__module____qualname____doc__r]   ndarrayr   r   rU   ctranslate2StorageViewr?   r   r   r   r   r&      s    

5r&   c                       s   e Zd ZdZ					d(dededed	ee d
e	e
edf dee def fddZdd Zdd Zdd Zdd Zde
de
dededef
ddZ					 			d)d!e	eejf dee
 dee d"ee d#ef
d$d%Zd!ejd#efd&d'Z  ZS )*FasterWhisperPipelinez>
    Huggingface Pipeline wrapper for FasterWhisperModel.
    NptFrF   
vad_paramsr(   r    rZ   ztorch.devicelanguagesuppress_numeralsc
                    s   || _ || _|| _|| _|	| _|
dd | _d| _| jdi |
\| _	| _
| _d| _|| _| jdkr]t|tjr<|| _n$t|trHt|| _n|dk rStd| _ntd| | _n|| _tt|   || _|| _d S )NrK   rW   r   ri   cpuzcuda:r   )rF   r    r(   preset_languagerl   pop_batch_size_num_workers_sanitize_parameters_preprocess_params_forward_params_postprocess_params
call_count	framework
isinstancetorchrZ   rH   superr	   __init__	vad_model_vad_params)rI   rF   vadrj   r(   r    rZ   rw   rk   rl   kwargs	__class__r   r   r{   j   s,   


zFasterWhisperPipeline.__init__c                 K   s"   i }d|v r|d |d< |i i fS )Nr    	maybe_argr   )rI   r   preprocess_kwargsr   r   r   rr      s   
z*FasterWhisperPipeline._sanitize_parametersc                 C   sB   |d }| j jd}t||d ur|ndt|jd  d}d|iS )Ninputsfeature_sizeP   r   n_melspadding)rF   feat_kwargsgetr   r   r<   )rI   audiomodel_n_melsr'   r   r   r   
preprocess   s   z FasterWhisperPipeline.preprocessc                 C   s    | j |d | j| j}d|iS )Nr   rT   )rF   rU   r    r(   )rI   model_inputsoutputsr   r   r   _forward   s   zFasterWhisperPipeline._forwardc                 C   s   |S r_   r   )rI   model_outputsr   r   r   postprocess   s   z!FasterWhisperPipeline.postprocessnum_workersrK   preprocess_paramsforward_paramspostprocess_paramsc                 C   sd   t || j|}dtjvrdtjd< dd }tjjj||||d}	t |	| j||d}
t |
| j	|}|S )NTOKENIZERS_PARALLELISMfalsec                 S   s   dt dd | D iS )Nr   c                 S   s   g | ]}|d  qS )r   r   r3   r   r   r   r5      s    zEFasterWhisperPipeline.get_iterator.<locals>.stack.<locals>.<listcomp>)ry   stack)itemsr   r   r   r      s   z1FasterWhisperPipeline.get_iterator.<locals>.stack)r   rK   
collate_fn)loader_batch_size)
r
   r   osenvironry   utilsdata
DataLoaderforwardr   )rI   r   r   rK   r   r   r   datasetr   
dataloadermodel_iteratorfinal_iteratorr   r   r   get_iterator   s   	

z"FasterWhisperPipeline.get_iteratorr      r   taskr7   c
                 C   st  t |tr	t|}dd }
tt| jtr | j|}| jj}nt	|}t	j}| |t
d}|||| jd | jd d}| jd u r]|pI| |}|pMd}t| jj| jjj||d| _n&|pb| jj}|ph| jj}|| jjksu|| jjkrt| jj| jjj||d| _| jr| jj}t| j}td	 || jj }tt|}t| j|d
| _g }|p| j}t|}t| j|
||||dD ]`\}}|r|d | d }|r|d n|}td|dd |d }|dv r|d }|	rtdt || d d dt || d d d|  |!|t || d dt || d dd q| j"d u r)d | _| jr5t| j|d
| _||dS )Nc                 s   sB    |D ]}t |d t }t |d t }d| || iV  qd S )Nstartendr   )rB   r   )r   segmentssegf1f2r   r   r   r      s   z.FasterWhisperPipeline.transcribe.<locals>.data)waveformsample_rate	vad_onset
vad_offset)onsetoffset
transcriber   rk   z%Suppressing numeral and symbol tokens)r1   )rK   r   rW   d   rX   z
Progress: .2fz%...rT   )r   rW   Nr   zTranscript: [r      z --> r   z] )rT   r   r   )r   rk   )#rx   rH   r   
issubclasstyper|   r   preprocess_audiomerge_chunksr   r   r}   r    detect_languager   rF   hf_tokenizeris_multilinguallanguage_coder   rl   r(   r1   r%   printlistsetr   rp   r[   	enumerate__call__rC   r   rn   )rI   r   rK   r   rk   r   
chunk_sizeprint_progresscombined_progressverboser   r   r   vad_segmentsprevious_suppress_tokensr!   new_suppressed_tokensr   total_segmentsidxoutbase_progresspercent_completerT   r   r   r   r      s   
	





"6	
z FasterWhisperPipeline.transcribec           	      C   s   |j d tk rtd | jjd}t|d t |d ur|nd|j d tkr(dnt|j d  d}| j|}| jj|}|d d \}}|dd }td| d	|d
d |S )Nr   zIWarning: audio is shorter than 30s, language detection may be inaccurate.r   r   r   rX   zDetected language: z (r   z) in first 30s of audio...)	r<   r   r   rF   r   r   r   r?   r   )	rI   r   r   segmentrJ   resultslanguage_tokenlanguage_probabilityrk   r   r   r   r     s   z%FasterWhisperPipeline.detect_language)Nrh   ri   NF)Nr   NNr   FFF)r`   ra   rb   rc   r&   dictr   r   r   r   rB   rH   boolr{   rr   r   r   r   r   r]   rd   r   r   r   __classcell__r   r   r   r   rg   b   sr    	
'


arg   float16pyannoter   F   whisper_archrZ   asr_optionsrk   r|   
vad_methodvad_optionsrF   download_rootr7   c                 C   s  |  drd}|	pt| ||||||d}	|dur$t|	j|	jj|
|d}ntd d}i ddd	dd
ddddddddg dddddddddddddddddd d!gd"dd#dd$d%|	jjdddddd&
}|durz|| |d' }|d'= td1i |}d(dd)d*}|dur|| |durtd+ |}n%|d,krt	d1i |}n|d-krt
t|fd.di|}ntd/| t|	||||||d0S )2aA  Load a Whisper model for inference.
    Args:
        whisper_arch - The name of the Whisper model to load.
        device - The device to load the model on.
        compute_type - The compute type to use for the model.
        vad_method - The vad method to use. vad_model has higher priority if is not None.
        options - A dictionary of options to use for the model.
        language - The language of the model. (use English for now)
        model - The WhisperModel instance to use.
        download_root - The root directory to download the model to.
        local_files_only - If `True`, avoid downloading the file and return the path to the local cached file if it exists.
        threads - The number of cpu threads to use per worker, e.g. will be multiplied by num workers.
    Returns:
        A Whisper pipeline.
    z.enen)rZ   r\   compute_typer   local_files_onlycpu_threadsNr   ziNo language specified, language will be first be detected for each audio file (increases inference time).r,      best_ofr-   rW   r.   repetition_penaltyno_repeat_ngram_sizer   temperatures)        g?g?333333?g?g      ?compression_ratio_thresholdg333333@log_prob_thresholdg      no_speech_thresholdr   condition_on_previous_textFprompt_reset_on_temperatureg      ?r=   r*   r0   Tr1   rh   r)   r   u   "'“¿([{-u   "'.。,，!！?？:：”)]}、)
rD   word_timestampsprepend_punctuationsappend_punctuationsmultilingualrl   max_new_tokensclip_timestampshallucination_silence_thresholdr+   rl   r   gZd;?)r   r   r   z7Use manually assigned vad_model. vad_method is ignored.sileror   use_auth_tokenzInvalid vad_method: )rF   r~   r(   r    rk   rl   rj   r   )endswithr&   r   r   rF   r   r   updater   r   r   ry   rZ   
ValueErrorrg   )r   rZ   r\   r   r   rk   r|   r   r   rF   r   r   r   threadsr    default_asr_optionsrl   default_vad_optionsr   r   r   
load_model-  s   
 	


r  )r   r   NNNr   NNr   NFr   )'r   typingr   r   r   dataclassesr   re   faster_whispernumpyr]   ry   faster_whisper.tokenizerr   faster_whisper.transcriber   r   transformersr	   transformers.pipelines.pt_utilsr
   whisperx.audior   r   r   r   whisperx.typesr   r   whisperx.vadsr   r   r   r%   r&   rg   rH   r   r  r   r   r   r   <module>   sf    	F O	
