o
    Xi?                     @   sJ  d dl Z d dlZd dlZd dlmZmZmZmZmZ d dl	Z
d dlZd dlZddlmZmZmZmZmZ ddlmZmZ ddlmZmZmZ ddlmZmZmZmZmZm Z m!Z!m"Z" erbddl#m$Z$ dd	d
dddddddee%e
j&ej'f dee( dee)ee)df f dee) dee) dee) de(fddZ*dd Z+e,dkre+  dS dS )    N)ListOptionalTupleUnionTYPE_CHECKING   )SAMPLE_RATEN_FRAMES
HOP_LENGTHpad_or_trimlog_mel_spectrogram)DecodingOptionsDecodingResult)	LANGUAGESTO_LANGUAGE_CODEget_tokenizer)	exact_divformat_timestampoptional_intoptional_floatstr2bool	write_txt	write_vtt	write_srt)Whisper)g        皙?g?333333?g?g      ?333333@      r   T)verbosetemperaturecompression_ratio_thresholdlogprob_thresholdno_speech_thresholdcondition_on_previous_textmodelr   audior   r    .r!   r"   r#   r$   c          )         s   ddr	tjntj}	jtdkr+tj rtd |	tjkr+td tj}	|	tjkr4dd< t	|}
 dddu r{j
sHd	d< n3rNtd
 t|
tj|	}|\}}t||j dd< dur{tdtd     d } dd}tj
||ddtjdtffdd}dttjj}|t t }g }g  d}ddpg }|rʈd|  }|| dtdtdtjdtf fdd}|
j d }}t!j!|ddud9}|k r!tt t }t|
dddf tj|	}|j d t t }||d d < ||}t"|j#}|durN|j$|k}durC|j%krCd}|rN|j d 7 q|&j'}t(|dd |d!d @ d )d!} t*| dkrd}!| D ]1}"||!|" }#|#d + j' }$|#d + j' }%|||$|  ||%|  |#d!d |d" |"}!qs||!d!  + j' }&|&| 7 ||d|!d!  ,  n@|}'||- .  }(t*|(dkr|(d + j'kr|(d + j' }&|&| }'||||' ||d" |j d 7 ||,  |r|j/d#krt*|}|0t1||  }|k sW d   n	1 s,w   Y  t23|t*|d  |d$S )%a  
    Transcribe an audio file using Whisper

    Parameters
    ----------
    model: Whisper
        The Whisper model instance

    audio: Union[str, np.ndarray, torch.Tensor]
        The path to the audio file to open, or the audio waveform

    verbose: bool
        Whether to display the text being decoded to the console. If True, displays all the details,
        If False, displays minimal details. If None, does not display anything

    temperature: Union[float, Tuple[float, ...]]
        Temperature for sampling. It can be a tuple of temperatures, which will be successfully used
        upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.

    compression_ratio_threshold: float
        If the gzip compression ratio is above this value, treat as failed

    logprob_threshold: float
        If the average log probability over sampled tokens is below this value, treat as failed

    no_speech_threshold: float
        If the no_speech probability is higher than this value AND the average log probability
        over sampled tokens is below `logprob_threshold`, consider the segment as silent

    condition_on_previous_text: bool
        if True, the previous output of the model is provided as a prompt for the next window;
        disabling may make the text inconsistent across windows, but the model becomes less prone to
        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.

    decode_options: dict
        Keyword arguments to construct `DecodingOptions` instances

    Returns
    -------
    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
    fp16Tcpuz2Performing inference on CPU when CUDA is availablez0FP16 is not supported on CPU; using FP32 insteadFlanguageNenz]Detecting language using up to the first 30 seconds. Use `--language` to specify the language)keyzDetected language: task
transcribe)r)   r,   segmentreturnc                    s   t ttfr
gn}d }|D ]K}i }|dkr'|dd  |dd  n|dd  tdi |d|i}| |}d} d urK|j krKd}d urV|jk rVd}|s[ |S q|S )	Nr   	beam_sizepatiencebest_ofr    FT )
isinstanceintfloatpopr   decodecompression_ratioavg_logprob)r.   temperaturesdecode_resulttkwargsoptionsneeds_fallback)r!   decode_optionsr"   r%   r    r3   F/home/ubuntu/.local/lib/python3.10/site-packages/whisper/transcribe.pydecode_with_fallbackf   s(   z(transcribe.<locals>.decode_with_fallbackr   initial_prompt startendtext_tokensresultc                    s    fdd|D }t| dkrd S  t | ||| |j|j|j|jd
 rCt	dt
|  dt
| d|  d S d S )Nc                    s   g | ]	}| j k r|qS r3   )eot).0token)	tokenizerr3   rB   
<listcomp>   s    z3transcribe.<locals>.add_segment.<locals>.<listcomp>r   )
idseekrF   rG   texttokensr    r:   r9   no_speech_prob[z --> z] )r8   lenstripappendtolistr    r:   r9   rS   printr   )rF   rG   rH   rI   rQ   )all_segmentsrP   rM   r   r3   rB   add_segment   s&   &ztranscribe.<locals>.add_segmentframes)totalunitdisablepromptr   )rF   rG   rH   rI   g      ?)rQ   segmentsr)   )4gettorchfloat16float32devicecudais_availablewarningswarnr   is_multilingualrY   r   r	   todetect_languagemaxr   titler   Tensorr   r   dimsn_audio_ctxr
   r   r7   encoderV   extendr6   shapetqdmtensorrR   rS   r:   getimestamp_beginwhereadd_rU   itemrX   nonzeroflattenr    updatemindictr8   ))r%   r&   r   r    r!   r"   r#   r$   rA   dtypemelr.   _probsr)   r,   rC   input_stridetime_precision
all_tokensprompt_reset_sincerD   r[   
num_framesprevious_seek_valuepbartimestamp_offsetsegment_durationrI   rR   should_skiptimestamp_tokensconsecutive
last_slicecurrent_slicesliced_tokensstart_timestamp_positionend_timestamp_positionlast_timestamp_positionduration
timestampsr3   )	rZ   r!   rA   r"   r%   rP   r    rM   r   rB   r-      s   6





 



(

(


" Hr-   c               	   C   s  ddl m}  tjtjd}|jddtdd |jdd	|  d
d |jdtd dd |jdtj	 r3dnddd |jddtddd |jdt
ddd |jdtdddgdd |jd td tt td!d" t D  d#d |jd$td%d&d |jd'td(d)d |jd*td(d+d |jd,td d-d |jd.td d/d |jd0td1d2d |jd3td d4d |jd5t
dd6d |jd7t
dd8d |jd9td:d;d |jd<td=d>d |jd?td@dAd |jdBtdCdDd |jdEtd%dFd | j}|dG}|dH}|dI}|dJ}tj|ddK |dLr3|dM dNvr3|dM d ur/t| dO|dM  dP dQ|dM< |dR}|dS}|d urLtt|dT|}n|g}|dU}	|	d%kr^t|	 ddVl m}
 |
|||dW}|dD ]}t||fdR|i|}tj !|}t"tj #||dX dYdZd[}t$|d\ |d] W d    n	1 sw   Y  t"tj #||d^ dYdZd[}t%|d\ |d] W d    n	1 sw   Y  t"tj #||d_ dYdZd[}t&|d\ |d] W d    n	1 sw   Y  qpd S )`Nr   )available_models)formatter_classr&   +zaudio file(s) to transcribe)nargstypehelpz--modelsmallz name of the Whisper model to use)defaultchoicesr   z--model_dirz>the path to save model files; uses ~/.cache/whisper by default)r   r   r   z--devicerh   r(   z#device to use for PyTorch inference)r   r   z--output_dirz-o.zdirectory to save the outputsz	--verboseTz4whether to print out the progress and debug messagesz--taskr-   	translatezawhether to perform X->X speech recognition ('transcribe') or X->English translation ('translate'))r   r   r   r   z
--languagec                 S   s   g | ]}|  qS r3   )rp   )rK   kr3   r3   rB   rN     s    zcli.<locals>.<listcomp>zHlanguage spoken in the audio, specify None to perform language detectionz--temperaturer   ztemperature to use for samplingz	--best_of   z<number of candidates when sampling with non-zero temperaturez--beam_sizezHnumber of beams in beam search, only applicable when temperature is zeroz
--patiencezoptional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam searchz--length_penaltyzoptional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by defaultz--suppress_tokensz-1zcomma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuationsz--initial_promptz:optional text to provide as a prompt for the first window.z--condition_on_previous_textzif True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loopz--fp16z5whether to perform inference in fp16; True by defaultz#--temperature_increment_on_fallbackr   zhtemperature to increase when falling back when the decoding fails to meet either of the thresholds belowz--compression_ratio_thresholdr   zUif the gzip compression ratio is higher than this value, treat the decoding as failedz--logprob_thresholdr   zUif the average log probability is lower than this value, treat the decoding as failedz--no_speech_thresholdr   zif the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silencez	--threadsz]number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADSr%   	model_dir
output_dirrg   )exist_okz.enr)   >   r*   Englishz) is an English-only model but receipted 'z'; using English instead.r*   r    !temperature_increment_on_fallbackgzo ?threads)
load_model)rg   download_rootz.txtwzutf-8)encodingrb   )filez.vttz.srt)' r   argparseArgumentParserArgumentDefaultsHelpFormatteradd_argumentstrrd   rh   ri   r   sortedr   keysr   r6   r   r   
parse_args__dict__r7   osmakedirsendswithrj   rk   tuplenparangeset_num_threadsr   r-   pathbasenameopenjoinr   r   r   )r   parserargs
model_namer   r   rg   r    r   r   r   r%   
audio_pathrI   audio_basenametxtvttsrtr3   r3   rB   cli   st   2










r   __main__)-r   r   rj   typingr   r   r   r   r   numpyr   rd   rw   r&   r   r	   r
   r   r   decodingr   r   rM   r   r   r   utilsr   r   r   r   r   r   r   r   r%   r   r   ndarrayrq   boolr6   r-   r   __name__r3   r3   r3   rB   <module>   sR    (	
 gK
