o
    ۷iv                      @   s  d dl Z d dlZd dlZd dlZd dlmZmZmZmZm	Z	 d dl
Zd dlZd dlZddlmZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' erpdd	l(m)Z) dd
ddddddddddddddde	e*ej+ej,f dee- de	e.ee.df f dee. dee. dee. de-dee* de-d e-d!e*d"e*d#e	e*ee. f d$ee. fd%d&Z/d'd( Z0e1d)kre0  dS dS )*    N)TYPE_CHECKINGListOptionalTupleUnion   )FRAMES_PER_SECOND
HOP_LENGTHN_FRAMES	N_SAMPLESSAMPLE_RATElog_mel_spectrogrampad_or_trim)DecodingOptionsDecodingResult)add_word_timestamps)	LANGUAGESTO_LANGUAGE_CODEget_tokenizer)	exact_divformat_timestampget_end
get_writer	make_safeoptional_floatoptional_intstr2bool)Whisper)        皙?g?333333?g?      ?333333@      r    TF   "'“¿([{-   "'.。,，!！?？:：”)]}、0)verbosetemperaturecompression_ratio_thresholdlogprob_thresholdno_speech_thresholdcondition_on_previous_textinitial_promptcarry_initial_promptword_timestampsprepend_punctuationsappend_punctuationsclip_timestampshallucination_silence_thresholdmodelr   audior'   r(   .r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   c          S         s~	   ddr	tjntj}jtdkr+tj rtd |tjkr+td tj}|tjkr4dd< t	|j
jtd}|jd t }t|t t } d	d
d
u rjs\dd	< n3|rbtd t|tj|}|\}}t||j dd	< |d
urtdtd	     d	 } dd}tjj||dt|trdd |r|dng D }dd |D }t|dkr| d t|d dkr| | t!t"|d
d
d |dd
d }d|
r|dkrtd dtj#dt$f fdd}d}|| d t%tj
j&}|t t }g }g } d}!j
j'd d }"|d
urA(d |)  }#|*|# |"t|#8 }"ng }#d!td"td#tj#d$t$ffd%d&}$t+j+|d'|dud(G}%d)}&|t|k r|| \}'}(|'k rv|'|(kr|d7 }|t|k r|| d qbtt t })tt t t }*t,t| |( }+|d
d
|+ f }|+t t },t|tj|}|	rtt|#|!}-||-d
 |" d
 }.|#|. d*< n||!d
 d*< ||}/t-|/j.}0d
ur|/j/k}1d
ur|/j0krd}1|1r|+7 qb}2g }3d+t1dtfd,d-	dt2t1 dt3f	fd.d/}4d0t4t1 dt2t1 fd1d2}5|05j6}6|6d3d
 7 ddgk}7t8|6d
d |6dd
 @ d }8|89d t|8dkr|87 }9|7r{|9 t|0 d}:|9D ]0};|0|:|; }<|<d : j6 }=|<d : j6 }>|3 |$|)|=|  |)|>|  |<|/d4 |;}:q|7r|+7 nK|0|:d  : j6 }?|?| 7 n9|,}@|0|6; <  }At|Adkr|Ad : j6kr|Ad : j6 }?|?| }@|3 |$|)|)|@ |0|/d4 |+7 |
rt=|3||+|||&d5 |7s)t>|3}B|Bd
ur)|B|)kr)t?|Bt@ |d
ur|}C|7sUt>|3}B|Bd
urU|B|)krU|*|B }D|D|CkrQt?|Bt@ n|2|+ |5|3}E|Ed
urx|4|Erx|Ed! |) }F|F|Ckrx|2t?|Ft@  qb|&}GtAt|3D ]}H|3|H }I|Id6 sq|4|Ir|5|3|Hd d
 }J|Jd
ur|Jd6 d d! }Kn|)|, }K|Id! |G |Ckp|Id! |Ck p|Id! |) d7k }L|K|Id"  |Ckp|4|Jp|*|Id"  d7k }M|Lr|Mrt?t|)d |Id! t@ ||Id"  |Ck r|g |3|Hd
<  n|Id" }Gqt>|3}B|Bd
ur|B}&|r?|3D ]&}I|Id! |Id" |Id8 }N}O}Pd9tB|N d:tB|O d;|P }QttC|Q qtD|3D ]#\}R}I|Id! |Id" ksY|Id8 ) d<kred<|Id8< g |Id#< g |Id6< qC| *d=d tD|3t| d>D  |*d?d |3D  |r|/jEd@krt|}!|%Ft,||2  |t|k siW d
   n	1 sw   Y  t1G|t|#d
 | |dAS )Ba  
    Transcribe an audio file using Whisper

    Parameters
    ----------
    model: Whisper
        The Whisper model instance

    audio: Union[str, np.ndarray, torch.Tensor]
        The path to the audio file to open, or the audio waveform

    verbose: bool
        Whether to display the text being decoded to the console. If True, displays all the details,
        If False, displays minimal details. If None, does not display anything

    temperature: Union[float, Tuple[float, ...]]
        Temperature for sampling. It can be a tuple of temperatures, which will be successively used
        upon failures according to either `compression_ratio_threshold` or `logprob_threshold`.

    compression_ratio_threshold: float
        If the gzip compression ratio is above this value, treat as failed

    logprob_threshold: float
        If the average log probability over sampled tokens is below this value, treat as failed

    no_speech_threshold: float
        If the no_speech probability is higher than this value AND the average log probability
        over sampled tokens is below `logprob_threshold`, consider the segment as silent

    condition_on_previous_text: bool
        if True, the previous output of the model is provided as a prompt for the next window;
        disabling may make the text inconsistent across windows, but the model becomes less prone to
        getting stuck in a failure loop, such as repetition looping or timestamps going out of sync.

    word_timestamps: bool
        Extract word-level timestamps using the cross-attention pattern and dynamic time warping,
        and include the timestamps for each word in each segment.

    prepend_punctuations: str
        If word_timestamps is True, merge these punctuation symbols with the next word

    append_punctuations: str
        If word_timestamps is True, merge these punctuation symbols with the previous word

    initial_prompt: Optional[str]
        Optional text to provide as a prompt for the first window. This can be used to provide, or
        "prompt-engineer" a context for transcription, e.g. custom vocabularies or proper nouns
        to make it more likely to predict those word correctly.

    carry_initial_prompt: bool
        If carry_initial_prompt is True, `initial_prompt` is prepended to the prompt of each internal
        `decode()` call. If there is not enough context space at the start of the prompt, it is
        left-sliced to make space.

    decode_options: dict
        Keyword arguments to construct `DecodingOptions` instances

    clip_timestamps: Union[str, List[float]]
        Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to process.
        The last end timestamp defaults to the end of the file.

    hallucination_silence_threshold: Optional[float]
        When word_timestamps is True, skip silent periods longer than this threshold (in seconds)
        when a possible hallucination is detected

    Returns
    -------
    A dictionary containing the resulting text ("text") and segment-level details ("segments"), and
    the spoken language ("language"), which is detected when `decode_options["language"]` is None.
    fp16Tcpuz2Performing inference on CPU when CUDA is availablez0FP16 is not supported on CPU; using FP32 insteadF)paddinglanguageNenz]Detecting language using up to the first 30 seconds. Use `--language` to specify the language)keyzDetected language: task
transcribe)num_languagesr:   r=   c                 S   s   g | ]}t |qS  )float.0tsr@   r@   H/home/ubuntu/vllm_env/lib/python3.10/site-packages/whisper/transcribe.py
<listcomp>   s    ztranscribe.<locals>.<listcomp>,c                 S   s   g | ]}t |t qS r@   )roundr   rB   r@   r@   rE   rF          r      r   u*   "'“¿([{-"'.。,，!！?？:：”)]}、	translatez:Word-level timestamps on translations may not be reliable.segmentreturnc                    s   t ttfr
gn}d }|D ]_}i }|dkr'|dd  |dd  n|dd  tdi |d|i}| |}d} d urK|j krKd}d urV|jk rVd}d urj|jkrjd urj|jk rjd}|so |S q|S )	Nr   	beam_sizepatiencebest_ofr(   FTr@   )	
isinstanceintrA   popr   decodecompression_ratioavg_logprobno_speech_prob)rL   temperaturesdecode_resulttkwargsoptionsneeds_fallback)r)   decode_optionsr*   r4   r+   r(   r@   rE   decode_with_fallback   s8   



z(transcribe.<locals>.decode_with_fallback startendtokensresultc              
      s@   |  }fdd|D } | ||||j|j|j|jd	S )Nc                    s   g | ]	}| j k r|qS r@   )eot)rC   token)	tokenizerr@   rE   rF      s    z3transcribe.<locals>.new_segment.<locals>.<listcomp>)	seekra   rb   textrc   r(   rV   rU   rW   )tolistrT   r(   rV   rU   rW   )ra   rb   rc   rd   text_tokens)rh   rg   r@   rE   new_segment   s   ztranscribe.<locals>.new_segmentframes)totalunitdisabler   promptwordc                 S   s`   |  dd}| d | d  }d}|dk r|d7 }|dk r$|d| d 7 }|d	kr.||d	 7 }|S )
Nprobabilityr   rb   ra   g333333?r!   g/$?          @)get)rr   rs   durationscorer@   r@   rE   word_anomaly_score<  s   z&transcribe.<locals>.word_anomaly_scorec                    sd   | d u s| d s
dS  fdd| d D }|d d }t fdd|D }|dkp1|d	 t|kS )
NwordsFc                    s   g | ]
}|d   vr|qS )rr   r@   rC   w)punctuationr@   rE   rF   K  s    z:transcribe.<locals>.is_segment_anomaly.<locals>.<listcomp>   c                 3   s    | ]} |V  qd S )Nr@   r{   )ry   r@   rE   	<genexpr>M  s    z9transcribe.<locals>.is_segment_anomaly.<locals>.<genexpr>   g{Gz?)sumlen)rL   rz   rx   )r}   ry   r@   rE   is_segment_anomalyH  s   z&transcribe.<locals>.is_segment_anomalysegmentsc                 S   s   t dd | D d S )Nc                 s   s    | ]	}|d  r|V  qdS )rz   Nr@   )rC   sr@   r@   rE   r   Q  s    z9transcribe.<locals>.next_words_segment.<locals>.<genexpr>)next)r   r@   r@   rE   next_words_segmentP  s   z&transcribe.<locals>.next_words_segment)ra   rb   rc   rd   )r   r4   rg   mel
num_framesr0   r1   last_speech_timestamprz   ru   ri   [z --> z]  c                 S   s   g | ]
\}}d |i|qS )idr@   )rC   irL   r@   r@   rE   rF     s    
)ra   c                 S   s   g | ]}|d  D ]}|qqS )rc   r@   )rC   rL   rf   r@   r@   rE   rF     s    g      ?)ri   r   r:   )Hrv   torchfloat16float32devicecudais_availablewarningswarnr   dimsn_melsr   shaper
   rA   r	   r   is_multilingualprintr   todetect_languagemaxr   titler   r?   rQ   strsplitr   appendlistzipTensorr   r   n_audio_ctx
n_text_ctxencodestripextendtqdmmintensorrc   rW   rV   dictr   boolr   getimestamp_beginrj   whereadd_itemnonzeroflattenr   r   rH   r   ranger   r   	enumerater(   updaterT   )Sr4   r5   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r^   dtyper   content_framescontent_durationmel_segment_probsr:   r=   seek_points
seek_clipsr_   clip_idxinput_stridetime_precision
all_tokensall_segmentsprompt_reset_sinceremaining_prompt_lengthinitial_prompt_tokensrl   pbarr   seek_clip_startseek_clip_endtime_offsetwindow_end_timesegment_sizesegment_durationnignoredremaining_promptrd   rc   should_skipprevious_seekcurrent_segmentsr   r   timestamp_tokenssingle_timestamp_endingconsecutiveslices
last_slicecurrent_slicesliced_tokensstart_timestamp_posend_timestamp_poslast_timestamp_posrw   
timestampslast_word_end	thresholdremaining_durationfirst_segmentgaphal_last_endsirL   next_segmenthal_next_startsilence_beforesilence_afterra   rb   ri   liner   r@   )
r)   r^   r*   r4   r+   r}   rh   r(   rg   ry   rE   r>   &   s  Y








"
"*








"











$  wr>   c                     s  ddl m fdd} tjtjd}|jddtdd	 |jd
d| dd |jdtd dd |jdtj	 r8dnddd |jddtddd |jddtdg ddd |jd t
d!d"d |jd#td$d$d%gd&d |jd'td tt td(d) t D  d*d |jd+td,d-d |jd.td/d0d |jd1td/d2d |jd3td d4d |jd5td d6d |jd7td8d9d |jd:td d;d |jd<t
d=d>d |jd?t
d!d@d |jdAt
d!dBd |jdCtdDdEd |jdFtdGdHd |jdItdJdKd |jdLtdMdNd |jdOt
d=dPd |jdQtdRdSd |jdTtdUdVd |jdWt
d=dXd |jdYtd dZd |jd[td d\d |jd]td d^d |jd_td,d`d |jdatdbdcd |jddtdedf | j  dg} dh} di} dj} dk}tj|d!dl |dmr dn dovr dn d urt| dp dn  dq dr dn<  ds} dt }d urtt|du|}n|g} dv }	d,krt|	 ddwl m}
 |
|||dx}t||}g dy} dz s|D ]} | r| d{| d| q d} r d~ std  d r d~ rtd  fdd|D } dD ]C}zt!||fds|i }|||fi | W q) t"yl } zt#$  t%d| dt&|j' dt|  W Y d }~q)d }~ww d S )Nr   available_modelsc                    s,   |   v st j| r| S td   d)Nzmodel should be one of z or path to a model checkpoint)ospathexists
ValueError)namer   r@   rE   valid_model_name  s
   zcli.<locals>.valid_model_name)formatter_classr5   +zaudio file(s) to transcribe)nargstypehelpz--modelturboz name of the Whisper model to use)defaultr   r   z--model_dirz>the path to save model files; uses ~/.cache/whisper by default)r   r   r   z--devicer   r7   z#device to use for PyTorch inference)r   r   z--output_dirz-o.zdirectory to save the outputsz--output_formatz-fall)txtvttsrttsvjsonr   zSformat of the output file; if not specified, all available formats will be produced)r   r   choicesr   z	--verboseTz4whether to print out the progress and debug messagesz--taskr>   rK   zawhether to perform X->X speech recognition ('transcribe') or X->English translation ('translate')z
--languagec                 S   s   g | ]}|  qS r@   )r   )rC   kr@   r@   rE   rF     s    zcli.<locals>.<listcomp>zHlanguage spoken in the audio, specify None to perform language detectionz--temperaturer   ztemperature to use for samplingz	--best_of   z<number of candidates when sampling with non-zero temperaturez--beam_sizezHnumber of beams in beam search, only applicable when temperature is zeroz
--patiencezoptional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam searchz--length_penaltyzoptional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by defaultz--suppress_tokensz-1zcomma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuationsz--initial_promptz:optional text to provide as a prompt for the first window.z--carry_initial_promptFz{if True, prepend initial_prompt to every internal decode() call. May reduce the effectiveness of condition_on_previous_textz--condition_on_previous_textzif True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loopz--fp16z5whether to perform inference in fp16; True by defaultz#--temperature_increment_on_fallbackr   zhtemperature to increase when falling back when the decoding fails to meet either of the thresholds belowz--compression_ratio_thresholdr"   zUif the gzip compression ratio is higher than this value, treat the decoding as failedz--logprob_thresholdr#   zUif the average log probability is lower than this value, treat the decoding as failedz--no_speech_thresholdr    zif the probability of the <|nospeech|> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silencez--word_timestampszQ(experimental) extract word-level timestamps and refine the results based on themz--prepend_punctuationsr$   zNif word_timestamps is True, merge these punctuation symbols with the next wordz--append_punctuationsr%   zRif word_timestamps is True, merge these punctuation symbols with the previous wordz--highlight_wordszT(requires --word_timestamps True) underline each word as it is spoken in srt and vttz--max_line_widthze(requires --word_timestamps True) the maximum number of characters in a line before breaking the linez--max_line_countzJ(requires --word_timestamps True) the maximum number of lines in a segmentz--max_words_per_linezk(requires --word_timestamps True, no effect with --max_line_width) the maximum number of words in a segmentz	--threadsz]number of threads used by torch for CPU inference; supercedes MKL_NUM_THREADS/OMP_NUM_THREADSz--clip_timestampsr&   zcomma-separated list start,end,start,end,... timestamps (in seconds) of clips to process, where the last end timestamp defaults to the end of the filez!--hallucination_silence_thresholdz(requires --word_timestamps True) skip silent periods longer than this threshold (in seconds) when a possible hallucination is detected)r   r   r4   	model_dir
output_diroutput_formatr   )exist_okz.enr:   >   r;   Englishz) is an English-only model but receipted 'z'; using English instead.r;   r(   !temperature_increment_on_fallbackgzo ?threads)
load_model)r   download_root)highlight_wordsmax_line_countmax_line_widthmax_words_per_liner/   z--z  requires --word_timestamps Truer  r  z7--max_line_count has no effect without --max_line_widthr  z8--max_words_per_line has no effect with --max_line_widthc                    s   i | ]}|  |qS r@   )rS   )rC   arg)argsr@   rE   
<dictcomp>d  rI   zcli.<locals>.<dictcomp>z	Skipping z due to z: )(r   r   argparseArgumentParserArgumentDefaultsHelpFormatteradd_argumentr   r   r   r   r   sortedr   keysr   rA   r   r   
parse_args__dict__rS   r   makedirsendswithr   r   tuplenparangeset_num_threadsr  r   errorr>   	Exception	traceback	print_excr   r   __name__)r   parser
model_namer  r  r	  r   r(   	incrementr  r  r4   writerword_optionsoptionwriter_args
audio_pathrd   er@   )r  r   rE   cli  s   2












2r3  __main__)2r  r   r'  r   typingr   r   r   r   r   numpyr"  r   r   r5   r   r	   r
   r   r   r   r   decodingr   r   timingr   rg   r   r   r   utilsr   r   r   r   r   r   r   r   r4   r   r   ndarrayr   r   rA   r>   r3  r)  r@   r@   r@   rE   <module>   s    $	(	

   bi
