o
    i                  	   @   s  d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	m
Z
 d dlZd dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZmZmZmZ d d
lmZ d dlmZ d dl m!Z!m"Z"m#Z# d dl$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z-m.Z.m/Z/ d dl0m1Z1 d dl2m3Z3 d dl4m5Z5 d dl6m7Z7 d dl8m9Z9 d dl:m;Z; d dl<m=Z=m>Z>m?Z? d dl@mAZAmBZB d dlCmDZDmEZEmFZFmGZGmHZH d dlImJZJ d dlKmLZL d dlMmNZNmOZO d dlPmQZQ d dlRmSZS dd lTmUZUmVZVmWZW dd!lXmYZYmZZZm[Z[m\Z\m]Z] e%e^Z_G d"d# d#e j`ZaG d$d% d%eNZbG d&d' d'e+ZcG d(d) d)ejdZeG d*d+ d+ejfZgG d,d- d-egZhG d.d/ d/ejfZiG d0d1 d1ejfZjG d2d3 d3ejfZkG d4d5 d5ejfZled d6d7d8G d9d: d:ejfZmG d;d< d<ejfZnG d=d> d>eEZoG d?d@ d@eDeo ZpG dAdB dBeFeo Zqe;jreqeoepdCG dDdE dEejfeWeVZsdFeeteuejvf  dGeudHeeteuejvf  fdIdJZwdS )K    N)IterableMappingSequence)nullcontext)	AnnotatedLiteral)nn)BatchFeatureWhisperConfigWhisperFeatureExtractor)	sinusoids)support_torch_compile)CacheConfigModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)ExplicitEncoderDecoderPrompt
PromptType
TextPrompt)init_logger)
get_act_fn)	AttentionCrossAttentionMMEncoderAttention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)ParallelLMHead)default_weight_loader)ISO639_1_SUPPORTED_LANGS)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseProcessingInfoEncDecMultiModalProcessorPromptReplacementPromptUpdate)cached_processor_from_config)json_map_leaves)TensorSchemaTensorShape)set_default_torch_dtype)AttentionType   )MultiModalEmbeddingsSupportsMultiModalSupportsTranscription)AutoWeightsLoaderWeightsMappercast_overflow_tensorsmake_layersmaybe_prefixc                   @   s   e Zd ZdZdZdZdS )WhisperPosEmbedType
sinusoidalropelearnedN)__name__
__module____qualname__
SINUSOIDALROPELEARNED rH   rH   X/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/model_executor/models/whisper.pyr>   N   s    r>   c                   @   s4   e Zd ZU dZeeej dB edddf e	d< dS )WhisperAudioInputszl
    Dimensions:
        - b: Batch size
        - nmb: Number of mel bins
        - t: Time frames (M)
    Nbnmbtinput_features)
rB   rC   rD   __doc__r   listtorchTensorr2   __annotations__rH   rH   rH   rI   rJ   T   s   
 
rJ   c                       s:   e Zd ZdZdejdejdejdejf fddZ  ZS )WhisperEncoderAttentionzBMulti-headed attention for Whisper encoder with 2D tensor support.querykeyvaluereturnc                    sP   |  dk}|r|d}|d}|d}t |||}|r&|d}|S )zo
        Input shape: batch_size x seq_len x hidden_size
                     or seq_len x hidden_size
           r   )dim	unsqueezesuperforwardsqueeze)selfrU   rV   rW   is_2dout	__class__rH   rI   r]   e   s   




zWhisperEncoderAttention.forward)rB   rC   rD   rO   rQ   rR   r]   __classcell__rH   rH   rb   rI   rT   b   s    rT   c                       s.   e Zd Zdedef fddZdd Z  ZS )WhisperPositionalEmbeddingnum_positionsembedding_dimc                    s   t  || d S N)r\   __init__)r_   rf   rg   rb   rH   rI   ri      s   z#WhisperPositionalEmbedding.__init__c                 C   s
   | j | S rh   )weight)r_   position_idsrH   rH   rI   r]         
z"WhisperPositionalEmbedding.forward)rB   rC   rD   intri   r]   rd   rH   rH   rb   rI   re   ~   s    re   c                       s   e Zd ZdejddddfdedededededB d	edB d
edB de	f fddZ
			ddeded
edB de	ddf
ddZdejfddZ  ZS )WhisperAttentionTN 	embed_dim	num_headsbias	attn_typeper_layer_sliding_windowcache_configquant_configprefixc	           
         s  t    || _t }	|| _| j|	 dksJ | j|	 | _| j|	kr,| j|	 dks+J n	|	| j dks5J td| j|	 | _| j| j | _| j| j | _	| j| j | _
|| _| j| | jkrjtd| j d| d| jd | _| j||||d t||||| dd	| _|tjkrt| j| j| j| jd
| _d S | jtjkrt| j| j| j| j||| d| jd| _d S t| j| j| j| j||| d| j|d	| _d S )Nr   r5   z;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rw   z	.out_proj
input_sizeoutput_sizerr   rv   rw   )num_kv_headsz.attn)r|   ru   rv   rw   rs   )r|   ru   rv   rw   rs   rt   )r\   ri   rp   r   total_num_headsrq   maxr|   head_dimq_sizekv_sizers   
ValueErrorscaling	_init_qkvr   out_projr4   ENCODERrT   attnENCODER_DECODERr   r   )
r_   rp   rq   rr   rs   rt   ru   rv   rw   tp_sizerb   rH   rI   ri      sx   


zWhisperAttention.__init__rX   c              	   C   s(   t || j| j| j||| dd| _d S )Nz	.qkv_projhidden_size	head_sizer}   total_num_kv_headsrr   rv   rw   )r   r   r}   qkv_projr_   rp   rr   rv   rw   rH   rH   rI   r      s   zWhisperAttention._init_qkvhidden_statesc           	      C   sN   |  |\}}|j| j| j| jgdd\}}}| |||}| |\}}|S NrZ   )r   splitr   r   r   r   )	r_   r   qkv_qkvattn_outputoutputrH   rH   rI   r]      s
    zWhisperAttention.forwardTNro   )rB   rC   rD   r4   DECODERrm   boolr   r    strri   r   rQ   rR   r]   rd   rH   rH   rb   rI   rn      sR    	P
rn   c                       s   e Zd Z				ddededededB dedB d	ef fd
dZ			ddedededB d	eddf
ddZ	de
jde
jdB fddZ  ZS )WhisperCrossAttentionTNro   rp   rq   rr   ru   rv   rw   c              	      s    t  j||||||tjd d S )N)rp   rq   rr   ru   rv   rw   rs   )r\   ri   r4   r   )r_   rp   rq   rr   ru   rv   rw   rb   rH   rI   ri      s   	
zWhisperCrossAttention.__init__rX   c              	   C   s@   t ||||| dd| _t|| jd| j||| dd| _d S )Nz.q_projry   r   z.kv_projr   )r   q_projr   r   r}   kv_projr   rH   rH   rI   r     s    zWhisperCrossAttention._init_qkvr   encoder_hidden_statesc           
      C   sh   |  |\}}|d ur | |\}}|j| j| jgdd\}}nd  }}| |||}| |\}	}|	S r   )r   r   r   r   r   r   )
r_   r   r   r   r   kvr   r   r   r   rH   rH   rI   r]     s   zWhisperCrossAttention.forward)TNNro   r   )rB   rC   rD   rm   r   r   r    r   ri   r   rQ   rR   r]   rd   rH   rH   rb   rI   r      sJ    
r   c                       sL   e Zd Z		ddededededB def
 fdd	Zd
ejfddZ	  Z
S )
WhisperMLPNro   rp   ffn_dimact_fnrv   rw   c                    sH   t    t|| _t|||| dd| _t|||| dd| _d S )Nz.fc1)rz   r{   rv   rw   z.fc2)r\   ri   r   activation_fnr   fc1r   fc2)r_   rp   r   r   rv   rw   rb   rH   rI   ri   6  s   

zWhisperMLP.__init__r   c                 C   s*   |  |\}}| |}| |\}}|S rh   )r   r   r   )r_   r   r   rH   rH   rI   r]   N  s   
zWhisperMLP.forward)Nro   )rB   rC   rD   rm   r   r    ri   rQ   rR   r]   rd   rH   rH   rb   rI   r   5  s    r   c                       s<   e Zd Zdddedef fddZdejfdd	Z  Z	S )
WhisperEncoderLayerro   rx   vllm_configrw   c             	      s   t    |jj}t|dd }|j}|j}|j| _t	| j|j
tj|||| dd| _t| j| _t|j|j|j|| dd| _t| j| _d S )Nsliding_window
.self_attn)rp   rq   rs   rt   ru   rv   rw   .mlprp   r   r   rv   rw   )r\   ri   model_config	hf_configgetattrru   rv   d_modelrp   rn   encoder_attention_headsr4   r   	self_attnr   	LayerNormself_attn_layer_normr   encoder_ffn_dimactivation_functionmlpfinal_layer_norm)r_   r   rw   configr   ru   rv   rb   rH   rI   ri   V  s0   
	zWhisperEncoderLayer.__init__r   c                 C   sN   |}|  |}| j|d}|| }|}| |}| |}|| }t|}|S )Nr   )r   r   r   r   r;   )r_   r   residualrH   rH   rI   r]   q  s   


zWhisperEncoderLayer.forward
rB   rC   rD   r   r   ri   rQ   rR   r]   rd   rH   rH   rb   rI   r   U  s
    r   c                       sF   e Zd Zdddedef fddZdejdejd	B fd
dZ  Z	S )WhisperDecoderLayerro   rx   r   rw   c                   s   t    |jj}|j}|j}t|j|jt	j
||| dd| _t|j| _t|j|j||| dd| _t|j| _t|j|j|j|| dd| _t|j| _d S )Nr   )rp   rq   rs   ru   rv   rw   z.encoder_attn)rp   rq   ru   rv   rw   r   r   )r\   ri   r   r   ru   rv   rn   r   decoder_attention_headsr4   r   r   r   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   )r_   r   rw   r   ru   rv   rb   rH   rI   ri     s:   
zWhisperDecoderLayer.__init__r   r   Nc                 C   sj   |}|  |}| j|d}|| }|}| |}| j||d}|| }|}| |}| |}|| }|S )Nr   )r   r   )r   r   r   r   r   r   )r_   r   r   r   rH   rH   rI   r]     s    



zWhisperDecoderLayer.forwardr   rH   rH   rb   rI   r     s     r   c                       sR   e Zd Zddddededef fddZd	eje	ej B d
ejfddZ
  ZS )WhisperEncoderro   F)rw   init_in_fp32r   rw   r   c             	      s  t     jj}|j}tt|dd| _|j| _|j	| _	|j
r%t|nd| _tj| j|ddd| _tj||dddd| _| jjd	 | jjd	  | _t|j fd
d| dd\| _| _| _t|j| _| jtjtjfvrxtd| j |rttj nt! }t" 7 | t#| j	|| _$| j$j%&t'| j$j%j(  W d    n1 sw   Y  W d    d S W d    d S 1 sw   Y  d S )N	pos_embedr?         ?   r5   )kernel_sizepaddingrY   )strider   r   r   c                       t  |  ddS N.layersr   rw   )r   rx   r   rH   rI   <lambda>      
z)WhisperEncoder.__init__.<locals>.<lambda>r   rx   z\Only sinusoidal or learned position embeddings are supported for non-causal models, but got ))r\   ri   r   r   r   r>   r   pos_embed_typenum_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler   Conv1dconv1conv2r   total_strider<   encoder_layersstart_layer	end_layerlayersr   
layer_normrE   rG   r   r3   rQ   float32r   no_grad	Embeddingembed_positionsrj   copy_r   shape)r_   r   rw   r   r   rp   maybe_fp32_init_ctxrb   r   rI   ri     sL   


PzWhisperEncoder.__init__rN   rX   c                 C   s   g }d}|D ]9}t j| |}t j| |}|dd}|| jjd |dd d f  	|j
}|| |jdk}q|rHt|}ntj|dd}| jD ]}||}qR| |}|S )NFr   rY   r   r   )r   
functionalgelur   r   	transposer   rj   sizetodtypeappendndimrQ   catstackr   r   )r_   rN   r   input_is_batchedfeaturesembedsencoder_layerrH   rH   rI   r]     s$   "



zWhisperEncoder.forward)rB   rC   rD   r   r   r   ri   rQ   rR   rP   r]   rd   rH   rH   rb   rI   r     s    1r   r   )	input_ids	positions)dynamic_arg_dimsc                       s\   e Zd Zdddedef fddZdejdejd	B fd
dZdejdejfddZ	  Z
S )WhisperDecoderro   rx   r   rw   c                   s   t     jj}|j| _|j| _|j| _|j	| _	|j
r"t|jnd| _t|j|j| j| _t| j|j| _t|j fdd| dd\| _| _| _t|j| _d S )Nr   c                    r   r   )r   rx   r   rH   rI   r   !  r   z)WhisperDecoder.__init__.<locals>.<lambda>r   rx   )r\   ri   r   r   decoder_layerdrop	layerdroppad_token_idpadding_idxmax_target_positionsr   r   r   r   r   r   r   r   
vocab_sizeembed_tokensre   r   r<   decoder_layersr   r   r   r   r   )r_   r   rw   r   rb   r   rI   ri     s&   

zWhisperDecoder.__init__r   r   Nc                 C   sB   |  |}| |}|| }| jD ]}|||d}q| |}|S )N)r   )embed_input_idsr   r   r   )r_   r   r   r   inputs_embedsr   decoder_layerrH   rH   rI   r]   (  s   



zWhisperDecoder.forwardr   rX   c                 C   s
   |  |S rh   )r  )r_   r   rH   rH   rI   r  ;  rl   zWhisperDecoder.embed_input_ids)rB   rC   rD   r   r   ri   rQ   rR   r]   r  rd   rH   rH   rb   rI   r     s    
r   c                       s   e Zd Zdddedef fddZdejdB d	ejd
eej dejfddZ	dejeej B dB dejdB fddZ
deeeejf  dee fddZ  ZS )WhisperModelro   rx   r   rw   c                   s6   t    t|| dd| _t|| dd| _d S )Nz.encoderr   z.decoder)r\   ri   r   encoderr   decoder)r_   r   rw   rb   rH   rI   ri   @  s   


zWhisperModel.__init__r   Nr   encoder_outputsrX   c                 C   s.   t |rtj|ddnd }| j|||d}|S )Nr   r   )r   r   r   )lenrQ   r   r  )r_   r   r   r  
enc_statesdecoder_outputsrH   rH   rI   r]   I  s   zWhisperModel.forwardrN   c                 C   s   |d u rd S |  |S rh   )r  )r_   rN   rH   rH   rI   get_encoder_outputsW  s   
z WhisperModel.get_encoder_outputsweightsc                 C   s   g d}t |  }t }|D ]M\}}|D ](\}}}	||vrq|||}|dr/||vr/q|| }
|
j}||
||	  n|drH||vrHq|| }
t|
dt}||
| || q|S )N)).self_attn.qkv_projz.self_attn.q_projr   )r  z.self_attn.k_projr   )r  z.self_attn.v_projr   ).encoder_attn.kv_projz.encoder_attn.k_projr   )r  z.encoder_attn.v_projr   z.biasweight_loader)	dictnamed_parameterssetreplaceendswithr  r   r"   add)r_   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  rH   rH   rI   load_weights_  s*   
zWhisperModel.load_weights)rB   rC   rD   r   r   ri   rQ   rR   rP   r]   r  r   tupler  r%  rd   rH   rH   rb   rI   r
  ?  s"    	

,r
  c                   @   s|   e Zd ZdefddZdd ZedefddZde	e
edB f fd	d
ZdedefddZdefddZdefddZdS )WhisperProcessingInforX   c                 C   s   | j tS rh   )ctxget_hf_configr
   r_   rH   rH   rI   r)    s   z#WhisperProcessingInfo.get_hf_configc                 C   s    |   }t|j|  |  dS )N)	target_srtarget_channelsexpected_hidden_size)get_feature_extractorr)   sampling_rateget_target_channels_get_expected_hidden_size)r_   feature_extractorrH   rH   rI   get_data_parser  s   z%WhisperProcessingInfo.get_data_parserc                 C      dS )NTrH   r*  rH   rH   rI   skip_prompt_length_check     z.WhisperProcessingInfo.skip_prompt_length_checkNc                 C   s   ddiS )Naudior5   rH   r*  rH   rH   rI   get_supported_mm_limits  s   z-WhisperProcessingInfo.get_supported_mm_limitskwargsc                 K   s(   | j di |}|j}t|tsJ |S )NrH   )get_hf_processorr2  
isinstancer   )r_   r9  hf_processorr2  rH   rH   rI   r.    s   z+WhisperProcessingInfo.get_feature_extractorc                 C   r4  )z7Return target audio channels for Whisper models (mono).r5   rH   r*  rH   rH   rI   r0    r6  z)WhisperProcessingInfo.get_target_channelsc                 C   s
   |   jS rh   )r)  r   r*  rH   rH   rI   get_num_audio_tokens  rl   z*WhisperProcessingInfo.get_num_audio_tokens)rB   rC   rD   r
   r)  r3  propertyr   r5  r   r   rm   r8  objectr   r.  r0  r=  rH   rH   rH   rI   r'    s    	r'  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )WhisperDummyInputsBuilder	mm_countsrX   c                 C   s   | dd}d| S )Nr7  r   z<|startoftranscript|>)get)r_   rA  
num_audiosrH   rH   rI   get_dummy_text  s   z(WhisperDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sL   | j  }|j}|j| }|dd}|r|dnd }d| j|||diS )Nr7  r   )lengthrC  	overrides)infor.  r/  chunk_lengthrB  _get_dummy_audios)	r_   rE  rA  rF  r2  r/  	audio_lenrC  audio_overridesrH   rH   rI   get_dummy_mm_data  s   

z+WhisperDummyInputsBuilder.get_dummy_mm_datarh   )
rB   rC   rD   r   r   rm   rD  r   r%   rN  rH   rH   rH   rI   r@    s    	
r@  c                
       s   e Zd Zdeee B dedeee B fddZdedeee	f deee	f deee	f de
f
 fd	d
Zde
deee	f deeef fddZdedeee	f dedee fddZ  ZS )WhisperMultiModalProcessorpromptmm_itemsrX   c                 C   s   dgS )Nr   rH   )r_   rP  rQ  rH   rH   rI   create_encoder_prompt  s   	z0WhisperMultiModalProcessor.create_encoder_promptmm_data	mm_kwargs
tok_kwargsc                    s~   |r| j jd	i |}t|dd}td	i |d|ji}dd | D }t j||||d}d|v r=|d|d< |S )
Naudios)r7  r/  c                 S   s   i | ]\}}|d vr||qS ))
truncation
max_lengthrH   ).0r   r   rH   rH   rI   
<dictcomp>  s    zAWhisperMultiModalProcessor._call_hf_processor.<locals>.<dictcomp>)rP  rS  rT  rU  labelsr   rH   )rI  r.  r  popr/  itemsr\   _call_hf_processor)r_   rP  rS  rT  rU  r2  processed_outputsrb   rH   rI   r^    s(   	z-WhisperMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t tddS )Nr7  rN   )r  r&   batched)r_   r`  ra  rH   rH   rI   _get_mm_fields_config  s   z0WhisperMultiModalProcessor._get_mm_fields_configout_mm_kwargsc                 C   s"   | j  }tddgdg| dgS )Nr7  r   )modalitytargetreplacement)rI  r=  r-   )r_   rQ  ra  re  
num_tokensrH   rH   rI   _get_prompt_updates  s   
z.WhisperMultiModalProcessor._get_prompt_updates)rB   rC   rD   r   rP   rm   r(   rR  r   r?  r	   r^  r&   rd  r'   r   r.   rj  rd   rH   rH   rb   rI   rO    sF    





 



rO  )rI  dummy_inputsc                       s  e Zd Zg dddgdZeddddZd	Zd	ZeZ	e
d
edB dedB f fddZe
dejdeded
edB ded dededB defddZe
dedededB fddZe
dededefddZe
dededededB fdd Zd!d"d#ed$ef fd%d&Z	d>d'ejd(ejd)eej dB dejfd*d+Zd,ede fd-d.Z!	d>dd/d0d'ejd1e dB d2ejdB d3e"dejf
d4d5Z#d,ede$fd6d7Z%d8ejdejfd9d:Z&d;e'e(eejf  de)e fd<d=Z*  Z+S )?WhisperForConditionalGeneration)zself_attn.q_projzself_attn.k_projzself_attn.v_projzencoder_attn.k_projzencoder_attn.v_proj)zself_attn.qkv_projzencoder_attn.kv_projz	.mlp.fc1.z	.mlp.fc2.)z.fc1.z.fc2.)orig_to_new_substrTlanguageNrX   c                    s"   |d u rt d d}t |S )NzDefaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field in the TranscriptionRequest.en)loggerwarningr\   validate_language)clsrn  rb   rH   rI   rr    s   z1WhisperForConditionalGeneration.validate_languager7  r   
stt_config	task_type)
transcribe	translaterequest_promptto_languagec           	      C   sX   |d u rt d|rd| ndd| d| d }ttdd||jfidt|d	d
S )Nz;Language must be specified when creating the Whisper promptz<|prev|>ro   z<|startoftranscript|><|z|><|z|><|notimestamps|>r7  )rP  multi_modal_data)rP  )encoder_promptdecoder_prompt)r   r   r   sample_rate)	rs  r7  r   rt  rn  ru  rx  ry  decoder_textrH   rH   rI   get_generation_prompt+  s   z5WhisperForConditionalGeneration.get_generation_promptrf  ic                 C   s   | drd S td)Nr7  z Only audio modality is supported)
startswithr   )rs  rf  r  rH   rH   rI   get_placeholder_strG  s   
z3WhisperForConditionalGeneration.get_placeholder_strc                 C   s   t |}t|jj|jjdS )N)max_audio_clip_sr}  )r/   r   r2  rJ  r/  )rs  r   ru  	processorrH   rH   rI   get_speech_to_text_configN  s
   z9WhisperForConditionalGeneration.get_speech_to_text_configaudio_duration_sc                 C   s0   t |}|jj}|d usJ t||j | S rh   )r/   r2  
hop_lengthr   ceilr}  )rs  r  rt  r   r  r  rH   rH   rI   r=  Y  s   z4WhisperForConditionalGeneration.get_num_audio_tokensro   rx   r   rw   c                   s   t    |jj}|j}|| _|jj| _| j|tdt	id t
||d| _W d    n1 s0w   Y  t|j|j|t|dd| _| j| jjj| _t|dd}t|j|d| _d S )	Nr7  )language_targetstower_targetsr   proj_out)rv   rw   logit_scaler   )scale)r\   ri   r   r   rv   r   r   _mark_composite_modelr   r   r
  modelr!   r  r   r=   r  tie_weightsr  r  r   r   logits_processor)r_   r   rw   r   rv   r  rb   rH   rI   ri   i  s*   

z(WhisperForConditionalGeneration.__init__r   r   r  c                 K   s    |d u rg }| j |||d}|S )N)r   r   r  )r  )r_   r   r   r  r9  r  rH   rH   rI   r]     s   z'WhisperForConditionalGeneration.forwardr9  c                 K   s,   | j di |}| j|d }|jddS )NrN   r   r   rH   )_parse_and_validate_audio_inputr  r  unbind)r_   r9  audio_input
enc_outputrH   rH   rI   embed_multimodal  s   z0WhisperForConditionalGeneration.embed_multimodalF)is_multimodalhandle_oov_mm_tokenmultimodal_embeddingsr  r  c                C   s   | j j|S rh   )r  r  r  )r_   r   r  r  r  rH   rH   rI   r    s   
z/WhisperForConditionalGeneration.embed_input_idsc                    s0   | dd }|d urt fdd|}t|dS )NrN   c                    s   |   jS rh   )r   r   )xr*  rH   rI   r     s    zQWhisperForConditionalGeneration._parse_and_validate_audio_input.<locals>.<lambda>rb  )r\  r0   rJ   )r_   r9  rN   rH   r*  rI   r    s   
z?WhisperForConditionalGeneration._parse_and_validate_audio_inputr   c                 C   s   |  | j|}|S rh   )r  r  )r_   r   logitsrH   rH   rI   compute_logits  s   z.WhisperForConditionalGeneration.compute_logitsr  c                 C   s(   t | dgd}t|d}|j|| jdS )Nz	proj_out.)skip_prefixesz.k_proj.weight)mapper)r9   _create_fake_bias_for_k_projr%  hf_to_vllm_mapper)r_   r  loaderrH   rH   rI   r%    s   
z,WhisperForConditionalGeneration.load_weightsrh   ),rB   rC   rD   packed_modules_mappingr:   r  supports_transcription_onlysupports_segment_timestampr#   supported_languagesclassmethodr   rr  npndarrayr   r   r   r   r  rm   r  r  floatr=  r   ri   rQ   rR   rP   r]   r?  r6   r  r   r  rJ   r  r  r   r&  r  r%  rd   rH   rH   rb   rI   rl    s    		 	


,rl  r  fake_bias_key_namerX   c                 c   sZ    | D ]'\}}| |r%t|d}|dd}||f||fgE dH  ||fV  qdS )z
    Create full zeros bias for k_proj weight in self-attn and x-attn layers.
    So that the bias for k_proj in qkv_proj can be initialized with zeros.
    r   rj   rr   N)r  rQ   zerosr   r  )r  r  r  rj   rr   	bias_namerH   rH   rI   r    s   
r  )xenumr   collections.abcr   r   r   
contextlibr   typingr   r   numpyr  rQ   r   transformersr	   r
   r   ,transformers.models.whisper.modeling_whisperr   vllm.compilation.decoratorsr   vllm.configr   r   r   r   vllm.config.multimodalr   vllm.distributedr   vllm.inputs.datar   r   r   vllm.loggerr   %vllm.model_executor.layers.activationr   $vllm.model_executor.layers.attentionr   r   r   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr    3vllm.model_executor.layers.vocab_parallel_embeddingr!   -vllm.model_executor.model_loader.weight_utilsr"   (vllm.model_executor.models.whisper_utilsr#   vllm.multimodalr$   vllm.multimodal.inputsr%   r&   r'   vllm.multimodal.parser(   r)   vllm.multimodal.processingr*   r+   r,   r-   r.   !vllm.transformers_utils.processorr/   vllm.utils.jsontreer0   vllm.utils.tensor_schemar1   r2   vllm.utils.torch_utilsr3   vllm.v1.attention.backendr4   
interfacesr6   r7   r8   utilsr9   r:   r;   r<   r=   rB   rp  Enumr>   rJ   rT   r   re   Modulern   r   r   r   r   r   r   r
  r'  r@  rO  register_processorrl  r&  r   rR   r  rH   rH   rH   rI   <module>   s   mB .;P0D"C
 2