o
    -i<                  	   @   s  d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	m
Z
mZ d dlZd dlZd dlmZ d dlmZmZmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZmZmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z( d dl)m*Z* d dl+m,Z, d dl-m.Z.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z< d dl=m>Z>m?Z?m@Z@ d dlAmBZBmCZC d dlDmEZEmFZFmGZGmHZHmIZI d dlJmKZK d dlLmMZM d dlNmOZOmPZP d dlQmRZR d d lSmTZT d!d"lUmVZVmWZWmXZX d!d#lYmZZZm[Z[m\Z\m]Z]m^Z^ e&e_Z`G d$d% d%e jaZbG d&d' d'eOZcG d(d) d)e,ZdG d*d+ d+ejeZfG d,d- d-ejgZhG d.d/ d/ehZiG d0d1 d1ejgZjG d2d3 d3ejgZkG d4d5 d5ejgZlG d6d7 d7ejgZmed d8d9d:G d;d< d<ejgZnG d=d> d>ejgZoG d?d@ d@eFZpG dAdB dBeEep ZqG dCdD dDeGep Zre<jserepeqdEG dFdG dGejgeXeWZtdHeeuevejwf  dIevdJeeuevejwf  fdKdLZxdS )M    N)IterableMappingSequence)nullcontext)	AnnotatedLiteralcast)nn)BatchFeatureWhisperConfigWhisperFeatureExtractor)	sinusoids)	Attention)support_torch_compile)CacheConfigModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)$get_tensor_model_parallel_world_size)
PromptType)init_logger)
get_act_fn)CrossAttention)MMEncoderAttention)ColumnParallelLinearQKVParallelLinearRowParallelLinear)LogitsProcessor)QuantizationConfig)ParallelLMHead)default_weight_loader)ISO639_1_SUPPORTED_LANGS)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItems)MultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseProcessingInfoEncDecMultiModalProcessorPromptReplacementPromptUpdate)cached_processor_from_config)json_map_leaves)TensorSchemaTensorShape)set_default_torch_dtype)AttentionType   )MultiModalEmbeddingsSupportsMultiModalSupportsTranscription)AutoWeightsLoaderWeightsMappercast_overflow_tensorsmake_layersmaybe_prefixc                   @   s   e Zd ZdZdZdZdS )WhisperPosEmbedType
sinusoidalropelearnedN)__name__
__module____qualname__
SINUSOIDALROPELEARNED rG   rG   _/home/ubuntu/veenaModal/venv/lib/python3.10/site-packages/vllm/model_executor/models/whisper.pyr=   L   s    r=   c                   @   s4   e Zd ZU dZeeej dB edddf e	d< dS )WhisperAudioInputszl
    Dimensions:
        - b: Batch size
        - nmb: Number of mel bins
        - t: Time frames (M)
    Nbnmbtinput_features)
rA   rB   rC   __doc__r   listtorchTensorr1   __annotations__rG   rG   rG   rH   rI   R   s   
 
rI   c                       s:   e Zd ZdZdejdejdejdejf fddZ  ZS )WhisperEncoderAttentionzBMulti-headed attention for Whisper encoder with 2D tensor support.querykeyvaluereturnc                    sP   |  dk}|r|d}|d}|d}t |||}|r&|d}|S )zo
        Input shape: batch_size x seq_len x hidden_size
                     or seq_len x hidden_size
           r   )dim	unsqueezesuperforwardsqueeze)selfrT   rU   rV   is_2dout	__class__rG   rH   r\   c   s   




zWhisperEncoderAttention.forward)rA   rB   rC   rN   rP   rQ   r\   __classcell__rG   rG   ra   rH   rS   `   s    rS   c                       s.   e Zd Zdedef fddZdd Z  ZS )WhisperPositionalEmbeddingnum_positionsembedding_dimc                    s   t  || d S N)r[   __init__)r^   re   rf   ra   rG   rH   rh   }   s   z#WhisperPositionalEmbedding.__init__c                 C   s
   | j | S rg   )weight)r^   position_idsrG   rG   rH   r\         
z"WhisperPositionalEmbedding.forward)rA   rB   rC   intrh   r\   rc   rG   rG   ra   rH   rd   |   s    rd   c                       s   e Zd ZdejddddfdedededededB d	edB d
edB de	f fddZ
			ddeded
edB de	ddf
ddZdejfddZ  ZS )WhisperAttentionTN 	embed_dim	num_headsbias	attn_typeper_layer_sliding_windowcache_configquant_configprefixc	           
         s  t    || _t }	|| _| j|	 dksJ | j|	 | _| j|	kr,| j|	 dks+J n	|	| j dks5J td| j|	 | _| j| j | _| j| j | _	| j| j | _
|| _| j| | jkrjtd| j d| d| jd | _| j||||d t||||| dd	| _|tjkrt| j| j| j| jd
| _d S | jtjkrt| j| j| j| j||| d| jd| _d S t| j| j| j| j||| d| j|d	| _d S )Nr   r4   z;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rv   z	.out_proj
input_sizeoutput_sizerq   ru   rv   )num_kv_headsz.attn)r{   rt   ru   rv   rr   )r{   rt   ru   rv   rr   rs   )r[   rh   ro   r   total_num_headsrp   maxr{   head_dimq_sizekv_sizerr   
ValueErrorscaling	_init_qkvr   out_projr3   ENCODERrS   attnENCODER_DECODERr   r   )
r^   ro   rp   rq   rr   rs   rt   ru   rv   tp_sizera   rG   rH   rh      sx   


zWhisperAttention.__init__rW   c              	   C   s(   t || j| j| j||| dd| _d S )Nz	.qkv_projhidden_size	head_sizer|   total_num_kv_headsrq   ru   rv   )r   r~   r|   qkv_projr^   ro   rq   ru   rv   rG   rG   rH   r      s   zWhisperAttention._init_qkvhidden_statesc           	      C   sN   |  |\}}|j| j| j| jgdd\}}}| |||}| |\}}|S NrY   )r   splitr   r   r   r   )	r^   r   qkv_qkvattn_outputoutputrG   rG   rH   r\      s
    zWhisperAttention.forwardTNrn   )rA   rB   rC   r3   DECODERrl   boolr   r   strrh   r   rP   rQ   r\   rc   rG   rG   ra   rH   rm      sR    	P
rm   c                       s   e Zd Z				ddededededB dedB d	ef fd
dZ			ddedededB d	eddf
ddZ	de
jde
jdB fddZ  ZS )WhisperCrossAttentionTNrn   ro   rp   rq   rt   ru   rv   c              	      s    t  j||||||tjd d S )N)ro   rp   rq   rt   ru   rv   rr   )r[   rh   r3   r   )r^   ro   rp   rq   rt   ru   rv   ra   rG   rH   rh      s   	
zWhisperCrossAttention.__init__rW   c              	   C   s@   t ||||| dd| _t|| jd| j||| dd| _d S )Nz.q_projrx   r   z.kv_projr   )r   q_projr   r~   r|   kv_projr   rG   rG   rH   r     s    zWhisperCrossAttention._init_qkvr   encoder_hidden_statesc           
      C   sh   |  |\}}|d ur | |\}}|j| j| jgdd\}}nd  }}| |||}| |\}	}|	S r   )r   r   r   r   r   r   )
r^   r   r   r   r   kvr   r   r   r   rG   rG   rH   r\     s   zWhisperCrossAttention.forward)TNNrn   r   )rA   rB   rC   rl   r   r   r   r   rh   r   rP   rQ   r\   rc   rG   rG   ra   rH   r      sJ    
r   c                       sL   e Zd Z		ddededededB def
 fdd	Zd
ejfddZ	  Z
S )
WhisperMLPNrn   ro   ffn_dimact_fnru   rv   c                    sH   t    t|| _t|||| dd| _t|||| dd| _d S )Nz.fc1)ry   rz   ru   rv   z.fc2)r[   rh   r   activation_fnr   fc1r   fc2)r^   ro   r   r   ru   rv   ra   rG   rH   rh   4  s   

zWhisperMLP.__init__r   c                 C   s*   |  |\}}| |}| |\}}|S rg   )r   r   r   )r^   r   r   rG   rG   rH   r\   L  s   
zWhisperMLP.forward)Nrn   )rA   rB   rC   rl   r   r   rh   rP   rQ   r\   rc   rG   rG   ra   rH   r   3  s    r   c                       s<   e Zd Zdddedef fddZdejfdd	Z  Z	S )
WhisperEncoderLayerrn   rw   vllm_configrv   c             	      s   t    |jj}t|dd }|j}|j}|j| _t	| j|j
tj|||| dd| _t| j| _t|j|j|j|| dd| _t| j| _d S )Nsliding_window
.self_attn)ro   rp   rr   rs   rt   ru   rv   .mlpro   r   r   ru   rv   )r[   rh   model_config	hf_configgetattrrt   ru   d_modelro   rm   encoder_attention_headsr3   r   	self_attnr	   	LayerNormself_attn_layer_normr   encoder_ffn_dimactivation_functionmlpfinal_layer_norm)r^   r   rv   configr   rt   ru   ra   rG   rH   rh   T  s0   
	zWhisperEncoderLayer.__init__r   c                 C   sN   |}|  |}| j|d}|| }|}| |}| |}|| }t|}|S )Nr   )r   r   r   r   r:   )r^   r   residualrG   rG   rH   r\   o  s   


zWhisperEncoderLayer.forward
rA   rB   rC   r   r   rh   rP   rQ   r\   rc   rG   rG   ra   rH   r   S  s
    r   c                       sF   e Zd Zdddedef fddZdejdejd	B fd
dZ  Z	S )WhisperDecoderLayerrn   rw   r   rv   c                   s   t    |jj}|j}|j}t|j|jt	j
||| dd| _t|j| _t|j|j||| dd| _t|j| _t|j|j|j|| dd| _t|j| _d S )Nr   )ro   rp   rr   rt   ru   rv   z.encoder_attn)ro   rp   rt   ru   rv   r   r   )r[   rh   r   r   rt   ru   rm   r   decoder_attention_headsr3   r   r   r	   r   r   r   encoder_attnencoder_attn_layer_normr   decoder_ffn_dimr   r   r   )r^   r   rv   r   rt   ru   ra   rG   rH   rh     s:   
zWhisperDecoderLayer.__init__r   r   Nc                 C   sj   |}|  |}| j|d}|| }|}| |}| j||d}|| }|}| |}| |}|| }|S )Nr   )r   r   )r   r   r   r   r   r   )r^   r   r   r   rG   rG   rH   r\     s    



zWhisperDecoderLayer.forwardr   rG   rG   ra   rH   r     s     r   c                       sR   e Zd Zddddededef fddZd	eje	ej B d
ejfddZ
  ZS )WhisperEncoderrn   F)rv   init_in_fp32r   rv   r   c             	      s  t     jj}|j}tt|dd| _|j| _|j	| _	|j
r%t|nd| _tj| j|ddd| _tj||dddd| _| jjd	 | jjd	  | _t|j fd
d| dd\| _| _| _t|j| _| jtjtjfvrxtd| j |rttj nt! }t" 7 | t#| j	|| _$| j$j%&t'| j$j%j(  W d    n1 sw   Y  W d    d S W d    d S 1 sw   Y  d S )N	pos_embedr>         ?   r4   )kernel_sizepaddingrX   )strider   r   r   c                       t  |  ddS N.layersr   rv   )r   rw   r   rG   rH   <lambda>      
z)WhisperEncoder.__init__.<locals>.<lambda>r   rw   z\Only sinusoidal or learned position embeddings are supported for non-causal models, but got ))r[   rh   r   r   r   r=   r   pos_embed_typenum_mel_binsmax_source_positionsscale_embeddingmathsqrtembed_scaler	   Conv1dconv1conv2r   total_strider;   encoder_layersstart_layer	end_layerlayersr   
layer_normrD   rF   r   r2   rP   float32r   no_grad	Embeddingembed_positionsri   copy_r   shape)r^   r   rv   r   r   ro   maybe_fp32_init_ctxra   r   rH   rh     sL   


PzWhisperEncoder.__init__rM   rW   c                 C   s   g }d}|D ]9}t j| |}t j| |}|dd}|| jjd |dd d f  	|j
}|| |jdk}q|rHt|}ntj|dd}| jD ]}||}qR| |}|S )NFr   rX   r   r   )r	   
functionalgelur   r   	transposer   ri   sizetodtypeappendndimrP   catstackr   r   )r^   rM   r   input_is_batchedfeaturesembedsencoder_layerrG   rG   rH   r\     s$   "



zWhisperEncoder.forward)rA   rB   rC   r   r   r   rh   rP   rQ   rO   r\   rc   rG   rG   ra   rH   r     s    1r   r   )	input_ids	positions)dynamic_arg_dimsc                       s\   e Zd Zdddedef fddZdejdejd	B fd
dZdejdejfddZ	  Z
S )WhisperDecoderrn   rw   r   rv   c                   s   t     jj}|j| _|j| _|j| _|j	| _	|j
r"t|jnd| _t|j|j| j| _t| j|j| _t|j fdd| dd\| _| _| _t|j| _d S )Nr   c                    r   r   )r   rw   r   rG   rH   r     r   z)WhisperDecoder.__init__.<locals>.<lambda>r   rw   )r[   rh   r   r   decoder_layerdrop	layerdroppad_token_idpadding_idxmax_target_positionsr   r   r   r   r   r   r	   r   
vocab_sizeembed_tokensrd   r   r;   decoder_layersr   r   r   r   r   )r^   r   rv   r   ra   r   rH   rh     s&   

zWhisperDecoder.__init__r   r   Nc                 C   sB   |  |}| |}|| }| jD ]}|||d}q| |}|S )N)r   )embed_input_idsr   r   r   )r^   r   r   r   inputs_embedsr   decoder_layerrG   rG   rH   r\   &  s   



zWhisperDecoder.forwardr   rW   c                 C   s
   |  |S rg   )r  )r^   r   rG   rG   rH   r  9  rk   zWhisperDecoder.embed_input_ids)rA   rB   rC   r   r   rh   rP   rQ   r\   r  rc   rG   rG   ra   rH   r     s    
r   c                       s   e Zd Zdddedef fddZdejdB d	ejd
eej dejfddZ	dejeej B dB dejdB fddZ
deeeejf  dee fddZ  ZS )WhisperModelrn   rw   r   rv   c                   s6   t    t|| dd| _t|| dd| _d S )Nz.encoderr   z.decoder)r[   rh   r   encoderr   decoder)r^   r   rv   ra   rG   rH   rh   >  s   


zWhisperModel.__init__r   Nr   encoder_outputsrW   c                 C   s.   t |rtj|ddnd }| j|||d}|S )Nr   r   )r   r   r   )lenrP   r   r  )r^   r   r   r  
enc_statesdecoder_outputsrG   rG   rH   r\   G  s   zWhisperModel.forwardrM   c                 C   s   |d u rd S |  |S rg   )r
  )r^   rM   rG   rG   rH   get_encoder_outputsU  s   
z WhisperModel.get_encoder_outputsweightsc                 C   s   g d}t |  }t }|D ]M\}}|D ](\}}}	||vrq|||}|dr/||vr/q|| }
|
j}||
||	  n|drH||vrHq|| }
t|
dt}||
| || q|S )N)).self_attn.qkv_projz.self_attn.q_projr   )r  z.self_attn.k_projr   )r  z.self_attn.v_projr   ).encoder_attn.kv_projz.encoder_attn.k_projr   )r  z.encoder_attn.v_projr   z.biasweight_loader)	dictnamed_parameterssetreplaceendswithr  r   r!   add)r^   r  stacked_params_mappingparams_dictloaded_paramsnameloaded_weight
param_nameweight_nameshard_idparamr  rG   rG   rH   load_weights]  s*   
zWhisperModel.load_weights)rA   rB   rC   r   r   rh   rP   rQ   rO   r\   r  r   tupler  r$  rc   rG   rG   ra   rH   r	  =  s"    	

,r	  c                   @   st   e Zd ZdefddZedefddZdee	e
dB f fddZd	edefd
dZde
fddZde
fddZdS )WhisperProcessingInforW   c                 C   s   | j tS rg   )ctxget_hf_configr   r^   rG   rG   rH   r(    s   z#WhisperProcessingInfo.get_hf_configc                 C      dS )NTrG   r)  rG   rG   rH   skip_prompt_length_check     z.WhisperProcessingInfo.skip_prompt_length_checkNc                 C   s   ddiS )Naudior4   rG   r)  rG   rG   rH   get_supported_mm_limits  s   z-WhisperProcessingInfo.get_supported_mm_limitskwargsc                 K   s(   | j di |}|j}t|tsJ |S )NrG   )get_hf_processorfeature_extractor
isinstancer   )r^   r/  hf_processorr1  rG   rG   rH   get_feature_extractor  s   z+WhisperProcessingInfo.get_feature_extractorc                 C   r*  )z7Return target audio channels for Whisper models (mono).r4   rG   r)  rG   rG   rH   get_target_channels  r,  z)WhisperProcessingInfo.get_target_channelsc                 C   s
   |   jS rg   )r(  r   r)  rG   rG   rH   get_num_audio_tokens  rk   z*WhisperProcessingInfo.get_num_audio_tokens)rA   rB   rC   r   r(  propertyr   r+  r   r   rl   r.  objectr   r4  r5  r6  rG   rG   rG   rH   r&    s    r&  c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )WhisperDummyInputsBuilder	mm_countsrW   c                 C   s   | dd}d| S )Nr-  r   z<|startoftranscript|>)get)r^   r:  
num_audiosrG   rG   rH   get_dummy_text  s   z(WhisperDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc           	      C   sL   | j  }|j}|j| }|dd}|r|dnd }d| j|||diS )Nr-  r   )lengthr<  	overrides)infor4  sampling_ratechunk_lengthr;  _get_dummy_audios)	r^   r>  r:  r?  r1  rC  	audio_lenr<  audio_overridesrG   rG   rH   get_dummy_mm_data  s   

z+WhisperDummyInputsBuilder.get_dummy_mm_datarg   )
rA   rB   rC   r   r   rl   r=  r   r$   rH  rG   rG   rG   rH   r9    s    	
r9  c                
       s   e Zd ZdefddZdeee B dedeee B fddZ	dede
eef de
eef d	e
eef def
 fd
dZdede
eef de
eef fddZdede
eef dedee fddZ  ZS )WhisperMultiModalProcessorrW   c                 C   s   | j  }t|j| j  dS )N)	target_srtarget_channels)rB  r4  r(   rC  r5  )r^   r1  rG   rG   rH   _get_data_parser  s
   
z+WhisperMultiModalProcessor._get_data_parserpromptmm_datac                 C   s   dgS )Nr   rG   )r^   rM  rN  rG   rG   rH   create_encoder_prompt  s   	z0WhisperMultiModalProcessor.create_encoder_prompt	mm_kwargs
tok_kwargsc                    sl   |r| j jdi |}t|dd}tdi |d|ji}t j||||d}d|v r4|d|d< |S )Naudios)r-  rC  )rM  rN  rP  rQ  labelsr   rG   )rB  r4  r  poprC  r[   _call_hf_processor)r^   rM  rN  rP  rQ  r1  processed_outputsra   rG   rH   rU    s"   z-WhisperMultiModalProcessor._call_hf_processor	hf_inputshf_processor_mm_kwargsc                 C   s   t tddS )Nr-  rM   )r  r%   batched)r^   rW  rX  rG   rG   rH   _get_mm_fields_config  s   z0WhisperMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                 C   s"   | j  }tddgdg| dgS )Nr-  r   )modalitytargetreplacement)rB  r6  r,   )r^   r\  rX  r]  
num_tokensrG   rG   rH   _get_prompt_updates  s   
z.WhisperMultiModalProcessor._get_prompt_updates)rA   rB   rC   r(   rL  r   rO   rl   r$   rO  r   r8  r
   rU  r%   r[  r'   r&   r   r-   rb  rc   rG   rG   ra   rH   rI    sH    









rI  )rB  dummy_inputsc                       s  e Zd Zg dddgdZeddddZd	Zd	ZeZ	e
d
edB dedB f fddZe
dejdeded
edB ded dededB defddZe
dedededB fddZe
dededefddZe
dededededB fdd Zd!d"d#ed$ef fd%d&Z	d>d'ejd(ejd)eej dB dejfd*d+Zd,ede fd-d.Z!	d>dd/d0d'ejd1e dB d2ejdB d3e"dejf
d4d5Z#d,ede$fd6d7Z%d8ejdejfd9d:Z&d;e'e(eejf  de)e fd<d=Z*  Z+S )?WhisperForConditionalGeneration)zself_attn.q_projzself_attn.k_projzself_attn.v_projzencoder_attn.k_projzencoder_attn.v_proj)zself_attn.qkv_projzencoder_attn.kv_projz	.mlp.fc1.z	.mlp.fc2.)z.fc1.z.fc2.)orig_to_new_substrTlanguageNrW   c                    s"   |d u rt d d}t |S )NzDefaulting to language='en'. If you wish to transcribe audio in a different language, pass the `language` field in the TranscriptionRequest.en)loggerwarningr[   validate_language)clsrf  ra   rG   rH   rj    s   z1WhisperForConditionalGeneration.validate_languager-  r   
stt_config	task_type)
transcribe	translaterequest_promptto_languagec           	      C   sZ   |d u rt ddd||jfid|rd| ndd| d d| d	 d
}tt|S )Nz;Language must be specified when creating the Whisper promptrn   r-  )rM  multi_modal_dataz<|prev|>z<|startoftranscript|><|z|>z<|z|><|notimestamps|>)encoder_promptdecoder_prompt)r   sample_rater   r   )	rk  r-  r   rl  rf  rm  rp  rq  rM  rG   rG   rH   get_generation_prompt  s   



z5WhisperForConditionalGeneration.get_generation_promptr^  ic                 C   s   | drd S td)Nr-  z Only audio modality is supported)
startswithr   )rk  r^  rw  rG   rG   rH   get_placeholder_str>  s   
z3WhisperForConditionalGeneration.get_placeholder_strc                 C   s   t |}t|jj|jjdS )N)max_audio_clip_sru  )r.   r   r1  rD  rC  )rk  r   rm  	processorrG   rG   rH   get_speech_to_text_configE  s
   z9WhisperForConditionalGeneration.get_speech_to_text_configaudio_duration_sc                 C   s0   t |}|jj}|d usJ t||j | S rg   )r.   r1  
hop_lengthr   ceilru  )rk  r}  rl  r   r{  r~  rG   rG   rH   r6  P  s   z4WhisperForConditionalGeneration.get_num_audio_tokensrn   rw   r   rv   c                   s   t    |jj}|j}|| _|jj| _| j|tdt	id t
||d| _W d    n1 s0w   Y  t|j|j|t|dd| _| j| jjj| _t|dd}t|j|d| _d S )	Nr-  )language_targetstower_targetsr   proj_out)ru   rv   logit_scaler   )scale)r[   rh   r   r   ru   r   r   _mark_composite_modelr   r   r	  modelr    r  r   r<   r  tie_weightsr  r  r   r   logits_processor)r^   r   rv   r   ru   r  ra   rG   rH   rh   `  s*   

z(WhisperForConditionalGeneration.__init__r   r   r  c                 K   s    |d u rg }| j |||d}|S )N)r   r   r  )r  )r^   r   r   r  r/  r  rG   rG   rH   r\   x  s   z'WhisperForConditionalGeneration.forwardr/  c                 K   s,   | j di |}| j|d }|jddS )NrM   r   r   rG   )_parse_and_validate_audio_inputr  r  unbind)r^   r/  audio_input
enc_outputrG   rG   rH   embed_multimodal  s   z0WhisperForConditionalGeneration.embed_multimodalF)is_multimodalhandle_oov_mm_tokenmultimodal_embeddingsr  r  c                C   s   | j j|S rg   )r  r  r  )r^   r   r  r  r  rG   rG   rH   r    s   
z/WhisperForConditionalGeneration.embed_input_idsc                    s0   | dd }|d urt fdd|}t|dS )NrM   c                    s   |   jS rg   )r   r   )xr)  rG   rH   r     s    zQWhisperForConditionalGeneration._parse_and_validate_audio_input.<locals>.<lambda>rY  )rT  r/   rI   )r^   r/  rM   rG   r)  rH   r    s   
z?WhisperForConditionalGeneration._parse_and_validate_audio_inputr   c                 C   s   |  | j|}|S rg   )r  r  )r^   r   logitsrG   rG   rH   compute_logits  s   z.WhisperForConditionalGeneration.compute_logitsr  c                 C   s(   t | dgd}t|d}|j|| jdS )Nz	proj_out.)skip_prefixesz.k_proj.weight)mapper)r8   _create_fake_bias_for_k_projr$  hf_to_vllm_mapper)r^   r  loaderrG   rG   rH   r$    s   
z,WhisperForConditionalGeneration.load_weightsrg   ),rA   rB   rC   packed_modules_mappingr9   r  supports_transcription_onlysupports_segment_timestampr"   supported_languagesclassmethodr   rj  npndarrayr   r   r   r   rv  rl   ry  r|  floatr6  r   rh   rP   rQ   rO   r\   r8  r5   r  r   r  rI   r  r  r   r%  r  r$  rc   rG   rG   ra   rH   rd    s    		 	


,rd  r  fake_bias_key_namerW   c                 c   sZ    | D ]'\}}| |r%t|d}|dd}||f||fgE dH  ||fV  qdS )z
    Create full zeros bias for k_proj weight in self-attn and x-attn layers.
    So that the bias for k_proj in qkv_proj can be initialized with zeros.
    r   ri   rq   N)r  rP   zerosr   r  )r  r  r  ri   rq   	bias_namerG   rG   rH   r    s   
r  )yenumr   collections.abcr   r   r   
contextlibr   typingr   r   r   numpyr  rP   r	   transformersr
   r   r   ,transformers.models.whisper.modeling_whisperr   vllm.attention.layerr   vllm.compilation.decoratorsr   vllm.configr   r   r   r   vllm.config.multimodalr   vllm.distributedr   vllm.inputs.datar   vllm.loggerr   %vllm.model_executor.layers.activationr   4vllm.model_executor.layers.attention.cross_attentionr   9vllm.model_executor.layers.attention.mm_encoder_attentionr   !vllm.model_executor.layers.linearr   r   r   +vllm.model_executor.layers.logits_processorr   'vllm.model_executor.layers.quantizationr   3vllm.model_executor.layers.vocab_parallel_embeddingr    -vllm.model_executor.model_loader.weight_utilsr!   (vllm.model_executor.models.whisper_utilsr"   vllm.multimodalr#   vllm.multimodal.inputsr$   r%   r&   vllm.multimodal.parser'   r(   vllm.multimodal.processingr)   r*   r+   r,   r-   !vllm.transformers_utils.processorr.   vllm.utils.jsontreer/   vllm.utils.tensor_schemar0   r1   vllm.utils.torch_utilsr2   vllm.v1.attention.backendr3   
interfacesr5   r6   r7   utilsr8   r9   r:   r;   r<   rA   rh  Enumr=   rI   rS   r   rd   Modulerm   r   r   r   r   r   r   r	  r&  r9  rI  register_processorrd  r%  r   rQ   r  rG   rG   rG   rH   <module>   s   mB .;P0DB
 5