o
    
۾iڀ                  
   @   s  d dl Z d dlmZmZmZ d dlmZmZ d dl mZ d dl	m
Z
mZ d dlZd dlZd dlZd dlmZ d dlmZ d dlmZmZmZ d dlmZ d d	lmZ d d
lmZ d dlm Z m!Z! d dl"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z)m*Z*m+Z+ d dl,m-Z- d dl.m/Z/m0Z0 d dl1m2Z2 d dl3m4Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z: d dl;m<Z<m=Z= d dl>m?Z? d dl@mAZA d dlBmCZCmDZDmEZEmFZFmGZG d dlHmIZImJZJmKZK d dlLmMZMmNZN d dlOmPZPmQZQmRZRmSZSmTZT d dlUmVZV d dlWmXZX d dlYmZZZ d d!l[m\Z\m]Z]m^Z^ d d"l_m`Z`maZa e2ebZcd#d$d%d&d'd(d)d*d+d,	ZdG d-d. d.ZeG d/d0 d0eQZfG d1d2 d2eMef ZgG d3d4 d4ePef ZheAjiehefegd5G d6d7 d7ejje]e8e\e^ZkG d8d9 d9ejjZlG d:d; d;ejjZmdS )<    N)IterableMappingSequence)cached_propertypartial)ceil)Literalcast)mel_filter_bank)
AudioChunkRawAudio	TextChunk)UserMessage)ChatCompletionRequest)TranscriptionRequest)AudioAudioEncoder)BatchFeature
TensorTypeWhisperConfig)	TextInput)ModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)
PromptTypeTokensPrompt)init_logger)QuantizationConfig)default_weight_loader)
SupportsPP)MultiModelKeys)WhisperEncoder_create_fake_bias_for_k_proj)WhisperCausalEncoder)MULTIMODAL_REGISTRY)MultiModalDataDictMultiModalFieldConfigMultiModalKwargsItemsMultiModalUUIDDictNestedTensors)AudioProcessorItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderProcessorInputs)BaseMultiModalProcessorBaseProcessingInfoMultiModalProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)cached_tokenizer_from_config)MistralTokenizer   )SupportsLoRASupportsMultiModalSupportsTranscription)init_vllm_registered_modelmaybe_prefixArabicDutchEnglishFrenchGermanHindiItalian
PortugueseSpanish)	arnlenfrdehiitptesc                       s   e Zd ZdZdeddf fddZedefddZede	fd	d
Z
ede	fddZede	fddZedefddZde	de	fddZ			ddeee B dB dejeej B dB deeB dB deeef fddZ  ZS )VoxtralProcessorAdapterzv
    Provide a HF-compatible interface for
    :class:`mistral_common.tokens.tokenizers.multimodal.AudioEncoder`.
    	tokenizerreturnNc                    s   t    || _d S N)super__init__rQ   selfrQ   	__class__ V/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/voxtral.pyrU   X   s   

z VoxtralProcessorAdapter.__init__c                 C   s   | j jj}t|tsJ |S rS   )rQ   instructaudio_encoder
isinstancer   )rW   r]   rZ   rZ   r[   _audio_processor\   s   
z(VoxtralProcessorAdapter._audio_processorc                 C   
   | j jjS rS   )r_   special_idsaudiorW   rZ   rZ   r[   audio_token_idb      
z&VoxtralProcessorAdapter.audio_token_idc                 C   r`   rS   )r_   ra   begin_audiorc   rZ   rZ   r[   begin_audio_token_idf   re   z,VoxtralProcessorAdapter.begin_audio_token_idc                 C   r`   rS   )r_   audio_configsampling_raterc   rZ   rZ   r[   ri   j   re   z%VoxtralProcessorAdapter.sampling_ratec                 C   r`   rS   )r_   rh   
frame_raterc   rZ   rZ   r[   rj   n   re   z"VoxtralProcessorAdapter.frame_rateaudio_lengthc                 C   s   t || j| j  S rS   )r   ri   rj   )rW   rk   rZ   rZ   r[   get_num_audio_tokensr   s   z,VoxtralProcessorAdapter.get_num_audio_tokenstextaudiosreturn_tensorsc           
      K   s0  |d u rg }t |ts|g}|d u rg }t |ts|g}|s+| |j}dt|iS tdd |D r8tdttj  }ttj  }|D ]?}t |t	j
sPJ |jdksWJ | jjjsf| jj|| jdd}| jg| jg| t|  }	|t|	 |t| qFtt|d  t|d|d	S )
N	input_idsc                 s   s    | ]	}t |d kV  qdS )r   N)len).0trZ   rZ   r[   	<genexpr>   s    z3VoxtralProcessorAdapter.__call__.<locals>.<genexpr>zYou've passed text inputs instead of token inputs. Make sure to process your input via `mistral_common`'s tokenizer or pass a chat completion request. For more info, see: https://github.com/vllm-project/vllm/issues/8411.r8   F)is_online_streaming)rp   audio_arrays)r^   listrQ   rp   torchtensorany
ValueErrorTensornpndarrayndimr_   rh   is_streamingpadri   rg   rd   rl   rq   appendr   catexpand)
rW   rm   rn   ro   kwargsrp   audios_tokensaudios_processedrb   audio_tokensrZ   rZ   r[   __call__x   sH   


z VoxtralProcessorAdapter.__call__)NNN)__name__
__module____qualname____doc__r7   rU   r   r   r_   intrd   rg   ri   floatrj   rl   r   rx   r~   r   strr   r   r*   r   __classcell__rZ   rZ   rX   r[   rP   R   s<    


rP   c                   @   s   e Zd ZdefddZdefddZdd Zdee	e
dB f fd	d
Zde
dee	e
f dee	e
f fddZde
fddZde
fddZdS )VoxtralProcessingInforR   c                 C   s"   t | jj}t|tstd|S )Nz.This model requires `--tokenizer-mode mistral`)r6   ctxmodel_configr^   r7   r|   rV   rZ   rZ   r[   get_tokenizer   s   
z#VoxtralProcessingInfo.get_tokenizerc                 C   s   t |  S rS   )rP   r   rc   rZ   rZ   r[   get_hf_processor   s   z&VoxtralProcessingInfo.get_hf_processorc                 C   s   t |  j|  dS )N)	target_srexpected_hidden_size)r-   r   ri   _get_expected_hidden_sizerc   rZ   rZ   r[   get_data_parser   s   z%VoxtralProcessingInfo.get_data_parserNc                 C   s   ddiS )Nrb      rZ   rc   rZ   rZ   r[   get_supported_mm_limits   s   z-VoxtralProcessingInfo.get_supported_mm_limitsseq_len	mm_countsc                 C   s   d|   iS Nrb   )get_max_audio_tokens)rW   r   r   rZ   rZ   r[   get_mm_max_tokens_per_item   s   z0VoxtralProcessingInfo.get_mm_max_tokens_per_itemc                 C   r`   rS   )r   r   max_model_lenrc   rZ   rZ   r[   r      s   
z*VoxtralProcessingInfo.get_max_audio_tokensc                 C   s    |   }|  t|j|j  S rS   )r   r   r   ri   rj   )rW   	processorrZ   rZ   r[   get_max_audio_array_len   s   
z-VoxtralProcessingInfo.get_max_audio_array_len)r   r   r   r7   r   rP   r   r   r   r   r   r   r   r   r   rZ   rZ   rZ   r[   r      s    


r   c                	   @   s   e Zd Zdeeef defddZ	ddedeeef deeef dB defdd	Z		ddedeeef deeef dB de
fd
dZdS )VoxtralDummyInputsBuilderr   rR   c                 C   s   dS )N rZ   )rW   r   rZ   rZ   r[   get_dummy_text   s   z(VoxtralDummyInputsBuilder.get_dummy_textNr   
mm_optionsc                 C   s<   | dd}| j }|r| dnd }d| j|||diS )Nrb   r   )length
num_audios	overrides)getinfor   _get_dummy_audios)rW   r   r   r   r   target_lengthaudio_overridesrZ   rZ   r[   get_dummy_mm_data   s   
z+VoxtralDummyInputsBuilder.get_dummy_mm_datac                 C   s   | j  }| |}| |||}|dg }g }d}	|D ]}
t|
| j  j|	d}tt	
|d}|| qttt|dg|dgd}|j|}|j}| j i |ddd	 |jD i}t||d
S )Nrb   wav)audio_arrayri   format)input_audio)rm   )content)messagesc                 S   s   g | ]}|j qS rZ   )r   rr   arZ   rZ   r[   
<listcomp>  s    zHVoxtralDummyInputsBuilder.get_dummy_processor_inputs.<locals>.<listcomp>)promptmm_items)r   r   r   r   r   r   r   ri   r   r   
from_audior   r   r   r   mistralencode_chat_completiontokensparse_mm_datarn   r/   )rW   r   r   r   rQ   
dummy_textdummy_mm_datadummy_audiosaudio_chunksr   rb   
audio_itemchunkrequestresdummy_tokensdummy_mm_inputsrZ   rZ   r[   get_dummy_processor_inputs   s0   


z4VoxtralDummyInputsBuilder.get_dummy_processor_inputsrS   )r   r   r   r   r   r   r   r   r&   r   r/   r   rZ   rZ   rZ   r[   r      s,    


r   c                       s   e Zd Zdeeef deeef deeef fddZde	deeef de
dee fdd	Z	
ddeee B de	deeef deeef ded
B deee eef f fddZ  ZS )VoxtralMultiModalProcessor	hf_inputshf_processor_mm_kwargsrR   c                 C   s   t tddS )Nrb   )rw   )dictr'   batched)rW   r   r   rZ   rZ   r[   _get_mm_fields_config  s   z0VoxtralMultiModalProcessor._get_mm_fields_configr   out_mm_kwargsc                    s>   | j jdi |j dtf fdd}tdd|dgS )Nitem_idxc                    s*    dt}|| }|} g| S r   )	get_itemsr+   get_audio_lengthrl   )r   rn   	audio_lennb_audio_tokensaudio_idr   r   rZ   r[   get_replacement%  s   


zGVoxtralMultiModalProcessor._get_prompt_updates.<locals>.get_replacementrb   r   )modalitytargetreplacementrZ   )r   r   rd   r   r3   )rW   r   r   r   r   rZ   r   r[   _get_prompt_updates  s   	z.VoxtralMultiModalProcessor._get_prompt_updatesNr   mm_data_itemstokenization_kwargsmm_uuidsc           	         s&   t  j|||||d\}}}||dfS )N)r   r   r   r   r   T)rT   _cached_apply_hf_processor)	rW   r   r   r   r   r   
prompt_idsmm_info_rX   rZ   r[   r   5  s   
	z5VoxtralMultiModalProcessor._cached_apply_hf_processorrS   )r   r   r   r   r   r*   objectr'   r   r,   r(   r   r4   r   rx   r   r)   tupler2   boolr   r   rZ   rZ   rX   r[   r     s>    





 


r   )r   dummy_inputsc                       s  e Zd ZeZdZg dddgdZddded	ef fd
dZ	de
fddZ		d4dejdB dejdedB dejdB dedejeB fddZdeej ejB eejdf B dB fddZdedeej dB fddZdejdejdB fddZeded edefd!d"Zed#ejded$ed%edB d ed& d'ed(edB defd)d*Zed+ed$edede dB fd,d-Z!d.e"eeejf  de#e fd/d0Z$d1e%de%fd2d3Z&  Z'S )5VoxtralForConditionalGenerationTq_projk_projv_proj	gate_projup_proj)qkv_projgate_up_projr   prefixvllm_configr   c                   s   t    t|j| _t|dr| |j|_|jj}|| _	| j	j
j| _| | t||jt|dd| _W d    n1 s@w   Y  | |d% t||j
t|dd| _t|j
j| j |jjd| _W d    d S 1 ssw   Y  d S )Nquant_configlanguage_model)r   	hf_configr   rb   whisper_encoderr   )hidden_sizedim)rT   rU   r6   r   rQ   hasattrmaybe_update_quant_configr   r   configrh   downsample_factor_mark_language_modelr<   text_configr=   r   _mark_tower_modelVoxtralEncoderModelwith_hf_configr   AudioLanguageAdapterd_modelr   audio_language_adapter)rW   r   r   r   rX   rZ   r[   rU   [  s4   




"z(VoxtralForConditionalGeneration.__init__rR   c                 C   s   t jdddgdS )z?Get module prefix for multimodal models to filter LoRA modules.r   r	  r   )r   	connectortower_model)r!   from_string_fieldrc   rZ   rZ   r[   get_mm_mapping{  s
   z.VoxtralForConditionalGeneration.get_mm_mappingNrp   	positionsintermediate_tensorsinputs_embedsr   c                 K   s$   |d urd }| j j||||d}|S )N)r  )r   model)rW   rp   r  r  r  r   hidden_statesrZ   rZ   r[   forward  s   z'VoxtralForConditionalGeneration.forward.c           
   	   K   s   | j di |}|d u rd S | |}t|D ]0\}}|j\}}| jt|| j  }tjj	
|ddd|| f}||| j || j ||< qtj|dd}	| |	}	tj|	dd |D dd}|S )Nr   r   c                 S   s   g | ]}|j d  qS )r   )shaper   rZ   rZ   r[   r     s    zDVoxtralForConditionalGeneration.embed_multimodal.<locals>.<listcomp>rZ   ) _parse_and_validate_audio_arraysr   	enumerater  r  mathr   ry   nn
functionalr   reshaper   r	  split)
rW   r   audio_inputsaudio_embeddingsiaudio_embeddingr   r   target_seq_lenaudio_embeddings_packedrZ   rZ   r[   embed_multimodal  s,   



z0VoxtralForConditionalGeneration.embed_multimodalc                 K   sX   | dd }|d u rd S t|tjtfstdt| t|tjr*t|d}|S )Nrw   z*Incorrect type of audio_arrays. Got type: r   )popr^   ry   r}   rx   r|   typeunbind)rW   r   rw   rZ   rZ   r[   r    s   z@VoxtralForConditionalGeneration._parse_and_validate_audio_arraysr  c                 C   s   | j |S rS   )r   compute_logits)rW   r  rZ   rZ   r[   r'    s   z.VoxtralForConditionalGeneration.compute_logitsr   	task_typec                 C   s,   t |}|jjj}|j}|j}t||d dS )N)max_audio_clip_ssample_ratemin_energy_split_window_size)r6   r\   r]   rh   chunk_length_sri   r   )clsr   r(  rQ   rh   r)  r*  rZ   rZ   r[   get_speech_to_text_config  s   
z9VoxtralForConditionalGeneration.get_speech_to_text_configrb   
stt_configlanguage)
transcribe	translaterequest_promptto_languagec                 C   s^   t |}t|t|jdd}t|jt||d}	|j	|	}
t
|
jd|
jd j|jfidS )Nr   )r   )r  rb   r0  rb   r   )prompt_token_idsmulti_modal_data)r6   r   r   r*  r   r  r   r   r\   encode_transcriptionr   r   rn   r   )r-  rb   r   r/  r0  r(  r3  r4  rQ   req	tokenizedrZ   rZ   r[   get_generation_prompt  s   z5VoxtralForConditionalGeneration.get_generation_promptaudio_duration_sc                 C   s$   t |}t|}|t||j S )z
        Map from audio duration to number of audio tokens produced by the ASR
        model, without running a forward pass.
        This is used for estimating the amount of processing for this audio.
        )r6   rP   rl   r   r*  )r-  r;  r/  r   rQ   adapterrZ   rZ   r[   rl     s
   z4VoxtralForConditionalGeneration.get_num_audio_tokensweightsc                    s   g dt tdji  tdt  fdd}j| D ]
}	d|  q)d}|vr?	| S )N)z,mm_streams_embeddings.embedding_module\.(.*)\1)zmm_whisper_embeddings\.(.*)r?  )zaudio_language_projection\.(.*)zaudio_language_adapter.\1)z!audio_language_adapter\.0\.weightz"audio_language_adapter.w_in.weight)z!audio_language_adapter\.2\.weightz#audio_language_adapter.w_out.weightr	  z
.wk.weightc               	   3   s    D ]w\} }d}dD ]}||  |o$|  | d o$|  | d O }qD ]\}}t|| r:t||| } q)|rNj| |f} d|   q|  v ru |  }t  t	|| W d    n1 sjw   Y  |  q| |fV  qd S )NF)mm_whisper_embeddingsz&mm_streams_embeddings.embedding_modulez.tok_embeddingsz.audio_language_projectionzwhisper_encoder.)

startswithre	fullmatchsubr   load_weightaddry   no_gradr   )namew
is_encoderkpatternreplparamaudio_paramsloaded_weightsremapping_rulesrW   r=  rZ   r[   llm_weights_generator"  s4   

zKVoxtralForConditionalGeneration.load_weights.<locals>.llm_weights_generatorzlanguage_model.z6whisper_encoder.whisper_encoder.embed_positions.weight)
r   r  
ModuleDictr	  named_parametersr#   setr   load_weightsrF  )rW   r=  rS  rH  sin_keyrZ   rO  r[   rW    s"   

z,VoxtralForConditionalGeneration.load_weightsr   c                 C   s   g d}t |dr/g }|jD ]}|}|D ]\}}t||r%t|||}q|| q||_t |drq|j}|D ]4}	d||	 v rgg }
||	 d D ]}|}|D ]\}}t||r`t|||}qO|
| qI|
||	 d< q9||_|S )z
        Update quant config to so that ignored module and target module names
        match the vLLM model names.
        Right now this is specific for compressed-tensors format and
        load_format mistral.
        ))outputzlanguage_model.lm_head)zlayers\.(\d+)\.attention\.woz1language_model.model.layers.\1.self_attn.out_proj)zlayers\.(\d+)\.attention\.w(.*)z0language_model.model.layers.\1.self_attn.\2_proj)zlayers\.(\d+)\.feed_forward\.w1z,language_model.model.layers.\1.mlp.gate_proj)zlayers\.(\d+)\.feed_forward\.w2z,language_model.model.layers.\1.mlp.down_proj)zlayers\.(\d+)\.feed_forward\.w3z*language_model.model.layers.\1.mlp.up_proj)zSmm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.w(.*)zBwhisper_encoder.whisper_encoder.layers.\1.layers.self_attn.\2_proj)zPmm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.attention.wozCwhisper_encoder.whisper_encoder.layers.\1.layers.self_attn.out_proj)zWmm_whisper_embeddings\.whisper_encoder\.transformer\.layers\.(\d+)\.feed_forward.w(\d+)z9whisper_encoder.whisper_encoder.layers.\1.layers.mlp.fc\2)z6mm_whisper_embeddings\.whisper_encoder\.conv_layers\.0z%whisper_encoder.whisper_encoder.conv1)z6mm_whisper_embeddings\.whisper_encoder\.conv_layers\.1z%whisper_encoder.whisper_encoder.conv2)z3mm_whisper_embeddings\.audio_language_projection\.0zaudio_language_adapter.w_in)z3mm_whisper_embeddings\.audio_language_projection\.2zaudio_language_adapter.w_outignoreconfig_groupstargets)r   rZ  rB  rC  rD  r   r[  )rW   r   rR  mistral_ignorerH  mistral_namerL  rM  r[  
group_namer\  rZ   rZ   r[   r   L  s4   	
5

z9VoxtralForConditionalGeneration.maybe_update_quant_config)NN)(r   r   r   ISO639_1_SUPPORTED_LANGSsupported_languagesskip_warmup_audio_preprocessingpacked_modules_mappingr   r   rU   r!   r  ry   r}   r5   r   r  rx   r   r#  r  r'  classmethodr   r   r.  r~   r   r   r   r:  r   r   rl   r   rV  rW  r   r   r   rZ   rZ   rX   r[   r   I  s     
 
 

	$Dr   c                       s@   e Zd Zdededdf fddZdejdejfdd	Z  ZS )
r  r   r   rR   Nc                    s<   t    tj||dd| _t | _tj||dd| _d S )NF)bias)rT   rU   r  Linearw_inGELUgeluw_out)rW   r   r   rX   rZ   r[   rU     s   

zAudioLanguageAdapter.__init__xc                 C   s   |  | | |S rS   )rj  ri  rg  )rW   rk  rZ   rZ   r[   r    s   zAudioLanguageAdapter.forward)	r   r   r   r   rU   ry   r}   r  r   rZ   rZ   rX   r[   r    s    r  c                       s   e Zd Zdg diZg dZdddededd	f fd
dZdej	dej	fddZ
edefddZedefddZdeej	 deej	ee f fddZdej	eej	 B deej	 fddZdeeej	f defddZ  ZS )r  r   r   )r>  )z.whisper_encoder\.conv_layers\.0\.(weight|bias)whisper_encoder.conv1.\1)z.whisper_encoder\.conv_layers\.1\.(weight|bias)whisper_encoder.conv2.\1)z4whisper_encoder\.conv_layers\.0\.conv\.(weight|bias)rl  )z4whisper_encoder\.conv_layers\.1\.conv\.(weight|bias)rm  )zOwhisper_encoder\.transformer\.layers\.(\d+)\.attention\.w([qkv])\.(weight|bias)z.whisper_encoder.layers.\1.self_attn.\2_proj.\3)zIwhisper_encoder\.transformer\.layers\.(\d+)\.attention\.wo\.(weight|bias)z/whisper_encoder.layers.\1.self_attn.out_proj.\2)zJwhisper_encoder\.transformer\.layers\.(\d+)\.attention_norm\.(weight|bias)z1whisper_encoder.layers.\1.self_attn_layer_norm.\2)zLwhisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w1\.(weight|bias)z$whisper_encoder.layers.\1.mlp.fc1.\2)zLwhisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w2\.(weight|bias)z$whisper_encoder.layers.\1.mlp.fc2.\2)zLwhisper_encoder\.transformer\.layers\.(\d+)\.feed_forward\.w3\.(weight|bias)z$whisper_encoder.layers.\1.mlp.fc3.\2)zDwhisper_encoder\.transformer\.layers\.(\d+)\.ffn_norm\.(weight|bias)z-whisper_encoder.layers.\1.final_layer_norm.\2)z1whisper_encoder\.transformer\.norm\.(weight|bias)zwhisper_encoder.layer_norm.\1r   r   r   r   rR   Nc                   s   t    tt|jj| _|jj| _t| jdd| _	| j	r t
}nttdd}||t|dd| _td| jjd  | jjd	d
| jjd}tj|tjd| _d S )N	is_causalFT)init_in_fp32r   )r   r   r8      g        g     @@)num_frequency_binsnum_mel_binsmin_frequencymax_frequencyri   )dtype)rT   rU   r	   r   r   r   r   ru  getattrrn  r$   r   r"   r=   r   r
   window_sizerr  ri   ry   rz   float32mel_filters)rW   r   r   WhisperEncoderClsry  rX   rZ   r[   rU     s&   

zVoxtralEncoderModel.__init__audio_waveformsc           
      C   s   |j }t| jj|j}tj|| jj| jj|dd}|dd df 	 d }| j
j| }tj|dd }| jj }rUt|tsJtd|d	tj||j|j d
}	n| }	t||	d }|d d }||S )NT)windowreturn_complex.rv   rp  g|=)minzglobal_log_mel_max=z needs to be of type float.)deviceru  g       @g      @)ru  ry   hann_windowr   rw  tor  stft
hop_lengthabsry  Tclamplog10global_log_mel_maxr^   r   	TypeErrorrz   maxmaximum)
rW   r{  input_dtyper|  r  
magnitudesmel_speclog_specr  log_spec_maxrZ   rZ   r[   compute_whisper_melspec  s0   

z+VoxtralEncoderModel.compute_whisper_melspecc                 C   s   | j jjd | j jjd  S )Nr   )r   conv1strideconv2rc   rZ   rZ   r[   r  $  s   z%VoxtralEncoderModel.downsample_factorc                 C   s   | j j| j S rS   )r   max_source_positionsr  rc   rZ   rZ   r[   
chunk_size*  s   zVoxtralEncoderModel.chunk_sizec                    sf   t |tsJ  fdd|D }g }g }|D ]}|j jdd}||7 }|t| qt||fS )Nc                    s   g | ]}  | jqS rZ   )r  r  ru  )rr   rb   rc   rZ   r[   r   4  s    z?VoxtralEncoderModel.prepare_inputs_for_conv.<locals>.<listcomp>rv   r  )r^   rx   r  r  r   rq   ry   stack)rW   r{  input_featureschunked_featureschunks_per_examplefeaturechunksrZ   rc   r[   prepare_inputs_for_conv.  s   
z+VoxtralEncoderModel.prepare_inputs_for_convr  c           	      C   sj   t |ts|g}| |\}}| |g}d}g }|D ]}||||  dd}|| ||7 }q|S )Nr   r8   )r^   rx   r  r   flattenr   )	rW   r  input_embedsr  out	chunk_idxresultsn_chunksresultrZ   rZ   r[   r  C  s   


zVoxtralEncoderModel.forwardweightc                 C   s   g d}g }| j r|ddg |dg t|  }|\}}| jD ]\}}t||r4t|||}q#|D ]\}	}
}|
|vrAq7||
|	}|| }|j	}||||  |S |D ]\}	}
|
|vrbqY||
|	}qY|| }t
|dt}||| |S )N))r   r   q)r   r   rK  )r   r   v).mlp.gate_up_projz.mlp.fc1r   )r  z.mlp.fc3r8   )z.mlp.down_projz.mlp.fc2weight_loader)rn  extendr   rU  mistral_remappingrB  rC  rD  replacer  rv  r   )rW   r  stacked_params_mappingparams_mappingparams_dictrH  loaded_weightrL  rM  
param_nameweight_nameshard_idrN  r  rZ   rZ   r[   rE  Y  sF   
zVoxtralEncoderModel.load_weight)r   r   r   rc  r  r   r   rU   ry   r}   r  propertyr   r  r  rx   r   r  r  rE  r   rZ   rZ   rX   r[   r    s>    8
 

$r  )nr  collections.abcr   r   r   	functoolsr   r   r   typingr   r	   numpyr~   regexrB  ry   torch.nnr  mistral_common.audior
   &mistral_common.protocol.instruct.chunkr   r   r   )mistral_common.protocol.instruct.messagesr   (mistral_common.protocol.instruct.requestr   -mistral_common.protocol.transcription.requestr   &mistral_common.tokens.tokenizers.audior   r   transformersr   r   r   $transformers.tokenization_utils_baser   vllm.configr   r   r   vllm.config.multimodalr   vllm.inputs.datar   r   vllm.loggerr   'vllm.model_executor.layers.quantizationr   -vllm.model_executor.model_loader.weight_utilsr   vllm.model_executor.modelsr    )vllm.model_executor.models.module_mappingr!   "vllm.model_executor.models.whisperr"   r#   )vllm.model_executor.models.whisper_causalr$   vllm.multimodalr%   vllm.multimodal.inputsr&   r'   r(   r)   r*   vllm.multimodal.parser+   r,   r-   vllm.multimodal.processingr.   r/   $vllm.multimodal.processing.processorr0   r1   r2   r3   r4   vllm.sequencer5   vllm.tokenizersr6   vllm.tokenizers.mistralr7   
interfacesr9   r:   r;   utilsr<   r=   r   loggerr`  rP   r   r   r   register_processorModuler   r  r  rZ   rZ   rZ   r[   <module>   s   ^%>6
  Z