o
    
۾iR                  	   @   sJ  d Z ddlmZmZmZ ddlmZmZ ddlZ	ddl
Z
ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZAmBZBmCZCmDZD ddlEmFZF ddlGmHZH ddlImJZJmKZK ddlLmMZM ddlNmOZO eePZQdZRde
jSfddZTG dd deBZUG d d! d!eAeU ZVd"eeWe
jSf fd#d$ZXG d%d& d&e?ZYG d'd( d(e)ZZe2j[eZeUeVd)G d*d+ d+ej\ee ee!Z]dS ),zInference-only Qwen3-ASR model.    )IterableMappingSequence)AnyLiteralN)BatchFeature)WhisperFeatureExtractor)ModelConfigSpeechToTextConfig
VllmConfig)BaseDummyOptions)
PromptTypeTokensPrompt)init_logger)MultiModalEmbeddingsSupportsMRoPESupportsMultiModal
SupportsPPSupportsTranscription)MultiModelKeys)Qwen3ForCausalLM)Qwen2_5OmniAudioFeatureInputsQwen3OmniMoeAudioEncoder&Qwen3OmniMoeThinkerMultiModalProcessor)AutoWeightsLoaderWeightsMapper_merge_multimodal_embeddingsmaybe_prefix)ISO639_1_SUPPORTED_LANGS)MULTIMODAL_REGISTRY)	AudioItemModalityDataMultiModalDataDictMultiModalFeatureSpecMultiModalFieldConfigMultiModalKwargsItems)AudioProcessorItemsDictEmbeddingItemsModalityDataItemsMultiModalDataItemsMultiModalDataParser)BaseDummyInputsBuilderBaseProcessingInfoPromptReplacementPromptUpdate)IntermediateTensors)cached_tokenizer_from_config)Qwen3ASRConfigQwen3ASRThinkerConfig)cached_processor_from_config)Qwen3ASRProcessorz
<asr_text>input_lengthsc                 C   sD   | d }|d d d }|d d d d d d | d d  }|S )Nd             )r5   input_lengths_leavefeat_lengthsoutput_lengthsr:   r:   X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/model_executor/models/qwen3_asr.py _get_feat_extract_output_lengths`   s
   &r?   c                   @   s`   e Zd Zdd ZdedefddZdedefddZde	e
ed	B f fd
dZdefddZd	S )Qwen3ASRProcessingInfoc                 C   s   | j tjS N)ctxget_hf_configr1   thinker_configselfr:   r:   r>   rC   j   s   z$Qwen3ASRProcessingInfo.get_hf_configkwargsreturnc                 K   s4   | j jtfd|ddi|}t|dsd|_|S )Nuse_fastTaudio_tokenz<|audio_pad|>)rB   get_hf_processorr4   pophasattrrJ   )rF   rG   	processorr:   r:   r>   rK   m   s   

z'Qwen3ASRProcessingInfo.get_hf_processorc                 K   s(   | j di |}|j}t|tsJ |S )Nr:   )rK   feature_extractor
isinstancer   )rF   rG   hf_processorrO   r:   r:   r>   get_feature_extractorw   s   z,Qwen3ASRProcessingInfo.get_feature_extractorNc                 C   s   dd iS )Naudior:   rE   r:   r:   r>   get_supported_mm_limits}   s   z.Qwen3ASRProcessingInfo.get_supported_mm_limitsc                 C   s   |   }t|j|  dS )N)	target_srexpected_hidden_size)rR   Qwen3ASRMultiModalDataParsersampling_rate_get_expected_hidden_size)rF   rO   r:   r:   r>   get_data_parser   s
   z&Qwen3ASRProcessingInfo.get_data_parser)__name__
__module____qualname__rC   objectr4   rK   r   rR   r   strintrT   r*   rZ   r:   r:   r:   r>   r@   i   s    
r@   c                	   @   sX   e Zd Zdeeef defddZ	d
dedeeef deeef dB defdd	Z	dS )Qwen3ASRDummyInputsBuilder	mm_countsrH   c                 C   s$   | dd}| j }|j}|| S )NrS   r   )getinforK   rJ   )rF   rb   
num_audiosrQ   rJ   r:   r:   r>   get_dummy_text   s   
z)Qwen3ASRDummyInputsBuilder.get_dummy_textNseq_len
mm_optionsc                 C   sN   | dd}| j }t|jd|j }|r| dnd }d| j|||diS )NrS   r      )lengthre   	overrides)rc   rd   rR   minchunk_lengthrX   _get_dummy_audios)rF   rg   rb   rh   re   rO   target_audio_lengthaudio_overridesr:   r:   r>   get_dummy_mm_data   s    
z,Qwen3ASRDummyInputsBuilder.get_dummy_mm_datarA   )
r[   r\   r]   r   r_   r`   rf   r   r"   rq   r:   r:   r:   r>   ra      s    
ra   	hf_inputsc                 C   s8   |  dtd}ttjd|ddtdtddS )Naudio_feature_lengths)r   rS   r7   dim)input_audio_featuresfeature_attention_maskrs   )rc   torchemptydictr$   flat_from_sizesbatched)rr   rs   r:   r:   r>   _qwen3asr_field_config   s   r}   c                       sD   e Zd Zdeeejf ee B de	e
e
f dB f fddZ  ZS )rW   datarH   Nc                    s*   t |trt|dddhtdS t |S )NrS   rv   rs   )modalityrequired_fieldsfields_factory)rP   rz   r'   r}   super_parse_audio_data)rF   r~   	__class__r:   r>   r      s   
z.Qwen3ASRMultiModalDataParser._parse_audio_data)r[   r\   r]   rz   r_   rx   Tensorr!   r    r(   r   r   __classcell__r:   r:   r   r>   rW      s    rW   c                	   @   sX   e Zd Zdedeeef deeef fddZde	deee
f dedee fdd	Zd
S )Qwen3ASRMultiModalProcessorrr   hf_processor_mm_kwargsrH   c                 C   s   t |S rA   )r}   )rF   rr   r   r:   r:   r>   _get_mm_fields_config   s   z1Qwen3ASRMultiModalProcessor._get_mm_fields_configmm_itemsout_mm_kwargsc                    s   | j jd	i |}| j  }| }|j}|| | }|d}	|d}
|	d u r2|
d u r2g  n$|	d ur?t|	}|  n|
d urVt	|
t
jsKJ t|
d}|  dtf fdd}td||dgS )
Nrs   rw   item_idxc                    sJ    |  }|dkr  dt}|| }td| dt| dg| S )Nr   rS   z
The audio z (len=z1) is too short to be represented inside the model)	get_itemsr&   rc   
ValueErrorlen)r   num_featuresaudiosrS   audio_output_lengthsaudio_token_idr   r:   r>   get_replacement_qwen2_audio   s   

zTQwen3ASRMultiModalProcessor._get_prompt_updates.<locals>.get_replacement_qwen2_audiorS   )r   targetreplacementr:   )rd   rK   get_tokenizer	get_vocabrJ   get_datarc   r?   tolistrP   rx   r   sumr`   r-   )rF   r   r   r   rN   	tokenizervocabrJ   out_mm_datars   rw   audio_output_lensr   r:   r   r>   _get_prompt_updates   s4   



z/Qwen3ASRMultiModalProcessor._get_prompt_updatesN)r[   r\   r]   r   r   r_   r^   r$   r   r)   r   r%   r   r.   r   r:   r:   r:   r>   r      s"    



r   )rd   dummy_inputsc                       s"  e Zd ZeZedddddZedede	ded	B fd
dZ
dddedef fddZdeded	B fddZdedefddZ				dEdedee d	B dejd	B dejfddZdeded	B fddZ		dFd	dddejd ed	B d!ejd	B d"edejf
d#d$Z				dEdejd%ejd&ed	B d'ejd	B dedejeB fd(d)Zd*ejdejd	B fd+d,Zd-eeeejf  dee fd.d/Z d0ee	 d1ee! deeje	f fd2d3Z"de#fd4d5Z$ed6e%d7ede&fd8d9Z'ed:e(j)d6e%d;e&d<ed	B d7e*d= d>ed?ed	B de+fd@dAZ,edBedefdCdDZ-  Z.S )G Qwen3ASRForConditionalGenerationzlanguage_model.lm_head.zlanguage_model.model. )zthinker.lm_head.zthinker.model.zthinker.)orig_to_new_prefixr   irH   Nc                 C   s   | drdS td)NrS   z)<|audio_start|><|audio_pad|><|audio_end|>z Only audio modality is supported)
startswithr   )clsr   r   r:   r:   r>   get_placeholder_str  s   
z4Qwen3ASRForConditionalGeneration.get_placeholder_strprefixvllm_configr   c                   s   t    || _|jjj}|j}|jj}|| _|| _|| _| 	|d t
|jt|dd| _W d    n1 s9w   Y  | | t|j|jdgdt|dd| _W d    n1 s_w   Y  | jj| _d S )NrS   audio_towerr   r   )architectureslanguage_model)r   r   )r   __init__r   model_config	hf_configrD   quant_configmultimodal_configconfig_mark_tower_modelr   audio_configr   r   _mark_language_modelr   with_hf_configtext_configr   make_empty_intermediate_tensors)rF   r   r   rD   r   r   r   r:   r>   r   !  s2   


	z)Qwen3ASRForConditionalGeneration.__init__rG   c                 K   s@   | dd }| dd }| dd }|d u rd S td|||dS )Nrv   rs   rw   audio_features)typeinput_featuresrs   rw   )rL   r   )rF   rG   rv   rs   rw   r:   r:   r>   _parse_and_validate_audio_input?  s   z@Qwen3ASRForConditionalGeneration._parse_and_validate_audio_inputc                 K   s6   i }|D ]}|dv rd|vr| j di ||d< q|S )Nrv   rS   r:   )r   )rF   rG   mm_input_by_modality	input_keyr:   r:   r>   %_parse_and_validate_multimodal_inputsO  s   
zFQwen3ASRForConditionalGeneration._parse_and_validate_multimodal_inputsaudio_inputaudio_hashescached_audio_featuresc                 C   s@   |d }|d }t |}| j|| jj||d}|| S )Nr   rs   )feature_lensaftercnn_lens)r?   r   todtypesplitr   )rF   r   r   r   r   rs   r   r   r:   r:   r>   _process_audio_input^  s   z5Qwen3ASRForConditionalGeneration._process_audio_inputc                 K   sP   | j di |}|sg S d}|D ]}|| }|dkr%| |}|t|7 }q|S )Nr:   rS   )r   r   tuple)rF   rG   r   multimodal_embeddingsr   multimodal_inputaudio_embeddingsr:   r:   r>   embed_multimodalp  s   
z1Qwen3ASRForConditionalGeneration.embed_multimodalFis_multimodalhandle_oov_mm_token	input_idsr   r   r   c                C   s@   | j || jj||d}|d u st|dkr|S t|||d}|S )Nr   r   )inputs_embedsr   r   )_embed_text_input_idsr   embed_input_idsr   r   )rF   r   r   r   r   r   r:   r:   r>   r     s   z0Qwen3ASRForConditionalGeneration.embed_input_ids	positionsintermediate_tensorsr   c                 K   s$   |d urd }| j j||||d}|S )N)r   )r   model)rF   r   r   r   r   rG   hidden_statesr:   r:   r>   forward  s   z(Qwen3ASRForConditionalGeneration.forwardr   c                 C   s   | j |S rA   )r   compute_logits)rF   r   r:   r:   r>   r     s   z/Qwen3ASRForConditionalGeneration.compute_logitsweightsc                 C   s$   t | ddgd}|j|| jd}|S )Nztalker.z	code2wav.)skip_prefixes)mapper)r   load_weightshf_to_vllm_mapper)rF   r   loaderloaded_weightsr:   r:   r>   r     s   z-Qwen3ASRForConditionalGeneration.load_weightsinput_tokensmm_featuresc                 C   s  t |}|stj|tjddddd}| dfS g }d}t|dd dD ]e}|jj	}|j
d	 j
}	t|	tjr>|	 }	tt|	 }
|| }|rU|d  d nd}tj|tjddddd| }|| || }tj|
tjddddd| }|| ||
 }q(||k r|r|d  d nd}|| }tj|tjddddd| }|| tj|dd
dd}|jd |krtd| d |  }||fS )N)r   r7   r      r   c                 S   s   | j jS rA   )mm_positionoffset)fr:   r:   r>   <lambda>  s    zLQwen3ASRForConditionalGeneration.get_mrope_input_positions.<locals>.<lambda>)keyrs   rt   z2Position ids length mismatch with input ids length)r   rx   arangelongviewexpandclonesortedr   r   r~   rP   r   itemr?   tensormaxappendcatreshapeshapeRuntimeError)rF   r   r   rg   llm_positionsllm_pos_ids_listst
mm_featurer   audio_feature_length	audio_lentext_lenst_idxtext_positionsaudio_positionsfinal_text_positionsmrope_position_deltar:   r:   r>   get_mrope_input_positions  sX   



z:Qwen3ASRForConditionalGeneration.get_mrope_input_positionsc                 C   s   t jddgdS )z<
        Get the module prefix in multimodal models
        r   zaudio_tower.)r   tower_model)r   from_string_fieldrE   r:   r:   r>   get_mm_mapping  s   z/Qwen3ASRForConditionalGeneration.get_mm_mappingr   	task_typec                 C   s   t |}|j}t|j|jdS )N)max_audio_clip_ssample_rate)r3   rO   r
   rm   rX   )r   r   r  rN   rO   r:   r:   r>   get_speech_to_text_config  s   z:Qwen3ASRForConditionalGeneration.get_speech_to_text_configrS   
stt_configlanguage
transcribe	translaterequest_promptto_languagec                 C   s~   t |}| dd}	|dvrtd| d| j||}
|du r(d|	 d}n
d|	 d	|
 t }||}t|d|id
S )z@Get the generation prompt to be used for transcription requests.rS   r   r  zUnsupported task_type 'z9'. Supported task types are 'transcribe' and 'translate'.Nz<|im_start|>user
z!<|im_end|>
<|im_start|>assistant
z*<|im_end|>
<|im_start|>assistant
language )prompt_token_idsmulti_modal_data)r0   r   r   supported_languagesrc   _ASR_TEXT_TAGencoder   )r   rS   r   r  r  r  r  r  r   audio_placeholderfull_lang_name_topromptr  r:   r:   r>   get_generation_prompt  s*   


z6Qwen3ASRForConditionalGeneration.get_generation_prompttextc                 C   s(   |sdS t |vr
|S |t d\}}|S )z
        Post-process Qwen3-ASR raw output to extract clean transcription.

        The model outputs in format: "language {lang}<asr_text>{transcription}"
        This method strips the language prefix and asr_text tags.
        r   r7   )r  rsplit)r   r!  _	text_partr:   r:   r>   post_process_output:  s   z4Qwen3ASRForConditionalGeneration.post_process_output)NNrA   )/r[   r\   r]   r   r  r   r   classmethodr_   r`   r   r   r   r^   r   r   rz   r   listrx   r   r   r   r   boolr   r/   r   r   r   r   setr   r#   r	  r   r  r	   r
   r  npndarrayr   r   r   r%  r   r:   r:   r   r>   r     s    





$	
@	
	&r   )^__doc__collections.abcr   r   r   typingr   r   numpyr*  rx   torch.nnnn%transformers.feature_extraction_utilsr   transformers.models.whisperr   vllm.configr	   r
   r   vllm.config.multimodalr   vllm.inputs.datar   r   vllm.loggerr   %vllm.model_executor.models.interfacesr   r   r   r   r   )vllm.model_executor.models.module_mappingr    vllm.model_executor.models.qwen3r   1vllm.model_executor.models.qwen3_omni_moe_thinkerr   r   r    vllm.model_executor.models.utilsr   r   r   r   "vllm.model_executor.models.whisperr   vllm.multimodalr   vllm.multimodal.inputsr    r!   r"   r#   r$   r%   vllm.multimodal.parser&   r'   r(   r)   r*   vllm.multimodal.processingr+   r,   r-   r.   vllm.sequencer/   vllm.tokenizersr0   )vllm.transformers_utils.configs.qwen3_asrr1   r2   !vllm.transformers_utils.processorr3   ,vllm.transformers_utils.processors.qwen3_asrr4   r[   loggerr  r   r?   r@   ra   r_   r}   rW   r   register_processorModuler   r:   r:   r:   r>   <module>   sb    	&
;

