o
    ٷi:                    @   sB  d dl mZmZmZ d dlZd dlZd dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlm Z  d dl!m"Z" d dl#m$Z$m%Z%m&Z& d dl'm(Z( d dl)m*Z* erd dl+m,Z, nede- dZ.ede- dZ/ee0Z1G dd de$Z2dS )    )TYPE_CHECKINGAnycastN)CUDAGraphWrapper)CUDAGraphMode)get_pp_group)set_forward_context)init_logger)MRotaryEmbedding)supports_mrope)VllmModelForPooling)SamplingType)
instrument)
LazyLoader)cdiv)EagleProposer)CachedRequestState)GPUModelRunnerIntermediateTensorsPerLayerAttnMetadata)maybe_create_ubatch_slices)
OmniOutput)SchedulerOutputxgrxgrammarxgr_torch_compilez:xgrammar.kernels.apply_token_bitmask_inplace_torch_compilec                       sr  e Zd Z fddZ fddZedddR fd	d
ZdefddZdS fddZ	dTddZ
dTddZe dejeej B eB defddZe 											dUdededB deded ed!ed"ed#ed$ed%ed&ed'edB deejejf fd(d)ZdTd*d+Zdee fd,d-Zdeeeef  fd.d/Zdefd0d1Zdejd2ed3ejddddf
d4d5Zd3ejdee ef fd6d7Z!d8e d9eddfd:d;Z"dTd<d=Z#dedB d>edB d?e dedB fd@dAZ$	dVdddBedCe%dB fdDdEZ&dFee  dGejddfdHdIZ'				dWdJejdB dKejdB dCe%dB dGejdB dLee e(f f
 fdMdNZ)d?e dOedB ddfdPdQZ*  Z+S )XOmniGPUModelRunnerc                    s(   t  j|i | d | _d | _d | _d S N)super__init__$_omni_per_req_additional_information_omni_num_scheduled_tokens_np_omni_last_model_output)selfargskwargs	__class__ U/home/ubuntu/.local/lib/python3.10/site-packages/vllm_omni/worker/gpu_model_runner.pyr   &   s   
zOmniGPUModelRunner.__init__c           	         s   t  || | jD ];}|D ]6}|jD ]0}t|dd}t|dd}|durC|dkrC| jj| d }|jd |k rCtj	||j
|jd|_qqq
dS )a  Override to fix scheduler_metadata buffer size for FA3 + CUDA graph.

        The upstream FlashAttentionMetadataBuilder pre-allocates
        scheduler_metadata with (max_num_seqs + 1) entries, but FA3's
        get_scheduler_metadata() can return up to
        (max_num_seqs * max_num_splits + 1) entries, causing a RuntimeError
        during CUDA graph capture.  After calling the parent implementation
        we resize any too-small buffers.
        scheduler_metadataNmax_num_splitsr      )dtypedevice)r   initialize_metadata_buildersattn_groupsmetadata_buildersgetattrscheduler_configmax_num_seqsshapetorchzerosr-   r.   r*   )	r#   kv_cache_configkernel_block_sizeskv_cache_group
attn_groupbuildersmr+   requiredr&   r(   r)   r/   ,   s$   


z/OmniGPUModelRunner.initialize_metadata_builderszLoading (GPU))	span_namereturnNc                    s   t  j|i | t| jdd }|d ur~|| _| jj}|d us!J t| jdd d u}| r:|r:t|| j	t
jd| _tt| jddpHt| jjd}t| j| jj}| j|tjd| _| j||| jdd	| _| j||| jdd	| _| j||| jdd	| _d S d S )
N
talker_mtptalker)runtime_modemtp_hidden_sizer   hidden_sizer-   F)r-   numpy)r   
load_modelr2   modelrA   compilation_configcudagraph_modehas_full_cudagraphsr   vllm_configr   FULLintmodel_confighf_text_configmaxmax_num_reqsmax_cudagraph_capture_size_make_bufferr6   int32talker_mtp_input_idsr-   talker_mtp_inputs_embedslast_talker_hidden	text_step)r#   r$   r%   rA   rK   has_separate_talkerrE   max_batch_sizer&   r(   r)   rH   F   s(   
zOmniGPUModelRunner.load_model	req_statec              
   C   s4  g }g }g }g }d}|j D ]Y}|j}|du rq| }	|	d }
dur+||
  |	d }
dur;||
  |	d }
durI||
 |	d }
durW||
 |	d}|durft| }qt| 	 r| j
j|j|j | jj|||||d\|_|_dS tj|j| jj|||||d	\|_|_dS )
a  Initialize M-RoPE positions for multimodal inputs.

        Extracts multimodal feature metadata (image grids, video grids,
        audio features) and computes M-RoPE positions for proper positional
        encoding of multimodal tokens.

        Args:
            req_state: Cached request state containing multimodal features

        Raises:
            AssertionError: If the model does not support M-RoPE
        FNimage_grid_thwvideo_grid_thwsecond_per_grid_tsaudio_feature_lengthsuse_audio_in_video)mm_features	hf_configr^   r_   r`   ra   rb   )rd   r^   r_   r`   ra   rb   )rc   dataget_datagetappendtolistboolitemr   	get_modelrI   get_mrope_input_positionsprompt_token_idsrP   rd   mrope_positionsmrope_position_deltar
   get_input_positions_tensor)r#   r]   r^   r_   r`   ra   rb   
mm_featuremm_itemmm_inputtuse_audio_in_video_valuer(   r(   r)   _init_mrope_positionsc   sT   



z(OmniGPUModelRunner._init_mrope_positionsscheduler_outputr   c                    s.   t  | t|  ddsdS | | dS )a@  Calculate M-RoPE positions for scheduled tokens.

        Delegates to the upstream implementation first, then applies a fixup
        pass for models that pre-compute 2D spatial decode positions (e.g.
        GLM-Image).  This avoids duplicating the full upstream method while
        still supporting non-linear decode position patterns.

        Models opt-in by declaring ``precomputed_mrope_decode = True`` as a
        class attribute.  When set, ``get_mrope_input_positions`` is expected
        to return positions covering **both** prefill and decode tokens.
        precomputed_mrope_decodeFN)r   _calc_mrope_positionsr2   rl   )_fixup_precomputed_mrope_decode_positions)r#   rx   r&   r(   r)   rz      s   z(OmniGPUModelRunner._calc_mrope_positionsc                 C   s  ddl m} d}t| jjD ]r\}}| j| }|jdusJ | jj| }|j| }||j	|j
}	|| |	krEtd|	| }
td||
 }n|}
d}||
7 }|dkr|}||
 }|| }|jjd }||kr||jdd||f | jjdd||| f< ||7 }qdS )a  Overwrite linear decode M-RoPE positions with pre-computed ones.

        For image-generation models (like GLM-Image) that output tokens in 2D
        grid order, ``get_mrope_input_positions`` returns positions for the
        full sequence (prefill + decode).  The upstream runner only uses the
        prefill portion and falls back to linear increments for decode.  This
        method patches the decode slice with the correct pre-computed values.
        r   )&length_from_prompt_token_ids_or_embedsNr,   )
vllm.utilsr|   	enumerateinput_batchreq_idsrequestsro   num_computed_tokens_cpunum_scheduled_tokensrn   prompt_embedsrR   r5   cpu)r#   rx   r|   mrope_pos_ptrindexreq_idreqnum_computed_tokensr   num_prompt_tokensprompt_part_lencompletion_part_len	dst_startdecode_start
decode_endtotal_precomputedr(   r(   r)   r{      s4   	

z<OmniGPUModelRunner._fixup_precomputed_mrope_decode_positionsc           2      C   s|  |j D ]}| j|d | j|d q|j D ]}| j| q|jD ]	}| j|d q#|j	 }| jj
	 }|jj}|||  }|D ]}| j| qDg }|jD ]w}	|	j}|	j}
|	j}|
rt|
jtjkrttj| jd}||
j nd}| jr|dusJ |j}|dusJ dtt|  }|j|}| | t!||	j"|	j#|	j$|
|||	j%|	j&g |	j'd}|| j|< z>t(|	dddur|	j#}t(t)|j*}t)j+|j,|d}|-|j.}t/|}t0| j| d| z||	_#W n	 t1y   Y nw W n t1y } zt23d|  W Y d}~nd}~ww zit(|	d	ddurx|	j4}i }t5|t6r(|}nDd
dl7m8} t5||rl|j9: D ]2\}}|j;duret)*t(|dd}t)j+|j;|d}|-|j<}t/|= ||< q9|j>||< q9|rxt0| j| d| W n t1y } zt23d|  W Y d}~nd}~ww |
r|
j?dur|
j?dkr| jj@n|
j?| j|< | jAr| B| | jCd
kr| D| |E| j|  qRtF jG}|j}|jH}| I }tJ|jKD ]:\} }| j| }|j&|  }!|jL|  }"||jv }#|jM|  }$| jj
N|}%|jOr;| jPr;|%du rd
|_On'| jjQdusJ | jjQ| }&||& d }'|jO|' }(|!|(8 }!|jRSdg|'  |!|_&|sn|jT|  })|!tU|) |jV }*|*dkr]|jRE|)d  n2|*d
krm|jRS|)|* d  n!|$tU|jRk r|jR|$d= |%dur| jjW|% |$ }+|+| jjX|%< |#s|"durtY|j%|"D ]
\},}-|,S|- qn|%du sJ |"dusJ |"|_%| jj
N|}%|%du r| jPr|$d
kr|jZ| }.|.|$ d |_R|E| q|!| jj[|%< |"dur| jj\]|"|% |s|!}/|!tU|) }0|)| jj^|%|/|0f< |0| jjX|%< | j_|| q|D ]}1| j`|1 | j_|1| q| ja  | b| | jc  dS )ab  Update the cached states and the persistent batch with the scheduler
        output.

        The updated states are used by the `_prepare_inputs` function to create
        the input GPU tensors for the model.

        The SamplingMetadata is updated and copied to the GPU if there is a
        new/resumed/paused/finished request in the batch.
        N)r.   z!You did not set `task` in the API)r   rn   r   rc   sampling_paramspooling_params	generator	block_idsr   output_token_idslora_requestr   rF   prompt_embeds_cpuzError decoding prompt embeds: additional_informationr   )AdditionalInformationPayloadtensor_dtypefloat32additional_information_cpuz'Error decoding additional information: r,   )dfinished_req_idsr   popnum_prompt_logprobsr   remove_requestfree_encoder_mm_hashesencoder_cacher   keysreq_id_to_indexscheduled_cached_reqsresumed_req_idsscheduled_new_reqsr   r   r   sampling_typer   RANDOM_SEEDr6   	Generatorr.   manual_seedseedis_pooling_modeltaskr   r   rl   poolerget_pooling_updatesapplyr   rn   r   rc   r   r   r   r2   npr-   
frombufferre   reshaper5   
from_numpysetattr	Exceptionloggererrorr   
isinstancedictvllm_omni.enginer   entriesitemstensor_datatensor_shapecopy	list_dataprompt_logprobs
vocab_size
uses_mroperw   uses_xdrope_dim_init_xdrope_positionsrh   r   is_last_rankscheduled_spec_decode_tokens_get_valid_sampled_token_countr~   r   new_block_idsnum_output_tokensrg   prev_num_draft_lenuse_async_schedulingprev_req_id_to_indexr   extendnew_token_idslen
num_tokensr   num_tokens_no_speczipall_token_idsr   block_table
append_rowtoken_ids_cpuupdate_req_spec_token_idsadd_requestcondense_may_reorder_batchrefresh_metadata)2r#   rx   r   mm_hashscheduled_req_idscached_req_idsr   unscheduled_req_idsreqs_to_addnew_req_datar   r   r   r   rI   	to_updater]   payloadr-   arrpe_cpuepayload_info	info_dictr   kentrydtr   req_datascheduled_spec_tokensvalid_sampled_token_countir   r   resumed_from_preemptionr   	req_indexprev_req_indexnum_acceptednum_rejectedr   num_new_tokensend_idxr   new_idsresumed_token_idsstart_token_indexend_token_indexrequestr(   r(   r)   _update_states   sD  



























z!OmniGPUModelRunner._update_stateshidden_statesc                 C   s   t | jdr| jjrt|tr|j}|j}||fS t|tjr'|}i }||fS t|t	s1t|t
r;|d }i }||fS tdt| )Nhave_multimodal_outputsr   zInvalid hidden states type: )hasattrrI   r  r   r   text_hidden_statesmultimodal_outputsr6   Tensorlisttuple
ValueErrortype)r#   r  r  r  r(   r(   r)   extract_multimodal_outputs  s$   

z-OmniGPUModelRunner.extract_multimodal_outputsFTr   r   cudagraph_runtime_modeforce_attentionuniform_decodeallow_microbatching	skip_eplb
is_profilecreate_mixed_batchremove_lorais_graph_capturingnum_active_lorasactivate_lorac           3      C   s  |du r|dk}| j jj}|r|jrtg tg fS |du s&| s&J |r+| jn|}|| jj	ks5J | jj
}|r[|r?J t|d |d }|| }|d }dg| |g }|}n8|r{|raJ t|t||}|g| }|| dkrz|| |d< nt||}|| }|g| }|d  || 7  < t||ksJ t||ksJ tj|tjd}t| }tj|tjd}| j||||d||p|tjk|||d
\}}}}}|du r|}n||ksJ d	| d
| d|j}|jdur|jn|}t||||| j jj\}} td||  d}!| j|||| d\}"}#|s|tjkrq|r-dg| |d g }$n|}$|$| j jd|< d| j j|d< | j !  | "|\}%}|%| j#jd|d < | j#!  |tjk}&| j$||||&ri| n||
|"d\}!}| %| j&||||	 || j'ksJ | ( }'| j)r| jj*s| +|\}(})i |'| ,|}'n9| j-rd}(| j.j/d| })| ( }'n&t0t0| ddddr| j1j/d| }(| j.j/d| })n
| j1j/d| }(d})| j2r| j3j/ddd|f }*n| j4dkr | j5j/ddd|f }*n| j6j/d| }*t7 j8rd}+n| j9du r$| j:j;| j'| jj<| j=d| _9| >|dd}+| dur@| d j}|dur@||dd< | ?|(|)w t@|!| j ||||| |#dY t0| j:dddurtA| j:dr|},|,| j'kru| jBj/jCd },| D| jBj/d|, | jEj/d|, | jFj/d|, | jGj/d|, }-d| jH_I| j:d|(|*|+|)d|'}-W d   n	1 sw   Y  W d   n	1 sw   Y  | jJr|-\}.}n|-}.| K|.\}.}/| jLr| jLM rtN| jOtPsJ |
r|tjQkp|
 o|tjko| jLjR }0| jHjSr|rd}0| jOjT||0|
|#d W d   n	1 s"w   Y  | U  |s5| jVd|d tW|d }1tX|1jY| j=dd}2|.|.|2 fS )a=  
        Run a dummy forward pass to warm up/profile run or capture the
        CUDA graph for the model.

        Args:
            num_tokens: Number of tokens to run the dummy forward pass.
            cudagraph_runtime_mode: used to control the behavior.
                - if not set will determine the cudagraph mode based on using
                    the self.cudagraph_dispatcher.
                - CUDAGraphMode.NONE: No cudagraph, for warm up and profile run
                - CUDAGraphMode.PIECEWISE: Piecewise cudagraph.
                - CUDAGraphMode.FULL: Full cudagraph, attention metadata is
                    needed.
            force_attention: If True, always create attention metadata. Used to
                warm up attention backend when mode is NONE.
            uniform_decode: If True, the batch is a uniform decode batch.
            skip_eplb: If True, skip EPLB state update.
            is_profile: If True, this is a profile run.
            create_mixed_batch: If True, create a mixed batch with both decode
                (1 token) and prefill (multiple tokens) requests.
            remove_lora: If False, dummy LoRAs are not destroyed after the run
            num_active_loras: Number of active LoRAs to capture for.
            activate_lora: Backward-compatible override for LoRA activation.
        Nr   r,      r   rF   F)
r   num_reqsnum_scheduled_tokens_npmax_num_scheduled_tokensuse_cascade_attnr  force_eagerforce_uniform_decodeforce_has_loraforce_num_active_lorasz7Cudagraph runtime mode mismatch in dummy_run. Expected z
, but got .z+ubatch_slices: %s, ubatch_slices_padded: %s)num_tokens_paddednum_reqs_paddednum_tokens_unpaddedubatch_slices)r   r  max_query_lenr%  for_cudagraph_captureslot_mappingsrI   has_preprocess)
batch_sizer-   r.   )r   num_tokens_across_dpr  batch_descriptorr%  slot_mappingrB   rA   	input_ids	positionsintermediate_tensorsinputs_embeds)use_cudagraphsr  r(  T)is_dummyr  )non_blockingr(   )ZrM   rP   multimodal_configmm_encoder_onlyr6   tensorvalid_runtime_modesuniform_decode_query_lenr3   max_num_batched_tokensr4   minr   sumr   r   arrayrV   rO   ones&_determine_batch_execution_and_paddingr   NONEr   r  r   parallel_confignum_ubatchesr   debug_get_slot_mappingsrN   seq_lenscopy_to_gpu_get_cumsum_and_arangequery_start_loc_build_attention_metadatamaybe_dummy_run_with_loralora_configmax_num_tokens_init_model_kwargssupports_mm_inputsis_encoder_decoder_prepare_mm_inputs_dummy_mm_kwargsenable_prompt_embedsr2  gpur2   r/  r   ro   r   xdrope_positionsr0  r   is_first_rankr1  rI   make_empty_intermediate_tensorsr-   r.   #sync_and_slice_intermediate_tensorsmaybe_randomize_inputsr   r  rW   r5   rA   rX   rY   rZ   rJ   	cache_diruse_aux_hidden_state_outputsr  speculative_config	use_eagler   drafterr   	PIECEWISEenforce_eagercudagraph_specialize_lora	dummy_run_register_layerwise_nvtx_hooks	eplb_stepcumsumr   to)3r#   r   r  r  r  r  r  r  r  r  r  r  r  	mm_configr&  rS   num_decode_tokensnum_prefill_tokensr  num_scheduled_tokens_listmin_tokens_per_reqr   r$  num_sampled_tokens_cudagraph_mode
batch_descshould_ubatchr+  _r"  r#  r%  ubatch_slices_paddedattn_metadataslot_mappings_by_groupr(  rF  cum_num_tokenspad_attnmodel_kwargsr/  r2  r0  r1  num_tokens_padded_talker_mtpoutputsr  r  r3  logit_indiceslogit_indices_devicer(   r(   r)   
_dummy_run  s  (






	



	





"  
		zOmniGPUModelRunner._dummy_runc              
   C   s  zt |dg }|sW dS |D ]}t |ddpt |dd}|du r!qt |dd}d}|durmt|tjr=| d }n0t |dd}t |dd}|durm|durmtt |d	d
}	tj	||	d}
|

|}
t|
 }|dur|| jv rt| j| d| t |dd}|duri }t|tr|}nIt |dd}t|tr| D ]9\}}t |dd}|durtt |dd
}	tj	||	d}
|

t |dd}
t|
 ||< qt |dd||< q|r|| jv rt| j| d| qW dS  ty } ztd|  W Y d}~dS d}~ww )zDecode per-request prompt_embeds and additional_information for newly
        scheduled requests and store them to CPU in the request state.
        This version avoids hard dependency on payload classes by duck-typing.r   Nr   
request_idr   r   re   r5   r-   r   rF   r   r   r   r   r   r   r(   r   r   z7Error decoding prompt_embeds / additional_information: )r2   r   r6   r  detachrf  
contiguousr   r-   r   r   r   r   r   r   r   r   r   r   r   )r#   rx   new_reqsnrr   
payload_per   re   r5   r   r   r   r   r   r   r   r   r   r(   r(   r)   "_decode_and_store_request_payloadsL  sZ   


+z5OmniGPUModelRunner._decode_and_store_request_payloadsc                 C   s   g }| j jD ]?}| j|}|durt|ddnd}|r@t|tr@|| d|v r?|d }t|dr?t	
d| d|j  q|i  q|S )zQGather per-request additional_information stored in request state in batch order.Nr   thinker_reply_part_per_requestr5   z[OMNI] req=z1 has thinker_reply_part_per_request queue shape: )r   r   r   rg   r2   r   r   rh   r  r   rD  r5   )r#   per_req_runtime_infor   r]   infoqr(   r(   r)   &_gather_runtime_additional_information  s   

z9OmniGPUModelRunner._gather_runtime_additional_informationc                 C   sL   g }t t| jjD ]}t| jj| }t|| }|||| f q
|S )zUCompute (start, end) token spans for each request within the flattened step sequence.)ranger   r   r   rO   rI  r   rh   )r#   r  req_token_spansr   start_offsetsched_tokensr(   r(   r)   _compute_request_token_spans  s   z/OmniGPUModelRunner._compute_request_token_spansc              
   C   s^   i }z	|   |d< W |S  ty. } ztd|  ddl}|  W Y d}~|S d}~ww )zBuild extra keyword arguments passed to the model for this step, including:
        - runtime_additional_information: per-request additional information stored in request state
        runtime_additional_informationz0[OMNI DEBUG] Error building model_kwargs_extra: r   N)r  r   r   r   	traceback	print_exc)r#   model_kwargs_extrar   r  r(   r(   r)   _build_model_kwargs_extra  s   z,OmniGPUModelRunner._build_model_kwargs_extrar  r  c                 C   s  z[t | jdrS| jjrVt| jjD ]G\}}| j|}|dur%t|ddnd}t	| j
j| }	t	|| }
|	|	|
 }}||| }| jj|fi |}| || qW dS W dS W dS  ty } ztd| jj d| d|  ddl}|  W Y d}~dS d}~ww )z_Process model-provided per-request additional_information updates and merge into request state.has_postprocessNr   zError merging for requests:z  additional information update: z!, with the multimodal_outputs as r   )r  rI   r  r~   r   r   r   rg   r2   rO   rI  r   postprocess$_merge_additional_information_updater   r   r   r  r  )r#   r  r  r  rx   r   r   r]   	req_infosr  r  sr   hidden_states_sliceupdate_dictr  r(   r(   r)   '_process_additional_information_updates  s4   z:OmniGPUModelRunner._process_additional_information_updatesc                 C   s   t | jjD ]_\}}| j| }t|dd}t| jj| }t|j}t	d|| }t|| }	t
|	|}
|
dkr9q|
dkre|dure||||
  j| j| jdd}t| jj| }| j|||
  | qdS )zOverlay per-request prompt_embeds for the prefill portion and collect
        additional_information slices for this step. Returns a map req_id -> dict.r   Nr   T)r-   r.   r5  )r~   r   r   r   r2   rO   r   r   rn   rR   r<  rf  r-   r.   rI  r   r2  copy_)r#   r  r   r   r]   r   r   
prompt_lenprompt_remainingr  overlay_lensrcr  r(   r(   r)   +_collect_additional_information_for_prefill  s$   



z>OmniGPUModelRunner._collect_additional_information_for_prefillr|  r   c                 C   sL   | j |}|du rdS t|dd}t|tr"|dur$|| dS dS dS )zBUpdate per-request additional_information stored in request state.Nr   )r   rg   r2   r   r   update)r#   r|  r   r]   r   r(   r(   r)   _update_request_information  s   z.OmniGPUModelRunner._update_request_informationc                 C   s~   |j D ]}t|dd }t|tr| |j| qt|jdr9t|jdi }t|tr;| D ]\}}| || q.d S d S d S )Nr   )	r   r2   r   r   r  r   r  r   r   )r#   rx   new_reqr   cached_infosr   r  r(   r(   r)   _update_additional_information  s   


z1OmniGPUModelRunner._update_additional_informationr  r   c                 C   s^   |du s| j jjdkr|S t|trt|ni }t|dd}|r)|ds)||d< ||d< |S )zAttach MiMoAudio-specific fields into req_infos if applicable.

        This helper is intentionally small and self-contained so that it can be
        unit-tested to prevent regressions when updating MiMoAudio handling.
        N!MiMoAudioForConditionalGenerationrc   r   )rI   r'   __name__r   r   r2   rg   )r#   r]   r  r   rc   r(   r(   r)   "_maybe_attach_mimo_audio_req_infos  s   z5OmniGPUModelRunner._maybe_attach_mimo_audio_req_infosnum_input_tokensr1  c           '         s   j }t j}| jj}d}| jre|re|se| j | jd}|   | 	 \}}	W d   n1 s3w   Y  | j
j| jjd| ||	d}
| jjd| |
 | |\}}i |  |  }nl| jr|r| jjd| jddd}| dkr| jj| }| j
j|d}|| jj|< | jjd| }|  }| jjd| }n*t| j
d	dr| jjd| }| jjd| }|  }n| jjd| }d}|  }| jr| jjddd|f }n| jdkr| jjddd|f }n| jjd| }|rd}n|dusJ | ||d
}|r" jr"|  }| d|i | j!j"}t#j$ fdd|D t#j%d}|| _&|durB| '| t(| j
d	rY| j
j)rYg }| j*jj+r[| ,  t-| j!j"D ]\}}| j./|}|durvt|ddnd}| 0|||}t1| j2j3| }t1|| }||| }}t1|t1| }|dur||| nd}| j
j4d||| |d|\} }!}"|du rt5j6|j7d |!j7d f|!j8|!j9d}t(| j
dr|dkr|":d\}#}$t;t<|t<|d }%| j=j|% |  | j>j|% |! | j?j|% |# | j@j|% |$ |A| | B||" tC||!j7d }&|!d|& ||||& < tD| t5jErJ|  |&krJ| ||||& < qat(| j
drY| F|| ||||||fS )zIAlign with v0.14.0 preprocess and omni's additional information handling.N)r   )multimodal_embeddingsis_multimodalF)as_tupler,   r   )r/  r)  Tencoder_outputsc                    s   g | ]} j | qS r(   )r   ).0ridrx   r(   r)   
<listcomp>w  s    z2OmniGPUModelRunner._preprocess.<locals>.<listcomp>rF   r   )r/  input_embedsr   )r.   r-   rA   
mtp_inputsr(   )Gtotal_num_scheduled_tokensr   rV  rP   rP  rO  maybe_get_ec_connector_outputr   _execute_mm_encoder_gather_mm_embeddingsrI   embed_input_idsr/  rT  r2  r  rQ  rN  _extract_mm_kwargsrS  is_token_idsnonzerosqueezenumelr2   r   ro   r   rU  r0  rX  scheduled_encoder_inputsr  r   r   r   r>  rV   r!   r  r  r)  rM   async_chunkr  r~   r   rg   r  rO   rI  r   
preprocessr6   emptyr5   r.   r-   r   slicer   rW   rX   rY   rZ   rh   r  r<  r   r  _talker_mtp_forward)'r#   rx   r  r1  r   rV  rP  ec_connector_output	mm_embedsis_mm_embedinputs_embeds_scheduledr/  r2  rv  token_ids_idx	token_idstokens_to_embedsr0  r  r   r  decode_req_idsr   r   r]   r  r  r  r  r   span_lenembed_slicereq_input_ids
req_embedsr  rY   rZ   decode_sliceseg_lenr(   r  r)   _preprocess  s   










zOmniGPUModelRunner._preprocessr  r2  c                 C   sl  t |}|dkr
d S | j||tj|tjdddd\}}}}}t| jts(tj	}|j
}| jjd | }| jjd | }	| jjd | }
| jjd | }td | j||d | ||	|
|\}	}W d    n1 siw   Y  | d }t| jdd	}t|D ]1\}}| jj|}t| jj| }|	||d  |||d < ||||d  i}| || qd S )
Nr   rF   r,   F)r   r  r  r  r  )r  r,  r   talker_mtp_output_keycode_predictor_codes)r   r@  r   r?  rV   r   rA   r   r   rA  r   rW   rT  rX   rY   rZ   r   rM   r}  rf  r~  r2   rI   r~   r   r   r   rO   rI  r   r  )r#   r  r2  decode_batch_sizerm  rn  rp  r"  r  r  rY   rZ   r  code_predictor_codes_cpuout_keyidxr   r   r  r  r(   r(   r)   r    s>   
z&OmniGPUModelRunner._talker_mtp_forwardr/  r0  rv  c                    s\   |   }t jd||||d||}t|ts)t| jdr)| jj|fi |}|| _|S )z?Inject omni-specific kwargs into forward and cache model outputr.  make_omni_outputNr(   )	r  r   _model_forwardr   r   r  rI   r  r"   )r#   r/  r0  r1  r2  rv  r  model_outputr&   r(   r)   r    s   	z!OmniGPUModelRunner._model_forwardupdc                 C   s   t |tsd S | j|}|d u rd S t|di }t |ts i }t|}| D ])\}}t |tjr>| 	d
 ||< q(t |trMdd |D ||< q(|||< q(t|d| d S )Nr   r   c                 S   s.   g | ]}t |tjr| d  n|qS )r   )r   r6   r  r}  rf  r~  )r  rk   r(   r(   r)   r    s    "zKOmniGPUModelRunner._merge_additional_information_update.<locals>.<listcomp>)r   r   r   rg   r2   r   r6   r  r}  rf  r~  r  r   )r#   r   r  r]   existingmergedr   vr(   r(   r)   r    s$   



z7OmniGPUModelRunner._merge_additional_information_update)r@   N)rx   r   )rx   r   r@   N)NFFTFFFTFr   Nr   )NNNN),r  
__module____qualname__r   r/   r   rH   r   rw   rz   r{   r  r6   inference_moder  r  r   r   r  rO   r   rj   r	  r{  r  r  r  r  objectr   ndarrayr  strr  r  r  r  r   r  r  r   r  r  __classcell__r(   r(   r&   r)   r   %   s    <

,  $	
  
I6	
"





 1#
"r   )3typingr   r   r   rG   r   r6   vllm.compilation.cuda_graphr   vllm.configr   vllm.distributed.parallel_stater   vllm.forward_contextr   vllm.loggerr	   +vllm.model_executor.layers.rotary_embeddingr
   %vllm.model_executor.models.interfacesr   *vllm.model_executor.models.interfaces_baser   vllm.sampling_paramsr   vllm.tracingr   vllm.utils.import_utilsr   vllm.utils.math_utilsr   vllm.v1.spec_decode.eagler   vllm.v1.worker.gpu_input_batchr   vllm.v1.worker.gpu_model_runnerr   r   r   vllm.v1.worker.ubatch_utilsr   0vllm_omni.model_executor.models.output_templatesr   vllm.v1.core.sched.outputr   globalsr   r   r  r   r   r(   r(   r(   r)   <module>   s<    