o
    پi^                     @   s  U d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZ ddlZddlZddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z- e* Z.ed Z/da0eej1 e2d< da3dZ4dZ5da6dd Z7dd Z8dd Z9dej1deej1 fddZ:G dd dej1Z;G dd  d Z<G d!d" d"e<Z=G d#d$ d$e<Z>da?ee! e2d%< d~d&e@fd'd(ZAd)ej1d*e@d+e@d,eee@e@f  deej1e@e@f f
d-d.ZBd/ee d0ee@ d1ee@ d2eeee@e@f   deej1 f
d3d4ZCeee gej1e%B f ZDd5ed6 d*e@d+e@d,eee@e@f  ded6 f
d7d8ZEd9eDd:ee d;ee@ d0ee@ d1ee@ d2eeee@e@f   d<ej1deFej1dB ej1f fd=d>ZGd)ej1d*e@d+e@d,eee@e@f  deeej1 e@e@f f
d?d@ZHd9eed6 gej1f d:ed6 d;ee@ d0ee@ d1ee@ d2eeee@e@f   deej1 fdAdBZId<ej1dCej1dej1fdDdEZJd)ej1dFej1dej1fdGdHZKd9eDd:ee dCej1d<ej1d;ee@ d0ee@ d1ee@ d2eeee@e@f   deej1dB ej1dB ej1f fdIdJZLdddi fdKee dLee@ dMee@ d<ej1dNejMdOejNdPeeeDf dQeOeee@ f dReeePf deej1 fdSdTZQdddi fd<ej1dUe#dVejNdOeejN dWeeeDf dQeeOeee@ f  dReeePf dej1fdXdYZRd<ej1dZee@ d[eee@e@f  dej1fd\d]ZSde@fd^d_ZTde@fd`daZUdbdc ZVddej1dee@dej1fdfdgZWdhdi ZXdjdk ZYdleOdme@dne@doe@dpe@dqee@ fdrdsZZdtdu Z[G dvdw dwZ\dxdy Z]dzd{ Z^d|d} Z_dS )z
Multi-modality utils
    N)abstractmethod)defaultdict)shared_memory)AnyCallableDictListLiteralOptionalTuple)nn)envs)gpu_tensor_hash)CudaIpcTensorTransportProxyModalityMultimodalDataItemMultimodalInputs)EmbeddingResultMultiModalStaticCache)ForwardBatch)EVSEmbeddingResult)get_global_server_args)flatten_nested_listis_npuprint_warning_once)logger)cuda_ipcautodefault_GPU_FEATURE_BUFFERc              
   C   s   | dkst j dkstd urd S z$t j }t|d d d }tj|tj| dat	d| d W d S  t
yJ } z	d aW Y d }~d S d }~ww )Ncpur   i      dtypedevicezPreallocated zMB GPU buffer)r   SGLANG_MM_BUFFER_SIZE_MBgetr   inttorchemptyfloat32r   infoRuntimeError)r$   size_mbnum_elementse r0   P/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/managers/mm_utils.pyinit_feature_buffer5   s   
r2   c                   C   s   da d S Nr   )_BUFFER_OFFSETr0   r0   r0   r1   reset_buffer_offsetH   s   r5   c                   C   s   t d u rdS dS )NFT)r   r0   r0   r0   r1   is_feature_buffer_initializedM   s   r6   tensorreturnc                 C   sb   t d u r| S |  }t| t  kr/t tt|  }|j|  dd || j}t|7 a|S | S )NTnon_blocking)r   numelr4   copy_flattenviewshape)r7   tensor_sizebuffer_viewresultr0   r0   r1   try_add_to_bufferT   s   rC   c                   @   s   e Zd ZdZe			ddejdee dee	ee
f  defdd	Zd
d Zde	ee
f fddZedee fddZede	ee
f fddZedefddZdS )TransportProxyTensorz
    A convenient torch.Tensor subclass that carries extra metadata and supports
    efficient inter-process communications
    Nr   datanamefieldstransport_modec                 O   sF   t |tjstdt| || }||d ur|ni |d|_|S )Nz-Input 'data' must be a torch.Tensor, but got )rF   rG   rH   )
isinstancer(   Tensor	TypeErrortypeas_subclass	_metadata)clsrE   rF   rG   rH   argskwargsinstancer0   r0   r1   __new__l   s   
zTransportProxyTensor.__new__c              
   C   s   | j ddd}| j dd}|dkrY| jrYz"|  }| }|| j| j|  | jj	| 
 d|d< d|d< W |S  tyX } zd|d	 d< | tj|d< W Y d}~|S d}~ww d|d	 d< | tj|d< |S )
zM
        Called during pickling. Implements the serialization logic.
        N)metadatatensor_data	ipc_extrarH   r   r   )handler?   r#   stridedevice_indexstorage_offsetrV   rU   rT   )rN   r&   is_cudauntyped_storage_share_cuda_r?   r#   rX   r$   indexrZ   	ExceptionrM   r(   rJ   )selfstaterH   storagerW   r/   r0   r0   r1   __getstate__   s6   

	z!TransportProxyTensor.__getstate__ra   c              
   C   s8  |d | _ | j dd}|dkr|d dur|d }|d |d |d	 |d
 |d |d f\}}}}}}	z=td| }
tj|
$ tjj| }tjd||
dj||	||d}| | W d   W dS 1 sjw   Y  W dS  t	y } z
t
d| d |d}~ww |d dur| |d  dS td)zQ
        Called during unpickling. Implements the deserialization logic.
        rT   rH   r   r   rV   NrW   r?   r#   rX   rY   rZ   zcuda:r   r"   )rZ   sizerX   z3Error: Failed to deserialize from CUDA IPC handle (z).rU   z=Invalid state for TransportProxyTensor: no tensor data found.)rN   r&   r(   r$   cudaUntypedStorage_new_shared_cudar)   set_r_   printpickleUnpicklingError)r`   ra   rH   rV   rW   r?   r#   rX   source_device_indexs_offsettarget_devicerb   reconstructed_tensorr/   r0   r0   r1   __setstate__   s@   
	
&z!TransportProxyTensor.__setstate__r8   c                 C   s   | j dS )NrF   rN   r&   r`   r0   r0   r1   rF         zTransportProxyTensor.namec                 C   s   | j di S )NrG   rq   rr   r0   r0   r1   rG         zTransportProxyTensor.fieldsc                 C   s   | j ddS )NrH   r   rq   rr   r0   r0   r1   rH      rt   z#TransportProxyTensor.transport_mode)NNr   )__name__
__module____qualname____doc__staticmethodr(   rJ   r
   strr   r   TensorTransportModerS   rc   rp   propertyrF   rG   rH   r0   r0   r0   r1   rD   f   s.    $&rD   c                   @   s2   e Zd ZdZedee dedee fddZdS )MultiModalityDataPaddingPatternz
    Data tokens (like image tokens) often need special handling during padding
    to maintain model compatibility. This class provides the interface for
    implementing different padding strategies for data tokens
    	input_ids	mm_inputsr8   c                 C   s   dS )ze
        Pad the input ids sequence containing data tokens, and replace them with pad_values
        Nr0   )r`   r~   r   r0   r0   r1   pad_input_tokens   s   z0MultiModalityDataPaddingPattern.pad_input_tokensN)	ru   rv   rw   rx   r   r   r'   r   r   r0   r0   r0   r1   r}      s    r}   c                   @   s`   e Zd ZdZ	ddeeeeef   deee  ddfddZdee d	e	dee fd
dZ
dS ))MultiModalityDataPaddingPatternTokenPairsa_  In this pattern, data tokens should be enclosed by special token pairs (e.g. <image>...</image>, data_token_pairs)

    The padded value in a region enclosed by a token pair with be the same one, as the MultimodalDataItem's pad value

    This strategy should be applied when data content is marked by start/end token pairs in the input sequence.
    Ndata_token_pairsdata_start_token_idsr8   c                 C   s   || _ |pdd |D | _dS )z

        Args:
            data_start_token_ids marks the start of a single multimodal data
            See Minicpmo's slice_start_id for example
        c                 S   s   g | ]\}}|qS r0   r0   .0s_er0   r0   r1   
<listcomp>  s    zFMultiModalityDataPaddingPatternTokenPairs.__init__.<locals>.<listcomp>N)data_token_id_pairsr   )r`   r   r   r0   r0   r1   __init__   s   
z2MultiModalityDataPaddingPatternTokenPairs.__init__r~   r   c                    sj  dd |j D }| j}g |_|du r|j|jg}|du r"td |S dd |D dd |D  g }d}d	}fd
dt|D } fddt|D }	t|t|	krV|S t||	D ]B\}
}|	|||
d   ||
 | j
v r}|d7 }| j|
g7  _|t|krt|d }||
 d }|| }|	|g|  |}q[|	||d  t|t|ksJ d|S )zc
        This function will replace the data-tokens in between with pad_values accordingly
        c                 S      g | ]}|j qS r0   	pad_valuer   itemr0   r0   r1   r         zNMultiModalityDataPaddingPatternTokenPairs.pad_input_tokens.<locals>.<listcomp>NzANo data_token_pairs provided, RadixAttention might be influenced.c                 S      h | ]\}}|qS r0   r0   r   r0   r0   r1   	<setcomp>      zMMultiModalityDataPaddingPatternTokenPairs.pad_input_tokens.<locals>.<setcomp>c                 S      h | ]\}}|qS r0   r0   r   _sr/   r0   r0   r1   r     r   r   c                       g | ]
\}}| v r|qS r0   r0   r   ix)start_token_idsr0   r1   r         c                    r   r0   r0   r   )end_tokens_idsr0   r1   r     r      zLength validation fails)mm_itemsr   data_offsetsim_start_id	im_end_idr   	enumeratelenzipextendr   )r`   r~   r   
pad_valuesr   
padded_idslast_idxdata_idxstart_indicesend_indices	start_idxend_idx
num_tokensr   r0   )r   r   r1   r     sB   z:MultiModalityDataPaddingPatternTokenPairs.pad_input_tokensN)ru   rv   rw   rx   r
   r   r   r'   r   r   r   r0   r0   r0   r1   r      s"    


r   c                   @   s.   e Zd ZdZdee dedee fddZdS )/MultiModalityDataPaddingPatternMultimodalTokenszIn this pattern, data tokens should be represented as repetitions of a single token
    e.g. <image><image>....<image>, or <audio><audio>...<audio>
    r~   r   r8   c                 C   sz  |r|j s|S t|}tj ritt}|j D ]
}||j 	| qt
j|jt
j|jt
j|jt
j|ji}| D ].\}}||}	|rH|	du rIq9t|D ]\}
}||
 jD ]}|j||d |d d < qVqMq9nNi }|j D ]9}| r|jdur|j||j< qn| r|jdur|j||j< qn| r|jdur|j||j< qntd|j | D ]
\}	}||||	k< q| }|S )z
        Replaces multimodal tokens in input_ids with corresponding pad_values from mm_items.
        Each modality (image, audio, video) is handled separately based on its token_id.
        Nr   r   z$No multimodal token id provided for )r   r(   	as_tensorr   SGLANG_ENABLE_MM_SPLITTINGr&   r   listmodalityappendr   IMAGEim_token_idMULTI_IMAGESAUDIOaudio_token_idVIDEOvideo_token_iditemsr   offsetsr   is_imageis_audiois_video
ValueErrortolist)r`   r~   r   input_ids_tensoritems_by_modalityr   token_id_mapr   r   token_idr   offsettoken_to_pad_mappingr   ret_input_idsr0   r0   r1   r   >  sJ   






z@MultiModalityDataPaddingPatternMultimodalTokens.pad_input_tokensN)ru   rv   rw   rx   r   r'   r   r   r0   r0   r0   r1   r   9  s    r   embedding_cachemax_sizec                 C   s   t | ad S r   )r   r   )r   r0   r0   r1   init_mm_embedding_cachez  rs   r   	embeddingextend_prefix_lenextend_seq_lenitems_offsetc                 C   s   d\}}|}|| d }|D ]<\}}	||kr!||	kr!||| 7 }n||	kr-||	| d 7 }||kr>||	kr>||| d 7 }q||	krJ||	| d 7 }q|  d| jd } | || }
|
||fS )a  
    Extract a chunk of embeddings based on the specified prefix length, sequence length, and offset ranges.

    Args:
        embedding: The full embedding tensor to extract a chunk from
        extend_prefix_len: The starting position (prefix length) for extraction
        extend_seq_len: The number of tokens to extract
        items_offset: List of [start, end] offset ranges for multimodal items in the input sequence

    Returns:
        A tuple containing:
        - The extracted embedding chunk as a tensor
        - The start index used for extraction
        - The end index used for extraction

    Note:
        If there's no overlap between the requested range and the offset ranges,
        an empty tensor is returned with zeros for start and end indices.
    )r   r   r   r   )reshaper?   )r   r   r   r   start_index	end_indexextend_start_indexextend_end_indexstartendembedding_chunkr0   r0   r1   get_embedding_chunk  s    
r   r   prefix_lengthextend_lengthitems_offset_listc              
   C   sN  g }t | D ]x\}}|jdu r|d q|| }|||  d }g }	g }
|| D ]:\}}||kr3 n1||krE|	t|| || d  ||krc|
t|| d || d || d || d  q)tt|	}	tt|
}
||j|	|	|
   qtdd |D rtdd |D st	dt
|}|d|jd }|S dS )z
    If all items have precomputed_embeddings, return their concatenation.
    If some but not all have precomputed_embeddings, raise NotImplementedError.
    If none have precomputed_embeddings, return None.
    Nr   c                 s       | ]}|d uV  qd S r   r0   r   featurer0   r0   r1   	<genexpr>      z-_get_precomputed_embedding.<locals>.<genexpr>c                 s   r   r   r0   r   r0   r0   r1   r     r   z0MM inputs where only some items are precomputed.r   )r   precomputed_embeddingsr   minr'   npsumanyallNotImplementedErrorr(   concatr   r?   )r   r   r   r   r   idxr   seq_start_idxseq_end_idxprefix_embedding_lengthextend_embedding_lengthmm_start_idx
mm_end_idxrB   r0   r0   r1   _get_precomputed_embedding  sX   






r   embedding_items_per_reqr   c                 C   s   t | t |ksJ dt |  dt | d|dkrg S |}|| }td|t }|t }g }t| |D ]\}	\}
}|
|kr@q5||krM|
|k rM||	 q5|S )aQ  
    From all multimodal items of a request, select the subset that is "relevant to
    this prefill chunk", and allow a small amount of extra padding on both sides
    of the chunk boundary (for easier caching or cross-chunk reuse).

    Assumptions:
        - len(embedding_items_per_req) == len(items_offset)
        - items_offset[j] = (start, end), meaning the multimodal tokens of the j-th
        item correspond to [start, end) (left-closed, right-open) in the entire
        token sequence
        - The item order in embedding_items_per_req is one-to-one aligned with
        items_offset

    Args:
        embedding_items_per_req: all items of this modality under the current
            request (e.g. each frame in a 500-frame video)
        extend_prefix_len: number of tokens already prefilled before the current
            chunk
        extend_seq_len: number of tokens in the current chunk
        items_offset: (start, end) position of each item in the whole sentence

    Returns:
        The subset of items to feed into ViT for this chunk (preserving the
        original order)
    zitems_per_req(z) vs items_offset(z
) mismatchr   )r   max_EXTRA_PRE_TOKENS_EXTRA_POST_TOKENSr   r   )r   r   r   r   chunk_start	chunk_endwindow_start
window_endselected_itemsr   r   r   r0   r0   r1   0get_embedding_items_per_chunk_with_extra_padding  s&   
r   data_embedding_funcembedding_items
items_sizer~   c                    sz  g }t t|d t}t|D ] |  | d  krq||  | d   }	|  }
|
d us6J |
t fdd|
D rCqdd |	D }t|}t|}|d u rs| |	}t|t	j
rgt|dn|}t||sstd   } t|k r|  nd}t|tr|	d }|j||
|||d\}}
t|j|||
d	\}}}|| qt|dkrd |fS t	j|dd
|fS )Nr   c                       g | ]
\}}|  k qS r0   r0   r   _
offset_endr   r   r0   r1   r   ;  r   z2_get_chunked_prefill_embedding.<locals>.<listcomp>c                 S   r   r0   hashr   r0   r0   r1   r   =  r   )r   zMultimodal embedding cache is full. This typically occurs when a single embedding exceeds the cache size limit. Consider increasing the `SGLANG_VLM_CACHE_SIZE_MB` environment variable or reducing the input embedding size.r   )r   r   r   )r   r   r   r   dim)r   r   ranger   r   combine_hashesr   r&   rI   r(   rJ   r   setr   r   'redistribute_pruned_frames_placeholdersr   r   r   r   )r   r   r  r   r   r   r~   embedding_listmax_iterationsr   r   item_hashesembedding_items_hashembedding_per_reqr   r   r   r   embedding_per_req_chunkr  r0   r  r1   _get_chunked_prefill_embedding'  sZ   





r  c                 C   sT  | du s
|   dkrdS |}|| }|dks||krdS td|t }|t }d}g }	d}
d}|D ]e\}}||kr:q1|| }||koE||k }|sIq1t||}t||}||k r|| }|| }|| }|| }|	| ||  |
td|7 }
|td|| 7 }n||kr|
|7 }
n||kr||7 }||7 }q1|	sd|
|fS tj|	dd}||
|fS )a  
    From the embedding computed on "items related to this chunk + extra padding",
    trim out the token embeddings that are not needed for the current chunk, and
    keep only those mm tokens covered by
    [extend_prefix_len, extend_prefix_len + extend_seq_len).

    Assumptions:
        - Each (start, end) in items_offset represents an item's multimodal token
        interval [start, end) in the whole token sequence, and their order is
        consistent with the order of items in `embedding`.
        - The layout of `embedding`: each selected item is concatenated in order,
        and item j occupies seg_len_j = end_j - start_j rows.

    Args:
        embedding: output of data_embedding_func(embedding_items_per_chunk),
                shape = (T_total, D)
        extend_prefix_len: number of tokens before the chunk (prefix_len)
        extend_seq_len: number of tokens in this chunk (chunk_len)
        items_offset: list of (start, end) for all items of the current request

    Returns:
        - trimmed_embedding: embedding that contains only the mm tokens needed
        by this chunk, concatenated in token order
        - num_tokens_before: number of mm tokens "before the chunk" that are
        trimmed off (optional info, not used by the current caller)
        - num_tokens_after: number of mm tokens "after the chunk" that are
        trimmed off (optional info, not used by the current caller)
    Nr   )Nr   r   r	  )r;   r   r   r   r   r   r(   cat)r   r   r   r   r   r   r   r   embedding_idxkept_slicesnum_tokens_beforenum_tokens_afterr   r   seg_lenselectedoverlap_startoverlap_endlocal_start	local_endslice_start	slice_endtrimmed_embeddingr0   r0   r1   (get_embedding_chunk_remove_extra_paddingj  sL   "





r$  c                    s^  g }t t|d t}t|D ] |  | d  krq||  | d   }|  }	|	dus6J |	t fdd|	D rCqt|   t|k rR|  nd|	d}
|
sZqdd |
D }t|}t|}|du r| |
}|	 
 }t||std n|d jj}|j|kr||}|dur| dkr|| q|sdS tj|dd	S )
an  
    Multi-modal embedding computation for chunked prefill.

    For each request:
    1. Use items_size to split embedding_items into per-request sublists embedding_items_per_req;
    2. Use get_embedding_items_per_chunk_with_extra_padding to select the subset of items related to this chunk;
    3. Call data_embedding_func (ViT) on this subset to obtain embedding_per_chunk;
    4. Concatenate embedding_per_req_chunk for all requests in order.

    In this way, the ViT for each request only processes the frames / images related to the current chunk,
    avoiding OOM caused by processing all the frames at once.
    r   Nc                    r  r0   r0   r  r  r0   r1   r     r   zD_get_chunked_prefill_embedding_for_chunked_items.<locals>.<listcomp>r   )r   r   r   c                 S   r   r0   r  r   r0   r0   r1   r     r   z[WARN] Multimodal embedding cache is full. Consider increasing `SGLANG_VLM_CACHE_SIZE_MB` or reducing video frame count / resolution for a single request.r	  )r   r   r  r   r   r   r  r   r&   detachr    r  ri   r   r$   tor;   r   r(   r  )r   r   r  r   r   r   r  r  r   r   embedding_items_per_chunkr  r  embedding_per_chunkembedding_for_cachern   r0   r  r1   0_get_chunked_prefill_embedding_for_chunked_items  sL   




r*  placeholder_tensorc                 C   s   t | |dS )Nr   )r(   isin	unsqueeze)r~   r+  r0   r0   r1   _get_multimodal_mask4  s   r.  maskc                 C   s   | j d }|  }||kr^|d| d| d ||k rSt j}|dkr,|d |  dkr?| | d d d f } | S || j d  }| | d d d f } | S td|d	|d
| S )Nr   zUNumber of tokens in multimodal embedding does not match those in the input text. Got z tokens in the text but z# tokens from multimodal embeddings.r   z`You may want to avoid this issue by raising `chunked_prefill_size`, or disabling chunked prefill   zEInsufficient multimodal embedding length: num_mm_tokens_in_input_ids=z vs num_mm_tokens_in_embedding=z. This is an internal error)r?   r   r   warningr   chunked_prefill_sizer
  r,   )r   r/  r   num_mm_tokens_in_embeddingnum_mm_tokens_in_input_idsr2  num_multimodalr0   r0   r1   _adjust_embedding_length:  s4   
r6  c           
      C   sr   t ||||}|du r t| ||||||\}}|du r dd|fS tr)tj   t||}	t||	t	}||	|fS )a  
    Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.

    Args:
        data_embedding_func: Function that generates embeddings for multimodal items
        embedding_items: List of multimodal items to embed
        placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
        input_ids: The input token IDs tensor
        items_size: Cumulative sizes of multimodal items per request
        prefix_length: Prefix lengths for each request
        extend_length: Sequence lengths for each request
        items_offset_list: List of offset ranges for multimodal items in each request

    Returns:
        A tuple containing:
        - The generated embeddings tensor
        - A boolean mask tensor indicating where these embeddings should be placed
        - If EVS is used, the pruned input ids tensor; otherwise, the original input ids tensor
    N)
r   r  _is_npur(   npucurrent_streamsynchronizer.  r6  r   )
r   r   r+  r~   r  r   r   r   r   special_multimodal_maskr0   r0   r1   get_embedding_and_maskZ  s(   	


r<  mm_inputs_listextend_prefix_lensextend_seq_lensinput_embeddingmultimodal_modeldata_embedding_func_mappingplaceholder_tokensuse_deepstackc	           !         s  i }	| du rdS g }
| D ]}|
dd |j D 7 }
qg g g g f\}}}}t D ]  fdd|
D }|du r8dn| d}|du rQ j }t|d| dd}t|dkr|dusbJ d  tj	d	d |D |j
d
}tjt| d td}g }t| D ]"\}} fdd|j D }t|||d < |tdd |D  qtj|dd }t||||||||d\}}}| dr|dur||\}}||g7 }| g7 }||g7 }||g7 }q'|j}|jd|d d ||}|rt|j}|jdd |jd | f }tj||j
|jd}||	d< ttt||||D ];\} }}|du s1|du r3q!t|jddd } ||j
|j|| < | dr[|| |j
|j|| < q!||	fS )a!  
    Embed multimodal inputs and integrate them with text token embeddings.

    Args:
        mm_inputs_list: List of multimodal inputs to process
        extend_prefix_lens: Prefix lengths for each request
        extend_seq_lens: Sequence lengths for each request
        input_ids: Input token IDs tensor
        input_embedding: Embedding layer for text tokens
        placeholder_tokens: Token IDs for multimodal placeholders (uses pad_values if None)

    Returns:
        Combined embedding tensor with multimodal content integrated
    Nc                 S      g | ]}|d ur|qS r   r0   r   r0   r0   r1   r     s    z#embed_mm_inputs.<locals>.<listcomp>c                       g | ]
}|j  d r|qS r   is_modalityr   rH  r0   r1   r     s
    get__featurer   zno embedding method found for c                 S   r   r0   r   r   r0   r0   r1   r     r   r$   r   r#   c                    rF  rG  rI  r   rH  r0   r1   r     s    
c                 S   r   r0   )r   r   r0   r0   r1   r     r   r	  )r   r   r+  r~   r  r   r   r   )r   r   r   )r$   r#   input_deepstack_embeds)r   r   r   r&   rF   lowergetattrr   r(   r   r$   zerosr'   r   r   r   cumsumr   r<  separate_deepstack_embedsnum_embeddingsclamp_deepstack_visual_indexesr?   r#   r   r  wheresqueezer&  )!r=  r>  r?  r~   r@  rA  rB  rC  rD  
other_infoitem_flatten_listr   
modalities
embeddingsmasksdeepstack_embeddingsr   embeddermodality_idr+  r  items_offsetsr   r   r   r/  deepstack_embedding
vocab_sizeinput_embedsnum_deepstack_embeddingsdeepstack_embedding_shaperO  indicesr0   rH  r1   embed_mm_inputs  s   








ri  forward_batchlanguage_modeldata_embedding_funcsc                    sZ  t |dsJ | }t |dr|jjr j s j s  rdd  jD }	 fddt	 j
D }
 fddt	 jD }t|	|
|| |||||d	\}}|rY|d |d< |	r|	D ]&}|rt |d	r|jD ]}t|d
d}t|tjr|jr|jddd|_qiq]d _| _n|| } jdur j|  j}nd}|dd |d|}|S )a  
    Process multimodal inputs and forward through language model.

    Args:
        input_ids: Input token IDs tensor
        forward_batch: Batch information for model forward pass
        language_model: Base language model to use
        data_embedding_funcs: A dictionary mapping from modality type to the corresponding embedding function.
        placeholder_tokens: Token IDs for multimodal placeholders
        use_deepstack: Whether to use deepstack embeddings for each modality, default False
        **kwargs: Additional arguments passed to language model

    Returns:
        Hidden states from language model forward pass
    get_input_embeddingspp_groupc                 S   rE  r   r0   )r   mm_inputr0   r0   r1   r   9  s    z,general_mm_embed_routine.<locals>.<listcomp>c                    "   g | ]\}} j | d ur|qS r   r   )r   r   
prefix_lenrj  r0   r1   r   <  
    c                    rp  r   rq  )r   r   seq_lenrs  r0   r1   r   A  rt  )	r=  r>  r?  r~   rA  r@  rB  rC  rD  rO  r   r   Nr    Tr9   )r~   rj  re  r0   )hasattrrm  rn  is_first_rankforward_mode	is_decodeis_target_verifycontains_mm_inputsr   r   extend_prefix_lens_cpuextend_seq_lens_cpuri  r   rQ  rI   r(   rJ   r[   r&  r   mm_input_embedsre  r<   )r~   rj  rk  rA  rl  rC  rD  rQ   embed_tokensr=  r>  r?  re  rZ  mm_input_objmm_itemr   hidden_statesr0   rs  r1   general_mm_embed_routine  sn   


	

r  r   token_pairsc                 C   s  dd |D }dd |D }t dd |D sJ t dd |D s$J t| tj|| jd}t| tj|| jd}t|\}t|\}|  }	|  }
t|	t|
krt|	d t|
kr| d	 	 |v r|
r|	r|
d	 |	d	 k r|	
d	d	 tt|	t|
}|d	krtjd
| jdS g }t|D ]}|	| }|
| }||k r||d |d f q|stjd
| jdS tj|| jd}|S )z
    Returns a tensor indicating the bounds of multimodal data (images, video, audio, etc.)

    Returns:
        [bounds_count, 2]
    c                 S   r   r0   r0   r   r0   r0   r1   r     r   z-get_multimodal_data_bounds.<locals>.<setcomp>c                 S   r   r0   r0   r   r0   r0   r1   r     r   c                 s       | ]}t |tV  qd S r   rI   r'   r   tr0   r0   r1   r         z-get_multimodal_data_bounds.<locals>.<genexpr>c                 s   r  r   r  r  r0   r0   r1   r     r  rM  r   r   )r   r0  )r   r(   r,  r   r$   rX  r    r   r   r   insertr   rR  r  r   )r~   r   r  start_tokens
end_tokens
start_condend_conddata_start_tokensdata_end_tokensdata_start_tokens_cpudata_end_tokens_cpuvalid_mm_data_numsvalid_pairsr   start_token	end_tokenvalid_pairs_tensorr0   r0   r1   get_multimodal_data_boundsw  sJ   
r  c                 C   s&   t |  d d }tj|dddS )N   bigF)	byteordersigned)hashlibsha256digestr'   
from_bytes)rE   
hash_bytesr0   r0   r1   	data_hash  s   r  c                 C   s   | }t | trt| } dd | D } t| }|jr t| S | 	 }|j
tjkr0| }t |tjs8J | }t| }t| S )z(
    hash a tensor or a tensor list
    c                 S   s$   g | ]}t |tjr| n|qS r0   )rI   r(   rJ   r=   )r   r   r0   r0   r1   r     s    ztensor_hash.<locals>.<listcomp>)rI   r   r   r(   r   r[   r   re   r%  
contiguousr#   bfloat16floatrJ   r    
memoryviewnumpyr  tobytes)tensor_listr7   
tensor_cpumvr0   r0   r1   tensor_hash  s    

r  c                 C   s   t | trt | d tjrt| S ttt| S t | tj	r,t
| }| }t|S t | tjr7t| gS t | trI| tj }t|gS t| S r3   )rI   r   r(   rJ   r  r  tupler   r   ndarrayascontiguousarrayr  r   reconstruct_on_target_devicere   current_device)farr	arr_bytesreconstruct_tr0   r0   r1   hash_feature  s   




r  mrope_positionsoutput_ids_lenc                 C   sb   |dkr| S | dddf }|d d }t j||| t j| jdddd}t j| |gddS )a}  
    Extend mrope_positions for retracted requests by appending positions for output_ids.

    When a request is retracted and has multimodal inputs with mrope_positions,
    we need to extend the positions to cover the output_ids that were already generated.
    For pure text tokens, all three dimensions use the same incremental sequence.

    Args:
        mrope_positions: The original mrope positions tensor, shape (3, origin_input_ids_len)
        output_ids_len: The number of output tokens to generate positions for

    Returns:
        Extended mrope_positions tensor with shape (3, origin_input_ids_len + output_ids_len)
    r   Nr   r   r"      r	  )r(   arangeint64r$   r-  expandr  )r  r  last_position	start_posoutput_positionsr0   r0   r1   ,extend_mrope_positions_for_retracted_request  s   r  c                 C   sn   | d u rd S t | tjr| jdkr| jd S d S t | tjr*| jdkr(| jd S d S t | ttfr5t	| S d S r3   )
rI   r(   rJ   ndimr?   r   r  r   r  r   )valuer0   r0   r1   _get_length  s   r  c                 C   s   t | tjr| || S t | tjr| || S t | tr#| || S t | tr.| || S z| || W S  ty@   |  Y S w r   )rI   r(   rJ   r   r  r   r  r_   )r  r   r   r0   r0   r1   _slice_value  s   

r  rE   r^   r   r   	num_itemstotal_feature_lenc           
      C   sj   i }|   D ],\}}t|}	|	|krt|||d ||< q|d ur.|	|kr.t|||||< q|||< q|S )Nr   )r   r  r  )
rE   r^   r   r   r  r  slicedkeyr  lengthr0   r0   r1   _slice_model_data(  s   
r  c              
   C   s>  g }| D ]}|j d uot|j dk}|rt|j }| r|jd}t|}|d u s1||kr7|| qg }|D ]}tj|tj	d}	|t
t|	  q;tjtj|tj	ddd}
dg|
  }t|j}|d u rtt|j}|d u s~|d |kr|| q|}t|D ]G}|| ||d  }}t|}|jd urt|j|||_|jd urt|j|||_|j | g|_ t|j|||||d|_d |_|| qq| r|jd}|d u r|| qt|}|}g }d}t|D ]/}|| }t|tjrt
|d  }ntj|tj	d}	t
|	d  }|| ||7 }q||kr1|| qg }t|D ]0}|| }t|tjrR|t
t|  q7tj|tj	d}	|t
t|	  q7tjtj|tj	ddd}
dg|
  }t|j}|d u rt|j}|d u s|d |kr|| q|}dg}t|D ]}||d ||   qt|D ]V}|| ||d  }}|| ||d  }}t|}|jd urt|j|||_|jd urt|j|||_|j || |_ t|j|||||d|_d |_|| qq|| q|| q|S )	Nr   image_grid_thwrN  r   r	  r   )r^   r   r   r  r  video_grid_thw)r   r   r   model_specific_datar&   r  r   r(   r   longr'   prodr   rS  r7   r   r   r   r  copydeepcopyr  r  r  r   rI   rJ   )original_mm_itemsexpanded_mm_itemsr   
is_bundledr  r  grid_lenpatches_per_itemgridgrid_tensor
cumulativeslice_indicesfeature_lenr  r   r   r   new_itemr  
num_videosframes_per_videototal_framesTpatches_per_videoframe_start_indices	video_idxframe_start	frame_endr0   r0   r1   get_new_expanded_mm_items<  s   





















r  c                   @   s0   e Zd ZdZdejfddZdd Zdd Zd	S )
ShmPointerMMDataz
    Wraps a tensor to be sent via a shared memory handle.
    This acts as a "pointer" to the tensor data across process boundaries.
    r7   c                 C   s   |   | _| jj| _| jj| _| j | j  }tjd|d| _	z#t
j|ft
j| j	jd}| jtj  |d d < W | j	  d S | j	  w )NTcreaterd   r#   buffer)r    r  
cpu_tensorr?   r#   r;   element_sizer   SharedMemoryshmr   r  uint8bufr>   r(   r  r=   close)r`   r7   nbytesshm_viewr0   r0   r1   r     s   

 zShmPointerMMData.__init__c                 C   s   t | dr
| jd u rht| dd }|d u rt| dd }|d u r"td|  }|j| _|j| _| |	  }t
jd|d| _z!tj|ftj| jjd}|tj  |d d < W | j  n| j  w | jj| j| jdS )	Nr  r  r7   z=ShmPointerMMData cannot recreate shared memory without tensorTr  r  )shm_namer?   r#   )rv  r  rQ  r,   r    r  r?   r#   r;   r  r   r  r   r  r  r  r>   r(   r  r=   r  rF   )r`   r7   r  r  r  r0   r0   r1   rc     s*   zShmPointerMMData.__getstate__c                 C   s|   |d | _ |d | _|d | _d | _tj| j d}ztj|j| jd	| j
 | _W |  |  d S |  |  w )Nr  r?   r#   )rF   rN  )r  r?   r#   r  r   r  r(   
frombufferr  r   cloner7   r  unlink)r`   ra   
shm_handler0   r0   r1   rp     s   



zShmPointerMMData.__setstate__N)	ru   rv   rw   rx   r(   rJ   r   rc   rp   r0   r0   r0   r1   r    s
    r  c                  C   s&   t d u rddlm}  | t dka t S )Nr    _determine_tensor_transport_moder   )_is_default_tensor_transport%sglang.srt.managers.tokenizer_managerr  r   r  r0   r0   r1   _get_is_default_transport  s
   r  c                 C   sj   t  st jr	| S t| dr3| jr3| jdg }|D ]}t|dr2t|jtj	r2|jj
r2t|j|_q| S )zO
    Scan the object for multimodal tensors and wrap them in SHM pointers.
    r   r   r   )r  r   skip_tokenizer_initrv  r   r&   rI   r   r(   rJ   is_cpur  objr   r   r0   r0   r1   wrap_shm_features'  s   r  c                 C   sT   t  st jr	| S t| dr(| jr(| jdg }|D ]}t|jtr'|jj	|_q| S )zM
    Restore ShmPointerMMData wrappers back into standard torch.Tensors.
    r   r   )
r  r   r  rv  r   r&   rI   r   r  r7   r  r0   r0   r1   unwrap_shm_features:  s   
r  )r   )`rx   r  r  rj   abcr   collectionsr   multiprocessingr   typingr   r   r   r   r	   r
   r   r  r   r(   r   sglang.srt.environr   sglang.srt.layers.multimodalr   "sglang.srt.managers.schedule_batchr   r   r   r   %sglang.srt.mem_cache.multimodal_cacher   r   ,sglang.srt.model_executor.forward_batch_infor   sglang.srt.multimodal.evsr   sglang.srt.server_argsr   sglang.srt.utilsr   r   r   sglang.utilsr   r7  r{   r   rJ   __annotations__r4   r   r   r  r2   r5   r6   rC   rD   r}   r   r   r   r'   r   r   r   DataEmbeddingFuncr   r  r  r$  r*  r.  r6  r<  	EmbeddingModuledictboolri  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r0   r0   r0   r1   <module>   s   $	wK>
-
9
=
C
p
Z

 	
<

	

 

	
_
<
*
 D