o
    پi+                     @   st  d Z ddlZddlZddlmZmZmZmZmZm	Z	 ddl
Z
ddlZddlmZ ddl
mZ ddlmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' dd	l(m)Z) dd
l*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2 e3e4Z5e1 Z6ej7G dd dZ8ej7G dd dZ9G dd dej:Z;ej<dej=fddZ>dd Z?dS )zLogits processing.    N)AnyDictListOptionalTupleUnion)nn)$get_tensor_model_parallel_world_size tensor_model_parallel_all_gather)envs)DpPaddingModeattn_tp_all_gatherattn_tp_all_gather_into_tensordp_gather_replicate
dp_scatterget_attention_dp_rankget_attention_dp_sizeget_attention_tp_sizeget_dp_deviceget_dp_dtypeget_dp_hidden_size)InputLogprobsResult&compute_temp_top_p_normalized_logprobsget_token_ids_logprobs_chunkget_token_ids_logprobs_prefillget_top_logprobs_chunkget_top_logprobs_prefill)VocabParallelEmbedding)CaptureHiddenModeForwardBatchForwardMode)get_global_server_args)is_npuuse_intel_amx_backendc                   @   s6  e Zd ZU eej ed< dZeej ed< dZeej ed< dZ	ee
 ed< dZee
 ed< dZee
ee
e ejf   ed< dZee
 ed< dZeej ed	< dZee
 ed
< dZee
 ed< dZee
ee
e ejf   ed< dZee
 ed< dZeej ed< dZeeee
e f  ed< dZeej ed< dS )LogitsProcessorOutputnext_token_logitsNhidden_statesnext_token_logprobsnext_token_top_logprobs_valnext_token_top_logprobs_idx!next_token_token_ids_logprobs_val!next_token_token_ids_logprobs_idxinput_token_logprobsinput_top_logprobs_valinput_top_logprobs_idxinput_token_ids_logprobs_valinput_token_ids_logprobs_idxfull_logitscustomized_infomm_input_embeds)__name__
__module____qualname__r   torchTensor__annotations__r&   r'   r(   r   r)   r*   r   floatr+   r,   r-   r.   r/   r0   r1   r2   r   strr   r3    r<   r<   V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/logits_processor.pyr$   A   s(   
  r$   c                   @   s  e Zd ZU eed< ejZeed< dZe	e
j ed< dZeed< dZeed< dZeed< dZe	e
j ed	< dZe	ee  ed
< dZe	ee  ed< dZe	ee  ed< dZe	ee  ed< dZe	e
j ed< dZe	eee   ed< dZeed< dZe
jed< dZeed< dZe
jed< dZe	e
j ed< dZe	e
j ed< dZe	e
j ed< dZ e	e ed< dZ!e	e
j ed< dZ"e	e
j ed< dZ#e	e$ ed< dZ%eed< dZ&eed< dZ'e	e
j ed< e(de)fd d!Z*d"d# Z+dS )$LogitsMetadataforward_modecapture_hidden_modeNnext_token_logits_bufferFextend_return_logprobextend_return_top_logprobextend_token_ids_logprobextend_seq_lensextend_seq_lens_cpuextend_logprob_start_lens_cpuextend_logprob_pruned_lens_cputop_logprobs_nums"extend_input_logprob_token_ids_gputoken_ids_logprobstemp_scaled_logprobstemperaturetop_p_normalized_logprobstop_pglobal_num_tokens_gpudp_local_start_posdp_local_num_tokensglobal_dp_buffer_len!global_num_tokens_for_logprob_cpu!global_num_tokens_for_logprob_gpudp_padding_modepadded_static_lenis_prefill_onlyr3   forward_batchc                 C   sL  |j  rA|jrA|j  sAtdd |jD }tdd |jD }d}g }t|j|j	D ]\}}|| dkr8d}|
||  q,nd } } }}| di d|j d|jd	|jd
|d|d|d|jd|jd|j	d|d|jd|jd|jd|jd|jd|jd|jd|jd|jd|jd|jdtjd|jS )Nc                 s   s    | ]}|d kV  qdS )r   Nr<   .0xr<   r<   r=   	<genexpr>       
z4LogitsMetadata.from_forward_batch.<locals>.<genexpr>c                 s   s    | ]}|d uV  qd S Nr<   r[   r<   r<   r=   r^      r_   Fr   Tr?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rK   rJ   rX   rY   rP   rQ   rR   rS   rT   rU   rV   r3   r<   )r?   	is_extendreturn_logprobis_target_verifyanyrI   rK   ziprF   rG   appendr@   rA   rE   rJ   rX   rY   rP   rQ   rR   rS   rT   rU   r   SUM_LENr3   )clsrZ   rC   rD   rB   rH   
extend_len	start_lenr<   r<   r=   from_forward_batch   s   

	
z!LogitsMetadata.from_forward_batchc                 C   s   t j| jdd}t }|dkrt | jd }n||d  }|| _| j| | _t }t }t	 }| j
d ur<t| j
| _n| j| _t j| j|f||d| _d S )Nr   dim   dtypedevice)r7   cumsumrU   r   
zeros_likerQ   rR   r   r   r   rT   sumrS   emptygathered_buffer)self	cumtokensdp_rankrQ   hidden_sizerp   rq   r<   r<   r=   compute_dp_attention_metadata   s,   
z,LogitsMetadata.compute_dp_attention_metadata),r4   r5   r6   r    r9   r   NULLr@   rA   r   r7   r8   rB   boolrC   rD   rE   rF   r   intrG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   r   rX   rY   r3   classmethodr   rk   r{   r<   r<   r<   r=   r>   n   s>   
 4r>   c                       sR  e Zd Z			d5dedee def fddZ		d6ded	ee	e
f d
eej deej def
ddZdejdeej d
eej d	e	fddZdejdeej d
eeej  dejdeej deeej  deej d	e	deej fddZd	e	dejfddZd	e	fddZdejdejdejdee ded	e	deeejf fdd Z	d7dejded	e	d!eej dejf
d"d#Z	d7dejded!eej dejfd$d%Zdejd	e	deejejf fd&d'Zd(ejdejfd)d*Zd(ejd+ejd	e	dejfd,d-Zd(ejd	e	dejfd.d/Zdejded	e	defd0d1Z ded	ee	e
f d2efd3d4Z!  Z"S )8LogitsProcessorFNskip_all_gatherlogit_scalereturn_full_logitsc                    s   t    || _|j| _|| _t j| _t j| _	| jr-t
 | _| o'| jdk| _d| _n| o4t dk| _| jo=t dk| _t| jdd | _| jd urT| jdk rTd | _|| _t j| _tj | _tj | _d S )Nrn   Ffinal_logit_softcappingr   )super__init__config
vocab_sizer   r!   enable_dp_lm_headuse_attn_tp_groupenable_fp32_lm_headuse_fp32_lm_headr   attn_tp_sizedo_tensor_parallel_all_gather%do_tensor_parallel_all_gather_dp_attnr	   r   getattrr   r   multi_item_scoring_delimitermulti_item_delimiterr   $SGLANG_ENABLE_LOGITS_PROCESSER_CHUNKgetenable_logprobs_chunk"SGLANG_LOGITS_PROCESSER_CHUNK_SIZElogprobs_chunk_size)rw   r   r   r   r   	__class__r<   r=   r      s4   




zLogitsProcessor.__init__lm_headlogits_metadataaux_hidden_stateshidden_states_before_normreturnc              
   C   sd  t |tr
t|}| jd ur|jr| ||||| jS |j r(| 	|||S | 
||||\}}}	}
}}| ||||||	|
|}~|js_| |||}|
d urU||
 n|}t|||jdS | ||j | j pt|jd | jkpt| j}|r| |||}|
d ur||
 n|}|| }~| ||}n| ||
||||\}}t|||j|j|j|j|j|jdS )N)r%   r&   r3   r   )r%   r&   r,   r-   r.   r/   r0   r3   )
isinstancer   r>   rk   r   rY   'compute_logprobs_for_multi_item_scoringr?   is_dllm_extend_get_dllm_logits_get_pruned_states_get_hidden_states_to_storerB   _get_logitsr$   r3   _expand_metadata_for_logprobsrq   r   shaper   r   process_input_logprobsprocess_input_logprobs_by_chunkr,   r-   r.   r/   r0   )rw   	input_idsr&   r   r   r   r   pruned_statespruned_states_before_normaux_pruned_statessample_indicesinput_logprob_indicestoken_to_seq_idxhidden_states_to_storelogitssampled_logitsshould_skip_chunkinginput_logitslogprobs_resultr<   r<   r=   forward  s   
	

	

	zLogitsProcessor.forwardr&   c                    sP  d }d }g }|j  s|j  s|j  r)|}|}|d ur$dd |D }d }	d }
n|j  ru|jsu|jdk rAtj|j	ddd ntj
t|j	|j	jd}||j |j	 d | }|d urc| }|d urpfdd|D }d }	d }
nd}g }	d g }
dg g }}}tt|j|jD ]d\}\}}||kr|d }n|}||ksJ |||| ||   |d ur|||| ||   ||g||   ||7 }||| 7 }|	| |
 fd	dt|| D   || 7  q|t|jd  t|}|d urt|}tj|	|jtjd
}	tj|
|jtjd
}
||||	|
|fS )Nc                 S   s   g | ]}|qS r<   r<   r\   hiddenr<   r<   r=   
<listcomp>  s    z6LogitsProcessor._get_pruned_states.<locals>.<listcomp>r   rl   rn   rq   c                    s   g | ]}|  qS r<   r<   r   )
last_indexr<   r=   r     s    rW   c                    s   g | ]} | qS r<   r<   )r\   i)input_logprob_indices_ptr<   r=   r     s    rq   rp   )r?   is_decode_or_idlerc   is_draft_extend_v2ra   rB   rX   r7   rr   rE   arangelenrq   	enumeratere   rG   rF   rf   extendrangecattensorint64)rw   r&   r   r   r   r   r   r   r   r   r   idxsample_index_ptptpruned_states_listpruned_states_before_norm_listextend_logprob_start_lenri   rj   r<   )r   r   r=   r     s   









z"LogitsProcessor._get_pruned_statesr   r   r   r   c	                 C   s   d }	d }
|j  rY|j  r!|d urtj|dd}|}	n|}	|}
n8|j  rU|d ur<tj|dd}|d ur9|| n|}	n|d urD|| n|}	|d urT|d urR|| n|}
nJ d|
d ur_|
}	|	S )NrW   rl   FzShould never reach)r@   need_captureis_fullr7   r   is_last)rw   r&   r   r   r   r   r   r   r   r   "hidden_states_to_store_before_normr<   r<   r=   r     s<   


z+LogitsProcessor._get_hidden_states_to_storerq   c                 C   sR   t j|j|d}|jrt |jd|dd|_|jr't |j||_d S d S )Nr   rW   rn   )	r7   r   rH   rL   repeat_interleaverM   viewrN   rO   )rw   r   rq   pruned_lensr<   r<   r=   r   N  s"   

z-LogitsProcessor._expand_metadata_for_logprobsc           	      C   sx   t ||}|jrt||\}}nd  }}|jrt||\}}nd  }}|tj|jd |jd|j	f }t
|||||dS )Nr   r   r,   r-   r.   r/   r0   )r   rC   r   rD   r   r7   r   r   rq   rJ   r   )	rw   r   r   input_logprobsr-   r.   r/   r0   r,   r<   r<   r=   r   `  s6   z&LogitsProcessor.process_input_logprobsr   r   c           %   	   C   sN  | j }|jd }|| d | }	g }
|jrg }g }nd}d}|jr&g }g }nd}d}d}d}t|	D ]}|| }t|d | |}||k||k @ }|| }|| }tj|ddd }||| }| |||}|dkr}tj	|jd |jd f|j
|jd}||k||k @ }| r|| | }|| ||< | dkrq2|| }|jr|jdur|j| nd}|jr|jdur|j| nd}t||||}t|| || d } |jr|j|  }!|j|  }"t|||!|"|||}|jr|j|  }#|j|  }"t||#|"|||}|tj|jd |jd|j| f }$|
|$ q2tj|
dd}
t|
||||d	|fS )
aZ  
        compute logprobs for the output token from the hidden states.
        To avoid using too much memory, we split pruned_states into chunks of
        rows to compute input_logprobs separately, then concatenate the results.

        Returns:
            InputLogprobsResult: logprobs result
            torch.Tensor: sampled logits
        r   rn   NTas_tuplero   r   rl   r   )r   r   rC   rD   r   minr7   nonzeror   ru   rp   rq   rd   numelrL   rM   rN   rO   r   slicerI   rH   r   rK   r   r   rJ   rf   r   r   )%rw   r   r   r   r   r   r   
chunk_size
total_size
num_chunksr,   r-   r.   r/   r0   split_len_topksplit_len_token_idsr   	start_idxend_idx
chunk_maskglobal_indiceschunk_indicesmask_indiceschunk_stateschunk_logitsr   chunk_sample_maskchunk_sample_indiceschunk_input_logprobschunk_temperaturechunk_top_pchunk_slice
top_k_numsr   rK   chunk_input_token_logprobsr<   r<   r=   r     s   







z/LogitsProcessor.process_input_logprobs_by_chunkembedding_biasc                 C   s   |  ||\}}| |||}| jdur|| j | jr*| jr&| |}nt|}| |||}| 	||}| j
rOtsDt|| j
 |S | j
t|| j
  }|S )a	  Get logits from hidden_states.

        If sampled_logits_only is True, it means hidden_states only contain the
        last position (e.g., extend without input logprobs). The caller should
        guarantee the given hidden_states follow this constraint.
        N)_gather_dp_attn_hidden_states_compute_lm_headr   mul_r   r   _gather_attn_tp_logitsr
   _scatter_dp_attn_logits_copy_logits_to_bufferr   _is_npufused_softcapr7   tanh)rw   r&   r   r   r   local_hidden_statesr   r<   r<   r=   r   )  s,   
zLogitsProcessor._get_logitsc                 C   s.  t |drt |dr||}|S t |drc| jr*t|tj|jtjj}|S t|r@tj	j
||jj|jd d}|S t jd urTt| |jj }|S t||jj|jj}|S | jrtjjjdd |j||tj|}W d    |S 1 sw   Y  |S |j|||}|S )Nset_lora
apply_loraweightTF)enabled)hasattrr   r7   matmultofloat32r  Tr#   ops
sgl_kernelweight_packed_linearrp   r!   rl_on_policy_targetbfloat16cudaampautocastquant_methodapply)rw   r&   r   r   r   r<   r<   r=   r   U  sN   #

z LogitsProcessor._compute_lm_headc                 C   s4   | j r|  |}|j}t||| ||fS ||fS r`   )r   r{   rv   r   )rw   r&   r   r   r<   r<   r=   r     s   z-LogitsProcessor._gather_dp_attn_hidden_statesr   c                 C   s   | j | j dkr3tj| j|jd | j | j f|j|jd}t|| |ddd	|jd | j }|S tj| j |jd f|j|jd}|j
}tt|j| jdd| |S )Nr   r   rn      rW   rl   )r   r   r7   ru   r   rq   rp   r   permutereshaper  r   listtensor_split)rw   r   global_logitsr<   r<   r=   r     s2   

	z&LogitsProcessor._gather_attn_tp_logitsr   c                 C   s>   | j r|}tj|jd |jd f|j|jd}t||| |S )Nr   rn   r   )r   r7   ru   r   rq   rp   r   )rw   r   r   r   r  r<   r<   r=   r     s   z'LogitsProcessor._scatter_dp_attn_logitsc                 C   sb   |j d ur"|j }|jtjksJ ||d d d | jf  |}|S |d d d | jf  }|S r`   )rA   rp   r7   r:   copy_r   )rw   r   r   logits_bufferr<   r<   r=   r     s   
z&LogitsProcessor._copy_logits_to_bufferc                 C   s$   | j sJ | |||}t|d dS )N)r1   r%   )r   r   r$   )rw   r&   r   r   r1   r<   r<   r=   r     s   
z LogitsProcessor._get_dllm_logitsdelimiter_tokenc              	   C   s  ||kj ddd d }|| }| |||}tjjj|dd}	g }
g }d}d}|js.|jreg |_|j	durYd}|j	D ]}||||  }||k
  }|j| ||7 }q;n||k
  }|g|_|jrqt|	|dd\}
}|jr{t|	|\}}|	dd|f }td||||
||jd	S )
a  
        Compute logprobs for multi-item scoring using delimiter-based token extraction.

        This method is designed for scenarios where you want to score multiple items/candidates
        against a single query by combining them into one sequence separated by delimiters.

        Sequence format: Query<delimiter>Item1<delimiter>Item2<delimiter>...
        Scoring positions: Extracts logprobs at positions before each <delimiter>

        Args:
            input_ids (torch.Tensor): Input token IDs containing query and items separated by delimiters.
                Shape: [total_sequence_length] for single request or [batch_total_length] for batch.
            hidden_states (torch.Tensor): Hidden states from the model.
                Shape: [sequence_length, hidden_dim].
            lm_head (VocabParallelEmbedding): Language model head for computing logits.
            logits_metadata (Union[LogitsMetadata, ForwardBatch]): Metadata containing batch info
                and token ID specifications for logprob extraction.
            delimiter_token (int): Token ID used as delimiter between query and items.

        Returns:
            LogitsProcessorOutput: Contains:
                - next_token_logits: None (not needed for scoring-only requests)
                - input_token_logprobs: Logprobs of delimiter tokens at scoring positions
                - input_top_logprobs_val: Top-k logprobs at delimiter positions (if requested)
                - input_top_logprobs_idx: Top-k token indices at delimiter positions (if requested)
                - input_token_ids_logprobs_val: Logprobs for user-requested token IDs (if any)
                - input_token_ids_logprobs_idx: Indices for user-requested token IDs (if any)
        Tr   r   rn   rW   rl   N)delay_cpu_copy)r%   r,   r-   r.   r/   r0   r3   )r   r   r7   r   
functionallog_softmaxrK   rC   rH   rF   rt   itemrf   rD   r   r   r$   r3   )rw   r   r&   r   r   r  multi_item_indicessliced_hiddensliced_logitssliced_logprobsr/   r0   r-   r.   input_ptreq_seq_lenreq_input_idsdelimiter_counttotal_delimitersr,   r<   r<   r=   r     sf   $


	z7LogitsProcessor.compute_logprobs_for_multi_item_scoring)FNF)NNr`   )#r4   r5   r6   r}   r   r:   r   r   r   r>   r   r7   r8   r$   r   r   r   r   rq   r   r   r  r~   r   r   r   r   r   r   r   r   r   r   r   __classcell__r<   r<   r   r=   r      s   3

q
 	

5
$
 +
0
-




r   
BLOCK_SIZEc           
      C   s   t dt j}|| }|t d| }||k }t j| | |d}|| }t d| }	|	d |	d  }|| }t j| | ||d d S )Nr   )maskr  rn   )tl
program_idr  r   r   loadexpstore)
full_logits_ptrsoftcapping_value
n_elementsr)  pidblock_startoffsetsr*  r]   exp2xr<   r<   r=   fused_softcap_kernel=  s   r7  c                 C   s:   |   }d}|| d | ddf}t| | |||d | S )Ni   rn   )r0  r1  r2  r)  )r   r7  )r1   r   r2  r)  gridr<   r<   r=   r   Y  s   r   )@__doc__dataclassesloggingtypingr   r   r   r   r   r   r7   tritontriton.languagelanguager+  r   sglang.srt.distributedr	   r
   sglang.srt.environr   sglang.srt.layers.dp_attentionr   r   r   r   r   r   r   r   r   r   r   sglang.srt.layers.utils.logprobr   r   r   r   r   r   *sglang.srt.layers.vocab_parallel_embeddingr   ,sglang.srt.model_executor.forward_batch_infor   r   r    sglang.srt.server_argsr!   sglang.srt.utils.commonr"   r#   	getLoggerr4   loggerr   	dataclassr$   r>   Moduler   jit	constexprr7  r   r<   r<   r<   r=   <module>   sF    4 
,       S