o
    پiw                     @   s  U d dl Z d dlmZmZmZmZmZ d dlZd dlm	Z
 d dlmZ d dlmZ d dlmZmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZ d dlmZ d dlmZm Z m!Z!m"Z" e! rrd dl#m$Z$m%Z%m&Z&m'Z' e" ryd dl(Z(e )e*Z+e dZ,e dZ-i Z.ee/eg df f e0d< h dZ1G dd dej2Z3de/deg df ddfddZ4d9dee/ ddfddZ5dej6dej6dej6dej6de7d eej6 d!ej6fd"d#Z8d$ej6dej6dej6dej6de7f
d%d&Z9ej:d'd(d)ej6d*ej6d!ej6dej6fd+d,Z;		d:dej6d eej6 d!eej6 fd-d.Z<dej6dej6fd/d0Z=d)ej6d1eee>  deeef fd2d3Z?	4d;d$ej6d5ed6e>fd7d8Z@dS )<    N)CallableDictListOptionalTuple)nn)get_tp_group)get_attention_tp_groupis_dp_attention_enabled)LogitsProcessorOutput)murmur_hash32)get_token_ids_logprobsget_top_logprobs)SamplingBatchInfo)	TOP_K_ALL)get_global_server_args)crash_on_warningsget_bool_env_varis_cudais_npu)min_p_sampling_from_probstop_k_renorm_probtop_k_top_p_sampling_from_probstop_p_renorm_probSYNC_TOKEN_IDS_ACROSS_TPSGLANG_RETURN_ORIGINAL_LOGPROBSampler_CUSTOM_SAMPLER_FACTORIES>   ascendpytorch
flashinferc                       sl  e Zd Z fddZdejdedejfddZdeded	e	d
e
e de
e
e  dejfddZdejdedejde	dejf
ddZdejdedejdejfddZdejdede	dejfddZdejdede	d	e	deejeej f f
ddZdedejd
e
e de
e
e  dedejfddZdejdefddZdeded	e	d
e
e de
e
e  ddfd d!Z  ZS )"r   c                    s`   t    t j| _t j| _t rt	 j| _t j
| _
t j| _| j
d u| _t jdk| _d S )Nr   )super__init__r   enable_nan_detectionuse_nan_detectionr   device_grouptp_sync_groupr
   r	   rl_on_policy_targetenable_deterministic_inferenceenable_deterministicuse_log_softmax_logprobsampling_backenduse_ascend_backend)self	__class__ M/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/sampler.pyr"   )   s   




zSampler.__init__logitssampling_inforeturnc                 C   s^   |j rt|| | jr-tt|r-td tt|t	|d|}t
 r-td|S )z7Apply custom logit processors and handle NaN detection.z3Detected errors during sampling! NaN in the logits.g     j)has_custom_logit_processorapply_custom_logit_processorr$   torchanyisnanloggerwarningwhere	full_liker   
ValueError)r-   r2   r3   r0   r0   r1   _preprocess_logits9   s   

zSampler._preprocess_logitslogits_outputreturn_logprobtop_logprobs_numstoken_ids_logprobs	positionsc                 C   sn  |j }| ||}|jr t|d}|rtjjj|dd }	}
n|j o+|j	 o+|j
 }|r7tr7tj|dd}	d}| jdurP| |j }tj|dd}~| jr^| ||||\}}
nA| jrt| jrt|rt| |||}|rstss|}
n+||j tj|dd|dd< |}| ||||}|rts|dur|nt|}
~|rtr|	}
| ||
|||| | || |S )a  Run a sampler & compute logprobs and update logits_output accordingly.

        Args:
            logits_output: The logits from the model forward
            sampling_info: Metadata for sampling
            return_logprob: If set, store the output logprob information to
                logits_output
            top_logprobs_nums: Number of top lobprobs per sequence in a batch
            token_ids_logprobs: Per-sequence list of specific token IDs to retrieve
                logprobs for. Each element is a list of token IDs (or None) for one
                sequence in the batch. This is used in speculative decoding.
            positions: The positions of the tokens in the sequence. Used for deterministic sampling
                to get the unique seed for each position.
        dimN)next_token_logitsr?   is_all_greedyr7   argmaxr   
functionallog_softmaxneed_top_p_samplingneed_top_k_samplingneed_min_p_samplingr   r'   bfloat16divtemperaturesr,   _forward_ascend_backendr*   r)   _sample_from_logprobsdiv_softmax_sample_from_probslog_attach_logprobs_to_output_sync_token_ids_across_tp)r-   r@   r3   rA   rB   rC   rD   r2   batch_next_token_idsoriginal_logprobslogprobssimple_sampling_caselogprobs_via_logsoftmax_kernellogits_div_temperatureprobsr0   r0   r1   forwardL   s   


	zSampler.forwardra   r^   c                 C   s   |rt ||j|d}|S t j}|dkrD|jdu sJ d|jr4t||j}t||j}t	||j
}|S t| |j|jd| jd}|S |dkrYt||j|j|j
|j|j|}|S td| )	zSample from probability distribution (after softmax).

        Used for standard sampling with flashinfer/pytorch backends.
        Handles both simple (direct multinomial) and complex (top-k/top-p/min-p) cases.
        )sampling_seedrD   r    Nz5Sampling seed is not supported for flashinfer backendjoint)filter_apply_order	check_nanr   zInvalid sampling backend: )sampling_from_probs_torchrc   r   r+   rO   r   top_ksr   top_psr   min_psr   
contiguousr$   +top_k_top_p_min_p_sampling_from_probs_torchr>   )r-   ra   r3   rD   r^   r[   backendr0   r0   r1   rW      sN   &zSampler._sample_from_probsr]   c                 C   s2   |j dus	J dt||j |}|dtjS )zSample from log-probabilities using the Gumbel trick.

        Used for deterministic sampling with simple cases (no top-k/top-p/min-p).
        Requires sampling_seed to be set in sampling_info.
        Nz4sampling_seed is required for sampling from logprobsrE   )rc   multinomial_with_seedviewtor7   int32)r-   r]   r3   rD   sampled_indexr0   r0   r1   rT      s   zSampler._sample_from_logprobsc                 C   sd   |rt j|dd}t j|ddd}|t jS | js J dt||j|j	|j
|j}|t jS )zSample from temperature-scaled logits without softmax.

        Used for the Ascend NPU backend which handles softmax internally.
        rE   rF      num_samplesz1Only ascend backend supports sampling from logits)r7   rV   multinomialro   rp   rq   r,   -top_k_top_p_min_p_sampling_from_logits_ascendrh   ri   rj   rO   )r-   r2   r3   r^   ra   r[   r0   r0   r1   _sample_from_logits  s    
zSampler._sample_from_logitsc                 C   s<   | |j | |||}d}|rtstj|dd}||fS )am  Handle the full Ascend backend sampling path.

        Ascend backend has fused kernels that handle softmax internally,
        so we sample directly from temperature-scaled logits.

        Returns:
            A tuple of (batch_next_token_ids, logprobs). logprobs is None
            when return_logprob is False or SGLANG_RETURN_ORIGINAL_LOGPROB is set.
        NrE   rF   )rU   rR   rx   r   r7   rL   )r-   r2   r3   r^   rA   r[   r]   r0   r0   r1   rS     s   zSampler._forward_ascend_backendr[   c                 C   s   |j t|jjd tdd |D rt||\|_|_tdd |D r/t	||\|_
|_|tjt||jd|f |_d S )Nminc                 s       | ]}|d kV  qdS r   Nr0   .0xr0   r0   r1   	<genexpr>E      z5Sampler._attach_logprobs_to_output.<locals>.<genexpr>c                 s   s    | ]}|d uV  qd S Nr0   r}   r0   r0   r1   r   K  r   device)clamp_r7   finfodtyperz   r8   r   next_token_top_logprobs_valnext_token_top_logprobs_idxr   !next_token_token_ids_logprobs_val!next_token_token_ids_logprobs_idxarangelenr   next_token_logprobs)r-   r@   r]   rB   rC   r3   r[   r0   r0   r1   rY   8  s    

z"Sampler._attach_logprobs_to_outputc                 C   s*   t s|jrtjj|tjj| jd d S d S )N)opgroup)	r   grammarsr7   distributed
all_reducedistReduceOpMINr&   )r-   r[   r3   r0   r0   r1   rZ   V  s   

z!Sampler._sync_token_ids_across_tpNc           
      C   s   |j du rtd dS tdd |D }tdd |D }|s$|s$dS | |j |}tjjj|dd}	|r?t	|	|\|_
|_|rLt|	|\|_|_dS dS )z
        Compute logprobs for requested token IDs without performing sampling.

        Optimized for prefill-only scoring requests that need token probabilities
        but don't require next token generation.
        Nz+No logits available for logprob computationc                 s   s$    | ]}|d uot |dkV  qd S )Nr   r   r~   	token_idsr0   r0   r1   r   {  s
    
z0Sampler.compute_logprobs_only.<locals>.<genexpr>c                 s   r{   r|   r0   r}   r0   r0   r1   r     r   rE   rF   )rH   r:   r;   r8   r?   r7   r   rK   rL   r   r   r   &get_token_ids_logprobs_batch_optimizedr   r   )
r-   r@   r3   rA   rB   rC   needs_token_ids_logprobsneeds_top_logprobsr2   r]   r0   r0   r1   compute_logprobs_onlyg  s,   

zSampler.compute_logprobs_only)__name__
__module____qualname__r"   r7   Tensorr   r?   r   boolr   intrb   rW   rT   rx   r   r   rS   rY   rZ   r   __classcell__r0   r0   r.   r1   r   (   s    


p
5






rm   factoryr4   c                 C   sB   | st dddlm} | tv rtd|  ||  |t| < dS )z7Register a custom sampler factory for a backend string.z"backend must be a non-empty stringr   )SAMPLING_BACKEND_CHOICESz4Overriding existing sampler factory for backend '%s'N)r>   sglang.srt.server_argsr   r   r:   r;   add)rm   r   r   r0   r0   r1   register_sampler_backend  s   
r   c                 C   sn   t  }| p|r
|jnd} | tv r$t|   }t|ts"td|  d|S | du s,| tv r/t S td|  d)z7Create a sampler honoring custom backend registrations.Nz$Custom sampler factory for backend 'z' must return a SamplerzUnknown sampling backend 'z.'. Register it via register_sampler_backend().)r   r+   r   
isinstancer   	TypeError_BUILT_IN_SAMPLING_BACKENDSr>   )rm   server_argssamplerr0   r0   r1   create_sampler  s   



r   ra   rh   ri   rj   rO   rc   rD   c                 C   s  | j ddd\}}tj|dd}	d|tjd| jd | jddd|ddk< d||	| |ddk< |rR|d	u s>J d
|d	d	df | }
d|||
ddk < |d	u r^tj|dd}n|tj	}~|
  t|||}|tj}tj|d|dd}|S )z
    A top-k, top-p and min-p sampling implementation with native pytorch operations.
    When sampling_seed is not None, deterministic inference will be enabled, it will sample
    with the sampling_seed of each request.
    rE   TrG   
descendingrF           r   r   rs   NzDWith sampling seed, multinomial_with_seed will provide wrong resultsrt   rG   index)sortr7   cumsumr   shaper   ro   rv   rp   float64log_rn   rq   gather)ra   rh   ri   rj   rO   rc   rD   
probs_sort	probs_idx	probs_summin_p_thresholdsrr   r]   r[   r0   r0   r1   rl     s0   

rl   r2   c                 C   s  t tdr=t|dk|dk@ r=t| ||}|jdd}|r5|jdd| }||ddk }||d tj	|dd}	ntj| dd}
|
j
ddd	\}}|tk}|||
jd  tjd
|
jd |
jddd|ddk}||d tj|dd}|| |ddk}||d |r|ddd
f | }||ddk }||d tj	|dd}|tj}tj|d|d}	|	dS )zA top-k, top-p and min-p sampling implementation for ascend npu with torch_npu interface.

    Takes temperature-scaled logits as input (softmax is applied internally).
    npu_top_k_top_pi   rs   rE   rF   r   rt   Tr   r   r   Nr   )hasattr	torch_npur7   allr   rV   maxro   masked_fill_rv   r   r   r   r   r   r   rp   rq   r   )r2   rh   ri   rj   rO   logits_top_k_top_pprobs_top_k_top_pr   
min_p_maskr[   ra   r   r   topk_all_mask
top_k_maskr   
top_p_maskrr   r0   r0   r1   rw     s>   

rw   T)dynamicr]   seedc                 C   s   | j \}}|tj}tj|| jd}t|||}|tjttj	j
 }| jt|jjd  |   || tj tj|dddS )aL  
    Samples n elements from an input tensor `inputs` of shape (n, m) using
    a unique random seed for each row. This is a deterministic batched alternative to
    `torch.multinomial`.

    Args:
        inputs: A float tensor of shape (n, m) representing n categorical
                distributions with m categories each. The values are treated
                as weights and do not need to sum to 1.
        seed:   An integer tensor of shape (n,) containing the random seed
                for each corresponding row in `inputs`.
        positions: The positions of the tokens in the sequence. Used for deterministic sampling
                to get the unique seed for each position.

    Returns:
        A tensor of shape (n,) where the i-th element is an index sampled
        from the distribution in `inputs[i]` using `seed[i]`.
    r   ry   rs   TrG   keepdim)r   rp   r7   uint64r   r   r   r   iinfouint32r   r   r   r   r   rz   neg_add_rJ   )r]   r   rD   nmcol_indiceshashedr   r0   r0   r1   rn   !  s   
rn   c                 C   s@   |du rt j| dd}n	tt | ||}|dt j}|S )zA sampling implementation with native pytorch operations, without
    top-k, top-p, or min-p filtering.

    Note: For deterministic sampling from logprobs, use Sampler._sample_from_logprobs instead.
    Nrs   rt   rE   )r7   rv   rn   rX   ro   rp   rq   )ra   rc   rD   rr   r[   r0   r0   r1   rg   L  s   
rg   c                 C   s`   | j ddd\}}tj|dd}d||| |ddk< ||jddd t|d||S )NrE   Tr   rF   r   rs   r   )r   r7   r   ro   rU   sum
zeros_likescatter_)ra   ri   r   r   r   r0   r0   r1   top_p_normalize_probs_torcha  s
   r   rC   c                    s  t |} j}tjdd |D |d}t|  }|dkr/ fdd|D dd |D fS ttj||d|}tjdd |D |tj	d} ||f }tj
|| dd	}	g }
g }t|D ](\}}|d
ur{t |dkr{|
|	|  || q`|
 d |g  q`|
|fS )aK  
    Vectorized batch processing for token ID logprobs extraction.

    Uses a single GPU kernel call for the entire batch instead of multiple
    separate calls, significantly improving performance for large batches.

    Args:
        logprobs: Log probabilities tensor [batch_size, vocab_size]
        token_ids_logprobs: List of token IDs to extract logprobs for

    Example:
        # Input: batch_size=3, vocab_size=5
        logprobs = torch.tensor([
            [-1.2, -2.1, -0.8, -3.0, -1.5],  # batch 0
            [-0.5, -1.8, -2.2, -1.1, -2.7],  # batch 1
            [-2.0, -0.9, -1.4, -2.8, -1.6],  # batch 2
        ])
        token_ids_logprobs = [[1, 3], [2], [0, 2, 4]]

        # Output:
        # values = [tensor([-2.1, -3.0]), tensor([-2.2]), tensor([-2.0, -1.4, -1.6])]
        # indices = [[1, 3], [2], [0, 2, 4]]
    c                 S   s   g | ]}t |pg qS r0   r   r   r0   r0   r1   
<listcomp>  s    z:get_token_ids_logprobs_batch_optimized.<locals>.<listcomp>r   r   c                    s   g | ]}  d qS )r   )	new_emptyr~   _r]   r0   r1   r         c                 S   s   g | ]}g qS r0   r0   r   r0   r0   r1   r     s    c                 S   s   g | ]}|pg D ]}|qqS r0   r0   )r~   r   token_idr0   r0   r1   r     s    )r   r   rF   N)r   r   r7   tensorr   r   itemrepeat_interleaver   longsplit_with_sizestolist	enumerateappendr   )r]   rC   
batch_sizer   token_lengthstotal_tokensrow_indicesr   gathered_logprobssplit_logprobsoutput_token_ids_logprobs_valoutput_token_ids_logprobs_idxir   r0   r   r1   r   m  sB   
	r   rs   sampling_batch_infonum_tokens_in_batchc                    s   | j d t | ksJ d| j d  dt  d| d j D ]H\}\}}|jddd }|j d t ksJJ d|j d  d	t  dt||}|| |  fd
d|D | |< td|j	j
 d q#dS )a
  Apply custom logit processors to the logits.
    This function will modify the logits in-place.
    num_tokens_in_batch is needed to support spec decoding, where each batch can contain multiple
    tokens. By default, we assume each batch contains only 1 token.
    r   zThe batch size of logits (z8) does not match the batch size of sampling_batch_info (z) x num_tokens_in_batch ()T)as_tuplezThe number of batch mask (z4) does not match the number of sampling_batch_info (c                    s   g | ]} j | qS r0   )custom_params)r~   r   r   r0   r1   r     r   z0apply_custom_logit_processor.<locals>.<listcomp>zCustom logit processor z is applied.N)r   r   custom_logit_processoritemsnonzeror7   r   r:   debugr/   r   )r2   r   r   r   	processor
batch_maskbatch_indicesr0   r   r1   r6     s4   	
r6   r   )NN)rs   )Aloggingtypingr   r   r   r   r   r7   torch.distributedr   r   r   sglang.srt.distributedr   sglang.srt.layers.dp_attentionr	   r
   "sglang.srt.layers.logits_processorr   sglang.srt.layers.utils.hashr   sglang.srt.layers.utils.logprobr   r   'sglang.srt.sampling.sampling_batch_infor   #sglang.srt.sampling.sampling_paramsr   r   r   sglang.srt.utils.commonr   r   r   r   
sgl_kernelr   r   r   r   r   	getLoggerr   r:   r   r   r   str__annotations__r   Moduler   r   r   r   r   rl   rw   compilern   rg   r   r   r   r6   r0   r0   r0   r1   <module>   s   
 
  s
0

4,




[