o
    
۾i9                  
   @   sJ  d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZmZ eeZG dd	 d	ejZe jd
dde jde jfddZde jde jdB de jdB de jfddZde jde jde jfddZde jdeee jf de jfddZde jde jdB de jdB deee jf de jf
ddZdd ZdS )    N)version)envs)rocm_aiter_ops)LogprobsMode)init_logger)CpuArchEnumcurrent_platformc                       s|  e Zd ZdZddeddf fddZdejd	ee	ej
f d
ejdB dejdB deejejdB f f
ddZdejd	ee	ej
f d
ejdB dejdB deejejdB f f
ddZdejd	ee	ej
f d
ejdB dejdB deejejdB f f
ddZdejd	ee	ej
f d
ejdB dejdB deejejdB f f
ddZdejd
ejdB dejdB d	ee	ej
f dejf
ddZ  ZS )TopKTopPSamplerz
    Module that performs optional top-k and top-p filtering followed by
    weighted random sampling of logits.

    Implementations may update the logits tensor in-place.
    raw_logprobslogprobs_modereturnNc                    s8  t    || _|dvrJt rJtjr@ddlm} t	 }|d us#J |
|s4| }td| dtjddd | j| _nWtd	 | j| _nMt rdt }|tjtjfv r_| j| _n8| j| _n3|dvrt rzdd l}tjj| _td
 | j | _W n t!y   t"d | j| _Y nw | j| _t#| _#d S )Nprocessed_logitsprocessed_logprobsr   )FlashInferBackendz/FlashInfer does not support compute capability z&, unset VLLM_USE_FLASHINFER_SAMPLER=1.z,Using FlashInfer for top-p & top-k sampling.global)scopezFlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads.z9Using aiter sampler on ROCm (lazy import, sampling-only).z[aiter.ops.sampling is not available on ROCm. Falling back to forward_native implementation.)$super__init__r   r   is_cudar   VLLM_USE_FLASHINFER_SAMPLER%vllm.v1.attention.backends.flashinferr   get_device_capabilitysupports_compute_capabilityas_version_strRuntimeErrorlogger	info_onceforward_cudaforward
debug_onceforward_nativeis_cpuget_cpu_architecturer   RISCVPOWERPCforward_cpur   
is_enabledaiter.ops.samplingtorchopsaiter	aiter_opsforward_hipImportErrorwarning_onceapply_top_k_top_p)selfr   r   
capabilitycapability_strarchr+   	__class__ X/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/sample/ops/topk_topp_sampler.pyr      s`   







zTopKTopPSampler.__init__logits
generatorskpc                 C   sZ   |  |||}d}| jdkr|}n| jdkr|jdtjd}|jdtjd}t|||fS )z
        PyTorch-native implementation of top-k and top-p sampling.

        The logits tensor may be updated in-place.
        Nr   r   dimdtype)r0   r   log_softmaxr)   float32softmaxrandom_sample)r1   r9   r:   r;   r<   logits_to_returnprobsr7   r7   r8   r!   \   s   

zTopKTopPSampler.forward_nativec                 C   sZ   |du r|du s
|r|rt d | ||||S | jdvs"J dt| |||dfS )z;More optimized implementation for top-k and top-p sampling.NziFlashInfer 0.2.3+ does not support per-request generators. Falling back to PyTorch-native implementation.r   z5FlashInfer does not support returning logits/logprobs)r   r    r!   r   flashinfer_sample
contiguous)r1   r9   r:   r;   r<   r7   r7   r8   r   q   s   zTopKTopPSampler.forward_cudac           
      C   s   |  |||}d}| jdkr|}n| jdkr|jdtjd}t||jd kr-t||fS |jdtjd}t	|}|
  | D ]\}}	|| j
|	d qB||jddd|fS )	z
        PyTorch-native implementation of top-k and top-p sampling for CPU.

        The logits tensor may be updated in-place.
        Nr   r   r=   r>   r   	generatorr?   )r0   r   rA   r)   rB   lenshapecompiled_random_samplerC   
empty_likeexponential_itemsdiv_argmaxview)
r1   r9   r:   r;   r<   rE   rF   qirJ   r7   r7   r8   r&      s   


zTopKTopPSampler.forward_cpuc                 C   sr   d}	 |d u r|d u s|r|rt d | ||||S | jdvs%J d|r/| ||||S | ||||d fS )NTzVaiter sampler does not support per-request generators; falling back to PyTorch-native.r   z9aiter sampler does not support returning logits/logprobs.)r   r/   r!   r   aiter_sample)r1   r9   r:   r;   r<   DISABLE_AITER_SAMPLERr7   r7   r8   r-      s   zTopKTopPSampler.forward_hipc           
      C   s   |du}|du}|r/|r/|j dtjd }| jj|dgt|t|R ddi}|dS |rP|j dtjd }| jj|dgt|R ddi}|dS |rr|j dtjd }| jj	|gt|R  }	tj
|	dddS td)	z#Sample from logits using aiter ops.Nr=   r>   deterministicT   )num_samplesz6aiter_sample was called with no active top-k or top-p.)rC   r)   rB   rH   r,   top_k_top_p_sampling_from_probs_to_tensor_scalar_tuplerT   top_p_sampling_from_probstop_k_renorm_probsmultinomialr   )
r1   r9   r;   r<   r:   	use_top_k	use_top_prF   next_token_idsrenorm_probsr7   r7   r8   rW      sD   

zTopKTopPSampler.aiter_sample)r
   )__name__
__module____qualname____doc__r   r   r)   Tensordictint	Generatortupler!   r   r&   r-   rW   __classcell__r7   r7   r5   r8   r	      st    B



r	   T)dynamicr9   r   c                 C   s:   | j dtjd}t|}|  ||jdddS )Nr=   r>   rK   )rC   r)   rB   rO   rP   divrS   rT   )r9   rF   rU   r7   r7   r8   rN      s   
rN   r;   r<   c           	      C   s   |du r|du r
| S t | |S | jddd\}}|dur>|d|tj }|d|jdd}||k }||t	d  |durk|j
dd}tj|d|d}|d|jdd k}d|dddf< ||t	d  |jd||d	} | S )
zApply top-k and top-p masks to the logits.

    If a top-p is used, this function will sort the logits tensor,
    which can be slow for large batches.

    The logits tensor may be updated in-place.
    Nr=   F)r?   
descendingrZ   rK   inf)r?   out)r?   indexsrc)apply_top_k_onlysortsizetor)   longgather	unsqueezemasked_fill_floatrC   cumsumscatter)	r9   r;   r<   logits_sort
logits_idx
top_k_mask
probs_sort	probs_sum
top_p_maskr7   r7   r8   r0      s$   
r0   c                 C   s   || j d k}||d}| }|dd}| j|ddjd| }|	|dt
d  | 	| |k t
d  | S )z
    Apply top-k mask to the logits.

    This implementation doesn't involve sorting the entire vocab.

    The logits tensor may be updated in-place.
    rZ   rK   rr   )rM   masked_fillmaxsub_r|   topkvaluesr{   rz   r}   r~   )r9   r;   no_top_k_mask	max_top_kk_indexr   r7   r7   r8   rv     s   rv   rF   r:   c                 C   sb   t | }t|| jd kr|  |r%| D ]\}}|| j|d q| |jdddS )zRandomly sample from the probabilities.

    We use this function instead of torch.multinomial because torch.multinomial
    causes CPU-GPU synchronization.
    r   rI   r=   rK   )	r)   rO   rL   rM   rP   rQ   rR   rS   rT   )rF   r:   rU   rV   rJ   r7   r7   r8   rD   7  s   
	rD   c                 C   s   ddl }t|jtdk rtd|du r|du rJ |du r3| jdtjd}|jj	||dd}n |du rI| jdtjd}|jj
||dd}n
|jj| ||dd}|dS )	ab  Sample from the logits using FlashInfer.

    Statistically, this function is equivalent to the `random_sample` function.
    However, this function is faster because it avoids sorting the logits tensor
    via rejection sampling.

    NOTE: The outputs of this function do not necessarily match the outputs of
    the `random_sample` function. It only guarantees that the outputs are
    statistically equivalent.

    NOTE: This function includes CPU-GPU synchronization, while `random_sample`
    does not. Call this function at the end of the forward pass to minimize
    the synchronization overhead.
    r   Nz0.2.3zCFlashInfer version >= 0.2.3 required for top-k and top-p sampling. r=   r>   T)rY   )
flashinferr   parse__version__r.   rC   r)   rB   samplingr^   top_k_sampling_from_probs top_k_top_p_sampling_from_logitsrT   )r9   r;   r<   r:   r   rF   rc   r7   r7   r8   rG   O  s(   
rG   c                 C   s   t | tjr
| dfS d | fS )Nr   )
isinstancer)   ri   )xr7   r7   r8   r]     s   r]   )r)   torch.nnnn	packagingr   vllmr   vllm._aiter_opsr   vllm.config.modelr   vllm.loggerr   vllm.platformsr   r   re   r   Moduler	   compileri   rN   r0   rv   rj   rk   rl   rD   rG   r]   r7   r7   r7   r8   <module>   s`    
Z
+


1