o
    
۾i$                     @   s  d dl Zd dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZ G dd dZejdejd	ejfd
dZdejdejdejdejdejdejdejdejdejdeddfddZejddgddejfddZdejdededejdejddfdd Zd!edefd"d#ZdS )$    N)SamplingParams)tltriton)cdiv)UvaBackedTensorc                   @   s   e Zd ZdededejfddZdededd	fd
dZdej	de
jde
jdd	fddZdej	dej	de
jdej	dej	dedd	fddZd	S )PenaltiesStatemax_num_reqs
vocab_sizedevicec                 C   s   || _ || _|| _t|tjd| _t|tjd| _t|tjd| _t	j
|td| _| jj	d | j  tj
| j t| jdtj| jd| _tj
| j | jtj| jd| _g | _d S )Ndtype      ?    )r   r
   )r   r	   r
   r   torchfloat32repetition_penaltyfrequency_penaltypresence_penaltynpzerosbooluse_penaltyfillcopy_to_uvar   int32prompt_bin_maskoutput_bin_counts_penalties_reqs)selfr   r	   r
    r   W/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/worker/gpu/sample/penalties.py__init__   s&   


zPenaltiesState.__init__req_idxsampling_paramsreturnNc                 C   sT   |j | j j|< |j| jj|< |j| jj|< t|}|| j|< |r(| j| d S d S N)r   r   r   r   r   r   append)r   r"   r#   
do_penaltyr   r   r    add_request*   s   
zPenaltiesState.add_requestprefill_token_idsprefill_lensprompt_lensc                 C   sh   | j D ]}t|| t|| t|| | j| | j|  q| j   | j  | j  | j	  d S r%   )
r   bincountintr   r   clearr   r   r   r   )r   r)   r*   r+   r"   r   r   r    apply_staged_writes4   s   





z"PenaltiesState.apply_staged_writeslogitsidx_mappingidx_mapping_np	input_idsexpanded_local_posnum_speculative_tokensc                 C   sB   t | j| s
d S t||||| jj| jj| jj| j| j	|
 d S r%   )
r   anyr   apply_penaltiesr   gpur   r   r   r   )r   r0   r1   r2   r3   r4   r5   r   r   r    r7   I   s   	zPenaltiesState.apply_penalties)__name__
__module____qualname__r-   r   r
   r!   r   r(   Tensorr   ndarrayr/   r7   r   r   r   r    r      s6    

r   
BLOCK_SIZEMAX_SPEC_LENc           )      C   sD  t d}t || }t || }t || }t || }|dk}|dk}|dk}|p2|p2|}|s7d S t d}|| t d| }||k }t j| ||  | |d}|t j}t j|
||  | |dd}t || }|| }t j|ft jd}t |D ]} | |k rt || |  d }!||!k}"||"t j }q|| }#|#dk}$|r|| d t d|d  }%t j|||	  |% |%t 	|dk dd}&|&d d d f t ddd d d f ? d@ }'|'t j
}'|'|}'t |'|$B |d}(|t |dkd|( |(9 }|||# 8 }|||$ 8 }t j| ||  | ||d d S )	Nr   r              mask)rC   otherr   r   )r   
program_idloadarangetor   r   r   static_ranger   int1reshapewherestore))
logits_ptrlogits_strideidx_mapping_ptrtoken_ids_ptrexpanded_local_pos_ptrrepetition_penalty_ptrfrequency_penalty_ptrpresence_penalty_ptrprompt_bin_mask_ptrprompt_bin_mask_strideoutput_bin_counts_ptroutput_bin_counts_strider	   r>   r?   	token_idxreq_state_idxrep_penaltyfreq_penaltypres_penaltyuse_rep_penaltyuse_freq_penaltyuse_pres_penaltyr   	block_idxblockrC   r0   base_output_countspos	start_idxdraft_countsprev_pos
prev_tokentoken_matchr   output_bin_maskpacked_blockpacked_maskr   scaler   r   r    _penalties_kerneld   s^   

,
 ro   r0   r1   	token_idsr4   r   r   r   r   r   r5   r$   c
                 C   s^   | j \}
}d}t||}t|
|f | | d||||||||d||d|||	d d S )Ni    r   )r>   r?   )shaper   r   ro   stride)r0   r1   rp   r4   r   r   r   r   r   r5   
num_tokensr	   r>   
num_blocksr   r   r    r7      s(   


r7   prefill_len
prompt_len)do_not_specializec                 C   s   t d}|| |krd S || t d| }|| |k rG||k }t j| | |d}	|	d }
|	d }t |fdt j|> }t j||
 ||d |d | |krn||k }|||kM }t j| | |d}	t j||	 d|d d S d S )Nr   rB   r   rA   )r   rE   rG   rF   fullr   	atomic_or
atomic_add)prefill_token_ids_ptrru   rv   rV   rX   r>   rb   rc   rC   prefill_tokensidxbit_idxbitr   r   r    _bincount_kernel   s"   
	r   r)   c                 C   s>   |   |   d}t||}t|f | |||||d d S )Ni   )r>   )zero_r   r   r   )r)   ru   rv   r   r   r>   rt   r   r   r    r,      s   
r,   r#   c                 C   s   | j dkp| jdkp| jdkS )Nr   r@   )r   r   r   )r#   r   r   r    r   
  s
   
r   )numpyr   r   vllm.sampling_paramsr   vllm.triton_utilsr   r   vllm.utils.math_utilsr   vllm.v1.worker.gpu.buffer_utilsr   r   jit	constexprro   r<   r-   r7   r   r,   r   r   r   r   r   r    <module>   sl   XR	

"
