o
    i;                     @   s   d dl Z d dlmZmZ d dlmZmZ d dlmZ dZ	ej
dd Zej
dejfd	d
Zdede jde jdededefddZdedefddZdedede jde jdef
ddZej
dejfddZdS )    N)
VllmConfigreplace)tltriton)CommonAttentionMetadatac                 C   s   t jdd}||krdS t | | }d}|dkr|}nt | | d }	||	 }t || }
|d |
 }t |dk|d}t || d d }|| }t || | t || | dS )a  
    Fused kernel for Eagle prepare_input_padded. This kernel computes the
    token index to sample for each request, taking into account the number
    of draft tokens and the number of valid sampled tokens (which is one more than
    the number of accepted tokens).
    r   axisN   )r   
program_idloadwherestore)cu_num_draft_tokens_ptrvalid_sampled_tokens_count_ptrquery_start_loc_gpu_ptrtoken_indices_to_sample_ptrnum_rejected_tokens_gpu_ptrnum_reqsreq_idxcu_draft_currnum_draft_tokenscu_draft_prevvalid_countnum_rejected_tokensq_last_tok_idxindex_to_sample r   O/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/spec_decode/utils.py"eagle_prepare_inputs_padded_kernel   s    r   BLOCK_SIZE_TOKENSc
                 C   s8  t jdd}
|
|krdS t ||
 }|r7t ||
 }t jddt jd}t ||
 | t ||
 | dS t d|	}||k }| |
|  }t j|| |dd}|dk||k @ |@ }t |}|dkrt t 	||d}t t 	||k|d}t ||
 | nt ||
 }t ||
 | t ||
 | dS )a  
    Fused kernel for Eagle prepare_next_token_ids_padded. This kernel computes the
    number of valid (1 + accepted) tokens for each request, and the corresponding
    "next" token id to sample from during speculative decoding. This is the
    "last accepted token" from the sampled tokens, or the backup token if no
    tokens were accepted or if the request is marked as discarded.
    r   r   Nr   dtyper   maskother)
r   r   r   fulluint32r   arangesummaxr   )sampled_token_ids_ptrdiscard_request_mask_ptrbackup_next_token_ids_ptrnext_token_ids_ptrr   
vocab_sizenum_sampled_tokens_per_reqr   stride_sampled_token_idsr    r   is_discardedbackup_tokenr   
token_offs
token_maskrow_ptr	token_idsis_valid_masklast_valid_indexlast_valid_tokenr   r   r   &eagle_prepare_next_token_padded_kernel9   s0   
r;   cadnew_positionsis_rejected_token_mask
block_sizenum_new_tokensmax_model_lenc                 C   s   | j j\}}tj|| jjd}tj||  | t|d}tj	||d d}	|| |	|  }
| j 
d|
 }|	| }|| | }||k}||t ||t |S )N)device)output_sizer
   )r*   r   )block_table_tensorshapetorchr(   query_start_locrB   repeat_interleavenaive_query_lenslenclampviewmasked_fill_PADDING_SLOT_ID)r<   r=   r>   r?   r@   rA   
batch_sizen_blocks_per_reqreq_indicesclamped_positionsblock_table_indices
block_numsblock_offsetsnew_slot_mappingexceeds_max_model_lenr   r   r   compute_new_slot_mappingx   s"   
rX   target_model_vllm_configreturnc                 C   sD   | }|j dusJ d|j }t|j|jjd}t|d||jd}|S )an  The vllm_config is configured for the target model, e.g.
    its quant_config and parallel_config. But the draft model is potentially
    quantized differently, and has potentially different tensor_parallel_size.
    This function creates a new vllm_config configured for the drafter.
    The vllm_config is useful when loading the draft model with get_model().
    Nzspeculative_config is not set)rank)quant_configparallel_configmodel_config)speculative_configr   draft_parallel_configr]   r[   draft_model_config)rY   oldold_spec_confignew_parallel_confignewr   r   r   "create_vllm_config_for_draft_model   s   	
rf   common_attn_metadataNr(   rV   c              	   C   s|   | }|j ||dt|j    }|j|tjt|jtjd  }|j|||j| |j|	 |  |j
| |j| |d}|S )ah  
    Creates a new CommonAttentionMetadata with all query lengths increased by N.
    Also all seq lens are increased by N.
    This is useful e.g. in speculative decoding with parallel drafting, where we
    extend each sequence by N tokens and predict all tokens in one pass.
    The slot mapping is computed externally, as it requires more information.
    Nr!   )rG   query_start_loc_cpuseq_lensnum_actual_tokensmax_query_lenmax_seq_lenslot_mapping)rG   rJ   ri   rF   r(   int32r   rj   rk   rO   rl   rm   )rg   rh   r(   rV   r<   new_query_start_locnew_query_start_loc_cpunew_cadr   r   r   extend_all_queries_by_N   s   

rs   c           0      C   sT  t jdd}t jdd}t |	| }t |	| d }t |
| }|r4|| }d}|||d   }n|| d }d}|||  }|| d }|| | }|| t d| }||k }||k }||k}||k||| k @ }||| k} || }!|| | }"t |"|d }#t j| |# ||@ dd}$t || }%t || }&t ||&|$}$t |||$}$t | ||$}$|%| }'t | d|'}'| |@ }(||@ })||k||| k @ }*|| }+|| |+ },|r|| }-||-k }.|| }/t j||/ |!|.d t j||! |$|d t j||! |'|d t j||! |(|d t j||! |)|d t j||, |!|*|@ d dS )z
    Copy and expand inputs from the target model to the drafting buffers for Eagle
    speculative decoding. This kernel handles padding slots and parallel drafting
    tokens, if enabled.
    r   r   r
   r#   )r$   N)r   r   r   r(   minimumr   r   )0target_token_ids_ptrtarget_positions_ptrr.   out_input_ids_ptrout_positions_ptrout_is_rejected_token_mask_ptrout_is_masked_token_mask_ptrout_new_token_indices_ptrout_hidden_state_mapping_ptrquery_start_loc_ptrquery_end_loc_ptrpadding_token_idparallel_drafting_token_idtotal_input_tokensnum_padding_slots_per_requestshift_input_idsr    request_idxtoken_batch_idxrG   next_query_start_locquery_end_locnum_valid_tokensinput_offsetoutput_startnum_rejectedtotal_output_tokensj	in_boundsis_valid_regionis_bonus_regionis_parallel_draft_regionis_rejected_regionout_idxin_idxin_idx_clampedr7   	start_posbonus_token	positionsis_rejected_outis_masked_outis_new_token_regionnew_token_local_idxnew_token_out_idxnum_input_tokens_this_requestis_input_regionsrc_idxr   r   r   #copy_and_expand_eagle_inputs_kernel   s~   
	



r   )rF   vllm.configr   r   vllm.triton_utilsr   r    vllm.v1.attention.backends.utilsr   rN   jitr   	constexprr;   TensorintrX   rf   rs   r   r   r   r   r   <module>   sX   
*
>
 

"