o
    i$                     @   s  d dl Z d dlmZ d dlZd dlmZ d dlmZ d dlm	Z	m
Z
 d dlmZ d dlmZmZ d dlmZ d d	lmZ e
jd
e	jfddZdd Zdedeee ef fddZdee dee dee dedeedf dee dededededeeef fddZdee dee dee fd d!Zd"eded#ed$eeef d%ed&eeef deeef deedf fd'd(Z d"eded%ed&eeef d$eeef deeef deedf fd)d*Z!dS )+    N)Any)CacheConfig)MambaStateCopyFunc)tltriton)SchedulerOutput)KVCacheConfig	MambaSpec)CachedRequestState)GPUInputBatch
BLOCK_SIZEc                 C   s   t d}t | | }t || }t || }t d|}td||D ]1}	|	| |k }
||	 | t t j}||	 | t t j}t j||
d}t j|||
d q&d S )Nr   )mask)	r   
program_idloadarangerangetopointer_typeuint8store)src_ptrsdst_ptrssizesr   pidsrc_ptrdst_ptrsizeoffsetsir   curr_src_ptrcurr_dst_ptrdata r"   P/home/ubuntu/vllm_env/lib/python3.10/site-packages/vllm/v1/worker/mamba_utils.pybatch_memcpy_kernel   s   
r$   c                 C   sP   | j d }|j d |ksJ |j d |ksJ |f}d}t| | |||d d S )Nr   i   )r   )shaper$   )r   r   r   batchgridr   r"   r"   r#   batch_memcpy&   s   
r(   kv_cache_configreturnc                    s   g }g  t t| jD ]}| j| j}t|tr"||  | qt|dks-J dt fdd D s:J | d fS )Nr   zno mamba layers in the modelc                 3   s    | ]	} d  |kV  qdS )r   Nr"   ).0specmamba_specsr"   r#   	<genexpr>9   s    z#get_mamba_groups.<locals>.<genexpr>)r   lenkv_cache_groupskv_cache_spec
isinstancer	   appendall)r)   mamba_group_idsr   r2   r"   r-   r#   get_mamba_groups0   s   


r7   src_state_listdest_state_listnum_elements_listmamba_state_copy_funcs.r6   src_block_idxdest_block_idxaccept_token_bias	req_stateforward_contextc              	   C   s   ||kr
|dkr
d S |D ]K}|	j | }|| }|j| j}|D ]7}|
| }|jd }t||D ]&\}}|||||d }| |j |||   ||j|	   q/qqd S )Nr      )
	block_idsr1   layer_nameskv_cachezipr4   
start_addrdata_ptrnum_elementselement_size)r8   r9   r:   r)   r;   r6   r<   r=   r>   r?   r@   mamba_group_idrB   dest_block_idrC   
layer_name	attention	kv_cachesstatestate_copy_func	copy_specr"   r"   r#   collect_mamba_copy_meta=   s&   

rR   c                 C   s~   t | dkrd S t | t |ksJ t | t |ksJ tj| dtjd}tj|dtjd}tj|dtjd}t||| d S )Nr   cuda)devicedtype)r0   torchtensorint64int32r(   )r8   r9   r:   src_state_ptrsdst_state_ptrsrH   r"   r"   r#   do_mamba_copy_block^   s   r\   scheduler_outputcache_configmamba_state_idxinput_batchrequestsc                 C   s  t |\}}	|	j}
|jsJ |	j}| j}| jpt }t||D ]}|	|d q g }g }g }t
|jD ]K\}}|| }||}|du rL|jd | }t|j|d  }|d |
 }|||< |dkr||krt|||||||||j| d || d|j|< q4t||| dS )zc
    Copy the mamba state of previous step to the last
    (1 + num_speculative_blocks) block.
    NrA   r   )r7   num_speculative_blocksenable_prefix_caching
block_sizefinished_req_idspreempted_req_idsset	itertoolschainpop	enumeratereq_idsgetnum_computed_tokensr0   rB   rR   num_accepted_tokens_cpur\   )r]   r)   r^   r_   r`   ra   r@   r;   r6   
mamba_specrc   re   rf   rg   req_idr8   r9   r:   r   r?   prev_state_idx
num_blockscurr_state_idxr"   r"   r#   preprocess_mamban   sH   


rv   c                 C   s   | j }| j}|j}	t|\}
}g }g }g }t|jD ]X\}}|| }|j}t||g }|| }|	| }|| | }|| d }||j	 |j	 }||krr|| }|| }||j	 d }t
||||||
||||| ||krrd|	|< qt||| dS )z
    If a blocks is converted from partial block to full block in this step, copy the
    state from the block for running state to the new full block.
    rA   N)num_scheduled_tokensscheduled_spec_decode_tokensrp   r7   rl   rm   ro   r0   rn   re   rR   r\   )r]   r)   r`   ra   r_   r@   r;   num_scheduled_tokens_dict!scheduled_spec_decode_tokens_dictrp   r6   rq   r8   r9   r:   r   rr   r?   ro   num_draft_tokensrw   num_accepted_tokensnum_tokens_running_statenew_num_computed_tokensaligned_new_computed_tokensr>   r<   r=   r"   r"   r#   postprocess_mamba   sN   
r   )"ri   typingr   rV   vllm.configr   ,vllm.model_executor.layers.mamba.mamba_utilsr   vllm.triton_utilsr   r   vllm.v1.core.sched.outputr   vllm.v1.kv_cache_interfacer   r	   vllm.v1.worker.gpu_input_batchr
   &vllm.v1.worker.lora_model_runner_mixinr   jit	constexprr$   r(   tuplelistintr7   dictstrrR   r\   rv   r   r"   r"   r"   r#   <module>   s   

	


!





C



