o
    
۾i                     @   s~  U d dl Z d dlmZ d dlmZmZmZmZ d dlm	Z	m
Z
mZmZmZ d dlZd dlZd dlmZ d dlmZmZ d dlmZ d dlmZmZ e	rWd d	lmZ d d
lmZ d dlm Z  d dl!m"Z" d dl#m$Z$ d dl%m&Z& d dl'm(Z(m)Z)m*Z*m+Z+m,Z, e$e-Z.ed Z/da0e/dB e1d< dZ2de3de4fddZ5e j6dd Z7de/fddZ8eG dd dZ9dede:e3 de;d  de<e3e9f fd!d"Z=d#e<e3e9f de9fd$d%Z>	 dad&e?d'e+d(e?de@e+eejAgejAf f fd)d*ZBd'e+de+fd+d,ZC	-dbd'e+d.e?de@e?e?e?e?e?e?f fd/d0ZD	-	1dcd'e+d.e?d2e4de@e?e?e?e?f fd3d4ZE	 dad5ejAd6e?d7e?de:e@e?e?f  fd8d9ZF	-dbd:d;d<d=d.e?de4fd>d?ZGd@ejAdAe?dejAfdBdCZHdDejAdejAfdEdFZIdGe3dHe
dIe:e@e3e
e
f  de
fdJdKZJeG dLdM dMeZKdNe3dOe;e( de;e( fdPdQZLdRejAdSejMfdTdUZN	-		-dddVejAdWe?dXe?dB dYe?dejAf
dZd[ZOd\ejAdVejAd]ed^e3dejAf
d_d`ZPdS )e    N)Callable)	dataclassfieldfieldsmake_dataclass)TYPE_CHECKINGAnyLiteralProtocolget_args)runtime_checkable)
VllmConfigget_layers_from_vllm_config)cdiv)KVCacheSpec	MambaSpec)SchedulerOutput)
InputBatch)get_kv_connector_cache_layout)init_logger)AttentionLayerBase)AttentionBackendAttentionImplAttentionMetadataCommonAttentionMetadatasubclass_attention_backend)NHDHND_KV_CACHE_LAYOUT_OVERRIDEvaluereturnc                 C   s   | t tv S N)r   KVCacheLayoutType)r     r$   T/home/ubuntu/.local/lib/python3.10/site-packages/vllm/v1/attention/backends/utils.pyis_valid_kv_cache_layout/   s   r&   c                  C   sT   d } t d urt } td|  | S tj} | d u rt } | S t| s"J td|  | S )NzM`_KV_CACHE_LAYOUT_OVERRIDE` variable detected. Setting KV cache layout to %s.zT`VLLM_KV_CACHE_LAYOUT` environment variable detected. Setting KV cache layout to %s.)r   logger	info_onceenvsVLLM_KV_CACHE_LAYOUTr   r&   cache_layoutr$   r$   r%   get_kv_cache_layout3   s$   r-   r,   c                 C   s   | a d S r"   )r   r+   r$   r$   r%   set_kv_cache_layoutQ   s   r.   c                   @   sj   e Zd ZU dZeed< edB ed< eed< dZeed< e	dddZ
edB ed	< e	dddZedB ed
< dS )PerLayerParametersa  
    Currently, FlashInfer backend only support models in which all layers share
    the same values for the following hyperparameters. Should not be used for
    trtllm-gen backend since it supports different values for the following
    hyperparameters.
    window_leftNlogits_soft_capsm_scaleF	has_sinks)defaultcomparehas_same_window_leftshas_same_all_params)__name__
__module____qualname____doc__int__annotations__floatr3   boolr   r6   r7   r$   r$   r$   r%   r/   V   s   
 r/   vllm_configlayer_namescls_r   c                 C   s   t | t|}i }| D ]8\}}|j}t||sJ t|dd}|dur(|d nd}	t|dd}
|j}t|dddu}t|	|
||||< q|S )zc
    Scan layers in `layer_names` and determine some hyperparameters
    to use during `plan`.
    sliding_windowNr   r   r1   sinks)r   r   itemsimpl
isinstancegetattrscaler/   )r@   rA   rB   layersper_layer_paramskeylayerrF   window_sizer0   r1   r2   r3   r$   r$   r%   get_per_layer_parametersh   s$   
rO   rK   c                    s\   t | dks
J dt|  }|d  t fdd|D  _t fdd|D  _ S )ad  
    Currently, FlashInfer backend other than trtllm-gen
    only support models in which all layers share
    the same values for the following hyperparameters:
    - `window_left`
    - `logits_soft_cap`
    - `sm_scale`

    So this function asserts that all layers share the same values for these
    hyperparameters and returns the global values.
    r   z'No attention layers found in the model.c                 3   s    | ]	}|j  j kV  qd S r"   )r0   .0paramsglobal_paramsr$   r%   	<genexpr>   s    
z/infer_global_hyperparameters.<locals>.<genexpr>c                 3   s    | ]}| kV  qd S r"   r$   rP   rS   r$   r%   rU      s    
)lenlistvaluesallr6   r7   )rK   
param_setsr$   rS   r%   infer_global_hyperparameters   s   

r[   attn_chunk_sizecommon_attn_metadata
block_sizec                    s  |j  }|j }|j}|jj}|dd  |d d  }|jd }t| || |   |	tj
}	| ||    }
dt||	 |  }t|}|d t|| |}tjtj
d| }t||| d }t||	 |}|	||dk< t|| |d   | |dk ||dk< tjd tj
d}tj||dd  d d|d< tj|d | tj
d}|
||d < || }t||||  t|
|  }|| }| | dksJ d|  d| | | }|d d d f tj|tj
d }|dj|jd d d}ttj|tj
d|| }t| t| fd	d
}||}t|}t|}t| }t||j|dd|j|ddt||j| |||jd|t|d|fS )N   r   r   dtype)outzattn_chunk_size z  is not divisible by block_size )maxc                    s   |  f  dS )Nr   )view)block_tablebatch_indices_torchblock_indices_torchvirtual_batchesr$   r%   <lambda>Q  s    
z6make_local_attention_virtual_batches.<locals>.<lambda>T)devicenon_blocking)query_start_loc_cpuquery_start_locseq_lensnum_reqsnum_actual_tokensmax_query_lenmax_seq_lenblock_table_tensorslot_mappingcausal_seq_lens_cpu_num_computed_tokens_cpu)rm   numpyseq_lens_cpurt   rn   rk   shapenpminimumastypeint32r   cumsumrepeatarangeemptyfullreshapecliptorch
from_numpyr<   rc   r   torV   rq   ru   )r\   r]   r^   query_start_loc_npseq_lens_npre   rk   	q_seqlensactual_batch_sizeq_tokens_in_first_blocktokens_in_last_blocklocal_blockscu_num_blocksblock_offsetsr   rarangeseqlens_q_localcu_seqlens_q_localseqlens_k_localnum_computed_tokens_localk_seqstarts_absoluteblock_startspages_per_local_batchblock_indicesbatch_indicesmake_block_tableblock_table_localrm   rz   rs   r$   rf   r%   $make_local_attention_virtual_batches   s   





	


r   c                 C   s  | j dkr| S | jd usJ | jd usJ | j}| j}|d | }| j}| j}tj||dd  dd}tj||d}tj|d |j	|j
d}d|d< tj|dd|dd < t|  }	t|  }
t||jddd	| j||
|	| j| j| jd| j| jd
} | S )Nr_   T)right)	minlength)rk   ra   r   dimcpu)rl   )rn   rm   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   )rr   logits_indices_paddednum_logits_indicesrp   rn   r   	bucketizebincountr   rk   ra   r   r<   rc   itemsumr   r   ro   rs   rt   ru   rw   rx   )r]   r   r   logits_indicesrp   rn   request_idsnum_decode_tokensdecode_query_start_locdecode_max_query_lentotal_num_decode_tokensr$   r$   r%   1make_kv_sharing_fast_prefill_common_attn_metadataj  sB   
	r   r_   decode_thresholdc                 C   s  | j }| j}| j}| j}| j}||kr|dd|ddfS |dd |dd  }||k}||k|@ }	| jdd }
|	 jdd }|
}||
  }t	|sZ|dd|ddfS || }|| }t	|	so||d||dfS || }|| }|||  }|| }||||||fS )a  
    Assuming a reordered batch, finds the boundary between prefill and decode
    requests.

    Args:
        common_attn_metadata: CommonAttentionMetadata object containing the
            batch metadata.
        decode_threshold: The maximum query length to be considered a decode.

    Returns:
        num_decodes: The number of decode requests.
        num_extends: The number of extend requests.
        num_prefills: The number of prefill requests.
        num_decode_tokens: The number of tokens in the decode requests.
        num_extend_tokens: The number of tokens in the extend requests.
        num_prefill_tokens: The number of tokens in the prefill requests.
    r   r_   Nr   r   )
rr   rp   rq   rm   rz   r<   argmaxr   r   any)r]   r   rr   rp   
num_tokensrn   ro   
query_lensis_prefill_or_extend
is_prefillfirst_extendfirst_prefillnum_decodesr   num_prefills_or_extendsnum_prefill_or_extend_tokensnum_extendsnum_prefillsnum_prefill_tokensnum_extend_tokensr$   r$   r%   "split_decodes_prefills_and_extends  sJ   

	r   Frequire_uniformc                 C   s4  | j }| j}| j}| j}||kr|r|dkr|d|dfS |dd |dd  }|d  |kr6d|d|fS |r^t||d k|dkB rW||d  |ksQJ d|d|dfS ||d k}n||k}t|sm|d|dfS | j	dd }	t|d|	 |ksJ |	}
||
 }||	  }|| }|
|||fS )a  
    Assuming a reordered batch, finds the boundary between prefill and decode
    requests.

    Args:
        common_attn_metadata: CommonAttentionMetadata object containing the
            batch metadata.
        decode_threshold: The maximum query length to be considered a decode.
        require_uniform: If True, requires that all decode requests have the
            same query length. When set, some queries may be considered prefills
            even if they are <= decode_threshold, in order to ensure uniformity.

    Returns:
        num_decodes: The number of decode requests.
        num_prefills: The number of prefill requests.
        num_decode_tokens: The number of tokens in the decode requests.
        num_prefill_tokens: The number of tokens in the prefill requests.
    r_   r   Nr   ztokens not padded correctlyr   )
rr   rp   rq   rm   r   r   rY   r   r<   r   )r]   r   r   rr   rp   r   rn   r   r   r   r   r   r   r   r$   r$   r%   split_decodes_and_prefills  s6   
r   rz   workspace_sizerequest_offsetc           	      C   s   g }dt | }}t| |k sJ ||k rT|d}}||k rE|| |   } |krE||7 }|d7 }||k rE|| |   } |ks-||| || f ||k s|S )a  
    Split the prefill requests into chunks such that the total sequence length
    of each chunk is less than or equal to the workspace size.

    Args:
        seq_lens_cpu: The sequence lengths of the prefill requests on CPU.
        workspace_size: The maximum workspace size (in tokens) per chunk.
        request_offset: The offset to add to the request indices.
    Returns:
        A list of tuples of (reqs_start, reqs_end) representing chunk boundaries.
    r   r_   )rV   r   rY   r   append)	rz   r   r   chunk_boundsinstartchunk_totalsr$   r$   r%   split_prefill_chunks%  s   
  r   input_batchr   scheduler_outputr   c                    s^  t | j} fdd| jD }t|}| jd| }|dk}||k| @ }||k| @ }	tj|jtjd}
d|
|	< d|
|< t|	 }t|		 }tj|tjd}d|||| < d||| d< |
|k}|
 sldS t|d }tj|
| d	d
}|| }dd t||D }|D ] }|| }||kr| || |||}|||< |}||ksqdS )z
    Reorders the batch to split into prefill and decode requests; places all
    requests with <= decode_threshold tokens at the front of the batch.

    Returns:
        True if the batch was modified, False otherwise.
    c                    s   g | ]} j | qS r$   )num_scheduled_tokens)rQ   idr   r$   r%   
<listcomp>U  s    
z?reorder_batch_to_split_decodes_and_prefills.<locals>.<listcomp>Nr   r`   r_      Fstable)kindc                 S   s   i | ]\}}t |t |qS r$   )r<   )rQ   srcdstr$   r$   r%   
<dictcomp>u  s    z?reorder_batch_to_split_decodes_and_prefills.<locals>.<dictcomp>T)rV   req_idsr|   arraynum_computed_tokens_cpuzerosr{   r   r<   r   r   whereargsortzipswap_statesget)r   r   r   rp   r   num_scheduled_tokens_npnum_computed_tokens_npr   	is_decode	is_extendreq_regionsr   r   target_regions
needs_swaporig_indicessorted_ordersrc_indicessrc_dest_mapr   r   next_dstr$   r   r%   +reorder_batch_to_split_decodes_and_prefills@  sD   


r   query
batch_sizec                 C   sv   |   dksJ d|    d| jd }| jd }| jd }|| dks/J d|d||| }| ||||S )	z
    Reshapes the query tensor for the specified batch size, so that
    it has shape (batch_size, seq_len, num_heads, head_dim).
       zquery must be 3D, got Dr   r_   r   ztotal_tokens=z  is not divisible by batch_size=r   r{   rd   )r   r   total_tokens	num_headshead_dimseq_lenr$   r$   r%   reshape_query_for_spec_decode  s    


r   attn_outputc                 C   s^   |   dkr| S |   dksJ d|    d| jd | jd  }| || jd | jd S )zo
    Reshapes the attention output tensor, so that
    the batch_size and seq_len dimensions are combined.
    r      zattn_output must be 4D, got r   r   r_   r   r   )r   r   r$   r$   r%   #reshape_attn_output_for_spec_decode  s
    r   name_prefixmetadata_clsr   c                 C   s   | |j  }t|||fd}|S )zH
    Return a new subclass of `metadata_cls` with additional fields
    )bases)r8   r   )r   r   r   nameWrappedr$   r$   r%   subclass_attention_metadata  s   
r  c                   @   s0   e Zd ZU dZejdB ed< dZedB ed< dS )KVSharingFastPrefillMetadataNr   r   )	r8   r9   r:   r   r   Tensorr=   r   r<   r$   r$   r$   r%   r    s   
 r  prefixunderlying_attn_backendc                 C   s*   |  }G dd d|}t| ||d}|S )Nc                	       s2   e Zd Z	ddedededef fddZ  ZS )	zGcreate_fast_prefill_custom_backend.<locals>.FastPrefillAttentionBuilderFcommon_prefix_lenr]   
fast_buildr!   c                    s6   t |}t |||}G dd d|jt}|||S )Nc                   @   s   e Zd Zdd ZdS )z|create_fast_prefill_custom_backend.<locals>.FastPrefillAttentionBuilder.build.<locals>.KVSharingFastPrefillAttentionMetadatac                 S   s:   t |jD ]}t| |jt||j q|j| _|j| _d S r"   )r   	__class__setattrr  rH   r   r   )selfmetadatar]   _fieldr$   r$   r%   __init__  s
   zcreate_fast_prefill_custom_backend.<locals>.FastPrefillAttentionBuilder.build.<locals>.KVSharingFastPrefillAttentionMetadata.__init__N)r8   r9   r:   r  r$   r$   r$   r%   %KVSharingFastPrefillAttentionMetadata  s    r  )r   superbuildr
  r  )r  r  r]   r	  new_common_attn_metadatar  r  r
  r$   r%   r    s   

zMcreate_fast_prefill_custom_backend.<locals>.FastPrefillAttentionBuilder.build)F)	r8   r9   r:   r<   r   r?   r   r  __classcell__r$   r$   r  r%   FastPrefillAttentionBuilder  s    r  )r   attention_backend_clsbuilder_cls)get_builder_clsr   )r  r  underlying_builderr  attn_backendr$   r$   r%   "create_fast_prefill_custom_backend  s   r  query_start_loc_p_cpurk   c             	   C   s  | j jdksJ |  }i }d }d }dD ]}| |  }i ||< ||| d< |  || d< ttt	t
||}||| d< t
|| d }	|	|| d< td|	d }
g }t|D ]\}}|t| q_tj|tjd	}||| d
< |d u rtj|
fttj|d}tj|
fttj|d}n| |
k r||
t ||
t |d|	 | |d|	 | ||| d< ||| d< q|||fS )Nr   )   numstotmlist	mlist_leni   r   r`   
offsetlistra   rk   r   	batch_ptrtoken_chunk_offset_ptr)rk   typediffr   r   r   r   r|   r   r   rV   rc   	enumerateextendrangetensorr   r   PAD_SLOT_IDnelementresize_fill_copy_)r  rk   seqlens	nums_dictr%  r&  BLOCK_Mr  r!  r"  MAX_NUM_PROGRAMSr#  idxnumr$   r$   r%   compute_causal_conv1d_metadata  sR   
r8  ro   dcp_sizedcp_rankcp_kv_cache_interleave_sizec           
      C   s   |  d}|du rtj|tj| jdd|d}ntj|ggtj| jd}| tjdd|j	d }|| | | }|||  }t
|||  d|}|| }	|	dS )zWhile using dcp, kv_cache size stored on each rank may be different,
    use this function to calculate split decode seq_lens of each dcp rank.
    Only consider dcp now, we can extend the case of cp based on this.
    r   Nr$  r_   r   )sizer   r   r   rk   	unsqueezer   r,  r   r{   r   squeeze)
ro   r9  r:  r;  num_requestsrank_offsetsseq_lens_tiledbase	remainderdcp_local_seq_lensr$   r$   r%   get_dcp_local_seq_lens  s6   



rE  re   kv_cache_specmamba_cache_modec                 C   sd   |dv r| S t |tsJ tj|d |j dd}tjd|j | jd}|d| }t	| d|S )an  
    Get the block table tensor for mamba kernels from the input
    common_attn_metadata.block_table_tensor given different mamba cache modes.

    - "all":   input  (#requests, cdiv(max_model_len, block_size));
               output (#requests, cdiv(max_model_len, block_size)).

    - "none":  input  (#requests, 1 + num_speculative_blocks);
               output (#requests, 1 + num_speculative_blocks).

    - "align": input  (#requests, cdiv(max_model_len, block_size));
               output (#requests, 1 + num_speculative_blocks), which are the last
               1 + num_speculative_blocks of each request.
    )rY   noner_   r   )min)rk   )
rG   r   r   clampr^   r   num_speculative_blocksrk   r=  gather)re   ro   rF  rG  start_indicesoffsetsindices_to_gatherr$   r$   r%   mamba_get_block_table_tensor<  s   rP  )r   )r_   )r_   F)r_   Nr_   )Q	functoolscollections.abcr   dataclassesr   r   r   r   typingr   r   r	   r
   r   ry   r|   r   typing_extensionsr   vllm.configr   r   vllm.utils.math_utilsr   vllm.v1.kv_cache_interfacer   r   vllm.v1.core.sched.outputr   vllm.v1.worker.gpu_input_batchr   	vllm.envsr)   /vllm.distributed.kv_transfer.kv_connector.utilsr   vllm.loggerr   /vllm.model_executor.layers.attention_layer_baser   vllm.v1.attention.backendr   r   r   r   r   r8   r'   r#   r   r=   r-  strr?   r&   	lru_cacher-   r.   r/   rW   r'  dictrO   r[   r<   tupler  r   r   r   r   r   r   r   r   r  r  r  rk   r8  rE  rP  r$   r$   r$   r%   <module>   s&  



!

U
 
=
E
>

C

-
6
(