o
    پi)                     @   sd  d dl Z d dlmZ e eZerd dlmZ d dlm	Z	 i Z
dd Zeddd	 Zed
dd Zeddd Zeddd Zeddd Zeddd Zeddd Zeddd Zedd d! Zed"d#d$ Zed%d&d' Zed(d)d* Zed+d,d- Zed.d/d0 Zed1d2d3 Zed4d5d6 Zd@d;d<Zed=d>d? ZdS )A    N)TYPE_CHECKING)AttentionBackend)ModelRunnerc                    s    fdd}|S )Nc                    s   | t  < | S )N)ATTENTION_BACKENDS)fnname b/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/attention_registry.py	decorator   s   z-register_attention_backend.<locals>.decoratorr	   )r   r   r	   r   r
   register_attention_backend   s   r   
flashinferc                 C   sd   dd l }| js(ddlm} | jjdkr!t| dr| js!|j	 | _|| | j
dS ddlm} || S )Nr   )FlashInferAttnBackendEAGLEplan_stream_for_flashinfer)init_new_workspace)FlashInferMLAAttnBackend)torchuse_mla_backend.sglang.srt.layers.attention.flashinfer_backendr   server_argsspeculative_algorithmhasattrr   cudaStreamr   2sglang.srt.layers.attention.flashinfer_mla_backendr   )runnerr   r   r   r	   r	   r
   create_flashinfer_backend   s   r   
trtllm_mlac                 C   s"   | j stdddlm} || S )Nz4trtllm_mla backend can only be used with MLA models.r   )TRTLLMMLABackend)r   
ValueError.sglang.srt.layers.attention.trtllm_mla_backendr   )r   r   r	   r	   r
   create_trtllm_mla_backend0      r"   aiterc                 C      ddl m} || S )Nr   )AiterAttnBackend))sglang.srt.layers.attention.aiter_backendr&   )r   r&   r	   r	   r
   create_aiter_backend9      r(   wavec                 C   r%   )Nr   )WaveAttnBackend)(sglang.srt.layers.attention.wave_backendr+   )r   r+   r	   r	   r
   create_wave_backend@   r)   r-   ascendc                 C   r%   )Nr   )AscendAttnBackend)8sglang.srt.hardware_backend.npu.attention.ascend_backendr/   )r   r/   r	   r	   r
   create_ascend_backendG      r1   nsac                 C   r%   )Nr   )NativeSparseAttnBackend)'sglang.srt.layers.attention.nsa_backendr4   )r   r4   r	   r	   r
   create_nsa_backendP   r)   r6   tritonc                 C   s@   | j jrJ d| jjrddlm} || S ddlm} || S )NznCross attention is not supported in the triton attention backend. Please use `--attention-backend flashinfer`.r   )DoubleSparseAttnBackend)TritonAttnBackend)model_configis_encoder_decoderr   enable_double_sparsity3sglang.srt.layers.attention.double_sparsity_backendr8   *sglang.srt.layers.attention.triton_backendr9   )r   r8   r9   r	   r	   r
   create_triton_backendW   s   
r?   torch_nativec                 C   r%   )Nr   )TorchNativeAttnBackend)0sglang.srt.layers.attention.torch_native_backendrA   )r   rA   r	   r	   r
   create_torch_native_backendi   r)   rC   flex_attentionc                 C   r%   )Nr   )TorchFlexAttnBackend).sglang.srt.layers.attention.torch_flex_backendrE   )r   rE   r	   r	   r
   create_flex_attention_backendp   r)   rG   flashmlac                 C   r%   )Nr   )FlashMLABackend),sglang.srt.layers.attention.flashmla_backendrI   )r   rI   r	   r	   r
   create_flashmla_backendw   r)   rK   fa3c                 C   sN   dd l }|j d dkr| jr|j d dksJ dddlm} || S )Nr      	   zbFlashAttention v3 Backend requires SM>=80 and SM<=90. Please use `--attention-backend flashinfer`.FlashAttentionBackend)r   r   get_device_capabilityr   2sglang.srt.layers.attention.flashattention_backendrP   )r   r   rP   r	   r	   r
    create_flashattention_v3_backend~   s   rS   fa4c                 C   s   ddl m} || ddS )Nr   rO      )fa_impl_ver)rR   rP   )r   rP   r	   r	   r
    create_flashattention_v4_backend   s   rW   cutlass_mlac                 C   r%   )Nr   )CutlassMLABackend)/sglang.srt.layers.attention.cutlass_mla_backendrY   )r   rY   r	   r	   r
   create_cutlass_mla_backend   r)   r[   
trtllm_mhac                 C   s"   | j rtdddlm} || S )Nz8trtllm_mha backend can only be used with non-MLA models.r   )TRTLLMHAAttnBackend)r   r    .sglang.srt.layers.attention.trtllm_mha_backendr]   )r   r]   r	   r	   r
   create_trtllm_mha_backend   r#   r_   	intel_amxc                 C   r%   )Nr   )IntelAMXAttnBackend)-sglang.srt.layers.attention.intel_amx_backendra   )r   ra   r	   r	   r
   create_intel_amx_backend   r)   rc   dual_chunk_flash_attnc                 C   r%   )Nr   )DualChunkFlashAttentionBackend)=sglang.srt.layers.attention.dual_chunk_flashattention_backendre   )r   re   r	   r	   r
   $create_dual_chunk_flash_attn_backend   r2   rg   r   r   full_attn_backendr   c                 C   s  | j dur| jrJ d| j }rddlm} ddlm}m}m}m	}m
} ddlm}	m}
 |  | j dur_|	 rH| jjdksH| jjdksHJ d	|
 rU| jjd
ksUJ dtd || }n"| jduri|| }n| jdurs|| }n| jdur}|| }ntd|j}||||S |S )z
    Wrapper for special models like hybrid GDN, so we don't
    need to change the code of the original attention backend.
    Nz0hybrid_gdn can only be used with non-MLA models.r   )check_environments)GDNAttnBackendHybridLinearAttnBackendKimiLinearAttnBackendLightningAttentionBackendMamba2AttnBackend)is_blackwellis_npur7   r\   ztriton or trtllm_mha backend are the only supported backends on Blackwell GPUs for hybrid GDN models, use --attention-backend triton or --attention-backend trtllm_mha to specify the backend.r.   zascend backend is the only supported backend on NPU for hybrid GDN models, use --attention-backend ascend to specify the backend.z<Using hybrid linear attention backend for hybrid GDN models.z?Expected hybrid GDN or NemotronH models, but got unknown model.)hybrid_gdn_configr   mambaish_config%sglang.srt.layers.attention.fla.utilsri   6sglang.srt.layers.attention.hybrid_linear_attn_backendrj   rk   rl   rm   rn   sglang.srt.utilsro   rp   r   attention_backendloggerinfomamba2_configkimi_linear_confighybrid_lightning_configr    full_attention_layer_ids)r   rh   cfgri   rj   rk   rl   rm   rn   ro   rp   linear_attn_backendfull_attn_layersr	   r	   r
   attn_backend_wrapper   sD   









r   	intel_xpuc                 C   r%   )Nr   )XPUAttentionBackend)'sglang.srt.layers.attention.xpu_backendr   )r   r   r	   r	   r
   create_intel_xpu_backend   r)   r   )r   r   rh   r   )loggingtypingr   	getLogger__name__rw   -sglang.srt.layers.attention.base_attn_backendr   &sglang.srt.model_executor.model_runnerr   r   r   r   r"   r(   r-   r1   r6   r?   rC   rG   rK   rS   rW   r[   r_   rc   rg   r   r   r	   r	   r	   r
   <module>   sV    

















3