o
    پi                     @  s   d Z ddlmZ ddlmZ ddlmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ er@dd
lmZ ddlmZ G dd deZG dd de	jZedgde ddddd#d!d"ZdS )$zRadix attention.    )annotations)Enum)TYPE_CHECKINGOptionalN)nn)register_split_op)get_forward_context)register_custom_op)QuantizationConfig)ForwardBatchc                   @  s   e Zd ZdZdZdZdZdS )AttentionTypezO
    Attention type.
    Use string to be compatible with `torch.compile`.
    decoderdecoder_bidirectionalencoder_onlyN)__name__
__module____qualname____doc__DECODERDECODER_BIDIRECTIONALENCODER_ONLY r   r   U/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/radix_attention.pyr   !   s
    r   c                
      sH   e Zd ZdZdddddddejddf
d& fddZ	 d'd(d$d%Z  ZS ))RadixAttentionz-
    The attention layer implementation.
    g        FNONEtanhN 	num_headsinthead_dimscalingfloatnum_kv_headslayer_id	logit_cap
v_head_dimsliding_window_sizeis_cross_attentionboolpos_encoding_modestrlogit_capping_methodquant_configOptional[QuantizationConfig]	attn_typer   	use_iropeprefixc                   s   t    || _|| _|| _|| _|| _|dkr|n|| _|| _|| _	|| _
|p)d| _|	| _|| _d | _d | _d | _d | _d | _|d urL|j| |d| _| jd urW| j|  || _|
| _|| _d| _d S )Nr   )r1   )super__init__tp_q_head_numtp_k_head_numtp_v_head_numr    qk_head_dimr&   r!   r$   r%   r'   r(   r0   k_scalev_scalek_scale_floatv_scale_floatquant_methodget_quant_methodcreate_weightsr/   r*   r,   xai_temperature_len)selfr   r    r!   r#   r$   r%   r&   r'   r(   r*   r,   r-   r/   r0   r1   	__class__r   r   r3   4   s4   



zRadixAttention.__init__Tforward_batchr   save_kv_cachec                 K  s   |d ur*|d us
J d|vr!| d| j| j}| d| j| j}n	| d| j| j}|j r^t d ur^| j| jkrI||j	d | j
| j f}nt|}t|||||| jfi | |S |jj|||| ||fi |S )Nk_roper   r   )viewr5   r7   r6   r&   forward_mode	is_extendr   	new_emptyshaper4   torch
empty_likeunified_attention_with_outputr$   attn_backendforward)r@   qkvrC   rD   kwargsoutputr   r   r   rO   c   s4   	
zRadixAttention.forward)r   r   r    r   r!   r"   r#   r   r$   r   r%   r"   r&   r   r'   r   r(   r)   r*   r+   r,   r+   r-   r.   r/   r   r0   r)   r1   r+   )T)rC   r   rD   r)   )	r   r   r   r   r   r   r3   rO   __classcell__r   r   rA   r   r   /   s    5r   rT   )mutates_args)q_roperE   sinksquerytorch.TensorkeyvaluerD   r)   r$   r   rW   Optional[torch.Tensor]rE   rX   returnNonec                C  s   t  }	|	j}
|	j}|| }i }|d ur||d< |d ur||d< |d ur'||d< |
jj| ||||
|fi |}| | ksLJ d|  d|  ||j| d S )NrW   rE   rX   z Output tensor element mismatch: z != )	r   rC   attention_layersrN   rO   numelrF   rJ   copy_)rY   r[   r\   rT   rD   r$   rW   rE   rX   contextrC   r`   attention_layerrS   retr   r   r   rM      s*   rM   )rY   rZ   r[   rZ   r\   rZ   rT   rZ   rD   r)   r$   r   rW   r]   rE   r]   rX   r]   r^   r_   )r   
__future__r   enumr   typingr   r   rK   r   )sglang.srt.compilation.compilation_configr   0sglang.srt.compilation.piecewise_context_managerr   sglang.srt.utils.custom_opr	   *sglang.srt.layers.quantization.base_configr
   ,sglang.srt.model_executor.forward_batch_infor   r   Moduler   rM   r   r   r   r   <module>   s(   
[	