o
    پi-y                     @  sl  d dl mZ d dlZd dlZd dlZd dlmZmZ d dlmZm	Z	m
Z
mZ d dlZd dlmZ d dlm  mZ d dlmZ d dlmZ d dlmZ d dlmZmZ d d	lmZ d d
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% d dl&m'Z'm(Z( e" Z)e$ Z*e# Z+e)rd dl,m-Z- e*rd dl.Z.edoe+Z/d dl0m1Z1m2Z2 d dl0m3Z4 d dl5m6Z6 d dl7m8Z8 d dl9m:Z:m;Z;m<Z< d dl=m>Z> d dl?m@Z@ d dlAmBZB d dlmCZCmZ de@iZDejEG dd dZFejddd:d!d"ZGd;d)d*ZHG d+d, d,ejIZJG d-d. d.ejIZKG d/d0 d0ejIZLG d1d2 d2ejIZMG d3d4 d4ejIZNG d5d6 d6ejIZOeKeJeLeMeOeNd7ZPG d8d9 d9ejIZQdS )<    )annotationsN)	lru_cachepartial)AnyCallableOptionalTuple	rearrange)can_use_fused_inplace_qknorm)envs)get_attention_tp_rankget_attention_tp_size)apply_qk_norm)get_bool_env_varget_device_capabilityis_blackwell_supportedis_cudais_hipis_npuprint_info_once)maybe_execute_in_parallelwith_multi_streamflash_attn_varlen_funcSGLANG_USE_AITER)split_tensor_along_last_dim tensor_model_parallel_all_gather)utils)context_attention_fwd)RMSNorm)ColumnParallelLinearQKVParallelLinearRowParallelLinear)QuantizationConfig)apply_rotary_pos_emb)get_global_server_args)
add_prefixr   normalc                   @  s8   e Zd ZU dZded< dddZdd
dZdddZdS )SingletonCacheNr   datavaluereturnNonec                 C  s
   || _ d S Nr*   )selfr+    r1   V/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/vision.pyset_dataH   s   
zSingletonCache.set_dataOptional[Any]c                 C  s   | j S r.   r/   r0   r1   r1   r2   get_dataK   s   zSingletonCache.get_databoolc                 C  s   |   d u S r.   )r6   r5   r1   r1   r2   emptyN   s   zSingletonCache.empty)r+   r   r,   r-   )r,   r4   )r,   r7   )__name__
__module____qualname__r*   __annotations__r3   r6   r8   r1   r1   r1   r2   r)   D   s
   
 

r)      maxsize
batch_sizeintseqlenr,   torch.Tensorc                 C  s"   t jd| d | |t j|d}|S )z
    Generates cumulative sequence lengths (cu_seqlens) for a given batch_size, seqlen, and device.
    Caches the result based on these parameters.
    r      )stepdtypedevice)torcharangeint32)r@   rB   rG   
cu_seqlensr1   r1   r2   _get_cu_seqlens_for_shapeS   s   
rL   rK   $torch.Tensor | SingletonCache | Nonebszseq_lenrG   torch.devicec                C  sd   | d u rt |||d}nt| tr$|  r| t |||d |  }n| }t|tjs0J d|S )NrG   z!cu_seqlens must be a torch.Tensor)rL   
isinstancer)   r8   r3   r6   rH   Tensor)rK   rN   rO   rG   resolved_seqlensr1   r1   r2   resolve_seqlensc   s   

rU   c                      s`   e Zd ZdZ			d$d% fddZeeddd&ddZ	d'd(ddZ		d)d*d"d#Z	  Z
S )+VisionSdpaAttentionz5
    Scaled Dot Product Attention inner product

            Fhead_dimrA   	num_headsnum_kv_headsdropoutfloatflatten_batchr7   softmax_in_single_precisionc                   sD   t    || _|| _|| _|| _|| _|| _dt	| j | _
d S )Ng      ?)super__init__	head_sizerY   rZ   r]   r^   r[   mathsqrtscale)r0   rX   rY   rZ   r[   r]   r^   kwargs	__class__r1   r2   r`   ~   s   

zVisionSdpaAttention.__init__r=   r>   srK   tupler,   torch.BoolTensorc           
      C  s   |r.t jd| | gt jd}tdt|D ]}||d  }|| }d|d||||f< q|S t | ddd| }t | dd| d}t dd t|dd |dd D dddd}	||	k ||	k @ }|S )	aI  
        Generate a boolean attention mask with caching mechanism.
        Args:
            s: sequence length
            flatten_batch: whether to flatten batch dimension
            cu_seqlens: tuple of cumulative sequence lengths
        Returns:
            attention mask tensor of shape [b, 1, s, s] or [1, s, s]
        rD   rF   T.c                 S  s   g | ]\}}|| qS r1   r1   ).0startendr1   r1   r2   
<listcomp>   s    z<VisionSdpaAttention._generate_mask_cache.<locals>.<listcomp>N)	rH   zerosr7   rangelenrI   viewtensorzip)
rh   r]   rK   maskirm   rn   row_indicescol_indicesseq_lensr1   r1   r2   _generate_mask_cache   s    "z(VisionSdpaAttention._generate_mask_cacheOptional[torch.Tensor]c                 C  s*   |du rdS t |  }| |||S )aY  
        Creates a non-causal 4D mask of shape `(b, 1, s, s)` or `(1, 1, s, s)`.
        Args:
            s: sequence length
            cu_seqlens: cumulative sequence lengths tensor. If not, returns an empty mask
            flatten_batch: whether to flatten batch dimension
        Returns:
            attention mask tensor or None
        N)ri   cputolistr|   )r0   rh   rK   r]   cu_seqlens_tupler1   r1   r2   generate_patch_attention_mask   s   z1VisionSdpaAttention.generate_patch_attention_maskNqrC   kvrN   attention_maskc                   sB  | j r dksJ d| dksJ |j|jd   }|du r*| j||| j d}|du r6| jr5tdn|j|jd} fd	d
|||fD \}}}| jrt|d}t	
||| j }	~| t	|jj }|	| }	~tjj|	dt	jd|j}	tjj|	| jdd}	t	
|	|}
~	~ntj||||| jdd}
t|
d}
|
S )g
        Args:
            cu_seqlens: [b]
        Returns:
             [b * s, h, head_size]
        rD   z$flatten_batch is True, bsz must be 1   r   N)r]   zEmpty attention maskrQ   c                   s   g | ]	}t |d  dqS )z(b s) h d -> b h s dbr	   rl   xrN   r1   r2   ro      s    z/VisionSdpaAttention.forward.<locals>.<listcomp>zb h s d -> b h d srp   )dimrF   F)ptraining)	attn_mask	dropout_p	is_causalzb h s d -> (b s) h d)r]   r   shaper   r^   RuntimeErrortorG   r
   rH   matmulrd   finforF   minnn
functionalsoftmaxfloat32r[   Fscaled_dot_product_attention)r0   r   r   r   rN   rK   r   re   rh   attn_weightsoutputr1   r   r2   forward   sR   


zVisionSdpaAttention.forward)rW   FF)rX   rA   rY   rA   rZ   rA   r[   r\   r]   r7   r^   r7   )rh   rA   r]   r7   rK   ri   r,   rj   )F)rh   rA   rK   r}   r]   r7   r,   r}   )NN)r   rC   r   rC   r   rC   rN   rA   rK   r}   r   r}   r,   rC   )r9   r:   r;   __doc__r`   staticmethodr   r|   r   r   __classcell__r1   r1   rf   r2   rV   x   s    
$rV   c                      s*   e Zd ZdZ fddZdddZ  ZS )VisionTritonAttentionz<
    Triton-implemented attention without a causal mask
    c                   s8   t    d|v r|d nd}|rd| _d S t | _d S )Nuse_data_parallelFrD   )r_   r`   r   tp_sizer0   re   r   rf   r1   r2   r`     s   
zVisionTritonAttention.__init__r   rC   r   r   rK   rM   rN   rA   rO   r,   c              
   K  s   t j r.d|vrtdt|tstd|d }t|||||d |d |d dd |S t||||jd	}t	
|}|dd
 |d
d  }	|	  }
t||||| |	 |
dd |S )r   	output_wsz0output_ws should be prepared for cuda-graph modez+cuda-graph mode cu_seqlens should be a listr   rD      F)r   rQ   Nrp   )r   SGLANG_VIT_ENABLE_CUDA_GRAPHgetr   rR   listr   rU   rG   rH   
empty_likemaxitemcuda)r0   r   r   r   rK   rN   rO   re   r   r{   
max_seqlenr1   r1   r2   r      s@   


zVisionTritonAttention.forwardr   rC   r   rC   r   rC   rK   rM   rN   rA   rO   rA   r,   rC   )r9   r:   r;   r   r`   r   r   r1   r1   rf   r2   r     s    
r   c                      &   e Zd Z fddZdddZ  ZS )VisionFlash3Attentionc                   sD   t stdt   d|v r|d nd}|rd| _d S t | _d S )Nz0VisionFlash3Attention is only available for cudar   FrD   )_is_cuda	Exceptionr_   r`   r   r   r   rf   r1   r2   r`   Y  s   
zVisionFlash3Attention.__init__r   rC   r   r   rK   rM   rN   rA   rO   r,   c              	   K  s   t j r|d }t||||d |d ||d}	|	S t||||jd}|jtjd|j}|dd |dd  }
|
	 
 }t|||||||d}	|	S )r   rD   r   )cu_seqlens_qcu_seqlens_kmax_seqlen_qmax_seqlen_krQ   rk   Nrp   )r   r   r   r   rU   rG   r   rH   rJ   r   r   )r0   r   r   r   rK   rN   rO   re   r   r   r{   r1   r1   r2   r   e  s4   

zVisionFlash3Attention.forwardr   r9   r:   r;   r`   r   r   r1   r1   rf   r2   r   X  s    r   c                      r   )VisionFlash4Attentionc                      t stdt   d S )Nz0VisionFlash4Attention is only available for cuda)r   r   r_   r`   r0   re   rf   r1   r2   r`        zVisionFlash4Attention.__init__r   rC   r   r   rK   rM   rN   rA   rO   r,   c              
   K  s   |du rt |||jd}nt|tr%| r!|t |||jd | }|jtj	d|j}|dd |dd  }|
  }	t||||||	|	dd}
|
S )r   NrQ   rk   rD   rp      )r   r   r   r   ver)rL   rG   rR   r)   r8   r3   r6   r   rH   rJ   r   r   r   )r0   r   r   r   rK   rN   rO   re   r{   r   r   r1   r1   r2   r     s,   
zVisionFlash4Attention.forwardr   r   r1   r1   rf   r2   r     s    r   c                      r   )VisionAiterAttentionc              
     sT   t stdzddlm} W n ty } ztd|d }~ww || _t   d S )Nz$aiter_attn is only available for AMDr   r   z]aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device.)_is_hipr   aiterr   ImportErrorr_   r`   )r0   re   aiter_flash_attn_varlen_funcerf   r1   r2   r`     s   zVisionAiterAttention.__init__r   rC   r   r   rK   rM   rN   rA   rO   r,   c           
   	   K  sd   t ||||jd}|jtjd|j}|dd  |d d  }|  }	| j||||||	|	dS )NrQ   rk   rD   rp   )r   r   r   r   r   r   r   )rU   rG   r   rH   rJ   r   r   r   )
r0   r   r   r   rK   rN   rO   re   r{   r   r1   r1   r2   r     s   
zVisionAiterAttention.forwardr   r   r1   r1   rf   r2   r     s    r   c                      r   )VisionAscendAttentionc                   r   )Nz6VisionAscendAttention is only available for ascend npu)_is_npur   r_   r`   r   rf   r1   r2   r`     r   zVisionAscendAttention.__init__r   rC   r   r   rK   rM   rN   rA   rO   r,   c              
   K  s   t |||dd}|dd |dd  }|jr|d}|j\}	}
}|jd }t|}tj||||tj|d |
||d |S )r   r~   rQ   rD   Nrp   g      )querykeyr+   rO   scale_valuerY   rZ   out)	rU   r   r   r   rH   r   	torch_npu_npu_flash_attention_unpadrJ   )r0   r   r   r   rK   rN   rO   re   r{   _rY   ra   rZ   r   r1   r1   r2   r     s$   



zVisionAscendAttention.forwardr   r   r1   r1   rf   r2   r     s    r   )triton_attnsdpafa3fa4ascend_attn
aiter_attnc                      s   e Zd ZdZ																dBdC fd%d&Z	dDdEd+d,ZdFd/d0ZdGd4d5ZdGd6d7Z					dHdId@dAZ	  Z
S )JVisionAttentiona  
        Multi-headed attention without any cache, mostly used for multimodal transformers.


    Args:
        use_qkv_parallel (bool, optional): If True, use QKV-parallel attention.
        softmax_in_single_precision (bool, default to False):
            if ``True``, the softmax will be performed in single-precision
            Otherwise, it will be performed in half-precision

    NrW   F Tr   ư>	embed_dimrA   rY   projection_sizeuse_qkv_parallelr7   qkv_backendOptional[str]quant_configOptional[QuantizationConfig]r[   r\   r^   r]   prefixstr	proj_biasnum_dummy_headsqkv_biasqk_normalizationqk_normalization_by_head_sizelayer_norm_eps%customized_position_embedding_applierSCallable[[torch.Tensor, torch.Tensor, Any, Any], Tuple[torch.Tensor, torch.Tensor]]r   use_dp_attention_reduce
aux_streamOptional[torch.cuda.Stream]c                   s  t    |r	dnt | _|rdnt | _|| _|| | _t	||| _
t	|| | j| _t	|| | j| _| j| j | _| j| j | _|| _|| _|| | j | _| jrd| | j||\| _| _n| jrr| | j|\| _| _|}| |}t jd u r|d u rtd| d td| d || _t| | j| j| j||	||d| _|| _|rt|| j|| || ||| j| jtd|
d		| _nt |d
| j ||| j| jtd|
d| _t!| j|||| j| jtd|
|d| _"|| _#|rt$j%& t$j%& g| _'d S g | _'d S )NrD   r   z*Multimodal attention backend not set. Use .zUsing z! as multimodal attention backend.)rX   rY   rZ   r[   r]   r^   r   qkv_proj)	hidden_sizera   total_num_headstotal_num_kv_headsbiasr   tp_rankr   r   r   )
input_sizeoutput_sizer   r   r   r   r   proj)r   r   r   r   r   r   r   r   )(r_   r`   r   r   r   r   r[   ra   
dist_utilsdividehidden_size_per_attention_head!num_attention_heads_per_partition$num_attention_kv_heads_per_partitionq_sizekv_sizer   r   	dummy_dim_init_qk_normq_normk_norm_determine_attention_backendr&   mm_attention_backendr   r   QKV_BACKEND_IMPLr   r   r"   r'   r   r!   r#   r   r   rH   r   Event	ln_events)r0   r   rY   r   r   r   r   r[   r^   r]   r   r   r   r   r   r   r   r   r   r   r   re   _passed_backendrf   r1   r2   r`   =  s   






	
(zVisionAttention.__init__norm_dimepsvar_hidden_sizeOptional[int]c                 C  sR   t  jd urttjddni }t|f||d|}t|f||d|}||fS )NT)weight_dtypecast_x_before_out_mul)r  r  )r&   rl_on_policy_targetdictrH   r   r    )r0   r
  r  r  norm_kwargsr  r  r1   r1   r2   r    s0   zVisionAttention._init_qk_normpassed_backendr,   c                 C  s   t  j}|dur|}n*|dur|}n#t r$t \}}|dkr!d}nd}ntr3t dkr0tr0d}nd}nd}|dkr@t r@td|S )	zDecide the multimodal attention backend string.

        Priority: server args override > constructor arg > platform default.

        Platform defaults:
        - CUDA: "triton_attn"
        - Non-CUDA: "sdpa"
        N	   r   r   )r  r   r   r   z4The 'fa3' backend is not supported on Blackwell GPUs)r&   r  r   r   r   
_use_aiterr   
ValueError)r0   r  override_backendbackendmajorminorr1   r1   r2   r    s$   	
z,VisionAttention._determine_attention_backendr   rC   r   c                 C  sP   | d| j}| |}| d| j}| |}||j}||j}||fS )z"apply qk norm for GLM-OCR vit attnrp   )reshapera   r  r  rt   r   )r0   r   r   	q_by_head	k_by_headr1   r1   r2   _apply_qk_norm_head_size  s   

z(VisionAttention._apply_qk_norm_head_sizec                   sh   fdd} fdd}t d t||jj\ W d    fS 1 s+w   Y   fS )z#apply qk norm for internvl vit attnc                    h     dd} jdkrt|  } | } jdkr)ttjd}|| j } | ddj	f} | S NrD   r   )num_partitionsrp   )
flattenr   r   
contiguousr  r   r   r   	unflattenra   )q_splitter)r   r0   r1   r2   q_l2norm     


z0VisionAttention._apply_qk_norm.<locals>.q_l2normc                    r  r   )
r"  r   r   r#  r  r   r   r   r$  ra   )k_r&  )r   r0   r1   r2   k_l2norm  r(  z0VisionAttention._apply_qk_norm.<locals>.k_l2normTN)r   r   r  r   )r0   r   r   r'  r*  r1   )r   r   r0   r2   _apply_qk_norm  s   


zVisionAttention._apply_qk_normr   rK   r}   position_embeddings+Optional[Tuple[torch.Tensor, torch.Tensor]]rotary_pos_emb_cosrotary_pos_emb_sinr   c              
     s:     dkr d    dksJ  jt jdur:|dur:t|ts/J dt| dt fdd|D } j}|\}	}
}| j}| j	}d	|v rP|d	 nd}| j
r|  \}}|j| j| j| jgd
d\}}}||	|
 |d
 }||	|
 |d
 }||	|
 |d
 }| jr| ||\}}nKt d |  \}}| dd
 || jd| j  f }|j| }|j| j| j| jgd
d\}}}dd |||fD \}}}| jr| ||\}}d}d}|dur| jdur| ||||\}}n|\}}n|dur|dur|}|}|dur]|dur]|j}|d
|| j}|d
|| j}|d
d | jkrJtj||gd
d}tj||gd
d}t||||\}}||}||}|  dkrit|d}|  dkrut|d}|  dkrt|d}|  dksJ |  |  dksJ |  |  dksJ |  | jr| jst| j|jr|| j }t||| j| j || j!d\}}n| "||\}}| j#j$||||	|
|||d}|  dksJ |j| j
rt|d|	d}| %|\}}|S t|d|	|
d }| %|\}}||	|
d
}|S )z
        Args:
            x: [b, s, embed_dim]
            cu_seqlens: [b]
        Returns:
             [s, b, head * head_size]
        r   r   r   NzCexpected position_embeddings to be a tuple of two tensors,
but got z, change if neededc                 3  s    | ]	}|  jV  qd S r.   )r   rF   )rl   r   r   r1   r2   	<genexpr>9  s    z*VisionAttention.forward.<locals>.<genexpr>r   rp   )r   zb s ... -> s b ...c                 S  s   g | ]	}t |d  qS )zs b ... -> b s ...)r
   r#  r   r1   r1   r2   ro   \  s    z+VisionAttention.forward.<locals>.<listcomp>r   zb s ... -> (b s) ...)r   r   r  r  rX   
alt_stream)r   r   r   rN   rO   rK   r   r   z(b s) ... h d -> b s ... (h d)r   z(b s) h d -> s b (h d))r   rh   )&r   	unsqueezer   r&   r  rR   ri   typer   r   r   r   splitr   r   r  r#  r   r  r
   sizert   r   ra   rH   catr%   r   can_use_jit_qk_normrF   r   r  r  r   r+  r   r   r   )r0   r   rK   r,  r.  r/  r   re   x_shaperN   rh   r   headkv_headattn_output_wsqkvr   r   r   new_x_shapecossinoriginal_shapehead_dim_for_normr   context_layerr1   r0  r2   r     s   

 

 










zVisionAttention.forward)NNrW   FFr   Tr   TFFr   NFFN)(r   rA   rY   rA   r   rA   r   r7   r   r   r   r   r[   r\   r^   r7   r]   r7   r   r   r   r7   r   rA   r   r7   r   r7   r   r7   r   r\   r   r   r   r7   r   r7   r   r   r.   )r
  rA   r  r\   r  r  )r  r   r,   r   )r   rC   r   rC   )NNNNN)r   rC   rK   r}   r,  r-  r.  r}   r/  r}   r   r}   r,   rC   )r9   r:   r;   r   r`   r  r  r  r+  r   r   r1   r1   rf   r2   r   0  s<    x

 

)r   )r@   rA   rB   rA   r,   rC   )
rK   rM   rN   rA   rO   rA   rG   rP   r,   rC   )R
__future__r   dataclasses	functoolsrb   r   r   typingr   r   r   r   rH   torch.nnr   torch.nn.functionalr   r   einopsr
   sglang.jit_kernel.normr   r8  sglang.srt.environr   sglang.srt.layers.dp_attentionr   r   sglang.srt.models.utilsr   sglang.srt.utilsr   r   r   r   r   r   r   #sglang.srt.utils.multi_stream_utilsr   r   r   r   r   sgl_kernel.flash_attnr   r   r  sglang.srt.distributedr   r   r   r   8sglang.srt.layers.attention.triton_ops.prefill_attentionr   sglang.srt.layers.layernormr    sglang.srt.layers.linearr!   r"   r#   sglang.srt.layers.quantizationr$   "sglang.srt.layers.rotary_embeddingr%   sglang.srt.server_argsr&   r'   ROTARY_EMBED_CLASSES	dataclassr)   rL   rU   ModulerV   r   r   r   r   r   r  r   r1   r1   r1   r2   <module>   sn    $	

 G;4,4
