o
    پi{J                     @   s  d dl mZmZ d dlZd dlZd dlmZ d dlm	Z	 d dl
mZ e	 Zejejddiddejddid	dejddid
dejddiddejddid	dejddiddejddid	dejddid
dejddiddejddid	dejddid
dejddiddejddiddejddiddejddiddejddiddejddiddgdgdZejdejdejfddZeeZd@ddZG dd dZejg ejddidddejddid	ddejddid
ddejddiddejddid	dejddid
dejddidddejddid	ddejddid
ddejddid	d	dejddid
d	dejddid	dejddid
dejddid	ddejddid
ddejddid	dejddid
dejddid	dejddid
dejddiddejddid	ddejddid
ddejddidddejddid	ddejddid
ddejddidddejddid	dejddid
dejddiddejddid	ddejddid
ddejddidddejddid	ddejddid
ddejddidddd gdZejd!ejd ejdejfd"d#ZeeZd@d$d%Zejd!ejd ejdejfd&d'ZdAd(d)ZG d*d+ d+Zejd,ejd ejdejfd-d.Zed/d0	dBd1ejd/ejd2eej d3ejfd4d5Zejd6ejd7ejd ejdejfd8d9Z 			dCd:d;Z!ejd6ejd7ejd ejdejfd<d=Z"			dCd>d?Z#dS )D    )OptionalTupleN)is_hip)register_custom_op
BLOCK_SIZE      )kwargs	num_warps         i          i   i   i    i @  i   n_ele)configskeysoftcap_constc                 C   s   t jdd}|| }|t d| }||k }t j|| |d}	|	t j}
|
| }t d| }|d }|d }|| | }t j| | ||d d S )Nr   axismask      )tl
program_idarangeloadtofloat32expstore)
output_ptr	input_ptrr   r   r   pidblock_startoffsetsr   xfxfxsexpedtopbottomoutput r.   Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/elementwise.pyfused_softcap_kernel%   s   r0   Fc                    sd   t j| t jd}|  |r fdd}t| ||  | |S tt df ||  |ddd |S )N)dtypec                    s   t  | d fS )Nr   )tritoncdiv)meta
n_elementsr.   r/   <lambda>B   s    zfused_softcap.<locals>.<lambda>r   r   r   r
   )torch
empty_liker   numelfused_softcap_kernel_autotunedr0   r2   r3   )r'   r   autotuner-   gridr.   r5   r/   fused_softcap>   s   r?   c                   @   sf   e Zd ZdefddZdd Zdejdejfdd	Zdejdejfd
dZ	ddejdejfddZ
dS )Softcapr   c                 C   s
   || _ d S N)r   )selfr   r.   r.   r/   __init__M   s   
zSoftcap.__init__c                 O      | j |i |S rA   forwardrB   argsr	   r.   r.   r/   __call__P      zSoftcap.__call__r'   returnc                 C   s   |j r| |S | |S rA   )is_cudaforward_cudaforward_nativerB   r'   r.   r.   r/   rF   S   s   

zSoftcap.forwardc                 C   s   t | | j | j S rA   )r9   tanhfloatr   rO   r.   r.   r/   rN   Y   s   zSoftcap.forward_nativeFc                 C   s   t || j|dS N)r=   )r?   r   )rB   r'   r=   r.   r.   r/   rM   \   rJ   zSoftcap.forward_cudaNF)__name__
__module____qualname__rQ   rC   rI   r9   TensorrF   rN   rM   r.   r.   r.   r/   r@   L   s    r@   r   )r	   r
   
num_stages
hidden_dimepsc	                 C   sD  t jdd}	|	| }
t d|}||k }t j||
 | |dd}|t j}t t j|| dd| | }t j||
 | |dd}t j|| |dd}|t j}||| | |j }t j	||
 | ||d |t j}t t j|| dd| | }t j|| |dd}|t j}t j	| |
 | || | |d d S Nr   r           r   otherr   )
r   r   r   r   r   r   sqrtsumr1   r!   )r"   mid_ptr	activ_ptrresidual_ptrweight1_ptrweight2_ptrrZ   rY   r   r$   input_startr&   r   a_armsrw1_w1a2rrms2w2_w2r.   r.   r/   "fused_dual_residual_rmsnorm_kernel   s2    
 


rq   c              
   C   s   t | jdks	J | j|jkr| j|jks)J d| jd|jd| jd|jt| t| }}| j\}}	|rNt|f ||| |||||	d ||fS trRdnd}
t|	t	t
tt|	d	|
d
d}t|f ||| |||f||	d| ||fS )Nr   zx.shape=z residual.shape=z	 x.dtype=z residual.dtype=rZ   rY   r   r   r   r   r8   )lenshaper1   r9   r:   +fused_dual_residual_rmsnorm_kernel_autotune_is_hipr2   next_power_of_2maxminr3   rq   )r'   residualweight1weight2rZ   r=   r-   midbsrY   	max_warpsconfigr.   r.   r/   fused_dual_residual_rmsnorm   s>   "
	r   c                 C   s   t jddt j}|| }t d|}||k }	t j|| | |	dd}
|
t j}t t j|| dd| | }t j|| |	dd}|t j}|| | }t j	| | | ||	d d S r[   )
r   r   r   int64r   r   r   r_   r`   r!   )r"   rb   
weight_ptrrZ   rY   r   r$   rf   r&   r   rg   rh   ri   rk   rl   a_rmsr.   r.   r/   fused_rmsnorm_kernel   s   	 

r   c           
   	   C   s   t | jdks	J |r| }nt| }| j\}}trdnd}t|tttt	|d|dd}	t
|f || |f||d|	 |S )Nr   r   r   r   r   r8   rr   )rs   rt   r9   r:   rv   r2   rw   rx   ry   r3   r   )
r'   weightrZ   r=   inplacer-   r~   rY   r   r   r.   r.   r/   fused_rmsnorm   s&   

r   c                	   @   s   e Zd ZdZdddZdd Zdejd	ejdeejejf fd
dZ		ddejd	ejdeejejf fddZ
dejd	ejdeejejf fddZdejd	ejdeejejf fddZdS )FusedDualResidualRMSNormzK
    Fused implementation of
    y = RMSNorm2(RMSNorm1(x) + residual))
    rK   Nc                 C   sF   || _ || _| j j| _| j j| jjksJ | j jj| jjjks!J d S rA   )rmsnorm1rmsnorm2variance_epsilonr   rt   )rB   r   r   r.   r.   r/   rC     s
   
z!FusedDualResidualRMSNorm.__init__c                 O   rD   rA   rE   rG   r.   r.   r/   rI   !  rJ   z!FusedDualResidualRMSNorm.__call__r'   rz   c                 C   s   |j r	| ||S | ||S rA   )rL   rM   forward_flashinfer)rB   r'   rz   r.   r.   r/   rF   $  s   z FusedDualResidualRMSNorm.forwardFc                 C   s   t ||| jj| jj| j|dS rR   )r   r   r   r   r   )rB   r'   rz   r=   r.   r.   r/   rM   ,  s   z%FusedDualResidualRMSNorm.forward_cudac                 C   s    |  |}|| }| ||fS rA   )r   r   rB   r'   rz   normed1r.   r.   r/   r   8  s   
z+FusedDualResidualRMSNorm.forward_flashinferc                 C   s$   | j |}|| }| j||fS rA   )r   rN   r   r   r.   r.   r/   rN   A  s   z'FusedDualResidualRMSNorm.forward_native)rK   NrS   )rT   rU   rV   __doc__rC   rI   r9   rW   r   rF   rM   r   rN   r.   r.   r.   r/   r     sB    

	

	r   	combine_kc                 C   s   t d}|| }|| | }t d|}	|	|k }
t d|}t j|| |d d d f |  |	d d d f  |
d d d f dd}t j|dd}t j|| |	 |
dd}|| d }t j| | |	 ||
d d S )Nr   r\   r]   r   g;f?r   )r   r   r   r   r`   r!   )out_hidden_statesmoe_hidden_statesmlp_hidden_statesr   rY   r   r$   start_index_mlpstart_index_rmoer&   r   combine_k_offsetsmoe_xmlp_x
combined_xr.   r.   r/   experts_combine_kernelK  s*   
	r   r   )	out_shaper   output_bufferrK   c           	   	   C   s   |   sJ |  sJ t| jdkrd}n| jd }|d u r%t|}n||jd}| | ks8J |d |  |j}|j\}}t	
|ttt	
t	|dddd}t|f || |||fi | |S )Nr   r   r   r   r   r8   )is_contiguousrs   rt   r9   r:   viewr1   reshaper;   r2   rw   rx   ry   r3   r   )	r   r   r   r   r   flat_output_bufferr~   rY   r   r.   r.   r/   experts_combine_tritonj  s8   

	r   	quant_maxstatic_scalec                 C   s   t jdd}|| d }|| }	t d|}
t d||k }|t d| }t d|}t j|| |
 |ddt j}t j|| | |ddt j}ddt |d   | }|||jj }|d urit	 t j
| |	 | ||d	 d S )
Nr   r   r   r\   r]   g      ?g      ?g;f?r   )r   r   r   r   r   r   erfr1   
element_tyNotImplementedErrorr!   )out_hidden_states_ptrout_scales_ptrhidden_states_ptrr   r   rY   r   r$   rf   output_startinput1_offsr   input3_offsoutput_offsx1x3gelu_x1outr.   r.   r/   gelu_and_mul_kernel  s,   
r   c              	   C     | j \}}|d }|d u rtj||f|p| j| jd}n|j ||fks%J |j|p+| jks/J |}d }d}	|d urM|d u rItj|ftj| jd}n|}d}	trQdnd}
dttt	
t	|d|
d	i}t|f ||| f|d urvt|jnd |	|t	
|d
| |d ur||fS |d fS Nr   )r1   deviceFTr   r   r
   r   r   )r   r   rY   r   )rt   r9   emptyr1   r   r   rv   rx   ry   r2   rw   r3   r   finfohidden_statesscalesquantizer   r~   in_hidden_dimrY   r   
out_scalesr   r   r   r.   r.   r/   gelu_and_mul_triton  P   
r   c                 C   s   t jdd}|| d }|| }	t d|}
t d||k }|t d| }t d|}t j|| |
 |ddt j}t j|| | |ddt j}|t | }|||jj }|d urct	 t j
| |	 | ||d d S )Nr   r   r   r\   r]   r   )r   r   r   r   r   r   sigmoidr1   r   r   r!   )r   r   r   r   r   rY   r   r$   rf   r   r   r   r   r   r   r   silu_x1r   r.   r.   r/   silu_and_mul_kernel  s,   
r   c              	   C   r   r   )rt   r9   r   r1   r   r   rv   rx   ry   r2   rw   r3   r   r   r   r.   r.   r/   silu_and_mul_triton  r   r   rS   )FFrA   )NNN)$typingr   r   r9   r2   triton.languagelanguager   sglang.srt.utilsr   sglang.srt.utils.custom_opr   rv   r=   Configfused_softcap_autotunejit	constexprr0   r<   r?   r@   rmsnorm_autotunerq   ru   r   r   r   r   r   rW   r   r   r   r   r   r.   r.   r.   r/   <module>   sh   
	
 !"#%*	-
#
7,'
8'