o
    پid                     @   sJ  d Z ddlmZ ddlmZ ddlZddlZddlmZ	 eG dd dZ
ejde	jde	jd	e	jd
e	jde	jf
ddZejde	jde	jde	jd	e	jd
e	jf
ddZejde	jde	jde	jd	e	jd
e	jf
ddZejde	jde	jde	jfddZejde	jde	jde	jfddZejde	jde	jfddZ				dddZdS ) z?
Copyright (c) Ant Financial Service Group and its affiliates.
    )	dataclass)OptionalNc                   @   s   e Zd ZU eed< eed< ejed< ejed< ejed< ejed< dZeed< dZeed	< dZ	eed
< dZ
eed< dZeej ed< dS )	SegLaMeta
batch_sizemax_q_length	q_offsets	s_offsets	q_lengthss_scalesr   s_offsets_strideq_offsets_strides_scales_stridedecay_scales_strideNmask)__name__
__module____qualname__int__annotations__torchTensorr   r   r   r   r   r    r   r   ]/home/ubuntu/.local/lib/python3.10/site-packages/sglang/srt/layers/attention/linear/seg_la.pyr      s   
 



r   HEAD_DIM	SPLIT_DIMBLOCKEVENDECOUPLEc           4   	   C   sX  t d}t d}t d}t || }t || }t || }t || }t ||  }t d|}t d|}t d|}|dkrKd S | ||  ||  |d d d f | |d d d f   } |||  ||  |d d d f | |d d d f   }!|||  ||  ||  |d d d f | |d d d f   }"|||
  ||  ||  |d d d f |
 |d d d f   }#|||	  || |  ||  |d d d f | |d d d f   }$t j|$|dkdt j}%|dkrtd||D ]}&t |&|}&|r4t | |&|  t j}'t t |!|&|  t j}(t |"|&|  t j})nQt j| |&|  |&| d d d f |k ddt j}'t t j|!|&|  |&| d d d f |k ddt j}(t j|"|&|  |&| d d d f |k ddt j})|r|r|}*nt	|||& }*|*d | }+t 
||+ },t |+dk|,d}-t |+dkd|, d}.|'|.d d d f  }'|(|-d d d f  }(t |'|(| }/t |d d d f |d d d f k|/d}/t |/|)}0t 
||* }1|1| }2t |'|%|2 |0 }0|%|1 t |(|) }%nt |'|(| }/t 
||d d d f |d d d f   }-t |d d d f |d d d f k|-d}-|/|-9 }/t |/|)}0t 
||d d d f d  | }3t j|'|3 |%|0d}0|rj|}*nt	|||& }*|*d | }+t |+dk|+d	}+t 
||+ }-t 
||* }1|%|1 t |(|-d d d f  |) }%|rt |#|&|
  |0|jj qt j|#|&|
  |0|jj|&| d d d f |k d qt |$|%|jj d S t t | t j| }'t t |!t j}(t |"t j})|%t 
| |(|)  }%t j|'|% dd
d}0t |#|0|jj t |$|%|jj d S )Nr         r           r   other)acci'  T)axis	keep_dims)tl
program_idloadarangetofloat32rangemultiple_oftransminexpwheredotstoredtype
element_tysum)4QKVSOutsoftmax_scalestride_qstride_kstride_vstride_sstride_or   r   r	   r
   decay_scalesr   r   r   r   r   bidhidsids_scaleq_lengthq_offsets_offsetdecay_scaleoffs_boffs_doffs_sq_ptrsk_ptrsv_ptrsout_ptrss_ptrsstatenqkvbb_offsedbdecays
inv_decaysqkoblock_decayblock_decay_plus	decay_arrr   r   r   seg_la_kernel!   s  


""""
"
 


***"$ 
rd   K_SPLIT_DIMV_SPLIT_DIMc           6      C   s  t d}t d}t d}|| }|| }|| }t d}t || }t || }t || }t || }t ||  } t d|}!t d|}"t d|}#|dkr\d S | ||  ||  ||  |!d d d f | |"d d d f   }$|||  ||  ||  |!d d d f | |"d d d f   }%|||  ||  ||  |!d d d f | |#d d d f   }&|||
  || |  ||  ||  |!d d d f | | |#d d d f   }'|||	  || |  || |  ||  |"d d d f | |#d d d f   }(t j|(|dkdt j})td||D ]J}*t |*|}*|rit |$|*|  t j}+t 	t |%|*|  t j},t |&|*|  t j}-|}.|.d |! }/t 
| |/ }0d|0 }1nyt j|$|*|  |*|! d d d f |k ddt j}+t 	t j|%|*|  |*|! d d d f |k ddt j},t j|&|*|  |*|! d d d f |k ddt j}-t|||* }.|.d |! }/t 
| |/ }2t |/dk|2d}0t |/dkd|2 d}1|+|1d d d f  }+|,|0d d d f  },t |+|,| }3t |!d d d f |!d d d f k|3d}3t |3|-}4t 
| |. }5t |+|)|5 | |4 }4|)|5 t |,|- })|rLt |'|*| |  |4|jj qt j|'|*| |  |4|jj|*|! d d d f |k d qt |(|)|jj d S )Nr   r   r   r    r!   r"   r#   )r(   r)   num_programsr*   r+   r,   r-   r.   r/   r0   r2   r1   r3   r4   r5   r6   r7   )6r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   r   r   r	   r
   rD   r   re   rf   r   r   rE   rF   kvidNkidvidHrH   rI   rJ   rK   rL   rM   offs_koffs_vrP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r]   r^   block_decaysr_   r`   ra   r   r   r   seg_la_p_kernel   s   



"""	
&	

" 
"
"*&
rp   c           7      C   s  t d}t d}t d}|| }|| }|| }t d}t || }t || }t || }t || } t ||  }!t d|}"t d|}#t d|}$| dkr\d S | ||  ||  ||  |"d d d f | |#d d d f   }%|||  ||  ||  |"d d d f | |#d d d f   }&|||	  ||  ||  |"d d d f |	 |$d d d f   }'|||  || |  ||  ||  |"d d d f | | |$d d d f   }(|| |
  || |  || |  ||  |#d d d f | |$d d d f   })t j|)|dkdt j}*|rut |%t j}+t t |&t j},t |'t j}-t ||| |  t d|d d d f |  t d|d d d f  t j}.t 	|.dd }/t 
|/}0|0|/ }1nt j|%|"d d d f |k dt j}+t t j|&|"d d d f |k dt j},t j|'|"d d d f |k dt j}-t j||| |  t d|d d d f |  t d|d d d f  t d|d d d f |k t d|d d d f |k @ dt j}.t 	|.dd }/t 
|/}0|0|/ }1t |!|1 }2d|2 }3|+|3d d d f  }+|,|2d d d f  },t |+|,| }4|4|.t j }4t |4|-}5t |!|0d  }6t |+|*|6 | |5 }5|rat |(|5|jj d S t j|(|5|jj|"d d d f |k d d S )Nr   r   r   r    r!   )r(   r)   rg   r*   r+   r,   r-   r0   int32r8   maxr2   r4   r5   r6   r7   )7r9   r:   r;   r<   r=   Maskr>   r?   r@   rA   rB   rC   r   r   r	   r
   rD   r   re   rf   r   r   rE   rF   rh   ri   rj   rk   rl   rH   rI   rJ   rK   rL   rM   rm   rn   rP   rQ   rR   rS   rT   rU   rW   rX   rY   r   	positionsmax_posr[   r]   r^   r_   r`   ra   r   r   r   seg_la_s_kernel[  s   



"""	
&	

"


&,&

.rv   c           %      C   s  t d}t d}t d}|| }|| }|| }t d}t || }|dkr-d S t ||  }t d|}t d|}| ||  ||  ||  | }|||  ||  ||  | }|||  ||  ||  | }|||
  || |  ||  ||  | }|||	  || |  || |  ||  |d d d f | |d d d f   }t |t j} t |t j}!t |t j}"t |t j| }#| t | |!d d d f |"  } t j|#d d d f |  dd}$t 	||$|j
j t 	|| |j
j d S Nr   r   r   r    )r&   )r(   r)   rg   r*   r+   r,   r-   r2   r8   r5   r6   r7   )%r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   r   rD   r   re   rf   rE   rF   rh   ri   rj   rk   rl   rK   rL   rm   rn   rP   rQ   rR   rS   rT   rU   rX   rY   rW   r`   r   r   r   seg_la_d_kernel  s`   



   
	

""rx   c           +      C   s  t d}t d}t d}|| }|| }|| }t d}t || }|dkr-d S t t ||  }t d|}t d|}| || |  ||  ||  | }||| |  ||  ||  | } ||| |	  ||  ||  | }!|||  || | |  ||  ||  | }"|||
  || |  || |  ||  |d d d f | |d d d f   }#t |#t j}$t || }|||  || |  || |  ||  |d d d f | |d d d f   }%t|D ]l}&t |t j| }'t | t j}(t |!t j})|$| |(d d d f |)  }$t j	|'d d d f |$ dd}*t 
|"|*|jj t 
|%|$|jj ||7 }| |7 } |!|	7 }!|"|| 7 }"|%|| | 7 }%qd S rw   )r(   r)   rg   r*   r2   r+   r,   r-   r.   r8   r5   r6   r7   )+r9   r:   r;   r<   CACHESr=   r>   r?   r@   rA   rB   stride_crC   r   cache_indicesrD   stepr   re   rf   rE   rF   rh   ri   rj   rk   rl   rK   rL   rm   rn   rP   rQ   rR   rS   rT   rU   c_ptrsirW   rX   rY   r`   r   r   r   seg_la_mtp_kernel)  s   



$$$


"

"	r   DIM	NUM_BLOCKc              	   C   s   t d}t d}t j|ft jd}t|D ]}|t | || |  ||  t d| t j7 }qt 	|||  t d| | d S )Nr   )r6   )
r(   r)   rg   zerosr-   r.   r*   r+   r,   r5   )TOr   r   pidlengthxr~   r   r   r   seg_la_sum_kernel  s   

,$r   Fc
                 C   sz  | j \}
}}|j \}}}|j}|d u r|d }t|
|}||ks%J d|dkrPd}|dkr2dnd}d}d}|| }|| }tj||
||f| j| jd}|||| f}|d urd	}d}|
| }t| | ||||||| 	d
|	d
|	d
|	d
|	d
|	d
|j
||||||||d n|jd ur|jd}|d d d }||k}t| | |||||j|| 	d
|	d
|	d
|	d
|	d
|j
|j|j|j|f|||||||d n>d}|dkr|| d
knd	}t| | |||||| 	d
|	d
|	d
|	d
|	d
|j
|j|j|j||||||||d |dkrJ|
dk r,|d
}|S tj|
||f| j| jd}t|
f |||| |ddd |S |d
 }|S |dkr^d}d}d}d}nd}d}d}d}|| }|| }tj||
||f| j| jd}|||| f}t| | |||||| 	d
|	d
|	d
|	d
|	d
|j
||||||d |dkr|d
}|S |d
 }|S )Ng      z%seg_la does NOT support GQA currentlyr       r   @      )devicer6   Fr   )r   re   rf   	num_warps
num_stagesr          )r   re   rf   r   r   r   r   i   )r   r   r   r      )shaper   tritoncdivr   emptyr   r6   r   strider   r   sizerv   r   r	   r
   rp   r8   r   rx   )rW   rX   rY   srD   metacachesr{   r>   decoupler   qo_headsr   _kv_headsbs
MAX_LENGTHre   rf   r   r   k_dim_blockv_dim_blocktmpgridr   r   r|   msr`   r   r   r   
seg_la_fwd  sD  





upg


=;r   )NNNF)__doc__dataclassesr   typingr   r   r   triton.languagelanguager(   r   jit	constexprrd   rp   rv   rx   r   r   r   r   r   r   r   <module>   s    -  F[