o
    ÓÙ¾iza  ã                   @   sR  d dl Z d dlZd dlmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
m  m  m  mZ ddlmZ ddlmZ eG dd	„ d	eƒƒZeG d
d„ deƒƒZejdeje dejfdd„ƒZej		ddejdejdejdedejdeje deje fdd„ƒZej		ddejdejdejdejdeje deje fdd„ƒZdS )é    N)ÚTuple)Ú	dataclass)ÚFloat32é   )Ú
ParamsBase)ÚSeqlenInfoQKc                   @   sb  e Zd ZU eed< eje ed< ej	ed< ej	ed< dZ
eje ed< dZedB ed< e		d%dedeje deje dedB fd	d
„ƒZd&dd„Z	d'dejdeeB dB defdd„Z	d'dejdeeB dB defdd„Zej		d(dej	deje deje dej	fdd„ƒZej	d)dedeej	B dB dej	fdd „ƒZejd!ej	d"ej	ddfd#d$„ƒZdS )*ÚSoftmaxÚ
scale_log2Únum_rowsÚrow_maxÚrow_suméP   ÚarchNÚsoftmax_scalec                 C   s*   t  |t¡}t  |t¡}t| |||||ƒS ©N)ÚcuteÚmake_rmem_tensorr   r   )r	   r
   r   r   r   r   © r   úb/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/flash_attention/cute/softmax.pyÚcreate   s   zSoftmax.createÚreturnc                 C   s    | j  tj ¡ | j d¡ d S )Nç        )r   Úfillr   Úinfr   )Úselfr   r   r   Úreset%   s   zSoftmax.resetÚ	acc_S_rowÚinit_valc                 C   ó   t j||| jdS ©N)r   )ÚutilsÚfmax_reducer   )r   r   r   r   r   r   Ú_compute_row_max)   ó   zSoftmax._compute_row_maxÚacc_S_row_expc                 C   r   r   )r    Úfadd_reducer   )r   r$   r   r   r   r   Ú_compute_row_sum.   r#   zSoftmax._compute_row_sumFTÚacc_SÚis_firstÚ	check_infc                 C   sh  t  |¡}t | jt¡}| j}| j}| j}| j}	t	j
t |¡ddD ]}
||
df  ¡ }t j|t	 | ¡r9||
 nd|	d}t j|tjjdd}||
 }|||
< t	 |¡r_|tj kr]dn|}t	 |¡r~|| }t  || | ¡}t j|d|	d}d||
< n&|| }t  || | ¡}t  || | ¡||
< t j|||
 ||
  |	d}|||
< ||
df  |¡ q"|S )	zÝApply online softmax and return the row_scale to rescale O.

        :param acc_S: acc_S tensor
        :type acc_S: cute.Tensor
        :param is_first: is first n_block
        :type is_first: cutlass.Constexpr
        T©Úunroll_fullN)r   r   é   ©Úwidthr   ç      ð?)r    Úmake_acc_tensor_mn_viewr   Úmake_fragment_liker   r   r   r	   r   ÚcutlassÚrangeÚsizeÚloadr!   Ú
const_exprÚwarp_reduceÚfmaxr   Úexp2fr%   Ústore)r   r'   r(   r)   Úacc_S_mnÚ	row_scaler   r   r	   r   Úrr   Úrow_max_curÚrow_max_prevÚrow_max_cur_scaledr$   Úacc_S_row_sumr   r   r   Úonline_softmax3   s@   
ý


ÿzSoftmax.online_softmaxr/   Úfinal_scaleÚsink_valc              	   C   s\  t  |duot|tjƒ¡rt |¡t | j¡ksJ ‚| j}| j}| j}| 	t
j| ¡ tjdd¡ t |t¡}t jt |¡ddD ]j}t  |du¡rot|tjƒsR|n|| }t tj¡}	||  t
 ||	 || |  ¡7  < || dkp||| || k}
tj |
s†|| nd¡| ||< || }t d¡}|
s¥|| | t
 |¡ | ntj ||< qA|S )	zAFinalize the online softmax by computing the scale and logsumexp.Nr,   r-   Tr*   r   r/   g       @)r2   r6   Ú
isinstancer   ÚTensorr4   r   r   r	   r:   r    r7   r5   ÚoperatorÚaddr1   r   r3   ÚmathÚlog2Úer9   r   Ú
rcp_approxÚlogÚlog2fr   )r   rC   rD   r   r   r	   r<   r=   Úsink_val_curÚLOG2_EÚacc_O_mn_row_is_zero_or_nanÚrow_sum_curÚLN2r   r   r   Úfinalizeq   s.   &þ
ÿýzSoftmax.finalizeÚacc_Or<   c                 C   sl   t  |¡}t |¡tj|dgdksJ ‚tjt |¡ddD ]}||df  ||df  ¡ ||  ¡ qdS )zÏScale each row of acc_O by the given scale tensor.
        :param acc_O: input tensor
        :type acc_O: cute.Tensor
        :param row_scale: row_scale tensor
        :type row_scale: cute.Tensor
        r   ©ÚmodeTr*   N)r    r0   r   r4   r2   r3   r:   r5   )r   rU   r<   Úacc_O_mnr=   r   r   r   Ú	rescale_O”   s
   
(ÿzSoftmax.rescale_O)r   N)r   Nr   )FT)r/   N)Ú__name__Ú
__module__Ú__qualname__r   Ú__annotations__r2   Ú	ConstexprÚintr   rF   r   r   Ústaticmethodr   r   Ú	TensorSSAÚfloatr"   r&   ÚjitÚboolrB   rT   rY   r   r   r   r   r      sv   
 

üÿþýü

ÿÿ
ÿ
þÿÿ
ÿ
þüþýüû=ÿÿÿþ" r   c                   @   s  e Zd ZU dZeje ed< e		d!de	deje de	dB fdd„ƒZ
ejdejd	ed
ee	e	f fdd„ƒZ	d"dejde	d	ed
dfdd„Zejdejde	fdd„ƒZej				d#dejdejdeje deje deje deje fdd„ƒZejdejde	dejfdd „ƒZdS )$ÚSoftmaxSm100r   Úrescale_thresholdNr	   r   c              	   C   s6   d}d}t  |t¡}t  |t¡}t| ||||||dS )Nr   éd   )rf   )r   r   r   re   )r	   rf   r   r
   r   r   r   r   r   r   r   ¦   s   ùzSoftmaxSm100.creater   r(   r   c                 C   s°   t  |¡r|  |¡}|t jj kr|nd}d}n7| jd }| j||d}|t jj kr-|nd}|| | j }t |¡}t  | j	dk¡rO|| j	 krO|}|}d}|| jd< ||fS )Nr   r   ©r   r/   )
r2   r6   r"   r   r   r   r	   r    r9   rf   )r   r   r(   Úrow_max_newÚrow_max_safeÚ	acc_scaleÚrow_max_oldÚ
acc_scale_r   r   r   Úupdate_row_maxº   s    




zSoftmaxSm100.update_row_maxFr$   r<   c                 C   s6   t  | ¡r| jd | nd }| j||d| jd< d S )Nr   rh   )r2   r6   r   r&   )r   r$   r<   r(   r   r   r   r   Úupdate_row_sumÎ   s   zSoftmaxSm100.update_row_sumr   c                 C   sˆ   t  |j¡d dksJ dƒ‚|| j }tjdt  |j¡dddD ]!}t || ||d  f| j| jf| | f¡\||< ||d < q d S )Né   r   ú.acc_S_row must have an even number of elementsTr*   r   )r   r4   Úshaper	   r2   r3   r    Úfma_packed_f32x2)r   r   r   Úrow_max_scaledÚir   r   r   Úscale_subtract_rowmax×   s   


ýÿz"SoftmaxSm100.scale_subtract_rowmaxé   r,   r   Úacc_S_row_convertedÚe2eÚe2e_freqÚe2e_resÚe2e_frg_limitc                 C   s¼  t  |j¡d dksJ dƒ‚d}|d dksJ ‚t  |¡| }t  |¡| dks*J ‚t  |t  |¡¡}	t  |t  |¡¡}
t |¡D ]š}t dt j|	dgdd¡D ]w}t | ¡ryt j 	|	||f ¡|	||f< t j 	|	|d |f ¡|	|d |f< qPt || || k pˆ||| k¡r«t j 	|	||f ¡|	||f< t j 	|	|d |f ¡|	|d |f< qPt
 |	||f |	|d |f ¡\|	||f< |	|d |f< qP|
d |f  |	d |f  ¡  |j¡¡ qAd S )Nrp   r   rq   é    rV   r   )r   r4   rr   Úlogical_divideÚmake_layoutr2   Úrange_constexprr6   r   Úexp2r    Úex2_emulation_2r:   r5   ÚtoÚelement_type)r   r   rx   ry   rz   r{   r|   Úfrg_tileÚfrg_cntÚacc_S_row_frgÚacc_S_row_converted_frgÚjÚkr   r   r   Úapply_exp2_convertæ   s4   

ÿ&ÿ&ÿÿîzSoftmaxSm100.apply_exp2_convertc                 C   st  t  |j¡d dksJ dƒ‚| | j }t dt  |j¡d¡D ]}t || ||d  f| j| jf||f¡\||< ||d < qd}|d dksIJ ‚t  |¡| }t  |¡| dks[J ‚t  |t  	|¡¡}t  |t  	|¡¡}	t |¡D ]E}
t dt j|dgdd¡D ]"}t j
 |||
f ¡|||
f< t j
 ||d |
f ¡||d |
f< q|	d |
f  |d |
f  ¡  |j¡¡ qrd S )Nrp   r   rq   r   r}   rV   )r   r4   rr   r	   r2   r€   r    rs   r~   r   r   r   r:   r5   rƒ   r„   )r   r   r   rx   Úminus_row_max_scaledru   r…   r†   r‡   rˆ   r‰   rŠ   r   r   r   Úscale_apply_exp2_convert  s0   
ý
ÿ
&ÿóz%SoftmaxSm100.scale_apply_exp2_convert)r   N)F)Frw   r,   r   )rZ   r[   r\   rf   r2   r^   rb   r]   r`   r   r   r   rc   ra   r_   r   rn   ro   rF   rv   rd   r‹   r   r   r   r   r   re   ¢   sp   
 ýÿþý"ÿÿÿÿ
þ	þýùþýüûúù(þýüre   Úqhead_per_kvheadr   c                 C   s   t  |dk¡r	| S | | S )z,Convert q_idx to packed format for Pack-GQA.r   )r2   r6   )Úq_idxrŽ   r   r   r   Úfloor_if_packedB  s   r   FÚ	score_modÚvec_sizeÚqk_acc_dtypeÚseqlen_infoÚconstant_q_idxÚtranspose_indicesc           (   
   C   sÐ  t  |¡rt  d¡}t  d¡}n
t  d¡}t  d¡}t  t | j¡¡}t ||¡}t |t j¡}t |t j¡ 	|f¡}t |t j¡}t  |dkoK|du ¡rTt |t j¡}t j
d||ddD ]}t j
|ddD ]}| ||  | ||< t  |dko||du ¡r˜|||  | }|| }|||  }|| | ||< t  |duo¡|	du¡rÛt  |du ¡rÅ|	\}}t|||  | |ƒ}t||ƒ\}}|||< n|	\}}t|||  | |ƒ\}} | ||< qg|du rìt|||  | |ƒ||< |||  | ||< qg| ¡ }!| ¡ }"t  |du ¡r| ¡ }#n|}$t |$t j¡ 	|f¡}#t  |dko#|du ¡r+| ¡ }%nt |t j¡ 	|f¡}%g }&t  |du¡rB|}&||!||%|#|"|
|&d}'| |'¡ t j
|ddD ]}|| | || < qYq]dS )aB  Shared implementation for applying score modification.

    Args:
        score_tensor: The scores to modify (acc_S for flash_fwd, tSrS_t2r for sm100)
        index_tensor: Index positions (tScS for flash_fwd, tScS_t2r for sm100)
        score_mod: The score modification function to apply
        batch_idx: Batch index
        head_idx: Head index
        softmax_scale: Scale to apply
        vec_size: Vector size for processing elements
        qk_acc_dtype: Data type for accumulator
        aux_tensors: Optional aux_tensors for FlexAttention
        fastdiv_mods: Tuple of (seqlen_q_divmod, seqlen_k_divmod) for wrapping
        seqlen_info: Sequence length info
        constant_q_idx: If provided, use this constant for all q_idx values
                        If None, compute q_idx per-element
        qhead_per_kvhead_packgqa: Pack-GQA replication factor. Divide q_idx by this
                                  when greater than 1 so score mods see logical heads.
        transpose_indices: If True, swap q_idx/kv_idx in index_tensor (for bwd kernel where S is transposed)
    r   r   NTr*   ©r   Úkv_idxr”   Úaux_tensors)r2   r6   r   r4   rr   r   ÚInt32r    Úscalar_to_ssaÚbroadcast_tor3   r   Údivmodr5   r:   )(Úscore_tensorÚindex_tensorr‘   Ú	batch_idxÚhead_idxr   r’   r“   r™   Úfastdiv_modsr”   r•   rŽ   r–   Ú	q_idx_posÚ
kv_idx_posÚn_valsÚ	score_vecÚ
kv_idx_vecÚbatch_idx_ssaÚ	q_idx_vecÚhead_idx_vecru   r‰   Úq_idx_packedÚq_idx_logicalÚhead_offsetÚseqlen_q_divmodÚseqlen_k_divmodÚq_idx_flooredÚ_Úq_idx_wrappedÚkv_idx_wrappedÚ	score_ssaÚ
kv_idx_ssaÚ	q_idx_ssaÚq_idx_constÚhead_idx_ssaÚaux_argsÚpost_mod_scoresr   r   r   Úapply_score_mod_innerM  sx   
(


ÿ



ù
ÿÀr»   Úscore_mod_bwdc           *      C   sò  t  |¡rt  d¡}t  d¡}n
t  d¡}t  d¡}t  t | j¡¡}t ||¡}t ||¡}t |t j¡}t |t j¡ 	|f¡}t |t j¡}t  |dkoQ|du ¡rZt |t j¡}t j
d||ddD ]}t j
|ddD ]—}| ||  ||< |||  | ||< t  |dkoŠ|du ¡r¦|||  | }|| }|||  }|| | ||< t  |	duo¯|
du¡rét  |du ¡rÓ|
\}}t|||  | |ƒ}t||ƒ\} }!|!||< n|
\} }t|||  | |ƒ\} }"|"||< qm|du rút|||  | |ƒ||< |||  | ||< qm| ¡ }#| ¡ }$| ¡ }%t  |du ¡r| ¡ }&nt |t j¡ 	|f¡}&t  |dko3|du ¡r;| ¡ }'nt |t j¡ 	|f¡}'g }(t  |	du¡rR|	}(||#|$||'|&|%||(d})| |)¡ t j
|ddD ]}|| | || < qjqcdS )aÅ  Apply backward score modification (joint graph).

    Args:
        grad_tensor: in/out: dlogits rewritten in-place with d(scaled_scores)
        score_tensor: pre-mod scores (unscaled QK tile), scaled by softmax_scale internally
        index_tensor: Index positions (same as forward)
        score_mod_bwd: The backward score modification function (joint graph)
        batch_idx: Batch index
        head_idx: Head index
        softmax_scale: Scale to apply to score_tensor
        vec_size: Vector size for processing elements
        qk_acc_dtype: Data type for accumulator
        aux_tensors: Optional aux_tensors for FlexAttention
        fastdiv_mods: Tuple of (seqlen_q_divmod, seqlen_k_divmod) for wrapping
        seqlen_info: Sequence length info
        constant_q_idx: If provided, use this constant for all q_idx values
        qhead_per_kvhead: Pack-GQA replication factor
        transpose_indices: If True, swap q_idx/kv_idx in index_tensor
    r   r   NTr*   r—   )r2   r6   r   r4   rr   Úmake_fragmentrš   r    r›   rœ   r3   r   r   r5   r:   )*Úgrad_tensorrž   rŸ   r¼   r    r¡   r   r’   r“   r™   r¢   r”   r•   rŽ   r–   r£   r¤   r¥   Úgrad_vecr¦   r§   r¨   r©   rª   ru   r‰   r«   r¬   r­   r®   r¯   r°   r±   r²   r³   Úgrad_ssar´   rµ   r¶   r¸   r¹   Úgrad_out_ssar   r   r   Úapply_score_mod_bwd_innerÏ  s~   
(


ÿ



ø
ÿÃrÂ   )r   F)rI   rG   Útypingr   Údataclassesr   r2   Úcutlass.cuter   r   Ú,sglang.jit_kernel.flash_attention.cute.utilsÚ
jit_kernelÚflash_attentionr    Úcute_dsl_utilsr   r”   r   r   re   rc   r^   r_   rF   r   rd   r»   rÂ   r   r   r   r   Ú<module>   sr      þý
òýùøõôóò ñüø	÷óòñ