o
    ½Ù¾iIa  ã                   @   sF  d dl Z d dlZd dlmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dl
m  mZ d dlmZ d dlmZ eG dd„ deƒƒZeG d	d
„ d
eƒƒZejdeje dejfdd„ƒZej		ddejdejdejdedejdeje deje fdd„ƒZej		ddejdejdejdejdeje deje fdd„ƒZdS )é    N)ÚTuple)Ú	dataclass)ÚFloat32)Ú
ParamsBase)ÚSeqlenInfoQKc                   @   sb  e Zd ZU eed< eje ed< ej	ed< ej	ed< dZ
eje ed< dZedB ed< e		d%dedeje deje dedB fd	d
„ƒZd&dd„Z	d'dejdeeB dB defdd„Z	d'dejdeeB dB defdd„Zej		d(dej	deje deje dej	fdd„ƒZej	d)dedeej	B dB dej	fdd „ƒZejd!ej	d"ej	ddfd#d$„ƒZdS )*ÚSoftmaxÚ
scale_log2Únum_rowsÚrow_maxÚrow_suméP   ÚarchNÚsoftmax_scalec                 C   s*   t  |t¡}t  |t¡}t| |||||ƒS ©N)ÚcuteÚmake_rmem_tensorr   r   )r   r	   r   r   r
   r   © r   úR/home/ubuntu/.local/lib/python3.10/site-packages/flash_attn_origin/cute/softmax.pyÚcreate   s   zSoftmax.createÚreturnc                 C   s    | j  tj ¡ | j d¡ d S )Nç        )r
   Úfillr   Úinfr   )Úselfr   r   r   Úreset%   s   zSoftmax.resetÚ	acc_S_rowÚinit_valc                 C   ó   t j||| jdS ©N)r   )ÚutilsÚfmax_reducer   )r   r   r   r   r   r   Ú_compute_row_max)   ó   zSoftmax._compute_row_maxÚacc_S_row_expc                 C   r   r   )r   Úfadd_reducer   )r   r#   r   r   r   r   Ú_compute_row_sum.   r"   zSoftmax._compute_row_sumFTÚacc_SÚis_firstÚ	check_infc                 C   sh  t  |¡}t | jt¡}| j}| j}| j}| j}	t	j
t |¡ddD ]}
||
df  ¡ }t j|t	 | ¡r9||
 nd|	d}t j|tjjdd}t	 |¡rW|tj krUdn|}t	 |¡rv|| }t  || | ¡}t j|d|	d}d||
< n*||
 }|| }t  || | ¡}t  || | ¡||
< t j|||
 ||
  |	d}|||
< |||
< ||
df  |¡ q"|S )	zÝApply online softmax and return the row_scale to rescale O.

        :param acc_S: acc_S tensor
        :type acc_S: cute.Tensor
        :param is_first: is first n_block
        :type is_first: cutlass.Constexpr
        T©Úunroll_fullN)r   r   é   ©Úwidthr   ç      ð?)r   Úmake_acc_tensor_mn_viewr   Úmake_fragment_liker
   r   r   r   r   ÚcutlassÚrangeÚsizeÚloadr    Ú
const_exprÚwarp_reduceÚfmaxr   Úexp2fr$   Ústore)r   r&   r'   r(   Úacc_S_mnÚ	row_scaler
   r   r   r   Úrr   Úrow_max_curÚrow_max_cur_scaledr#   Úacc_S_row_sumÚrow_max_prevr   r   r   Úonline_softmax3   s@   
ý


ÿzSoftmax.online_softmaxr.   Úfinal_scaleÚsink_valc              	   C   s\  t  |duot|tjƒ¡rt |¡t | j¡ksJ ‚| j}| j}| j}| 	t
j| ¡ tjdd¡ t |t¡}t jt |¡ddD ]j}t  |du¡rot|tjƒsR|n|| }t tj¡}	||  t
 ||	 || |  ¡7  < || dkp||| || k}
tj |
s†|| nd¡| ||< || }t d¡}|
s¥|| | t
 |¡ | ntj ||< qA|S )	zAFinalize the online softmax by computing the scale and logsumexp.Nr+   r,   Tr)   r   r.   g       @)r1   r5   Ú
isinstancer   ÚTensorr3   r   r
   r   r9   r   r6   r4   ÚoperatorÚaddr0   r   r2   ÚmathÚlog2Úer8   r   Ú
rcp_approxÚlogÚlog2fr   )r   rB   rC   r   r
   r   r;   r<   Úsink_val_curÚLOG2_EÚacc_O_mn_row_is_zero_or_nanÚrow_sum_curÚLN2r   r   r   Úfinalizeo   s.   &þ
ÿýzSoftmax.finalizeÚacc_Or;   c                 C   sl   t  |¡}t |¡tj|dgdksJ ‚tjt |¡ddD ]}||df  ||df  ¡ ||  ¡ qdS )zÏScale each row of acc_O by the given scale tensor.
        :param acc_O: input tensor
        :type acc_O: cute.Tensor
        :param row_scale: row_scale tensor
        :type row_scale: cute.Tensor
        r   ©ÚmodeTr)   N)r   r/   r   r3   r1   r2   r9   r4   )r   rT   r;   Úacc_O_mnr<   r   r   r   Ú	rescale_O’   s
   
(ÿzSoftmax.rescale_O)r   N)r   Nr   )FT)r.   N)Ú__name__Ú
__module__Ú__qualname__r   Ú__annotations__r1   Ú	ConstexprÚintr   rE   r   r   Ústaticmethodr   r   Ú	TensorSSAÚfloatr!   r%   ÚjitÚboolrA   rS   rX   r   r   r   r   r      sv   
 

üÿþýü

ÿÿ
ÿ
þÿÿ
ÿ
þüþýüû;ÿÿÿþ" r   c                   @   s  e Zd ZU dZeje ed< e		d!de	deje de	dB fdd„ƒZ
ejdejd	ed
ee	e	f fdd„ƒZ	d"dejde	d	ed
dfdd„Zejdejde	fdd„ƒZej				d#dejdejdeje deje deje deje fdd„ƒZejdejde	dejfdd „ƒZdS )$ÚSoftmaxSm100r   Úrescale_thresholdNr   r   c              	   C   s6   d}d}t  |t¡}t  |t¡}t| ||||||dS )Né   éd   )re   )r   r   r   rd   )r   re   r   r	   r   r
   r   r   r   r   r   ¤   s   ùzSoftmaxSm100.creater   r'   r   c                 C   s°   t  |¡r|  |¡}|t jj kr|nd}d}n7| jd }| j||d}|t jj kr-|nd}|| | j }t |¡}t  | j	dk¡rO|| j	 krO|}|}d}|| jd< ||fS )Nr   r   ©r   r.   )
r1   r5   r!   r   r   r
   r   r   r8   re   )r   r   r'   Úrow_max_newÚrow_max_safeÚ	acc_scaleÚrow_max_oldÚ
acc_scale_r   r   r   Úupdate_row_max¸   s    




zSoftmaxSm100.update_row_maxFr#   r;   c                 C   s6   t  | ¡r| jd | nd }| j||d| jd< d S )Nr   rh   )r1   r5   r   r%   )r   r#   r;   r'   r   r   r   r   Úupdate_row_sumÌ   s   zSoftmaxSm100.update_row_sumr
   c                 C   sˆ   t  |j¡d dksJ dƒ‚|| j }tjdt  |j¡dddD ]!}t || ||d  f| j| jf| | f¡\||< ||d < q d S )Né   r   ú.acc_S_row must have an even number of elementsTr)   rf   )r   r3   Úshaper   r1   r2   r   Úfma_packed_f32x2)r   r   r
   Úrow_max_scaledÚir   r   r   Úscale_subtract_rowmaxÕ   s   


ýÿz"SoftmaxSm100.scale_subtract_rowmaxé   r+   rf   Úacc_S_row_convertedÚe2eÚe2e_freqÚe2e_resÚe2e_frg_limitc                 C   s¼  t  |j¡d dksJ dƒ‚d}|d dksJ ‚t  |¡| }t  |¡| dks*J ‚t  |t  |¡¡}	t  |t  |¡¡}
t |¡D ]š}t dt j|	dgdd¡D ]w}t | ¡ryt j 	|	||f ¡|	||f< t j 	|	|d |f ¡|	|d |f< qPt || || k pˆ||| k¡r«t j 	|	||f ¡|	||f< t j 	|	|d |f ¡|	|d |f< qPt
 |	||f |	|d |f ¡\|	||f< |	|d |f< qP|
d |f  |	d |f  ¡  |j¡¡ qAd S )Nrp   r   rq   é    rU   rf   )r   r3   rr   Úlogical_divideÚmake_layoutr1   Úrange_constexprr5   r   Úexp2r   Úex2_emulation_2r9   r4   ÚtoÚelement_type)r   r   rx   ry   rz   r{   r|   Úfrg_tileÚfrg_cntÚacc_S_row_frgÚacc_S_row_converted_frgÚjÚkr   r   r   Úapply_exp2_convertä   s4   

ÿ&ÿ&ÿÿîzSoftmaxSm100.apply_exp2_convertc                 C   st  t  |j¡d dksJ dƒ‚| | j }t dt  |j¡d¡D ]}t || ||d  f| j| jf||f¡\||< ||d < qd}|d dksIJ ‚t  |¡| }t  |¡| dks[J ‚t  |t  	|¡¡}t  |t  	|¡¡}	t |¡D ]E}
t dt j|dgdd¡D ]"}t j
 |||
f ¡|||
f< t j
 ||d |
f ¡||d |
f< q|	d |
f  |d |
f  ¡  |j¡¡ qrd S )Nrp   r   rq   rf   r}   rU   )r   r3   rr   r   r1   r€   r   rs   r~   r   r   r   r9   r4   rƒ   r„   )r   r   r
   rx   Úminus_row_max_scaledru   r…   r†   r‡   rˆ   r‰   rŠ   r   r   r   Úscale_apply_exp2_convert  s0   
ý
ÿ
&ÿóz%SoftmaxSm100.scale_apply_exp2_convert)r   N)F)Frw   r+   rf   )rY   rZ   r[   re   r1   r]   ra   r\   r_   r   r   r   rb   r`   r^   r   rn   ro   rE   rv   rc   r‹   r   r   r   r   r   rd       sp   
 ýÿþý"ÿÿÿÿ
þ	þýùþýüûúù(þýürd   Úqhead_per_kvheadr   c                 C   s   t  |dk¡r	| S | | S )z,Convert q_idx to packed format for Pack-GQA.rf   )r1   r5   )Úq_idxrŽ   r   r   r   Úfloor_if_packed@  s   r   rf   FÚ	score_modÚvec_sizeÚqk_acc_dtypeÚseqlen_infoÚconstant_q_idxÚtranspose_indicesc           (   
   C   sÐ  t  |¡rt  d¡}t  d¡}n
t  d¡}t  d¡}t  t | j¡¡}t ||¡}t |t j¡}t |t j¡ 	|f¡}t |t j¡}t  |dkoK|du ¡rTt |t j¡}t j
d||ddD ]}t j
|ddD ]}| ||  | ||< t  |dko||du ¡r˜|||  | }|| }|||  }|| | ||< t  |duo¡|	du¡rÛt  |du ¡rÅ|	\}}t|||  | |ƒ}t||ƒ\}}|||< n|	\}}t|||  | |ƒ\}} | ||< qg|du rìt|||  | |ƒ||< |||  | ||< qg| ¡ }!| ¡ }"t  |du ¡r| ¡ }#n|}$t |$t j¡ 	|f¡}#t  |dko#|du ¡r+| ¡ }%nt |t j¡ 	|f¡}%g }&t  |du¡rB|}&||!||%|#|"|
|&d}'| |'¡ t j
|ddD ]}|| | || < qYq]dS )aB  Shared implementation for applying score modification.

    Args:
        score_tensor: The scores to modify (acc_S for flash_fwd, tSrS_t2r for sm100)
        index_tensor: Index positions (tScS for flash_fwd, tScS_t2r for sm100)
        score_mod: The score modification function to apply
        batch_idx: Batch index
        head_idx: Head index
        softmax_scale: Scale to apply
        vec_size: Vector size for processing elements
        qk_acc_dtype: Data type for accumulator
        aux_tensors: Optional aux_tensors for FlexAttention
        fastdiv_mods: Tuple of (seqlen_q_divmod, seqlen_k_divmod) for wrapping
        seqlen_info: Sequence length info
        constant_q_idx: If provided, use this constant for all q_idx values
                        If None, compute q_idx per-element
        qhead_per_kvhead_packgqa: Pack-GQA replication factor. Divide q_idx by this
                                  when greater than 1 so score mods see logical heads.
        transpose_indices: If True, swap q_idx/kv_idx in index_tensor (for bwd kernel where S is transposed)
    rf   r   NTr)   ©r   Úkv_idxr”   Úaux_tensors)r1   r5   r   r3   rr   r   ÚInt32r   Úscalar_to_ssaÚbroadcast_tor2   r   Údivmodr4   r9   )(Úscore_tensorÚindex_tensorr‘   Ú	batch_idxÚhead_idxr   r’   r“   r™   Úfastdiv_modsr”   r•   rŽ   r–   Ú	q_idx_posÚ
kv_idx_posÚn_valsÚ	score_vecÚ
kv_idx_vecÚbatch_idx_ssaÚ	q_idx_vecÚhead_idx_vecru   r‰   Úq_idx_packedÚq_idx_logicalÚhead_offsetÚseqlen_q_divmodÚseqlen_k_divmodÚq_idx_flooredÚ_Úq_idx_wrappedÚkv_idx_wrappedÚ	score_ssaÚ
kv_idx_ssaÚ	q_idx_ssaÚq_idx_constÚhead_idx_ssaÚaux_argsÚpost_mod_scoresr   r   r   Úapply_score_mod_innerK  sx   
(


ÿ



ù
ÿÀr»   Úscore_mod_bwdc           *      C   sò  t  |¡rt  d¡}t  d¡}n
t  d¡}t  d¡}t  t | j¡¡}t ||¡}t ||¡}t |t j¡}t |t j¡ 	|f¡}t |t j¡}t  |dkoQ|du ¡rZt |t j¡}t j
d||ddD ]}t j
|ddD ]—}| ||  ||< |||  | ||< t  |dkoŠ|du ¡r¦|||  | }|| }|||  }|| | ||< t  |	duo¯|
du¡rét  |du ¡rÓ|
\}}t|||  | |ƒ}t||ƒ\} }!|!||< n|
\} }t|||  | |ƒ\} }"|"||< qm|du rút|||  | |ƒ||< |||  | ||< qm| ¡ }#| ¡ }$| ¡ }%t  |du ¡r| ¡ }&nt |t j¡ 	|f¡}&t  |dko3|du ¡r;| ¡ }'nt |t j¡ 	|f¡}'g }(t  |	du¡rR|	}(||#|$||'|&|%||(d})| |)¡ t j
|ddD ]}|| | || < qjqcdS )aÅ  Apply backward score modification (joint graph).

    Args:
        grad_tensor: in/out: dlogits rewritten in-place with d(scaled_scores)
        score_tensor: pre-mod scores (unscaled QK tile), scaled by softmax_scale internally
        index_tensor: Index positions (same as forward)
        score_mod_bwd: The backward score modification function (joint graph)
        batch_idx: Batch index
        head_idx: Head index
        softmax_scale: Scale to apply to score_tensor
        vec_size: Vector size for processing elements
        qk_acc_dtype: Data type for accumulator
        aux_tensors: Optional aux_tensors for FlexAttention
        fastdiv_mods: Tuple of (seqlen_q_divmod, seqlen_k_divmod) for wrapping
        seqlen_info: Sequence length info
        constant_q_idx: If provided, use this constant for all q_idx values
        qhead_per_kvhead: Pack-GQA replication factor
        transpose_indices: If True, swap q_idx/kv_idx in index_tensor
    rf   r   NTr)   r—   )r1   r5   r   r3   rr   Úmake_fragmentrš   r   r›   rœ   r2   r   r   r4   r9   )*Úgrad_tensorrž   rŸ   r¼   r    r¡   r   r’   r“   r™   r¢   r”   r•   rŽ   r–   r£   r¤   r¥   Úgrad_vecr¦   r§   r¨   r©   rª   ru   r‰   r«   r¬   r­   r®   r¯   r°   r±   r²   r³   Úgrad_ssar´   rµ   r¶   r¸   r¹   Úgrad_out_ssar   r   r   Úapply_score_mod_bwd_innerÍ  s~   
(


ÿ



ø
ÿÃrÂ   )rf   F)rH   rF   Útypingr   Údataclassesr   r1   Úcutlass.cuter   r   Úflash_attn_origin.cute.utilsr   Ú%flash_attn_origin.cute.cute_dsl_utilsr   Ú"flash_attn_origin.cute.seqlen_infor   r   rd   rb   r]   r^   rE   r   rc   r»   rÂ   r   r   r   r   Ú<module>   sr      þý
òýùøõôóò ñüø	÷óòñ