o
    i-                     @   sp   d dl Z d dlZd dlmZ d dlZd dlmZ d dlmZ d dlm  m	Z	 G dd dZ
G dd de
ZdS )    N)Tuple)Float32c                
   @   s   e Zd Z	d!dedeje deje fddZd"d	d
Z	d#de	j
deeB dB defddZ	d#de	j
deeB dB defddZe	j		d$de	jdeje deje de	jfddZe	jd%dede	jfddZe	jde	jde	jddfdd ZdS )&SoftmaxP   
scale_log2num_rowsarchc                 C   s,   || _ t|t| _t| j| _|| _d S N)r   cutemake_fragmentr   row_maxmake_fragment_likerow_sumr   )selfr   r   r    r   M/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/cute/softmax.py__init__   s   
zSoftmax.__init__returnNc                 C   s    | j tj  | jd d S )N        )r   fillr   infr   )r   r   r   r   reset   s   zSoftmax.reset	acc_S_rowinit_valc                 C      t j||| jdS N)r   )utilsfmax_reducer   )r   r   r   r   r   r   _compute_row_max      zSoftmax._compute_row_maxacc_S_row_expc                 C   r   r   )r   fadd_reducer   )r   r    r   r   r   r   _compute_row_sum#   r   zSoftmax._compute_row_sumFTacc_Sis_first	check_infc                 C   s\  t |}t| jt}tjt| jddD ]}||df 	 }| j
|t| r/| j| ndd}t j|tjjdd}t|rL|tj krJdn|}t|rj|| j }	t || j |	 }
| |
}d||< n.| j| }|| j }	t || j |	 }
t || | j ||< | j|
| j| ||  d}|| j|< || j|< ||df |
 q|S )	zApply online softmax and return the row_scale to rescale O.

        :param acc_S: acc_S tensor
        :type acc_S: cute.Tensor
        :param is_first: is first n_block
        :type is_first: cutlass.Constexpr
        Tunroll_fullNr      widthr         ?)r   make_acc_tensor_mn_viewr
   r   r   r   cutlassrangesizeloadr   
const_exprwarp_reducer   fmaxr   r   exp2fr"   r   store)r   r#   r$   r%   acc_S_mn	row_scalerr   row_max_currow_max_cur_scaledr    acc_S_row_sumrow_max_prevr   r   r   online_softmax(   s4   









zSoftmax.online_softmaxr,   final_scalec                 C   s   | j tj| j  tjdd t| j	t
}tjt| j ddD ]F}| j | dkp4| j | | j | k}tj|s?| j | nd| ||< | j | }td}|sa| j	| | j t| | nt
j | j |< q"|S )zAFinalize the online softmax by computing the scale and logsumexp.r)   r*   Tr&   r   r,   g       @)r   r6   r   r3   r1   operatoraddr
   r   r   r   r.   r/   r0   r   
rcp_approxmathlogr   log2fr   )r   r?   r8   r9   acc_O_mn_row_is_zero_or_nanrow_sum_curLN2r   r   r   finalizeV   s     

 
zSoftmax.finalizeacc_Or8   c                 C   sl   t |}t|tj|dgdksJ tjt|ddD ]}||df ||df  ||   qdS )zScale each row of acc_O by the given scale tensor.
        :param acc_O: input tensor
        :type acc_O: cute.Tensor
        :param row_scale: row_scale tensor
        :type row_scale: cute.Tensor
        r   modeTr&   N)r   r-   r
   r0   r.   r/   r6   r1   )r   rJ   r8   acc_O_mnr9   r   r   r   	rescale_Om   s
   
(zSoftmax.rescale_O)r   )r   Nr	   )FT)r,   )__name__
__module____qualname__r   r.   	Constexprintr   r   r
   	TensorSSAfloatr   r"   jitTensorboolr>   rI   rN   r   r   r   r   r      sT    





- r   c                       s   e Zd Zd dedeje f fddZej	dej
dedeeef fd	d
Z	d!dej
dededdfddZej	dejdefddZej					d"dejdejdeje deje deje deje fddZej	dejdedejfddZ  ZS )#SoftmaxSm100r   r   rescale_thresholdc                    s   t  j|ddd || _d S )N   d   )r   r   )superr   rZ   )r   r   rZ   	__class__r   r   r   |   s   
zSoftmaxSm100.__init__r   r$   r   c                 C   s   t |r| |}|t jj kr|nd}d}n7| jd }| j||d}|t jj kr-|nd}|| | j }t|}t | j	dkrO|| j	 krO|}|}d}|| jd< ||fS )Nr   r   r(   r,   )
r.   r2   r   r   r   r   r   r   r5   rZ   )r   r   r$   row_max_newrow_max_safe	acc_scalerow_max_old
acc_scale_r   r   r   update_row_max   s    




zSoftmaxSm100.update_row_maxFr    r8   Nc                 C   s6   t | r| jd | nd }| j||d| jd< d S )Nr   r(   )r.   r2   r   r"   )r   r    r8   r$   r   r   r   r   update_row_sum   s   zSoftmaxSm100.update_row_sumr   c                 C   s   t |jd dksJ d|| j }tjdt |jdddD ]"}t j|| ||d  f| j| jf| | f\||< ||d < q d S )N   r   .acc_S_row must have an even number of elementsTr&   r[   )r
   r0   shaper   r.   r/   r   fma_packed_f32x2)r   r   r   row_max_scaledir   r   r   scale_subtract_rowmax   s   


z"SoftmaxSm100.scale_subtract_rowmax   r)   r[   acc_S_row_convertede2ee2e_freqe2e_rese2e_frg_limitc                 C   s  t |jd dksJ dd}|d dksJ t || }t || dks*J t |t |}	t |t |}
t|D ]}tdt j|	dgddD ]w}t| ryt j	|	||f |	||f< t j	|	|d |f |	|d |f< qPt|| || k p||| krt j	|	||f |	||f< t j	|	|d |f |	|d |f< qPt
|	||f |	|d |f \|	||f< |	|d |f< qP|
d |f |	d |f  |j qAd S )Nrg   r   rh       rK   r[   )r
   r0   ri   logical_dividemake_layoutr.   range_constexprr2   r   exp2r   e2e_asm2r6   r1   toelement_type)r   r   ro   rp   rq   rr   rs   frg_tilefrg_cntacc_S_row_frgacc_S_row_converted_frgjkr   r   r   apply_exp2_convert   s,   

&"&:zSoftmaxSm100.apply_exp2_convertc                 C   sv  t |jd dksJ d| | j }tdt |jdD ] }t j|| ||d  f| j| jf||f\||< ||d < qd}|d dksJJ t || }t || dks\J t |t 	|}t |t 	|}	t|D ]E}
tdt j|dgddD ]"}t j
|||
f |||
f< t j
||d |
f ||d |
f< q|	d |
f |d |
f  |j qsd S )Nrg   r   rh   r[   rt   rK   )r
   r0   ri   r   r.   rw   r   rj   ru   rv   rx   r6   r1   rz   r{   )r   r   r   ro   minus_row_max_scaledrl   r|   r}   r~   r   r   r   r   r   r   scale_apply_exp2_convert   s0   


&z%SoftmaxSm100.scale_apply_exp2_convert)r   )F)Frn   r)   r[   )rO   rP   rQ   r   r.   rR   rU   r   r
   rV   rT   rS   r   re   rf   rW   rm   rX   r   r   __classcell__r   r   r^   r   rY   {   s\    "
	#rY   )rC   r@   typingr   r.   cutlass.cuter
   r   flash_attn.cute.utilsr   r   rY   r   r   r   r   <module>   s   m