o
    iiQ                     @   s&  d dl mZ d dlZd dlZd dlmZ d dlmZm	Z	 d dl
mZmZmZmZmZmZ dd Zdd Zejejd	d
ddddddejd
d	ddddddejd
dddddddejdd
ddddddejd	d	ddddddejd	dddddddejdd	ddddddejd	dddddddejddddddddejd	d
d	dddddejd
d	d	dddddejd
dd	dddddejdd
d	dddddejd	d	d	dddddejd	dddddddejdd	ddddddejd	dddddddejddddddddge  g dee	dddeddd iejdejdejdejdejd ejdejd!ejd"ejd#ejd$ejd%ejfd&d'Z		(	)d9d*ejd+ejd,eej d-ed.ed/ejfd0d1Zejejd	d
ddddddejd
d	ddddddejd
dddddddejdd
ddddddejd	d	ddddddejd	dddddddejdd	ddddddejd	dddddddejddddddddejd	d
d	dddddejd
d	d	dddddejd
dd	dddddejdd
d	dddddejd	d	d	dddddejd	dddddddejdd	ddddddejd	dddddddejddddddddge  g dee	dddedd2d iejdejdejdejdejd ejdejd%ejfd3d4Z	(	d:d5ejd+ejd-ed6eej d/ejf
d7d8ZdS );    )OptionalN)early_config_pruneestimate_matmul_time)gelugelu_approxgelu_approx_grad	gelu_gradsquared_relusquared_relu_gradc                    s    fddS )Nc                    s   |     S )N)zero_)nargsname R/home/ubuntu/vllm_env/lib/python3.10/site-packages/flash_attn/ops/triton/linear.py<lambda>   s    zinit_to_zero.<locals>.<lambda>r   r   r   r   r   init_to_zero   s   r   c                  C   s`   g } dD ])}dD ]$}dD ]}dD ]}|dkrdnd}|  tj|||dd	||d
 qqqq| S )N)               )       )r   @   )r   r         r   r   r      BLOCK_MBLOCK_NBLOCK_KSPLIT_K
num_stages	num_warps)appendtritonConfig)configsr$   block_mblock_kblock_nr%   r   r   r   get_configs_io_bound   s*   r-   r   r   r   r   r   r      r#   r   r   r   r   )CACHE_KEY_MCACHE_KEY_NCACHE_KEY_K
   )r   
perf_modeltop_k)r)   keyprune_configs_byEVEN_Kc                 C      | d | d | d   dkS NKr!   r"   r   r   argsr   r   r   r          r   r   GROUP_Mr    r!   r"   
A_ROWMAJOR
B_COLMAJORBIASSAVE_ACT_INPUT
ACTIVATIONc           .      C   s  t jdd}|| d | }|| d | }|| }|| }t|||  |} || ||   }!|| |  }"|!| t d| }#|"| t d| }$t t |#| ||}%t t |$| ||}&t d|}'|r~||%dddf | |'dddf   }n||%dddf | |'dddf |   }|r||'dddf |&dddf |   }n||'dddf | |&dddf |   }t j||ft jd}(t|d| D ]S})|rt 	|}*t 	|}+n t j	||'dddf |)k dd}*t j	||'dddf |)k dd}+|(t 
|*|+7 }(|r||7 }n||| 7 }|r||7 }q||| 7 }q|rCt j	||$ |$|k ddt j}|(|dddf 7 }(|r`||%dddf |  |&dddf  },t |,|( |dkrjt|(}(n|d	krtt|(}(n	|d
kr}t|(}(|!| t d| }#|"| t d| }$| |#dddf |  |$dddf  } |#|k dddf |$|k dddf @ }-t | |( dS )aW  
    Kernel for computing Out = activation(A x W + C)
    - Input has shape (M, K)
    - Weight has shape (K, N)
    - Bias has shape (N,)
    - Output has shape (M, N)
    - ActInputs (optional) has shape (M, N)
    'ActInputs' optionally saves the A x W + C intermediate for backward computations
    This kernel will consolidate over K
    r   axisr   Ndtype        maskotherr   r   r	   )tl
program_idminarangemax_contiguousmultiple_ofzerosfloat32rangeloaddottostorer   r   r	   ).C	ACT_INPUTABbiasMNr:   r/   r0   r1   	stride_cm	stride_am	stride_ak	stride_bn	stride_bkr   r>   r    r!   r"   r7   r?   r@   rA   rB   rC   pidgrid_mgrid_nwidthgroup_id
group_sizepid_mpid_nrmrnramrbnrkacckabact_in_ptrsrJ   r   r   r   
kernel_fwd5   sd   ~*,*,
  

 (




((rw   idFxweightr]   
activationsave_act_inputreturnc                    s`  |dv sJ | j dd | j d }}| }| ||}|ddkr/|ddkr/| }|ddkrA|ddkrA| }|durI| nd}| j|jks]J d| j d|j |durs| j|jkssJ d| j d|j |j d |j d ksJ d	|j  d
|j  |du s|j d |j d ksJ d|j \ }	|j \}	tj f| j| jd}
|rt	|
nd} fdd}t
| |
||||dur|n|  |	 d d |	d f|
d|d|d|d|d|du|||ddk|ddkdd |s|
jg ||
j d R  S |
jg ||
j d R  |jg ||j d R  fS )a  
    Compute e = activation(x @ weight.T + bias).
    This wrapper kicks the `kernel_fwd` Triton kernel
    :param x: input tensor
    :param weight: weight matrix
    :param bias: an optional bias tensor
    :param activation: Activation name. Needs to be a Triton kernel.
    :param act_input: an optional tensor to save the activation inputs (for backward)
    :return: result tensor
    rx   r   r   r	   Nr   r   z/Input and weight must have the same dtype, got  and z-Input and bias must have the same dtype, got Incompatible dimensions:  - z2Incompatible dimensions in between weight and biasdevicerG   c                    "   t  | d t | d  fS Nr   r    r'   cdivMETAr^   r_   r   r   r   9     " z#triton_linear_act.<locals>.<lambda>r   r.   )r`   ra   rb   rd   rc   rA   rB   rC   r?   r@   r>   )shapenumelreshapestride
contiguousrG   torchemptyr   
empty_likerw   )ry   rz   r]   r{   r|   batch_shapen	batch_dim
x_reshapedr:   output	act_inputgridr   r   r   triton_linear_act  sr   

r   c                 C   r8   r9   r   r;   r   r   r   r     r=   c           *      C   s  t jdd}|| d | }|| d | }|| }|| }t|||  |}|| ||  }|| | }|| t d| }|| t d| }t t || ||} t t || ||}!t d|}"|| dddf | |"dddf |   }||"dddf | |!dddf |   }t j||ft jd}#t|d| D ]C}$|rt 	|}%t 	|}&n t j	||"dddf |$k dd}%t j	||"dddf |$k dd}&|#t 
|%|&7 }#||| 7 }||| 7 }q|dkr
|| dddf |
  |!dddf  }'t 	|'|#j}(|d	kr|#t|(9 }#n|d
kr"|#t|(9 }#n|dkr-|#t|(9 }#|| t d| }|| t d| }| |dddf |
  |dddf  } ||k dddf ||k dddf @ })t j| |#|)d dS )a=  
    Kernel for computing Out = activation(A x W + C)
    - Input has shape (M, K)
    - Weight has shape (K, N)
    - Output has shape (M, N)
    - ActInputs (optional) has shape (M, N)
    'ActInputs' optionally saves the A x W + C intermediate for backward computations
    This kernel will consolidate over K
    r   rD   r   NrF   rH   rI   rx   r   r   r	   )rJ   )rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rG   r   r   r
   rX   )*rY   rZ   r[   r\   r^   r_   r:   r/   r0   r1   r`   ra   rb   rd   rc   r   r>   r    r!   r"   r7   rC   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rv   r   rJ   r   r   r   
kernel_bwd^  sN   x,,
  
(


((r   grad_outputr   c                    s  |dv sJ | j dd | j d }}| }| ||}|ddkr/|ddkr/| }|ddkrA|ddkrA| }| j|jksSJ d| j d|j |j d |j d ksiJ d|j  d	|j  |d
krx|dusxJ d| |j \ }|j \}tj f| j| jd}	 fdd}
t	|
 |	||| | d d |d |	d|d|d|d|d|dd |	jg ||	j d R  S )ap  
    Compute e = activation(grad_output @ weight + bias).
    This wrapper kicks the `kernel_fwd` Triton kernel
    :param grad_output: input tensor
    :param weight: weight matrix
    :param activation: Activation name. Needs to be a Triton kernel.
    :param act_input: an optional tensor to save the activation inputs (for backward)
    :return: result tensor
    r~   Nr   r   r   z5grad_output and weight must have the same dtype, got r   r   r   rx   z%act_input is required for activation r   c                    r   r   r   r   r   r   r   r   ;  r   z"triton_dgrad_act.<locals>.<lambda>r   r.   )r`   ra   rb   rd   rc   rC   r>   )
r   r   r   r   r   rG   r   r   r   r   )r   rz   r{   r   r   r   r   grad_output_reshapedr:   
grad_inputr   r   r   r   triton_dgrad_act  sP   

r   )Nrx   F)rx   N)typingr   r   r'   triton.languagelanguagerL   triton.ops.matmul_perf_modelr   r   #flash_attn.ops.triton.k_activationsr   r   r   r   r	   r
   r   r-   autotuner(   
heuristicsjit	constexprrw   Tensorstrboolr   r   r   r   r   r   r   <module>   s   ?@H !" 
\?@Hh