o
    پiUJ                  ,   @   s  d dl mZ d dlZd dlZd dlmZ d dlmZ dd Zdd Z	dd	 Z
eje
 g d
dejdejdejdejdejdejdejdejdejdejdejdejdejfddZ													d=dedededed ee d!ee d"ee d#ee d$ed%ee d&eej d'eej d(ed)ed*ed+ee d,ee d-eeeeeeeeff$d.d/Z										d>dedee deded+ed ee d!ee d"ee d#ee d$ed%ee d(ed)ed*ed,ee d-eeeeeeff d0d1ZG d2d3 d3Z					4										d?d5d6Zejdejdejdejdejfd7d8Z		d@dedee dee ded)ed+ee fd9d:Z					4									dAd;d<ZdS )B    )OptionalN)Tensorc                 C   s"   | d ur|  ddkr|  S | S )N   )stride
contiguousx r
   [/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/diffusion/triton/norm.pymaybe_contiguous_lastdim
   s   "r   c                 C   s   | d ur|   S d S N)r   r   r
   r
   r   maybe_contiguous   s   r   c                     sF   g } d t t t  ddd u rd fdddD S )Ni   	warp_size    c                    s&   g | ]}|  krt ji |d qS ))	num_warps)tritonConfig).0
warp_countmax_threads_per_blockr   r
   r   
<listcomp>"   s
    z+triton_autotune_configs.<locals>.<listcomp>)r               r   )getattrtorchget_device_moduleget_device_propertiescurrent_device)configsr
   r   r   triton_autotune_configs   s   
r#   )	NHAS_RESIDUALSTORE_RESIDUAL_OUTIS_RMS_NORMHAS_BIAS
HAS_WEIGHTHAS_X1HAS_W1HAS_B1)r"   keyr'   BLOCK_Nr%   r&   r)   r(   HAS_DROPOUTSTORE_DROPOUT_MASKHAS_ROWSCALEr*   r+   r,   c'           :      C   s  t d}'| |'| 7 } ||'| 7 }|r||'| 7 }|r!|	|'| 7 }	|$r)||'| 7 }|%r1||'| 7 }t d|}(t j| |( |(|k ddt j})|#rXt |
|' t j}*|)|*9 })|!rt jt ||' t j|(dd|k}+t |+|)d|  d})|"rt j	||'|  |( |+|(|k d |$rt j||( |(|k ddt j},|#rt |
| |' t j}*|,|*9 },|!rt jt || |' t j|(dd|k}+t |+|,d|  d},|"rt j	||'|  |( |+|(|k d |)|,7 })|rt j||( |(|k ddt j}-|)|-7 })|rt j	|	|( |)|(|k d |s7t j
|)dd| }.t 	||' |. t |(|k |)|. d}/t j
|/|/ dd| }0nt |(|k |)d}/t j
|/|/ dd| }0d	t |0|  }1t 	||' |1 |(|k }2|rwt j||( |2dt j}3|rw|3d7 }3| rt j||( |2dt j}4|s|)|. |1 n|)|1 }5|r| r|5|3 |4 n|5|3 }6n	| r|5|4 n|5}6t j	||( |6|2d |%rt j||( |2dt j}7|r|7d7 }7|&rt j||( |2dt j}8|&r|5|7 |8 n|5|7 }9t j	||( |9|2d d S d S )
Nr           maskother   )n_rounds      ?r4   axisr   )tl
program_idarangeloadtofloat32randuint32wherestoresumsqrt):XYWBRESIDUALX1W1B1Y1RESIDUAL_OUTROWSCALESEEDSDROPOUT_MASKDROPOUT_MASK1MeanRstdstride_x_rowstride_y_rowstride_res_rowstride_res_out_rowstride_x1_rowstride_y1_rowMr$   eps	dropout_pzero_centered_weightr'   r.   r%   r&   r)   r(   r/   r0   r1   r*   r+   r,   rowcolsr	   rowscale	keep_maskx1residualmeanxbarvarrstdr4   wbx_hatyw1b1y1r
   r
   r   _layer_norm_fwd_1pass_kernel+   s   
? $  $  rs   r2   Fr	   weightbiasr_   rg   rf   weight1bias1r`   rd   	out_dtyperesidual_dtypera   is_rms_normreturn_dropout_maskoutresidual_outreturnc                 C   s   |d u rt j| |
d u r| jn|
d}|d ur|j}|d u rE|d us6|d ur*|| jks6|dks6|	d us6|d urEt j| |d ur?|n| jd}nd }t| ||||||||||	||||d\}}}}}}|d u rf| }||||||||fS )N)dtyper2   )
rg   rf   rv   rw   r`   rd   ra   rz   r{   r}   )r   
empty_liker   _layer_norm_fwd_impl)r	   rt   ru   r_   rg   rf   rv   rw   r`   rd   rx   ry   ra   rz   r{   r|   r}   rr   rh   rk   seedsdropout_maskdropout_mask1r
   r
   r   _layer_norm_fwd   sD   r   c                 C   s>  | j \}}| ddksJ |d ur$|ddksJ |j ||fks$J |d ur9|j |fks0J |ddks9J |d urN|ddksFJ |j |fksNJ |d uri|j | j ksZJ |
d u s`J |ddksiJ |d ur~|j |fksuJ |ddks~J |d ur|j |fksJ |ddksJ |
d ur|
 sJ |
j |fksJ |j | j ksJ |ddksJ |d ur|j | j ksJ |ddksJ |d urt|}|ddksJ nd }|stj|ftj| jdnd }tj|ftj| jd}|	dkrtjd|d u r	|nd| f| jtj	d}nd }|r>|	dkr>tj||| jtj
d}|d ur;tj||| jtj
d}nd }nd\}}d	|   }t|t|}||krYtd
t | jj tjt|f g | ||d urw|n| ||||||||
|||||| d|d|d ur|dnd|d ur|dnd|d ur|dnd|d ur|dnd||||	t||||d u|d u|d u|d u|	dk|d u|
d uR |d u|d u|d ud W d    n	1 sw   Y  ||||||fS )Nr   r   )r   devicer2   l        r   )r   r   )NN   4This layer norm doesn't support feature dim >= 64KB.r   )r*   r+   r,   )shaper   is_contiguousr   r   emptyrA   r   randintint64boolelement_sizeminr   next_power_of_2RuntimeErrorr   indexlibrarywrap_tritonrs   int)r	   rt   ru   r_   r|   rg   rf   rv   rw   r`   rd   ra   rz   r{   r}   r^   r$   rr   rh   rk   r   r   r   MAX_FUSED_SIZEr.   r
   r
   r   r      s
  


 

	
 !"#$%&
+r   c                   @   s8   e Zd Ze															dddZdS )LayerNormFnNư>r2   Fc                 C   st  | j }t| d| j d } |d ur$|j |ksJ t|d|j d }|d urB|j |ks/J |	d u s7J dt|d|j d }|d urJ| }t|}t|}t|}|	d ura|	d }	|d urh|jn|rmtjnd }|d ur||d|j d }|d ur|d|j d }t| |||||||||	|||||||d\}}}}}}}}||}|d ur||}||fS |S )Nr   z1rowscale is not supported with parallel LayerNorm)	r`   rd   rx   ry   ra   rz   r{   r|   r}   )	r   r   reshaper   r   r   r   rA   r   )r	   rt   ru   rg   rf   rv   rw   r_   r`   rd   prenormresidual_in_fp32ra   rz   r{   rx   r|   r}   
x_shape_ogry   ro   rr   rh   rk   r   r   r   r
   r
   r   forwardw  sf   

zLayerNormFn.forwardNNNNr   r2   NFFFFFNNN)__name__
__module____qualname__staticmethodr   r
   r
   r
   r   r   u  s$    r   r   c                 C   s,   t | |||||||||	|
|||||||S r   r   r   )r	   rt   ru   rg   rf   rv   rw   r_   r`   rd   r   r   ra   rz   r{   rx   r|   r}   r
   r
   r   layer_norm_fn  s(   r   c                 C   sv  t d}| || 7 } ||| 7 }|
r|d7 }|r|d7 }t d|}t j| | ||k ddt j}|	sUt j|dd| }t ||k || d}t j|| dd| }nt ||k |d}t j|| dd| }dt ||  }|	sz|| | n|| }|
rt j|| ||k ddt j}|| }n|}|rt j|| ||k ddt j}||7 }t j	|| |||k d d S )Nr   r2   r3   r:   r   r8   r9   )
r<   r=   r>   r?   r@   rA   rF   rD   rG   rE   )rH   rI   rJ   rK   rX   rY   r^   r$   r_   r'   r)   r(   r.   rb   rc   r	   rh   ri   rj   rk   rn   rl   ro   rm   r
   r
   r   _norm_infer_kernel  s2   
  
 r   c                 C   s  | j \}}|  } |d ur|j |fksJ |ddksJ |d ur3|j |fks*J |ddks3J |d u r<t| }d|   }t|t|}	||	krRt	dtt
|	d dd}
t|f | ||d urh|n| |d uro|n| | d|d|||||d u|d u|	|
d |S )	Nr   r   r   r      r   r   )r'   r)   r(   r.   r   )r   r   r   r   r   r   r   r   r   r   maxr   )r	   rt   ru   r_   rz   r|   r^   r$   r   r.   r   r
   r
   r   
norm_infer  s@   

r   c                 C   s,   t | |||||||||	|
||d||||S )NTr   )r	   rt   ru   rg   rf   rv   rw   r_   r`   rd   r   r   ra   r{   rx   r|   r}   r
   r
   r   rms_norm_fnF  s(   r   )NNNNr2   NNNFFFNN)
NNNNr2   NFFFNr   )FN)NNNNr   r2   NFFFFNNN)typingr   r   r   triton.languagelanguager<   r   r   r   r#   autotunejit	constexprrs   floatr   r   r   r   r   r   r   r   r   r
   r
   r
   r   <module>   sz    !"#$%&' 	

E	

zT
*
1
.