o
    پi                     @   sp  U d Z ddlZddlmZmZmZ ddlm  mZ	 ddl
Z
ddlmZ ddlZddlmZ ddlmZ eeZi aeeef ed< i aeeejf ed< dZd	Zd
ZdZdZdZdZ dZ!dZ"dZ#dZ$dZ%ee% Z&d	Z'dd Z(dd Z)da*dd Z+dd Z,					d-dejdejdejdejd ejd!ejd"ejd#ejd$ejd%eej d&ee- d'e.d(e-d)e-d*ejfd+d,Z/dS ).z?CuTe DSL Fused Sigmoid Gating Delta Rule Kernel for GDN Decode.    N)DictOptionalTuple)cpasync)from_dlpack_compiled_kernels_cu_seqlens_cache       $                     c               (      s  dt  d t  tjdtjdtjdtjdtjt	 dtjdtjd	tjd
tjdtjdtjdtjdtjdtjdtjt
 dtjt
 dtjt
 dtjt	 dtjt	 dtjt f& fdd} tjdtjdtjdtjdtjt	 dtjdtjd	tjd
tjdtjdtjdtjdtjdtjdtjt
 dtjt
 dtjt
 dtjt	 dtjt	 dtjt f& fdd}tjdtjdtjdtjdtjt	 dtjdtjd	tjd
tjdtjdtjdtjdtjdtjdtjt
 dtjt
 dtjt
 dtjt	 dtjt	 dtjt f&dd}tjdtjdtjdtjdtjt	 dtjdtjd	tjd
tjdtjdtjdtjdtjdtjdtjt
 dtjt
 dtjt
 dtjt	 dtjt	 dtjt f&dd}| |||fS )z;Define CuTe DSL kernels for normal and varlen decode modes.r   r
   tiled_copy_load	h0_sourcesmem_layout_stagednum_v_tilesqkvabA_logdt_biaso
h0_indicessoftplus_betasoftplus_thresholdscaleHHVuse_qk_l2normc           d   	      s`  t j \}}}|d }t j }t j|}t j \}}}|t }|t }|t }|| }|| }|| }|||  }|| }|dkr| } | }!| }"|"|! }#tj	 }$|$
tj|d}%t jtfdd}&|$
tj|&d}'t jtfdd}(t jtfdd})|$
tj|(d}*|$
tj|)d}+|tk rt||d||f |*|< t||d||f |+|< |||ddf },t |,ttfd}-| |}.ttd |}/t|/D ].}0||0 }1|0t }2|-dd|1f }3|%dd|2f }4|.|3}5|.|4}6t | |5|6 t j  qt|	| }7t|
| }8t||d|f }9t||d|f }:d	};d	}<|dkrv|9|8 }=||= }>d	}?|>|kr]t |>}@td
|@ }Att |A}Bttd
| |B }?n|=}?t |7 |? }Cd
d
t |:   }<t |C};t j|;d};t j|<d}<t j  |rRd	}Dd	}E|tk r|+| }F|*| }G|F|F }D|G|G }EdD ]}H|Dt jj|D|Hddd7 }D|Et jj|E|Hddd7 }Eq|dkr|D|'|< |E|'|d < t j  d	}Id	}J|dkr(d	}Kd	}L|k r|'| }K|'|d  }LdD ]}H|Kt jj|K|Hddd7 }K|Lt jj|L|Hddd7 }Lq|dkr(t |Kd |'d< t |Ld |'d< t j  |'d }I|'d }J|tk rL|*| |J |*|< |+| | |I |+|< t j  n|tk r_|+| | |+|< t j  t|D ]D}0||0 }1|0t }2t jd t j  |0|/ }M|M|k r||M }N|Mt }O|-dd|Nf }P|%dd|Of }Q|.|P}5|.|Q}6t | |5|6 t j  |1t |# }Rt||d||Rf }Sd	}Ttj ddD ]}U|U }V|V|  }W|%|W|#|2f |; }X|*|W }Y|T|X|Y 7 }TqdD ]}H|Tt jj|T|H ddd7 }Tq|S|T |< }Zt j|Z|!}Zd	}[tj ddD ]/}U|U }V|V|  }W|%|W|#|2f |; }\|*|W }Y|+|W }]|\|Y|Z  }^|^|%|W|#|2f< |[|^|] 7 }[qdD ]}H|[t jj|[|H ddd7 }[qH| dkrp|1t |# }_t |[||d||_f< t j  t D ]+}U||Ud  }`|`t }a|`t }b|atk r|%|a|b|2f }X|1t |b }c|X||||a|cf< qyt j  qhdS dS )z*Small batch kernel for (N, 1, ...) format.r
   r   r	      strideNr   Nr&                 ?r   r   r   r   r&      offsetmaskmask_and_clampr   r   r&   ư>r   unrollr   r   r&   !cutearch
thread_idxwarp_idxmake_warp_uniform	block_idxNUM_BLOCKS_PER_STATE_SMALLcutlassutilsSmemAllocatorallocate_tensorFloat32make_layoutTILE_V_SMALLTILE_K
local_tile	get_slicemin
NUM_STAGESrangepartition_Spartition_Dcopycp_async_commit_groupexplogshuffle_syncbarriershuffle_sync_bflyrsqrtcp_async_wait_grouprange_dynamicBFloat16dr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   tidx_in_warp_tidr<   r>   	batch_idxbatch_innernum_v_tiles_per_blockstart_v_tilei_ni_hvi_hpool_idxk_localv_localv_basev_idxsmemsDatasmem_o_layoutsmem_osmem_k_layoutsmem_q_layoutsKsQ
gSrc_batchgSrcthr_copy_loadprefetch_countv_tile_offsetv_tilestage	gSrc_tilesData_stagethr_gSrc	thr_sDatar_A_log	r_dt_biasr_ar_br_gr_betaxbeta_x
softplus_x
exp_beta_x	log_input
log_result	r_g_valuesum_q_partialsum_k_partialq_valk_valr0   
inv_norm_q
inv_norm_klocal_sum_qlocal_sum_knext_v_tile_offsetnext_v_tile
next_stage	gSrc_next
sData_nextv_globalr_vsum_hkk_iterk_basek_idxh_valr_k_valv_newsum_hqh_oldr_q_valh_newv_global_outflat_idxk_writev_writev_global_writeNUM_K_ITERS_SMALLNUM_WARPS_SMALLROWS_PER_ITER_SMALLV_PER_WARP_SMALL Q/home/ubuntu/.local/lib/python3.10/site-packages/sglang/jit_kernel/cutedsl_gdn.pygdn_kernel_small_batch)   sh  




























 uz/_define_kernels.<locals>.gdn_kernel_small_batchc           d   	      s\  t j \}}}|d }t j }t j|}t j \}}}|t }|t }|t }|| }|| }|| }|||  }|| }|dkr| } | }!| }"|"|! }#tj	 }$|$
tj|d}%t jtfdd}&|$
tj|&d}'t jtfdd}(t jtfdd})|$
tj|(d}*|$
tj|)d}+|tk rt|d|||f |*|< t|d|||f |+|< |||ddf },t |,ttfd}-| |}.ttd |}/t|/D ].}0||0 }1|0t }2|-dd|1f }3|%dd|2f }4|.|3}5|.|4}6t | |5|6 t j  qt|	| }7t|
| }8t|||f }9t|||f }:d	};d	}<|dkrt|9|8 }=||= }>d	}?|>|kr[t |>}@td
|@ }Att |A}Bttd
| |B }?n|=}?t |7 |? }Cd
d
t |:   }<t |C};t j|;d};t j|<d}<t j  |rPd	}Dd	}E|tk r|+| }F|*| }G|F|F }D|G|G }EdD ]}H|Dt jj|D|Hddd7 }D|Et jj|E|Hddd7 }Eq|dkr|D|'|< |E|'|d < t j  d	}Id	}J|dkr&d	}Kd	}L|k r|'| }K|'|d  }LdD ]}H|Kt jj|K|Hddd7 }K|Lt jj|L|Hddd7 }Lq|dkr&t |Kd |'d< t |Ld |'d< t j  |'d }I|'d }J|tk rJ|*| |J |*|< |+| | |I |+|< t j  n|tk r]|+| | |+|< t j  t|D ]D}0||0 }1|0t }2t jd t j  |0|/ }M|M|k r||M }N|Mt }O|-dd|Nf }P|%dd|Of }Q|.|P}5|.|Q}6t | |5|6 t j  |1t |# }Rt|d|||Rf }Sd	}Ttj ddD ]}U|U }V|V|  }W|%|W|#|2f |; }X|*|W }Y|T|X|Y 7 }TqdD ]}H|Tt jj|T|H ddd7 }Tq|S|T |< }Zt j|Z|!}Zd	}[tj ddD ]/}U|U }V|V|  }W|%|W|#|2f |; }\|*|W }Y|+|W }]|\|Y|Z  }^|^|%|W|#|2f< |[|^|] 7 }[qdD ]}H|[t jj|[|H ddd7 }[qF| dkrn|1t |# }_t |[|d|||_f< t j  t D ]+}U||Ud  }`|`t }a|`t }b|atk r|%|a|b|2f }X|1t |b }c|X||||a|cf< qwt j  qfdS dS )z8Small batch kernel for varlen decode (1, N, ...) format.r
   r   r	   r%   r'   Nr)   r&   r*   r+   r,   r-   r.   r/   r   r3   r4   r   r5   r7   r8   rZ   r   r   r   gdn_kernel_small_batch_varlen  sh  




























 uz6_define_kernels.<locals>.gdn_kernel_small_batch_varlenc           ^   	   S   s(  t j \}}}|d }t j }t j|}t j \}}}|| }|| }|||  }|| }|dkr|t }|t }|t }|| }tj	 } | 
tj|d}!t jtfdd}"| 
tj|"d}#t jtfdd}$t jtfdd}%| 
tj|$d}&| 
tj|%d}'|tk rt||d||f |&|< t||d||f |'|< |||ddf }(t |(ttfd})| |}*ttd |}+t|+D ]*},|,t }-|)dd|,f }.|!dd|-f }/|*|.}0|*|/}1t | |0|1 t j  qt|	| }2t|
| }3t||d|f }4t||d|f }5d	}6d	}7|dkrb|4|3 }8||8 }9d	}:|9|krIt |9};td
|; }<tt |<}=ttd
| |= }:n|8}:t |2 |: }>d
d
t |5   }7t |>}6t j|6d}6t j|7d}7t j  |r>d	}?d	}@|tk r|'| }A|&| }B|A|A }?|B|B }@dD ]}C|?t jj|?|Cddd7 }?|@t jj|@|Cddd7 }@q|dkr|?|#|< |@|#|d < t j  d	}Dd	}E|dkrd	}Fd	}G|tk r|#| }F|#|d  }GdD ]}C|Ft jj|F|Cddd7 }F|Gt jj|G|Cddd7 }Gq|dkrt |Fd |#d< t |Gd |#d< t j  |#d }D|#d }E|tk r8|&| |E |&|< |'| | |D |'|< t j  n|tk rK|'| | |'|< t j  t|D ]<},|,t }-t jd t j  |,|+ }H|H|k r|Ht }I|)dd|Hf }J|!dd|If }K|*|J}0|*|K}1t | |0|1 t j  |,t | }Lt||d||Lf }Md	}Ntj t!ddD ]}O|Ot" }P|P| }Q|!|Q||-f |6 }R|&|Q }S|N|R|S 7 }NqdD ]}C|Nt jj|N|Ct ddd7 }Nq|M|N |7 }Tt j|T|}Td	}Utj t!ddD ]/}O|Ot" }P|P| }Q|!|Q||-f |6 }V|&|Q }S|'|Q }W|V|S|T  }X|X|!|Q||-f< |U|X|W 7 }UqdD ]}C|Ut jj|U|Ct ddd7 }Uq,|dkrT|,t | }Yt#|U||d||Yf< t j  tt!D ]+}O||Od  }Z|Zt }[|Zt }\|[tk r|!|[|\|-f }R|,t |\ }]|R||||[|]f< q]t j  qTdS dS )z*Large batch kernel for (N, 1, ...) format.r
   r   r	   r%   r'   Nr)   r&   r*   r+   r,   r-   r.   r/   r   r7   r4   r5   r   $r9   r:   r;   r<   r=   r>   
V_PER_WARPr@   rA   rB   rC   rD   rE   TILE_VrG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rU   NUM_WARPS_LARGErV   rW   rX   NUM_K_ITERSROWS_PER_ITERrY   ^r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r[   r\   r]   r<   r^   rb   rc   rd   re   rf   rg   rh   ri   rj   rk   rl   rm   rn   ro   rp   rq   rr   rs   rt   ru   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r0   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   gdn_kernel_large_batch  sN  




























 tz/_define_kernels.<locals>.gdn_kernel_large_batchc           ^   	   S   s$  t j \}}}|d }t j }t j|}t j \}}}|| }|| }|||  }|| }|dkr|t }|t }|t }|| }tj	 } | 
tj|d}!t jtfdd}"| 
tj|"d}#t jtfdd}$t jtfdd}%| 
tj|$d}&| 
tj|%d}'|tk rt|d|||f |&|< t|d|||f |'|< |||ddf }(t |(ttfd})| |}*ttd |}+t|+D ]*},|,t }-|)dd|,f }.|!dd|-f }/|*|.}0|*|/}1t | |0|1 t j  qt|	| }2t|
| }3t|||f }4t|||f }5d	}6d	}7|dkr`|4|3 }8||8 }9d	}:|9|krGt |9};td
|; }<tt |<}=ttd
| |= }:n|8}:t |2 |: }>d
d
t |5   }7t |>}6t j|6d}6t j|7d}7t j  |r<d	}?d	}@|tk r|'| }A|&| }B|A|A }?|B|B }@dD ]}C|?t jj|?|Cddd7 }?|@t jj|@|Cddd7 }@q|dkr|?|#|< |@|#|d < t j  d	}Dd	}E|dkrd	}Fd	}G|tk r|#| }F|#|d  }GdD ]}C|Ft jj|F|Cddd7 }F|Gt jj|G|Cddd7 }Gq|dkrt |Fd |#d< t |Gd |#d< t j  |#d }D|#d }E|tk r6|&| |E |&|< |'| | |D |'|< t j  n|tk rI|'| | |'|< t j  t|D ]<},|,t }-t jd t j  |,|+ }H|H|k r|Ht }I|)dd|Hf }J|!dd|If }K|*|J}0|*|K}1t | |0|1 t j  |,t | }Lt|d|||Lf }Md	}Ntj t!ddD ]}O|Ot" }P|P| }Q|!|Q||-f |6 }R|&|Q }S|N|R|S 7 }NqdD ]}C|Nt jj|N|Ct ddd7 }Nq|M|N |7 }Tt j|T|}Td	}Utj t!ddD ]/}O|Ot" }P|P| }Q|!|Q||-f |6 }V|&|Q }S|'|Q }W|V|S|T  }X|X|!|Q||-f< |U|X|W 7 }UqdD ]}C|Ut jj|U|Ct ddd7 }Uq*|dkrR|,t | }Yt#|U|d|||Yf< t j  tt!D ]+}O||Od  }Z|Zt }[|Zt }\|[tk r|!|[|\|-f }R|,t |\ }]|R||||[|]f< q[t j  qRdS dS )z8Large batch kernel for varlen decode (1, N, ...) format.r
   r   r	   r%   r'   Nr)   r&   r*   r+   r,   r-   r.   r/   r   r7   r4   r5   r   r   r   r   r   r   gdn_kernel_large_batch_varlen  sN  




























 tz6_define_kernels.<locals>.gdn_kernel_large_batch_varlen)rF   rG   r9   kernel	TiledCopyTensorLayoutr@   	Constexprintfloatbool)r   r   r   r   r   r   r   _define_kernels!   sZ  	
 i	
 i	
 Z	
 [r   c               /      sB  t  \ tjdtjdtjdtjdtjdtjdtjdtjdtjd	tjd
tjdtjdtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dt	j
f.fdd} tjdtjdtjdtjdtjdtjdtjdtjdtjd	tjd
tjdtjdtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dt	j
f.fdd}tjdtjdtjdtjdtjdtjdtjdtjdtjd	tjd
tjdtjdtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dt	j
f. fdd}tjdtjdtjdtjdtjdtjdtjdtjdtjd	tjd
tjdtjdtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dtjt dt	j
f.fdd}| |||fS ) z?Create JIT-compiled launcher functions for all kernel variants.
cu_seqlensr   r   r   r   r   r   r   r   r   r   r   r    r!   BTr"   r#   KVuse_initial_stater$   streamc           $           |j j\}}}}|	j jd }|| }tjtjtjjdtj	dd}t
|t}tjtttftdtt fd}tjddd} td	}!t|| |!}"d
t t t d
t  d
t d  d }# |"|||||||||||
|	||||||j|t ddftddg|#|d d S Nr   
cache_moder	   num_bits_per_copyr&   r'   )r
   r   )r   r&   r&   r   r   r   @   gridblockrj   r   layoutshaper9   make_copy_atomr   	CopyG2SOpLoadCacheModeGLOBALr@   rD   ceil_divrF   rE   rG   rK   TILE_V_SMALL_PADDEDmake_tiled_copy_tvlaunchr?   NUM_THREADS$r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r   r   r"   r#   r   r   r   r$   r   	pool_sizehv_dimk_dimv_dim	n_indices
batch_size	copy_atomnum_v_tiles_smallsmem_layout_smallthread_layout_smallval_layout_smalltiled_copy_load_smallsmem_bytes_small)	gdn_smallr   r   run_small_batch  j   


z._create_jit_functions.<locals>.run_small_batchc           $         r   r   r   r   )gdn_small_varlenr   r   run_small_batch_varlen  r   z5_create_jit_functions.<locals>.run_small_batch_varlenc           $           |j j\}}}}|	j jd }|| }tjtjtjjdtj	dd}t
|t}tjtttftdtt fd}tjddd} td	}!t|| |!}"d
t t t d
t  d
t d  d }# |"|||||||||||
|	||||||j|ddftddg|#|d d S Nr   r   r	   r   r&   r'   )r
   r   )r   r&   r   r   r   r   r   r   r   r9   r   r   r   r   r   r@   rD   r   r   rE   rG   rK   TILE_V_PADDEDr   r   NUM_THREADS_LARGE$r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r   r   r"   r#   r   r   r   r$   r   r   r   r   r   r   r   r   r   base_smem_layoutthread_layout
val_layoutr   
smem_bytes)	gdn_larger   r   run_large_batchZ  Z   
&
z._create_jit_functions.<locals>.run_large_batchc           $         r   r   r   r   )gdn_large_varlenr   r   run_large_batch_varlen  r  z5_create_jit_functions.<locals>.run_large_batch_varlen)r   r9   jitr   r@   r   r   r   r   cudaCUstream)r   r   r   r  r   )r   r  r   r   r   _create_jit_functions  s  	
N	
N	
I	
Jr  c                   C   s   t d u rt a t S )N)_jit_functionsr  r   r   r   r   _get_jit_functions  s   r	  c           +      C   s  | |||||||f}|t v rt | S tj| d tjdd}	|rdtjd| ||tjdd}
tjd| ||tjdd}tjd| ||tjdd}tj| |tjdd}tj| |tjdd}tjd| ||tjdd}nFtj| d||tjdd}
tj| d||tjdd}tj| d||tjdd}tj| d|tjdd}tj| d|tjdd}tj| d||tjdd}tj|tjdd}tj|tjdd}tj||||tjdd}tj| tjdd}t|	dd}t|
dd}t|dd}t|dd}t|dd}t|dd}t|dd}t|dd}t|dd}t|dd}t|dd}ttj	 j
}t \} }!}"}#|r.|r+|!n| }$n|r3|#n|"}$|d }%d}&d}'|rBdn| }(|rI| nd})tj|$|||||||||||f|&|'|%|(|)||||d	d	|d
}*|*t |< td|  d| d| d| d| d| d| d|  |*S )z/Get or compile the kernel for given dimensions.r&   r  dtypedevicer   assumed_align      r+         4@T)r   r    r!   r   r   r"   r   r   r#   r   r$   r   z CuTe DSL GDN kernel compiled: N=z, H=z, HV=z, K=z, V=z, pool_size=z, small_batch=z	, varlen=)r   torchzerosint32bfloat16float32r   r  r  current_streamcuda_streamr	  r9   compileloggerinfo)+Nr"   r#   r   r   r   use_small_batchis_varlen_decodekeyr   r   r   r   r   r   r   r   r   r   r   cu_seqlens_tensorq_tensork_tensorv_tensora_tensorb_tensorA_log_tensordt_bias_tensorh0_source_tensorh0_indices_tensoro_tensorr   	run_smallrun_small_varlen	run_largerun_large_varlenkernel_funcr!   r   r    	B_compile	T_compilecompiled_kernelr   r   r   _get_compiled_kernel   s   2r2  Tr+   r  r   r   r   r   r   r   r   initial_state_sourceinitial_state_indicesr   r!   use_qk_l2norm_in_kernelr   r    returnc           )      C   s"  |j \}}}}|j d }|j d }|j d }|dko!||ko!|dk}|
du r*|d }
|tk }| dkrG| || |  }|||||}n| dkrU|j d }|}ntd|j  |r| dkrj|d}| dkru|d}|jd|||tj	d	}n!| dkr|
d}| dkr|
d}|j|d||tj	d	}d
d |||fD \}}}|	dur|	}n|t|jf}|tvrtj|d tj|jdt|< t| }t| ddjdd}t| ddj|jd d}t| ddj|jd d}t| ddj|jd d}t| ddj|jd d} t| ddj|jd d}!t|  ddjdd}"t| ddjdd}#t| ddj|jd d}$t| ddjdd}%t| ddj|jd d}&ttj j}'t||||||||}(|(||||| |!|"|#|$|%|&|' |S )zBCuTe DSL implementation of fused sigmoid gating delta rule update.r      r   r&   Nr  r   z'Unexpected initial_state_source shape: )r  c                 S   s   g | ]}|  qS r   )
contiguous).0tr   r   r   
<listcomp>  s    zBcutedsl_fused_sigmoid_gating_delta_rule_update.<locals>.<listcomp>r
  r   r  )leading_dim)r   SMALL_BATCH_THRESHOLDdimnumelview
ValueErrorsqueeze	new_emptyr  r  	unsqueezestrr  r   aranger  r   detachmark_layout_dynamicndimr  r  r  r  r2  ))r   r   r   r   r   r   r   r3  r4  r   r!   r5  r   r    B_qT_qr"   r   r#   r   r  r  r  r   r   r   cu_seqlens_to_use	cache_keyr  r   r!  r"  r#  r$  r%  r&  r'  r(  r)  r   r1  r   r   r   .cutedsl_fused_sigmoid_gating_delta_rule_update]  s   










rN  )NNTr+   r  )0__doc__loggingtypingr   r   r   cuda.bindings.driverbindingsdriverr  r@   cutlass.cuter9   r  cutlass.cute.nvgpur   cutlass.cute.runtimer   	getLogger__name__r  r   object__annotations__r   r   rG   r   r   rF   r   rK   r   r?   r   r   r   r   r   r=  r   r  r  r	  r2  r   r   rN  r   r   r   r   <module>   s    
         Ag	
