o
    پim                 <   @   sp  d Z 	 ddlZddlmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlm  mZ z
ddlmZ dZW n eyK   d	Zd
d ZY nw dZdZdZdZdZdZdZdZdZdZdZdZ dZ!dZ"dZ#dZ$dZ%ee% Z&dZ'dZ(dZ)ddde*de*de*fddZ+ddde*de*de*fddZ,ej-dej.dej/dej0dej1e* d ej1e* d!ej/d"ej/d#ej/d$ej/d%ej/d&ej/d'ej/d(ej/d)ej/d*ej/d+ej1e2 d,ej1e2 d-ej1e2 d.ej1e* d/ej1e* d0ej1e* d1ej1e* d2ej1e* d3ej1e* d4ej1e3 d5ej1e3 d6ej1e3 f6d7d8Z4ej-dej.dej/dej0dej1e* d ej1e* d!ej/d"ej/d#ej/d$ej/d%ej/d&ej/d'ej/d(ej/d)ej/d*ej/d+ej1e2 d,ej1e2 d-ej1e2 d.ej1e* d/ej1e* d0ej1e* d1ej1e* d2ej1e* d3ej1e* d4ej1e3 d5ej1e3 d6ej1e3 f6d9d:Z5ej6dej/d!ej/d"ej/d#ej/d$ej/d%ej/d&ej/d'ej/d(ej/d)ej/d*ej/d+ej1e2 d,ej1e2 d-ej1e2 d.ej1e* d/ej1e* d0ej1e* d1ej1e* d2ej1e* d3ej1e* d4ej1e3 d5ej1e3 d6ej1e3 d;ej7f0d<d=Z8ej6dej/d!ej/d"ej/d#ej/d$ej/d%ej/d&ej/d'ej/d(ej/d)ej/d*ej/d+ej1e2 d,ej1e2 d-ej1e2 d.ej1e* d/ej1e* d0ej1e* d1ej1e* d2ej1e* d3ej1e* d4ej1e3 d5ej1e3 d6ej1e3 d;ej7f0d>d?Z9ej:d/e*d0e*d1e*d.e*d2e*d3e*d@ej;d-e2d5e3fdAdBZ<ej:d/e*d0e*d1e*d.e*d2e*d3e*d@ej;d-e2d5e3fdCdDZ=e			ded$ej/d%ej/d&ej/dEej/d!ej/d"ej/d#ej/d'ej/d-ee2 dFeej/ d5e3deej/ej/f fdGdHZ>ej-dej.dej/dej0d ej1e* d$ej/d%ej/d&ej/d"ej/d'ej/d!ej/d#ej/d(ej/d)ej/d+ej1e2 d,ej1e2 d-ej1e2 d1ej1e* d.ej1e* d5ej1e3 f&dIdJZ?ej-dej.dej/dej0d ej1e* d$ej/d%ej/d&ej/d"ej/d'ej/d!ej/d#ej/d(ej/d)ej/d+ej1e2 d,ej1e2 d-ej1e2 d1ej1e* d.ej1e* d5ej1e3 f&dKdLZ@ej6d*ej/d$ej/d%ej/d&ej/d"ej/d'ej/d!ej/d#ej/dej/d)ej/d(ej/d+ej1e2 d,ej1e2 d-ej1e2 d/ej1e* d0ej1e* d1ej1e* d.ej1e* d2ej1e* d3ej1e* d4ej1e3 d5ej1e3 d;ej7f.dMdNZAej6d*ej/d$ej/d%ej/d&ej/d"ej/d'ej/d!ej/d#ej/dej/d)ej/d(ej/d+ej1e2 d,ej1e2 d-ej1e2 d/ej1e* d0ej1e* d1ej1e* d.ej1e* d2ej1e* d3ej1e* d4ej1e3 d5ej1e3 d;ej7f.dOdPZBe			ded$ej/d%ej/d&ej/dEej/d!ej/d"ej/d#ej/d'ej/d-ee2 dFeej/ d5e3deej/ej/f fdQdRZCej-dej/dSej/dej1e* d ej1e* dTej1e* d!ej/d"ej/d#ej/d$ej/d%ej/d&ej/d'ej/d(ej/d)ej/d*ej/d+ej1e2 d,ej1e2 d-ej1e2 d.ej1e* d/ej1e* d0ej1e* d1ej1e* d2ej1e* d3ej1e* d4ej1e3 d5ej1e3 d6ej1e3 dUej1e3 dVej1e3 f:dWdXZDej6dej/dSej/d!ej/d"ej/d#ej/d$ej/d%ej/d&ej/d'ej/d(ej/d)ej/d*ej/d+ej1e2 d,ej1e2 d-ej1e2 d.ej1e* d/ej1e* d0ej1e* d1ej1e* d2ej1e* d3ej1e* dTej1e* dej1e* d4ej1e3 d5ej1e3 d6ej1e3 dUej1e3 dVej1e3 d;ej7f:dYdZZEej:d/e*d0e*d1e*d.e*d2e*d3e*d[e*d\e*dUe3dVe3d-e2d5e3dTe*de*fd]d^ZFe					dfd$ej/d%ej/d&ej/d_ej/d`ej/d!ej/d"ej/d#ej/d'ej/d-ee2 dFeej/ daeej/ dUe3d5e3deej/ej/f fdbdcZGdS )ga3  
Copyright (c) 2025 by FlashInfer team.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

  http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
    N)OptionalTuple)cpasync)from_dlpack   )flashinfer_apiTFc                 C   s   | S )N )funcr   r   I/home/ubuntu/.local/lib/python3.10/site-packages/flashinfer/gdn_decode.pyr   ;   s   r                $               
batch_sizeseq_lenreturnc                 C   s   dS )zSelect vec_size for MTP kernel.

    Always use vec_size=4 (32 threads per group = full warp, 4 groups per block).
    Full warp shuffle is more efficient and achieves >= 1.0x speedup vs Triton.
    r   r   r   r   r   r   r
   get_vec_size_mtpi   s   r   c                 C   s4   | dkrdS | dkrdS | dkrdS | dkrdS dS )zSelect optimal TILE_V for MTP kernel based on batch size and sequence length.

    With vec_size=4, num_groups=4, rows_per_group = tile_v / 4.
    Tuned via grid search for optimal performance.
    r   r   r   r   r   @   r   r   r   r   r
   get_tile_v_mtpr   s   r   tiled_copy_load	h0_sourcesmem_layout_stagedvec_sizenum_v_tilesA_logadt_biasqkvbo
h0_indices
cu_seqlenssoftplus_betasoftplus_thresholdscaleHVBTHKVuse_initial_stateuse_qk_l2norm	is_varlenc           a   
   C   s  t j \}}}|d }t j }t j|}t j \}}}|t } |t }!|t }"| | }#| | }$|$||  }%d}&t||$ }'t||#|&|$f }(t||$ })t||#|&|$f }*tj	
 }+|+tj|d},|+tjt |fd}-|+tjt |fd}.t t j|fddtj}/t t j|fddtj}0t t j|fddtj}1t t j|fddtj}2t t j|fddtj}3t t j|fddtj}4|| }5t j  || ddf }6t |dttf| ddf}7t |6ttfd	}8| |}9|!|" }:ttd |"};t|:|:|; D ]-}<|<|: t }=|8dd|<f }>|,dd|=f }?|9|>}@|9|?}At | |@|A t j  qt |ddd|f|#|&|%|f}Bt |	ddd|f|#|&|%|f}Ct |B|2 t |C|3 t|D ]}Dt|2|D |0|D< t|3|D |/|D< qkt |
ddd|f|#|&|$|f}Et |E|4 t|D ]}Dt|4|D |.|5|D < qt j  d
}Fd
}G|dkr|(|) }H||H }Id
}J|I|krt j|Idd}Ktd|K }Ltt j|Ldd}Mttd| |M }Jn|H}Jt j|'dd |J }Nddt j|* dd  }Gt j|Ndd}Ft j|Fd}Ft j|Gd}G|rd
}Od
}Pt|D ]}D|O|0|D |0|D  7 }O|P|/|D |/|D  7 }Pq%dD ]}Q|Ot jj |O|Qddd7 }O|Pt jj |P|Qddd7 }Pq?t j!|Od dd}Rt j!|Pd dd}St|D ]}D|0|D |R |0|D< |/|D |S |/|D< qrt|D ]}D|0|D | |0|D< q|:|" }Tt|:|TD ]}<|<|: t }=t j"d t j  |<|; }U|U|Tk r|U|: t }V|8dd|Uf }W|,dd|Vf }X|9|W}@|9|X}At | |@|A t j  tdtdD ]}Y|d }Zd
}[t |,d|df|Y|Z ||=f}\t |\|1 t|D ]}D|1|D |F |1|D< |[|1|D |/|D  7 }[qdD ]}Q|[t jj |[|Qddd7 }[q(|.|<t |Y |Z  |[ }]|]|G }]d
}^t|D ]}D|1|D  |/|D |] 7  < |^|1|D |0|D  7 }^qOt |7dd|dfd|Y|Z ||<f}_t |1|_ dD ]}Q|^t jj |^|Qddd7 }^q|<t |Y |Z }`|dkr|`|k rt|^|-|`< qqt j  ||:t kr||Tt k r|-| ||#|&|$|f< dS dS dS zCEach block uses pipeline to load one batch and vectorized writebackr   r   r   r   r   strideNr   )Nr           Tfastmath      ?r   r   r   r   r      offsetmaskmask_and_clampư>r   )#cutearch
thread_idxwarp_idxmake_warp_uniform	block_idxNUM_BLOCKS_PER_STATEcutlassFloat32utilsSmemAllocatorallocate_tensorBFloat16make_layoutmake_rmem_tensorbarrier
local_tileTILE_VTILE_K	get_slicemin
NUM_STAGESrangepartition_Spartition_Dcopycp_async_commit_groupautovec_copyrange_constexprexplogshuffle_syncshuffle_sync_bflyrsqrtcp_async_wait_group)ar   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   tidx_lane_idrI   rK   	batch_idxbatch_innernum_v_tiles_per_blocki_ni_hvi_hi_tr_A_logr_a	r_dt_biasr_bsmemsDatasOutputsVr_kr_qr_hr_q_bf16r_k_bf16r_v_bf16k_start
gSrc_batchgDstgSrcthr_copy_loadstart_v_tilesprefetch_countv_tilesstage	gSrc_tilesData_stagethr_gSrc	thr_sDataq_tilek_tileiv_tiler_gr_betaxbeta_x
softplus_x
exp_beta_x	log_input
log_result	r_g_valuesum_qsum_krB   
inv_norm_q
inv_norm_kend_v_tilesnext_v_tiles
next_stage	gSrc_next
sData_nextrow
row_offsetsum_hk
sData_tilev_newsum_hq	gDst_tileo_idxr   r   r
   *gdn_decode_kernel_small_batch_pretranspose   s<   















.r   c           \   
   C   sP  t j \}}}|d }t j }t j|}t j \}}}|| } || }!|!||  }"d}#t||! }$t|| |#|!f }%t||! }&t|| |#|!f }'tj	 }(|(
tj|d})|(
tjt |fd}*|(
tjt |fd}+t t j|fddtj},t t j|fddtj}-t t j|fddtj}.t t j|fddtj}/t t j|fddtj}0t t j|fddtj}1|| }2t j  ||ddf }3t |dttf|ddf}4t |3ttfd	}5| |}6ttd |}7t|7D ]*}8|8t }9|5dd|8f }:|)dd|9f };|6|:}<|6|;}=t | |<|= t j  qt |ddd|f| |#|"|f}>t |	ddd|f| |#|"|f}?t |>|/ t |?|0 t|D ]}@t|/|@ |-|@< t|0|@ |,|@< qUt |
ddd|f| |#|!|f}At |A|1 t|D ]}@t|1|@ |+|2|@ < qt j  d
}Bd
}C|dkr|%|& }D||D }Ed
}F|E|krt j|Edd}Gtd|G }Htt j|Hdd}Ittd| |I }Fn|D}Ft j|$dd |F }Jddt j|' dd  }Ct j|Jdd}Bt j|Bd}Bt j|Cd}C|rpd
}Kd
}Lt|D ]}@|K|-|@ |-|@  7 }K|L|,|@ |,|@  7 }LqdD ]}M|Kt jj|K|Mddd7 }K|Lt jj|L|Mddd7 }Lq)t j |Kd dd}Nt j |Ld dd}Ot|D ]}@|-|@ |N |-|@< |,|@ |O |,|@< q\t|D ]}@|-|@ | |-|@< qut|D ]	}8|8t }9t j!d t j  |8|7 }P|P|k r|Pt }Q|5dd|Pf }R|)dd|Qf }S|6|R}<|6|S}=t | |<|= t j  tdtdD ]}T|d }Ud
}Vt |)d|df|T|U ||9f}Wt |W|. t|D ]}@|.|@ |B |.|@< |V|.|@ |,|@  7 }VqdD ]}M|Vt jj|V|Mddd7 }Vq	|+|8t |T |U  |V }X|X|C }Xd
}Yt|D ]}@|.|@  |,|@ |X 7  < |Y|.|@ |-|@  7 }Yq0t |4dd|dfd|T|U ||8f}Zt |.|Z dD ]}M|Yt jj|Y|Mddd7 }Yqc|8t |T |U }[|dkr|[|k rt|Y|*|[< qϐqt j  ||k r|*| || |#|!|f< dS dS r6   )"rF   rG   rH   rI   rJ   rK   rM   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   )\r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   ri   rj   rk   rI   rl   ro   rp   rq   rr   rs   rt   ru   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rB   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r
   (gdn_decode_kernel_big_batch_pretranspose  s2   















.
r   streamc           #      C   s8  | j jd | j jd | j jd }}}tjtjtjjdtj	dd}tj
ddd	}t
d
}t|||}t|t}|| | d d d  td } tj
tttftdtt fd	}!dt t t d|  d|  d }"t|| |!| ||||||||||	|
||||||||||||j|t ddftddg|"|d dS )z>Launch original pipelined kernel for small batch pretranspose.r   r   r   
cache_moder   num_bits_per_copyr   r   r   r   r8   r   r   r      r   gridblockrw   r   N)layoutshaperF   make_copy_atomr   	CopyG2SOpLoadCacheModeGLOBALrM   rN   rS   make_tiled_copy_tvceil_divrW   rX   r[   r   launchrL   NUM_THREADS#r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r   r   v_dimk_dim	copy_atomthread_layout
val_layoutr   r   r   r   
smem_bytesr   r   r
   .run_gdn_decode_kernel_small_batch_pretranspose  st   




$
r   c           #      C   s4  | j jd | j jd | j jd }}}tjtjtjjdtj	dd}tj
ddd	}t
d
}t|||}t|t}|| | d d d  td } tj
tttftdtt fd	}!dt t t d|  d|  d }"t|| |!| ||||||||||	|
||||||||||||j|ddftddg|"|d d S )Nr   r   r   r   r   r   r   r   r8   r   r   r   r   r   )r   r   rF   r   r   r   r   r   rM   rN   rS   r   r   rW   rX   r[   r   r   r   r   r   r   r
   ,run_gdn_decode_kernel_big_batch_pretranspose  st   




$
r   dtypec	           	      C      i S )zECache compiled kernel for given configuration (pretranspose version).r   	r.   r/   r0   r-   r1   r2   r   r,   r4   r   r   r
   _get_compiled_decode_kernel~     r   c	           	      C   r   )zECache compiled kernel for given configuration (nontranspose version).r   r   r   r   r
   (_get_compiled_decode_kernel_nontranspose  r   r   stateoutputc           '      C   sf  | j \}}}}|dksJ d| |j \}}}}|j ||||fks6J d| d| d| d| d|j  
|dksAJ d	| |dksLJ d
| |t dks\J dt d| | jtjtjfv smJ d| j |jtjks{J d|j |jtjksJ d|j |du r|d }|	du}|r|	jn| j}|	du rtj||||ftj| jd}	|	|| ||}||||||| j||
f	}t
| }d|vs|d j| jkrtj|tj| jd|d< tj|d tj| jd|d< |d }|d }d|vrvttj j}t|dd}t|dd}t|dd}t|dd}t| dd}t|dd}t|dd} t|dd}!t|	dd}"t|dd}#t|dd}$|dkrNt}%nt}%tj|%||||||| |!|"|#|$fdd|||||||d|
d|dd}&|&|d< n|d }&ttj j}|d ||||| ||||	||| | s||	|||| |	j|kr|	|}	|	|fS ) a`  Gated Delta Rule Decode kernel for single-token generation.

    This implements the decode phase of gated delta rule linear attention,
    processing one token at a time and updating the recurrent state.

    Args:
        q (torch.Tensor):
            Current query of shape ``[B, 1, H, K]``. Must be float16/bfloat16.
        k (torch.Tensor):
            Current key of shape ``[B, 1, H, K]``. Must be float16/bfloat16.
        v (torch.Tensor):
            Current value of shape ``[B, 1, HV, V]``. Must be float16/bfloat16.
        state (torch.Tensor):
            Current state of shape ``[B, HV, V, K]`` (v-major layout).
            Must be float32. Will be updated in-place.
        A_log (torch.Tensor):
            Log decay parameter of shape ``[HV]``. Must be float32.
        a (torch.Tensor):
            Input-dependent decay of shape ``[B, 1, HV]``. Must be float16/bfloat16.
        dt_bias (torch.Tensor):
            Decay bias of shape ``[HV]``. Must be bfloat16 or float32.
        b (torch.Tensor):
            Update gate (beta) input of shape ``[B, 1, HV]``. Must be float16/bfloat16.
        scale (Optional[float]):
            Scale factor for queries. If None, defaults to ``1 / sqrt(K)``.
        output (Optional[torch.Tensor]):
            Pre-allocated output tensor of shape ``[B, 1, HV, V]``.
            If None, will be allocated automatically.
        use_qk_l2norm (bool):
            Whether to apply L2 normalization to q and k. Default: ``True``.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]:
            - output: Output tensor of shape ``[B, 1, HV, V]``
            - state: Updated state tensor of shape ``[B, HV, V, K]``

    Note:
        - Requires SM90 (Hopper) architecture
        - State is updated in-place
        - K and V must be multiples of 4 for vectorized loads
        - State layout is v-major: [B, HV, V, K]
    r    Decode only supports T=1, got T=Expected state shape [B=, HV=, V=, K=], got r   K must be at least 128, got K=V must be at least 128, got V=r   V must be divisible by ( to prevent out-of-bounds access, got V= q must be float16/bfloat16, got state must be float32, got A_log must be float32, got N      ࿩r   devicer(   r)   compiledr   assumed_alignr   r=         4@TF--enable-tvm-ffi)r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r   options)r   rW   r   torchfloat16bfloat16float32zerosr   reshaper   int32cudaCUstreamcurrent_streamcuda_streamr   r   r   rF   compileis_contiguouscopy_to)'r#   r$   r%   r   r    r!   r"   r&   r,   r   r4   r.   r/   r0   r1   rj   r-   r2   output_providedtarget_dtyper   	cache_keycacher(   r)   r   h0_source_tensorA_log_tensora_tensordt_bias_tensorq_tensork_tensorv_tensorb_tensoro_tensorh0_indices_tensorcu_seqlens_tensorrun_funcr   r   r   r
   $gated_delta_rule_decode_pretranspose  s   9"





r
  c           i   	   C   s  t j \}}}|d }t j }t j|}t j \}}}d}t| }d| }t| }|t }|t }|t }|| }|| } || }!|!||  }"||  }#|#dkr|| }$|| }%|| }&|&|% }'t	j
 }(|(t	j|d})t jtfdd}*|(t	j|*d}+t jtfdd},t jtfdd}-|(t	j|,d}.|(t	j|-d}/|tk rt	|| d|"|f |.|< t	|| d|"|f |/|< |#| |! }0||0ddf }1t |1ttfd}2| |}3t	td	 |}4t|4D ].}5||5 }6|5t }7|2dd|6f }8|)dd|7f }9|3|8}:|3|9};t | |:|; t j  qt	|	|! }<t	|
|! }=t	|| d|!f }>t	|| d|!f }?d
}@d
}A|dkr|>|= }B||B }Cd
}D|C|krtt j|Cdd}Et	d|E }Ft	t j|Fdd}Gt	t	d| |G }Dn|B}Dt j|<dd |D }Hddt j|? dd  }At j|Hdd}@t j|@d}@t j|Ad}At j  |rsd
}Id
}J|tk r|/| }K|.| }L|K|K }I|L|L }JdD ]}M|It jj|I|Mddd7 }I|Jt jj|J|Mddd7 }Jq|dkr|I|+|< |J|+|d < t j  d
}Nd
}O|dkrId
}Pd
}Q||k r|+| }P|+|d  }QdD ]}M|Pt jj|P|Mddd7 }P|Qt jj|Q|Mddd7 }Qq|dkrIt j|Pd dd|+d< t j|Qd dd|+d	< t j  |+d }N|+d	 }O|tk rm|.| |O |.|< |/| | |N |/|< t j  n|tk r|/| | |/|< t j  t|D ]B}5||5 }6|5t }7t jd t j  |5|4 }R|R|k r||R }S|Rt }T|2dd|Sf }U|)dd|Tf }V|3|U}:|3|V};t | |:|; t j  |6t |' }Wt	|| d|!|Wf }Xd
}Yt|ddD ]}Z|Z| }[|[|$ }\|)|\|'|7f |@ }]|.|\ }^|Y|]|^ 7 }YqdD ]}M|Yt jj|Y|M| ddd7 }Yq|X|Y |A }_t j|_|%}_d
}`t|ddD ]/}Z|Z| }[|[|$ }\|)|\|'|7f |@ }a|.|\ }^|/|\ }b|a|^|_  }c|c|)|\|'|7f< |`|c|b 7 }`q5dD ]}M|`t jj|`|M| ddd7 }`qg|$dkr|6t |' }dt	|`|| d|!|df< t j  t	 |D ]*}Z||Zd  }e|et }f|et }g|ftk r|)|f|g|7f }]|6t |g }h|]||0|f|hf< qt j  qdS dS )zDSmall batch kernel for (N, 1, ...) format with K-major state layout.r   r   r   r   r7   r8   Nr   Nr   r:   Tr;   r=   r>   r?   r@   rA   )r   r   rE   r   unrollr   r   r   )!rF   rG   rH   rI   rJ   rK   TILE_V_SMALL_NT	TILE_K_NTNUM_BLOCKS_PER_STATE_SMALL_NTrM   rO   rP   rQ   rN   rS   rV   rY   rZ   NUM_STAGES_NTr\   r]   r^   r_   r`   rc   rd   re   rU   rf   rg   rh   rR   rb   )ir   r   r   r   r#   r$   r%   r!   r&   r    r"   r'   r(   r*   r+   r,   r0   r-   r4   ri   rj   in_warp_tidrI   rK   NUM_WARPS_SMALLV_PER_WARP_SMALLROWS_PER_ITER_SMALLNUM_K_ITERS_SMALLrl   rm   rn   start_v_tilero   rp   rq   pool_idxk_localv_localv_basev_idxrw   rx   smem_o_layoutsmem_osmem_k_layoutsmem_q_layoutsKsQflat_idxr   r   r   r   v_tile_offsetr   r   r   r   r   r   rs   ru   rt   rv   r   r   r   r   r   r   r   r   r   sum_q_partialsum_k_partialq_valk_valrB   r   r   local_sum_qlocal_sum_knext_v_tile_offsetnext_v_tiler   r   r   v_globalr_vr   k_iterk_basek_idxh_valr_k_valr   r   h_oldr_q_valh_newv_global_outflat_tidk_writev_writev_global_writer   r   r
   *gdn_decode_kernel_small_batch_nontransposeY  sr  


























 wr=  c           _   	   C   sJ  t j \}}}|d }t j }t j|}t j \}}}|| }|| }|||  }|| }|dkr|t }|t }|t }|| }tj	 } | 
tj|d}!t jtfdd}"| 
tj|"d}#t jtfdd}$t jtfdd}%| 
tj|$d}&| 
tj|%d}'|tk rt||d||f |&|< t||d||f |'|< || | }(||(ddf })t |)ttfd}*| |}+ttd |},t|,D ]*}-|-t }.|*dd|-f }/|!dd|.f }0|+|/}1|+|0}2t | |1|2 t j  qt|	| }3t|
| }4t||d|f }5t||d|f }6d	}7d	}8|dkrq|5|4 }9||9 }:d	};|:|krRt j|:d
d}<td|< }=tt j|=d
d}>ttd| |> };n|9};t j|3d
d |; }?ddt j|6 d
d  }8t j|?d
d}7t j|7d}7t j|8d}8t j  |rQd	}@d	}A|tk r|'| }B|&| }C|B|B }@|C|C }AdD ]}D|@t jj|@|Dddd7 }@|At jj|A|Dddd7 }Aq|dkr|@|#|< |A|#|d < t j  d	}Ed	}F|dkr'd	}Gd	}H|tk r|#| }G|#|d  }HdD ]}D|Gt jj|G|Dddd7 }G|Ht jj|H|Dddd7 }Hq|dkr't j|Gd d
d|#d< t j|Hd d
d|#d< t j  |#d }E|#d }F|tk rK|&| |F |&|< |'| | |E |'|< t j  n|tk r^|'| | |'|< t j  t|D ]:}-|-t }.t jd t j  |-|, }I|I|k r|It }J|*dd|If }K|!dd|Jf }L|+|K}1|+|L}2t | |1|2 t j  |-t | }Mt||d||Mf }Nd	}Ott ddD ]}P|Pt! }Q|Q| }R|!|R||.f |7 }S|&|R }T|O|S|T 7 }OqdD ]}D|Ot jj|O|Dt ddd7 }Oq|N|O |8 }Ut j|U|}Ud	}Vtt ddD ]/}P|Pt! }Q|Q| }R|!|R||.f |7 }W|&|R }T|'|R }X|W|T|U  }Y|Y|!|R||.f< |V|Y|X 7 }VqdD ]}D|Vt jj|V|Dt ddd7 }Vq=|dkre|-t | }Zt"|V||d||Zf< t j  t#t D ]*}P||Pd  }[|[t }\|[t }]|\tk r|!|\|]|.f }S|-t |] }^|S||(|\|^f< qot j  qgdS dS )zDLarge batch kernel for (N, 1, ...) format with K-major state layout.r   r   r   r7   r8   Nr  r   r:   Tr;   r=   r>   r?   r@   rA   r   r  rE   r  r   )$rF   rG   rH   rI   rJ   rK   V_PER_WARP_NTrM   rO   rP   rQ   rN   rS   	TILE_V_NTr  rV   rY   rZ   r  r\   r]   r^   r_   r`   rc   rd   re   rU   rf   NUM_WARPS_LARGE_NTrg   rh   NUM_K_ITERS_NTROWS_PER_ITER_NTrR   rb   )_r   r   r   r   r#   r$   r%   r!   r&   r    r"   r'   r(   r*   r+   r,   r0   r-   r4   ri   rj   r  rI   rl   ro   rp   rq   r  r  r  r  r  rw   rx   r  r  r   r!  r"  r#  r$  r   r   r   r   r   r   r   r   r   r   rs   ru   rt   rv   r   r   r   r   r   r   r   r   r   r&  r'  r(  r)  rB   r   r   r*  r+  r-  r   r   r   r.  r/  r   r0  r1  r2  r3  r4  r   r   r5  r6  r7  r8  r9  r:  r;  r<  r   r   r
   (gdn_decode_kernel_big_batch_nontransposeK  sP  


























 vrC  c           "      C   s  |j j\}}}|	j jd  |}tjtjtjjdtj	dd}t
|t}tjtttftdtt fd}tjddd}td	}t|||} d
t t t d
t  d
t d  d }!t| |||||||||||
|	||||||j|t ddftddg|!|d d S )Nr   r   r   r   r   r8   )r   r   )r   r   r   r   r   r   r   )r   r   rF   r   r   r   r   r   rM   rN   r   r  rS   r  r  TILE_V_SMALL_PADDED_NTr   r=  r   r  NUM_THREADS_NT)"r)   r#   r$   r%   r!   r&   r    r"   r   r(   r'   r*   r+   r,   r.   r/   r0   r-   r1   r2   r3   r4   r   batch_hv_dimr   r   r   r   num_v_tiles_smallsmem_layout_smallthread_layout_smallval_layout_smalltiled_copy_load_smallsmem_bytes_smallr   r   r
   .run_gdn_decode_kernel_small_batch_nontranspose)  sj   


rM  c           "      C   s   |j j\}}}|	j jd  |}tjtjtjjdtj	dd}t
|t}tjtttftdtt fd}tjddd}td	}t|||} d
t t t d
t  d
t d  d }!t| |||||||||||
|	||||||j|ddftddg|!|d d S )Nr   r   r   r   r   r8   )r   r   )r   r   r   r   r   r   r   )r   r   rF   r   r   r   r   r   rM   rN   r   r?  rS   r  r  TILE_V_PADDED_NTr   rC  r   NUM_THREADS_LARGE_NT)"r)   r#   r$   r%   r!   r&   r    r"   r   r(   r'   r*   r+   r,   r.   r/   r0   r-   r1   r2   r3   r4   r   rF  r   r   r   r   r   base_smem_layoutr   r   r   r   r   r   r
   ,run_gdn_decode_kernel_big_batch_nontransposez  sf   


rQ  c           )      C   sh  | j \}}}}|dksJ d| |j \}}}}|j ||||fks6J d| d| d| d| d|j  
|dksAJ d	| |dksLJ d
| |t dks\J dt d| | jtjtjfv smJ d| j |jtjks{J d|j |jtjksJ d|j |du r|d }|	du}|r|	jn| j}|	du rtj||||ftj| jd}	|	 }|
|| ||}||||||| j||
f	}t| }d|vs|d j| jkrtj|tj| jd|d< tj|d tj| jd|d< |d }|d }d|vr{ttj j}|tk }|rt}nt}t|dd}t|dd}t|dd}t|dd} t| dd}!t|dd}"t|dd}#t|dd}$t|	dd}%t|dd}&t|dd}'tj||'|!|"|#||$|| ||&|%fdd|||||||d|
|dd}(|(|d< n|d }(ttj j}|(|| |||||||||	| | | kr|| |	j|kr|	|}	|	|fS )a  Gated Delta Rule Decode kernel (K-major layout, no transpose needed).

    This implements the decode phase of gated delta rule linear attention,
    processing one token at a time and updating the recurrent state.
    This version uses K-major state layout [B, HV, K, V] which is more natural
    and doesn't require transposition.

    Args:
        q (torch.Tensor):
            Current query of shape ``[B, 1, H, K]``. Must be float16/bfloat16.
        k (torch.Tensor):
            Current key of shape ``[B, 1, H, K]``. Must be float16/bfloat16.
        v (torch.Tensor):
            Current value of shape ``[B, 1, HV, V]``. Must be float16/bfloat16.
        state (torch.Tensor):
            Current state of shape ``[B, HV, K, V]`` (k-major layout).
            Must be float32. Will be updated in-place.
        A_log (torch.Tensor):
            Log decay parameter of shape ``[HV]``. Must be float32.
        a (torch.Tensor):
            Input-dependent decay of shape ``[B, 1, HV]``. Must be float16/bfloat16.
        dt_bias (torch.Tensor):
            Decay bias of shape ``[HV]``. Must be bfloat16 or float32.
        b (torch.Tensor):
            Update gate (beta) input of shape ``[B, 1, HV]``. Must be float16/bfloat16.
        scale (Optional[float]):
            Scale factor for queries. If None, defaults to ``1 / sqrt(K)``.
        output (Optional[torch.Tensor]):
            Pre-allocated output tensor of shape ``[B, 1, HV, V]``.
            If None, will be allocated automatically.
        use_qk_l2norm (bool):
            Whether to apply L2 normalization to q and k. Default: ``True``.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]:
            - output: Output tensor of shape ``[B, 1, HV, V]``
            - state: Updated state tensor of shape ``[B, HV, K, V]``

    Note:
        - Requires SM90 (Hopper) architecture
        - State is updated in-place
        - K and V must be multiples of 4 for vectorized loads
        - State layout is k-major: [B, HV, K, V] (no transpose needed)
    r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   Nr   r   r(   r)   r   r   r   r=   r   Tr   )r*   r+   r,   r.   r/   r0   r-   r1   r2   r3   r4   r   r   )r   r?  r   r   r   r   r   r   r   
contiguousviewr   aranger   r   r   r   r   SMALL_BATCH_THRESHOLD_NTrM  rQ  r   rF   r   data_ptrr   r   ))r#   r$   r%   r   r    r!   r"   r&   r,   r   r4   r.   r/   r0   r1   rj   r-   r2   r   r   state_contiguousr   r   r   r(   r)   r   use_small_batchr	  r   r   r   r  r  r  r  r  r  r  r  r   r   r   r
   gated_delta_rule_decode  s   ;"




rY  intermediate_statestile_vdisable_state_updatecache_intermediate_statesc           ]   
   C   sT  t j \}}}|d }t j } t j| } || }!d|! }"d|" }#||! }$||! }%| |" |% }&t j \}'}}|'| }(|'| })|)| }*|)| }+|*||  },||+ }-t||* }.t||* }/tj	 }0|0
tjt j||f|d dfdd}1|0
tjt j||f|d dfdd}2|0
tjt |fd}3|0
tjt |fd}4t t j|fddtj}5t t j|fddtj}6t t j|fddtj}7t t j|fddtj}8t t j|fddtj}9|-dkr|$| }:t|D ]o};t |ddd|f|+|;|,|$f}<t |	ddd|f|+|;|,|$f}=t |<|8 t |=|9 t|D ]}>t|8|> |5|>< t|9|> |6|>< q%t|rd	}?d	}@t|D ]}>|?|5|> |5|>  7 }?|@|6|> |6|>  7 }@qJd
D ]}A|?t jj|?|Addd7 }?|@t jj|@|Addd7 }@qdt j|?d dd| }Bt j|@d dd}Ct|D ]}>|5|> |B |5|>< |6|> |C |6|>< qnt|D ]}>|5|> | |5|>< q||!k rt|D ]}>|5|> |1|;|:|> f< |6|> |2|;|:|> f< qt||+|;|*f }Dt||+|;|*f }E|D|/ }F||F }Gt j|Gdd}Htd| t jtd|H dd }I|G|kr!tdntd	}J|J|I td|J |F  }Kt j|.dd |K }Ltdtdt j|E dd  }Mt j|Ldd}N|dkrc|N|3|;< |M|4|;< qt j  ||# }Ot|OD ]4}P|(| |&|O  |P }Q|Q|k r|-| |* }Rt | dd|f|R|Q|$f}St |S|7 t|D ]};t |1d|f|;|$f}Tt |2d|f|;|$f}Ut |T|5 t |U|6 |3|; }N|4|; }Mt|D ]}>|7|> |N |7|>< qd	}Vt|D ]}>|V|7|> |6|>  7 }Vqd
D ]}A|Vt jj|V|Addd7 }Vqt|
|+|;|*|Qf }W|W|V |M }Xt|D ]}>|7|>  |6|> |X 7  < qt|rQ|+| | |;|  |* }Yt |dd|f|Y|Q|$f}Zt |7|Z d	}[t|D ]}>|[|7|> |5|>  7 }[qXd
D ]}A|[t jj|[|Addd7 }[qh|$dkrt|[||+|;|*|Qf< qt| rt | dd|f|R|Q|$f}\t |7|\ qrdS dS )av  
    Parallel MTP kernel - each block handles one [TILE_V, TILE_K] tile.

    Grid: (B * HV * num_v_tiles, 1, 1)
    Each block:
    - Loads its v_tile of state into registers
    - Processes all T time steps with state in registers
    - Writes output and optionally updates state

    This matches Triton's parallelization strategy for better small-batch performance.
    r   r   r   r   r8   r   r7   r   r:   r>   r?   r@   rA   rE   Tr;   r=   N)rF   rG   rH   rI   rJ   rK   rM   rN   rO   rP   rQ   rS   rT   rR   rb   rV   ra   
const_exprrf   rg   rc   rd   rU   )]r   rZ  r   r   r[  r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   r5   r\  r]  ri   rj   rk   rI   threads_per_groupgroups_per_warp
num_groupslane_in_groupgroup_in_warp	group_idxrl   i_vtmprp   ro   rq   	cache_idxrs   ru   rw   r#  r"  sGsBetar|   r{   r}   r~   r   r   rr   r   r   r   r   r   rB   inv_norm_q_scaledr   rt   rv   r   r   r   softplus_valuse_softplusr   r   r   r   rows_per_grouprow_in_groupr  flat_state_idxh_tilesQ_tilesK_tiler   r/  r   r$  
inter_tiler   
h_tile_outr   r   r
   gdn_verify_kernel_mtp  sH  +









 \ru  c           #      C   s   | j jd | j jd | j jd }}}t||} || |  }!d| |d  d| |d   d|  d|  d }"t| ||| |||||||||	|
|||||||||||||||j|!ddftddg|"|d d S )Nr   r   r   r   r   r   r   )r   r   rF   r   ru  r   NUM_THREADS_MTP)#r   rZ  r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r[  r   r3   r4   r5   r\  r]  r   rj   r   r   r   	grid_sizer   r   r   r
   run_gdn_verify_kernel_mtp  sj   
!



rx  	pool_sizecache_stepsc                 C   r   )z2Cache compiled MTP kernel for given configuration.r   )r.   r/   r0   r-   r1   r2   ry  rz  r\  r]  r,   r4   r[  r   r   r   r
   _get_compiled_mtp_kernel
	  s   r{  initial_stateinitial_state_indicesintermediate_states_bufferc           0      C   s,  | j \}}}}|j \}}}}|j d }t||}t||}|j ||||fks:J d| d| d| d| d|j  
|dksEJ d| |dksPJ d	| || dks`J d
| d| | jtjtjfv sqJ d| j |jtjksJ d|j |jtjksJ d|j |	du r|d }	|
du}|r|
jn| j}|
du rtj||||ftj| j	d}
|
tj|| ||}|du}|r|j d }|j d }||ksJ d| d| d|
tj|| | || }n|}tjdddtj| j	d}|||||||||||	|||f}t| } d| vs!| d j	| j	kr/tj|d tj| j	d| d< | d }!d| vrttj j}"t|dd}#t|dd}$t|dd}%t|dd}&t|dd}'t| dd}(t|dd})t|dd}*t|dd}+t|
dd},t|dd}-t|!dd}.tjt|#|$|%|&|'|(|)|*|+|,|-|.fi ddddd|	d|d |d!|d"|d#|d$|d%|d&|d'd(d)|d*d+d,|d-|d.|"d/d0}/|/| d< n| d }/ttj j}"|/|||||| ||||
||!|" |s| s|||||| |
j|kr|

|}
|
|fS )1am  
    Gated Delta Rule MTP Kernel (Multiple Token Processing).

    This function processes multiple tokens (T > 1) in sequence, typically used for
    speculative decoding verification. It supports intermediate state caching for
    potential rollback scenarios.

    Args:
        q (torch.Tensor):
            Query tensor of shape ``[B, T, H, K]``.
        k (torch.Tensor):
            Key tensor of shape ``[B, T, H, K]``.
        v (torch.Tensor):
            Value tensor of shape ``[B, T, HV, V]``.
        initial_state (torch.Tensor):
            Initial state tensor of shape ``[pool_size, HV, V, K]`` (K-last layout).
        initial_state_indices (torch.Tensor):
            Indices mapping each batch to its initial state, shape ``[B]``.
        A_log (torch.Tensor):
            Log decay parameter of shape ``[HV]``.
        a (torch.Tensor):
            Input-dependent decay of shape ``[B, T, HV]``.
        dt_bias (torch.Tensor):
            Decay bias of shape ``[HV]``.
        b (torch.Tensor):
            Update gate input of shape ``[B, T, HV]``.
        scale (Optional[float]):
            Scaling factor for queries. If None, uses ``1/sqrt(K)``.
        output (Optional[torch.Tensor]):
            Pre-allocated output tensor of shape ``[B, T, HV, V]``.
        intermediate_states_buffer (Optional[torch.Tensor]):
            Buffer for caching intermediate states, shape ``[pool_size, T, HV, V, K]``.
            If None, intermediate states are not cached.
        disable_state_update (bool):
            If True, the initial state is not updated. Default: ``True``.
        use_qk_l2norm (bool):
            Whether to apply L2 normalization to q and k. Default: ``True``.

    Returns:
        Tuple[torch.Tensor, torch.Tensor]:
            - output: Output tensor of shape ``[B, T, HV, V]``
            - initial_state: Updated state tensor (unchanged if disable_state_update=True)

    Note:
        - Requires SM90 (Hopper) architecture
        - Supports T > 1 (multiple token processing)
        - State layout is K-last: [pool_size, HV, V, K]
        - Optimized for speculative decoding verification scenarios
    r   z(Expected initial_state shape [pool_size=r   r   r   r   r   r   r   r   r   r   z#initial_state must be float32, got r   Nr   r   r   z9intermediate_states_buffer second dimension (cache_steps=z) must be at least T=z" to prevent out-of-bounds indexingr)   r   r   r   r*   r=   r+   r   r,   r-   r.   r/   r0   r1   r2   r[  r   r3   Tr4   r5   Fr\  r]  r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   rR  r{  r   r   r   r   r   r   rF   r   rx  r   r   )0r#   r$   r%   r|  r}  r    r!   r"   r&   r,   r   r~  r\  r4   r.   r/   r0   r1   rj   r-   r2   ry  r[  r   r   r   r   r]  buffer_sizerz  rZ  r   r   r)   r   r   intermediate_states_tensorr   r   r  r  r  r  r  r  r  r  r   r   r   r
   gated_delta_rule_mtp	  s&  C


"







!
r  r7   )NNT)NNNTT)H__doc__	functoolstypingr   r   r   rM   cutlass.cuterF   cutlass.cute.nvgpur   cutlass.cute.runtimer   cuda.bindings.driverbindingsdriverr   api_loggingr   _FLASHINFER_AVAILABLEImportErrorrW   rX   r[   r   rL   r  r?  rN  r  rD  r  rE  r  rO  r@  r>  rB  rA  rU  
TILE_K_MTPrv  intr   r   kernel	TiledCopyTensorLayout	Constexprfloatboolr   r   jitr   r   r   r   r   r   r   r
  r=  rC  rM  rQ  rY  ru  rx  r{  r  r   r   r   r
   <module>   s   		
  	
  	
g	
q		
	
 9	
 r	
 ^	
P	
S
	
 L	
  	
Y	
	
